{ "id": 23, "title": { "en": "Advanced Ingestion Pipeline", "de": "Erweiterte Ingestion Pipeline", "zh": "编排复杂的 Ingestion Pipeline" }, "description": { "en": "This template demonstrates how to use an LLM to generate summaries, keywords, Q&A, and metadata for each chunk to support diverse retrieval needs.", "de": "Diese Vorlage demonstriert, wie ein LLM verwendet wird, um Zusammenfassungen, Schlüsselwörter, Fragen & Antworten und Metadaten für jedes Segment zu generieren, um vielfältige Abrufanforderungen zu unterstützen.", "zh": "此模板演示如何利用大模型为切片生成摘要、关键词、问答及元数据,以满足多样化的召回需求。" }, "canvas_type": "Ingestion Pipeline", "canvas_category": "dataflow_canvas", "dsl": { "components": { "File": { "obj": { "component_name": "File", "params": {} }, "downstream": [ "Parser:HipSignsRhyme" ], "upstream": [] }, "Parser:HipSignsRhyme": { "obj": { "component_name": "Parser", "params": { "outputs": { "html": { "type": "string", "value": "" }, "json": { "type": "Array", "value": [] }, "markdown": { "type": "string", "value": "" }, "text": { "type": "string", "value": "" } }, "setups": { "pdf": { "output_format": "markdown", "suffix": [ "pdf" ], "parse_method": "DeepDOC" }, "spreadsheet": { "output_format": "html", "suffix": [ "xls", "xlsx", "csv" ] }, "image": { "output_format": "text", "suffix": [ "jpg", "jpeg", "png", "gif" ], "parse_method": "ocr" }, "email": { "output_format": "text", "suffix": [ "eml", "msg" ], "fields": [ "from", "to", "cc", "bcc", "date", "subject", "body", "attachments" ] }, "text&markdown": { "output_format": "text", "suffix": [ "md", "markdown", "mdx", "txt" ] }, "word": { "output_format": "json", "suffix": [ "doc", "docx" ] }, "slides": { "output_format": "json", "suffix": [ "pptx" ] } } } }, "downstream": [ "Splitter:KindDingosJam" ], "upstream": [ "File" ] }, "Splitter:KindDingosJam": { "obj": { "component_name": "Splitter", "params": { "chunk_token_size": 512, "delimiters": [ "\n" ], "outputs": { "chunks": { "type": "Array", "value": [] } }, "overlapped_percent": 0.002 } }, "downstream": [ "Extractor:NineTiesSin" ], "upstream": [ "Parser:HipSignsRhyme" ] }, "Extractor:NineTiesSin": { "obj": { "component_name": "Extractor", "params": { "field_name": "summary", "frequencyPenaltyEnabled": false, "frequency_penalty": 0.7, "llm_id": "deepseek-chat@DeepSeek", "maxTokensEnabled": false, "max_tokens": 256, "outputs": {}, "presencePenaltyEnabled": false, "presence_penalty": 0.4, "prompts": [ { "content": "Text to Summarize:\n{Splitter:KindDingosJam@chunks}", "role": "user" } ], "sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.", "temperature": 0.1, "temperatureEnabled": false, "topPEnabled": false, "top_p": 0.3 } }, "downstream": [ "Extractor:TastyPointsLay" ], "upstream": [ "Splitter:KindDingosJam" ] }, "Extractor:TastyPointsLay": { "obj": { "component_name": "Extractor", "params": { "field_name": "keywords", "frequencyPenaltyEnabled": false, "frequency_penalty": 0.7, "llm_id": "deepseek-chat@DeepSeek", "maxTokensEnabled": false, "max_tokens": 256, "outputs": {}, "presencePenaltyEnabled": false, "presence_penalty": 0.4, "prompts": [ { "content": "Text Content:\n{Splitter:KindDingosJam@chunks}\n", "role": "user" } ], "sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nExtract the most important keywords/phrases of a given piece of text content.\n\nRequirements\n- Summarize the text content, and give the top 5 important keywords/phrases.\n- The keywords MUST be in the same language as the given piece of text content.\n- The keywords are delimited by ENGLISH COMMA.\n- Output keywords ONLY.", "temperature": 0.1, "temperatureEnabled": false, "topPEnabled": false, "top_p": 0.3 } }, "downstream": [ "Extractor:BlueResultsWink" ], "upstream": [ "Extractor:NineTiesSin" ] }, "Extractor:BlueResultsWink": { "obj": { "component_name": "Extractor", "params": { "field_name": "questions", "frequencyPenaltyEnabled": false, "frequency_penalty": 0.7, "llm_id": "deepseek-chat@DeepSeek", "maxTokensEnabled": false, "max_tokens": 256, "outputs": {}, "presencePenaltyEnabled": false, "presence_penalty": 0.4, "prompts": [ { "content": "Text Content:\n\n{Splitter:KindDingosJam@chunks}\n", "role": "user" } ], "sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nPropose 3 questions about a given piece of text content.\n\nRequirements\n- Understand and summarize the text content, and propose the top 3 important questions.\n- The questions SHOULD NOT have overlapping meanings.\n- The questions SHOULD cover the main content of the text as much as possible.\n- The questions MUST be in the same language as the given piece of text content.\n- One question per line.\n- Output questions ONLY.", "temperature": 0.1, "temperatureEnabled": false, "topPEnabled": false, "top_p": 0.3 } }, "downstream": [ "Extractor:CuteBusesBet" ], "upstream": [ "Extractor:TastyPointsLay" ] }, "Extractor:CuteBusesBet": { "obj": { "component_name": "Extractor", "params": { "field_name": "metadata", "frequencyPenaltyEnabled": false, "frequency_penalty": 0.7, "llm_id": "deepseek-chat@DeepSeek", "maxTokensEnabled": false, "max_tokens": 256, "outputs": {}, "presencePenaltyEnabled": false, "presence_penalty": 0.4, "prompts": [ { "content": "Content: \n\n{Splitter:KindDingosJam@chunks}", "role": "user" } ], "sys_prompt": "Extract important structured information from the given content. Output ONLY a valid JSON string with no additional text. If no important structured information is found, output an empty JSON object: {}.\n\nImportant structured information may include: names, dates, locations, events, key facts, numerical data, or other extractable entities.", "temperature": 0.1, "temperatureEnabled": false, "topPEnabled": false, "top_p": 0.3 } }, "downstream": [ "Tokenizer:LegalHorsesCheer" ], "upstream": [ "Extractor:BlueResultsWink" ] }, "Tokenizer:LegalHorsesCheer": { "obj": { "component_name": "Tokenizer", "params": { "fields": "text", "filename_embd_weight": 0.1, "outputs": {}, "search_method": [ "embedding", "full_text" ] } }, "downstream": [], "upstream": [ "Extractor:CuteBusesBet" ] } }, "globals": {}, "graph": { "nodes": [ { "data": { "label": "File", "name": "File" }, "dragging": false, "id": "File", "measured": { "height": 48, "width": 200 }, "position": { "x": -301.4128436198721, "y": 375.86728431988394 }, "selected": false, "sourcePosition": "left", "targetPosition": "right", "type": "beginNode" }, { "data": { "form": { "outputs": { "html": { "type": "string", "value": "" }, "json": { "type": "Array", "value": [] }, "markdown": { "type": "string", "value": "" }, "text": { "type": "string", "value": "" } }, "setups": [ { "fileFormat": "pdf", "output_format": "markdown", "parse_method": "DeepDOC" }, { "fileFormat": "spreadsheet", "output_format": "html" }, { "fileFormat": "image", "output_format": "text", "parse_method": "ocr" }, { "fields": [ "from", "to", "cc", "bcc", "date", "subject", "body", "attachments" ], "fileFormat": "email", "output_format": "text" }, { "fileFormat": "text&markdown", "output_format": "text" }, { "fileFormat": "word", "output_format": "json" }, { "fileFormat": "slides", "output_format": "json" } ] }, "label": "Parser", "name": "Parser" }, "dragging": false, "id": "Parser:HipSignsRhyme", "measured": { "height": 56, "width": 200 }, "position": { "x": -297.12089864837964, "y": 532.2084591689336 }, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "parserNode" }, { "data": { "form": { "chunk_token_size": 512, "delimiters": [ { "value": "\n" } ], "outputs": { "chunks": { "type": "Array", "value": [] } }, "overlapped_percent": 0.2 }, "label": "Splitter", "name": "Token Chunker" }, "dragging": false, "id": "Splitter:KindDingosJam", "measured": { "height": 80, "width": 200 }, "position": { "x": 7.288275851418206, "y": 371.19722568785704 }, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "splitterNode" }, { "data": { "form": { "field_name": "summary", "frequencyPenaltyEnabled": false, "frequency_penalty": 0.7, "llm_id": "deepseek-chat@DeepSeek", "maxTokensEnabled": false, "max_tokens": 256, "outputs": {}, "presencePenaltyEnabled": false, "presence_penalty": 0.4, "prompts": "Text to Summarize:\n{Splitter:KindDingosJam@chunks}", "sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.", "temperature": 0.1, "temperatureEnabled": false, "topPEnabled": false, "top_p": 0.3 }, "label": "Extractor", "name": "Summarization" }, "dragging": false, "id": "Extractor:NineTiesSin", "measured": { "height": 84, "width": 200 }, "position": { "x": 9.537168313582939, "y": 461.26662127765564 }, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "contextNode" }, { "data": { "form": { "field_name": "keywords", "frequencyPenaltyEnabled": false, "frequency_penalty": 0.7, "llm_id": "deepseek-chat@DeepSeek", "maxTokensEnabled": false, "max_tokens": 256, "outputs": {}, "presencePenaltyEnabled": false, "presence_penalty": 0.4, "prompts": "Text Content:\n{Splitter:KindDingosJam@chunks}\n", "sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nExtract the most important keywords/phrases of a given piece of text content.\n\nRequirements\n- Summarize the text content, and give the top 5 important keywords/phrases.\n- The keywords MUST be in the same language as the given piece of text content.\n- The keywords are delimited by ENGLISH COMMA.\n- Output keywords ONLY.", "temperature": 0.1, "temperatureEnabled": false, "topPEnabled": false, "top_p": 0.3 }, "label": "Extractor", "name": "Auto Keywords" }, "dragging": false, "id": "Extractor:TastyPointsLay", "measured": { "height": 84, "width": 200 }, "position": { "x": 7.473032067783009, "y": 533.0519245332371 }, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "contextNode" }, { "data": { "form": { "field_name": "questions", "frequencyPenaltyEnabled": false, "frequency_penalty": 0.7, "llm_id": "deepseek-chat@DeepSeek", "maxTokensEnabled": false, "max_tokens": 256, "outputs": {}, "presencePenaltyEnabled": false, "presence_penalty": 0.4, "prompts": "Text Content:\n\n{Splitter:KindDingosJam@chunks}\n", "sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nPropose 3 questions about a given piece of text content.\n\nRequirements\n- Understand and summarize the text content, and propose the top 3 important questions.\n- The questions SHOULD NOT have overlapping meanings.\n- The questions SHOULD cover the main content of the text as much as possible.\n- The questions MUST be in the same language as the given piece of text content.\n- One question per line.\n- Output questions ONLY.", "temperature": 0.1, "temperatureEnabled": false, "topPEnabled": false, "top_p": 0.3 }, "label": "Extractor", "name": "Auto Questions" }, "dragging": false, "id": "Extractor:BlueResultsWink", "measured": { "height": 84, "width": 200 }, "position": { "x": 2.905601749296892, "y": 617.0420857433816 }, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "contextNode" }, { "data": { "form": { "field_name": "metadata", "frequencyPenaltyEnabled": false, "frequency_penalty": 0.7, "llm_id": "deepseek-chat@DeepSeek", "maxTokensEnabled": false, "max_tokens": 256, "outputs": {}, "presencePenaltyEnabled": false, "presence_penalty": 0.4, "prompts": "Content: \n\n{Splitter:KindDingosJam@chunks}", "sys_prompt": "Extract important structured information from the given content. Output ONLY a valid JSON string with no additional text. If no important structured information is found, output an empty JSON object: {}.\n\nImportant structured information may include: names, dates, locations, events, key facts, numerical data, or other extractable entities.", "temperature": 0.1, "temperatureEnabled": false, "topPEnabled": false, "top_p": 0.3 }, "label": "Extractor", "name": "Generate Metadata" }, "dragging": false, "id": "Extractor:CuteBusesBet", "measured": { "height": 84, "width": 200 }, "position": { "x": 327.16477358029204, "y": 374.11630810111944 }, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "contextNode" }, { "data": { "form": { "fields": "text", "filename_embd_weight": 0.1, "outputs": {}, "search_method": [ "embedding", "full_text" ] }, "label": "Tokenizer", "name": "Indexer" }, "dragging": false, "id": "Tokenizer:LegalHorsesCheer", "measured": { "height": 120, "width": 200 }, "position": { "x": 345.50155210663667, "y": 533.0511852267863 }, "selected": false, "sourcePosition": "right", "targetPosition": "left", "type": "tokenizerNode" }, { "id": "Note:CruelSidesStick", "type": "noteNode", "position": { "x": -29, "y": 765 }, "data": { "label": "Note", "name": "Add more attributes", "form": { "text": "Using LLM to generate summaries, keywords, Q&A, and metadata." } }, "sourcePosition": "right", "targetPosition": "left", "dragHandle": ".note-drag-handle", "measured": { "width": 281, "height": 130 }, "width": 281, "height": 130, "resizing": false } ], "edges": [ { "data": { "isHovered": false }, "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", "source": "File", "sourceHandle": "start", "target": "Parser:HipSignsRhyme", "targetHandle": "end" }, { "data": { "isHovered": false }, "id": "xy-edge__Splitter:KindDingosJamstart-Extractor:NineTiesSinend", "source": "Splitter:KindDingosJam", "sourceHandle": "start", "target": "Extractor:NineTiesSin", "targetHandle": "end" }, { "data": { "isHovered": false }, "id": "xy-edge__Extractor:NineTiesSinstart-Extractor:TastyPointsLayend", "source": "Extractor:NineTiesSin", "sourceHandle": "start", "target": "Extractor:TastyPointsLay", "targetHandle": "end" }, { "data": { "isHovered": false }, "id": "xy-edge__Extractor:TastyPointsLaystart-Extractor:BlueResultsWinkend", "source": "Extractor:TastyPointsLay", "sourceHandle": "start", "target": "Extractor:BlueResultsWink", "targetHandle": "end" }, { "data": { "isHovered": false }, "id": "xy-edge__Extractor:BlueResultsWinkstart-Extractor:CuteBusesBetend", "source": "Extractor:BlueResultsWink", "sourceHandle": "start", "target": "Extractor:CuteBusesBet", "targetHandle": "end" }, { "data": { "isHovered": false }, "id": "xy-edge__Extractor:CuteBusesBetstart-Tokenizer:LegalHorsesCheerend", "source": "Extractor:CuteBusesBet", "sourceHandle": "start", "target": "Tokenizer:LegalHorsesCheer", "targetHandle": "end" }, { "data": { "isHovered": false }, "id": "xy-edge__Parser:HipSignsRhymestart-Splitter:KindDingosJamend", "markerEnd": "logo", "source": "Parser:HipSignsRhyme", "sourceHandle": "start", "style": { "stroke": "rgba(91, 93, 106, 1)", "strokeWidth": 1 }, "target": "Splitter:KindDingosJam", "targetHandle": "end", "type": "buttonEdge", "zIndex": 1001 } ] }, "history": [], "messages": [], "path": [], "retrieval": [] }, "avatar": "" }