Files
ragflow/agent/templates/advanced_ingestion_pipeline.json
LeonTung 67529825e2 Feat: Contribute ingestion pipeline templates (#10551)
### Type of change

- [x] Other (please describe): contribute agent templates
2025-10-14 21:29:42 +08:00

726 lines
41 KiB
JSON

{
"id": 23,
"title": {
"en": "Advanced Ingestion Pipeline",
"zh": "编排复杂的 Ingestion Pipeline"
},
"description": {
"en": "This template demonstrates how to use an LLM to generate summaries, keywords, Q&A, and metadata for each chunk to support diverse retrieval needs.",
"zh": "此模板演示如何利用大模型为切片生成摘要、关键词、问答及元数据,以满足多样化的召回需求。"
},
"canvas_type": "Ingestion Pipeline",
"canvas_category": "dataflow_canvas",
"dsl": {
"components": {
"File": {
"obj": {
"component_name": "File",
"params": {}
},
"downstream": [
"Parser:HipSignsRhyme"
],
"upstream": []
},
"Parser:HipSignsRhyme": {
"obj": {
"component_name": "Parser",
"params": {
"outputs": {
"html": {
"type": "string",
"value": ""
},
"json": {
"type": "Array<object>",
"value": []
},
"markdown": {
"type": "string",
"value": ""
},
"text": {
"type": "string",
"value": ""
}
},
"setups": {
"pdf": {
"output_format": "markdown",
"suffix": [
"pdf"
],
"parse_method": "DeepDOC"
},
"spreadsheet": {
"output_format": "html",
"suffix": [
"xls",
"xlsx",
"csv"
]
},
"image": {
"output_format": "text",
"suffix": [
"jpg",
"jpeg",
"png",
"gif"
],
"parse_method": "ocr"
},
"email": {
"output_format": "text",
"suffix": [
"eml",
"msg"
],
"fields": [
"from",
"to",
"cc",
"bcc",
"date",
"subject",
"body",
"attachments"
]
},
"text&markdown": {
"output_format": "text",
"suffix": [
"md",
"markdown",
"mdx",
"txt"
]
},
"word": {
"output_format": "json",
"suffix": [
"doc",
"docx"
]
},
"slides": {
"output_format": "json",
"suffix": [
"pptx"
]
}
}
}
},
"downstream": [
"Splitter:KindDingosJam"
],
"upstream": [
"File"
]
},
"Splitter:KindDingosJam": {
"obj": {
"component_name": "Splitter",
"params": {
"chunk_token_size": 512,
"delimiters": [
"\n"
],
"outputs": {
"chunks": {
"type": "Array<Object>",
"value": []
}
},
"overlapped_percent": 0.002
}
},
"downstream": [
"Extractor:NineTiesSin"
],
"upstream": [
"Parser:HipSignsRhyme"
]
},
"Extractor:NineTiesSin": {
"obj": {
"component_name": "Extractor",
"params": {
"field_name": "summary",
"frequencyPenaltyEnabled": false,
"frequency_penalty": 0.7,
"llm_id": "deepseek-chat@DeepSeek",
"maxTokensEnabled": false,
"max_tokens": 256,
"outputs": {},
"presencePenaltyEnabled": false,
"presence_penalty": 0.4,
"prompts": [
{
"content": "Text to Summarize:\n{Splitter:KindDingosJam@chunks}",
"role": "user"
}
],
"sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.",
"temperature": 0.1,
"temperatureEnabled": false,
"topPEnabled": false,
"top_p": 0.3
}
},
"downstream": [
"Extractor:TastyPointsLay"
],
"upstream": [
"Splitter:KindDingosJam"
]
},
"Extractor:TastyPointsLay": {
"obj": {
"component_name": "Extractor",
"params": {
"field_name": "keywords",
"frequencyPenaltyEnabled": false,
"frequency_penalty": 0.7,
"llm_id": "deepseek-chat@DeepSeek",
"maxTokensEnabled": false,
"max_tokens": 256,
"outputs": {},
"presencePenaltyEnabled": false,
"presence_penalty": 0.4,
"prompts": [
{
"content": "Text Content:\n{Splitter:KindDingosJam@chunks}\n",
"role": "user"
}
],
"sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nExtract the most important keywords/phrases of a given piece of text content.\n\nRequirements\n- Summarize the text content, and give the top 5 important keywords/phrases.\n- The keywords MUST be in the same language as the given piece of text content.\n- The keywords are delimited by ENGLISH COMMA.\n- Output keywords ONLY.",
"temperature": 0.1,
"temperatureEnabled": false,
"topPEnabled": false,
"top_p": 0.3
}
},
"downstream": [
"Extractor:BlueResultsWink"
],
"upstream": [
"Extractor:NineTiesSin"
]
},
"Extractor:BlueResultsWink": {
"obj": {
"component_name": "Extractor",
"params": {
"field_name": "questions",
"frequencyPenaltyEnabled": false,
"frequency_penalty": 0.7,
"llm_id": "deepseek-chat@DeepSeek",
"maxTokensEnabled": false,
"max_tokens": 256,
"outputs": {},
"presencePenaltyEnabled": false,
"presence_penalty": 0.4,
"prompts": [
{
"content": "Text Content:\n\n{Splitter:KindDingosJam@chunks}\n",
"role": "user"
}
],
"sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nPropose 3 questions about a given piece of text content.\n\nRequirements\n- Understand and summarize the text content, and propose the top 3 important questions.\n- The questions SHOULD NOT have overlapping meanings.\n- The questions SHOULD cover the main content of the text as much as possible.\n- The questions MUST be in the same language as the given piece of text content.\n- One question per line.\n- Output questions ONLY.",
"temperature": 0.1,
"temperatureEnabled": false,
"topPEnabled": false,
"top_p": 0.3
}
},
"downstream": [
"Extractor:CuteBusesBet"
],
"upstream": [
"Extractor:TastyPointsLay"
]
},
"Extractor:CuteBusesBet": {
"obj": {
"component_name": "Extractor",
"params": {
"field_name": "metadata",
"frequencyPenaltyEnabled": false,
"frequency_penalty": 0.7,
"llm_id": "deepseek-chat@DeepSeek",
"maxTokensEnabled": false,
"max_tokens": 256,
"outputs": {},
"presencePenaltyEnabled": false,
"presence_penalty": 0.4,
"prompts": [
{
"content": "Content: \n\n{Splitter:KindDingosJam@chunks}",
"role": "user"
}
],
"sys_prompt": "Extract important structured information from the given content. Output ONLY a valid JSON string with no additional text. If no important structured information is found, output an empty JSON object: {}.\n\nImportant structured information may include: names, dates, locations, events, key facts, numerical data, or other extractable entities.",
"temperature": 0.1,
"temperatureEnabled": false,
"topPEnabled": false,
"top_p": 0.3
}
},
"downstream": [
"Tokenizer:LegalHorsesCheer"
],
"upstream": [
"Extractor:BlueResultsWink"
]
},
"Tokenizer:LegalHorsesCheer": {
"obj": {
"component_name": "Tokenizer",
"params": {
"fields": "text",
"filename_embd_weight": 0.1,
"outputs": {},
"search_method": [
"embedding",
"full_text"
]
}
},
"downstream": [],
"upstream": [
"Extractor:CuteBusesBet"
]
}
},
"globals": {},
"graph": {
"nodes": [
{
"data": {
"label": "File",
"name": "File"
},
"dragging": false,
"id": "File",
"measured": {
"height": 48,
"width": 200
},
"position": {
"x": -301.4128436198721,
"y": 375.86728431988394
},
"selected": false,
"sourcePosition": "left",
"targetPosition": "right",
"type": "beginNode"
},
{
"data": {
"form": {
"outputs": {
"html": {
"type": "string",
"value": ""
},
"json": {
"type": "Array<object>",
"value": []
},
"markdown": {
"type": "string",
"value": ""
},
"text": {
"type": "string",
"value": ""
}
},
"setups": [
{
"fileFormat": "pdf",
"output_format": "markdown",
"parse_method": "DeepDOC"
},
{
"fileFormat": "spreadsheet",
"output_format": "html"
},
{
"fileFormat": "image",
"output_format": "text",
"parse_method": "ocr"
},
{
"fields": [
"from",
"to",
"cc",
"bcc",
"date",
"subject",
"body",
"attachments"
],
"fileFormat": "email",
"output_format": "text"
},
{
"fileFormat": "text&markdown",
"output_format": "text"
},
{
"fileFormat": "word",
"output_format": "json"
},
{
"fileFormat": "slides",
"output_format": "json"
}
]
},
"label": "Parser",
"name": "Parser"
},
"dragging": false,
"id": "Parser:HipSignsRhyme",
"measured": {
"height": 56,
"width": 200
},
"position": {
"x": -297.12089864837964,
"y": 532.2084591689336
},
"selected": false,
"sourcePosition": "right",
"targetPosition": "left",
"type": "parserNode"
},
{
"data": {
"form": {
"chunk_token_size": 512,
"delimiters": [
{
"value": "\n"
}
],
"outputs": {
"chunks": {
"type": "Array<Object>",
"value": []
}
},
"overlapped_percent": 0.2
},
"label": "Splitter",
"name": "Token Chunker"
},
"dragging": false,
"id": "Splitter:KindDingosJam",
"measured": {
"height": 80,
"width": 200
},
"position": {
"x": 7.288275851418206,
"y": 371.19722568785704
},
"selected": false,
"sourcePosition": "right",
"targetPosition": "left",
"type": "splitterNode"
},
{
"data": {
"form": {
"field_name": "summary",
"frequencyPenaltyEnabled": false,
"frequency_penalty": 0.7,
"llm_id": "deepseek-chat@DeepSeek",
"maxTokensEnabled": false,
"max_tokens": 256,
"outputs": {},
"presencePenaltyEnabled": false,
"presence_penalty": 0.4,
"prompts": "Text to Summarize:\n{Splitter:KindDingosJam@chunks}",
"sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.",
"temperature": 0.1,
"temperatureEnabled": false,
"topPEnabled": false,
"top_p": 0.3
},
"label": "Extractor",
"name": "Summarization"
},
"dragging": false,
"id": "Extractor:NineTiesSin",
"measured": {
"height": 84,
"width": 200
},
"position": {
"x": 9.537168313582939,
"y": 461.26662127765564
},
"selected": false,
"sourcePosition": "right",
"targetPosition": "left",
"type": "contextNode"
},
{
"data": {
"form": {
"field_name": "keywords",
"frequencyPenaltyEnabled": false,
"frequency_penalty": 0.7,
"llm_id": "deepseek-chat@DeepSeek",
"maxTokensEnabled": false,
"max_tokens": 256,
"outputs": {},
"presencePenaltyEnabled": false,
"presence_penalty": 0.4,
"prompts": "Text Content:\n{Splitter:KindDingosJam@chunks}\n",
"sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nExtract the most important keywords/phrases of a given piece of text content.\n\nRequirements\n- Summarize the text content, and give the top 5 important keywords/phrases.\n- The keywords MUST be in the same language as the given piece of text content.\n- The keywords are delimited by ENGLISH COMMA.\n- Output keywords ONLY.",
"temperature": 0.1,
"temperatureEnabled": false,
"topPEnabled": false,
"top_p": 0.3
},
"label": "Extractor",
"name": "Auto Keywords"
},
"dragging": false,
"id": "Extractor:TastyPointsLay",
"measured": {
"height": 84,
"width": 200
},
"position": {
"x": 7.473032067783009,
"y": 533.0519245332371
},
"selected": false,
"sourcePosition": "right",
"targetPosition": "left",
"type": "contextNode"
},
{
"data": {
"form": {
"field_name": "questions",
"frequencyPenaltyEnabled": false,
"frequency_penalty": 0.7,
"llm_id": "deepseek-chat@DeepSeek",
"maxTokensEnabled": false,
"max_tokens": 256,
"outputs": {},
"presencePenaltyEnabled": false,
"presence_penalty": 0.4,
"prompts": "Text Content:\n\n{Splitter:KindDingosJam@chunks}\n",
"sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nPropose 3 questions about a given piece of text content.\n\nRequirements\n- Understand and summarize the text content, and propose the top 3 important questions.\n- The questions SHOULD NOT have overlapping meanings.\n- The questions SHOULD cover the main content of the text as much as possible.\n- The questions MUST be in the same language as the given piece of text content.\n- One question per line.\n- Output questions ONLY.",
"temperature": 0.1,
"temperatureEnabled": false,
"topPEnabled": false,
"top_p": 0.3
},
"label": "Extractor",
"name": "Auto Questions"
},
"dragging": false,
"id": "Extractor:BlueResultsWink",
"measured": {
"height": 84,
"width": 200
},
"position": {
"x": 2.905601749296892,
"y": 617.0420857433816
},
"selected": false,
"sourcePosition": "right",
"targetPosition": "left",
"type": "contextNode"
},
{
"data": {
"form": {
"field_name": "metadata",
"frequencyPenaltyEnabled": false,
"frequency_penalty": 0.7,
"llm_id": "deepseek-chat@DeepSeek",
"maxTokensEnabled": false,
"max_tokens": 256,
"outputs": {},
"presencePenaltyEnabled": false,
"presence_penalty": 0.4,
"prompts": "Content: \n\n{Splitter:KindDingosJam@chunks}",
"sys_prompt": "Extract important structured information from the given content. Output ONLY a valid JSON string with no additional text. If no important structured information is found, output an empty JSON object: {}.\n\nImportant structured information may include: names, dates, locations, events, key facts, numerical data, or other extractable entities.",
"temperature": 0.1,
"temperatureEnabled": false,
"topPEnabled": false,
"top_p": 0.3
},
"label": "Extractor",
"name": "Generate Metadata"
},
"dragging": false,
"id": "Extractor:CuteBusesBet",
"measured": {
"height": 84,
"width": 200
},
"position": {
"x": 327.16477358029204,
"y": 374.11630810111944
},
"selected": false,
"sourcePosition": "right",
"targetPosition": "left",
"type": "contextNode"
},
{
"data": {
"form": {
"fields": "text",
"filename_embd_weight": 0.1,
"outputs": {},
"search_method": [
"embedding",
"full_text"
]
},
"label": "Tokenizer",
"name": "Indexer"
},
"dragging": false,
"id": "Tokenizer:LegalHorsesCheer",
"measured": {
"height": 120,
"width": 200
},
"position": {
"x": 345.50155210663667,
"y": 533.0511852267863
},
"selected": false,
"sourcePosition": "right",
"targetPosition": "left",
"type": "tokenizerNode"
},
{
"id": "Note:CruelSidesStick",
"type": "noteNode",
"position": {
"x": -29,
"y": 765
},
"data": {
"label": "Note",
"name": "Add more attributes",
"form": {
"text": "Using LLM to generate summaries, keywords, Q&A, and metadata."
}
},
"sourcePosition": "right",
"targetPosition": "left",
"dragHandle": ".note-drag-handle",
"measured": {
"width": 281,
"height": 130
},
"width": 281,
"height": 130,
"resizing": false
}
],
"edges": [
{
"data": {
"isHovered": false
},
"id": "xy-edge__Filestart-Parser:HipSignsRhymeend",
"source": "File",
"sourceHandle": "start",
"target": "Parser:HipSignsRhyme",
"targetHandle": "end"
},
{
"data": {
"isHovered": false
},
"id": "xy-edge__Splitter:KindDingosJamstart-Extractor:NineTiesSinend",
"source": "Splitter:KindDingosJam",
"sourceHandle": "start",
"target": "Extractor:NineTiesSin",
"targetHandle": "end"
},
{
"data": {
"isHovered": false
},
"id": "xy-edge__Extractor:NineTiesSinstart-Extractor:TastyPointsLayend",
"source": "Extractor:NineTiesSin",
"sourceHandle": "start",
"target": "Extractor:TastyPointsLay",
"targetHandle": "end"
},
{
"data": {
"isHovered": false
},
"id": "xy-edge__Extractor:TastyPointsLaystart-Extractor:BlueResultsWinkend",
"source": "Extractor:TastyPointsLay",
"sourceHandle": "start",
"target": "Extractor:BlueResultsWink",
"targetHandle": "end"
},
{
"data": {
"isHovered": false
},
"id": "xy-edge__Extractor:BlueResultsWinkstart-Extractor:CuteBusesBetend",
"source": "Extractor:BlueResultsWink",
"sourceHandle": "start",
"target": "Extractor:CuteBusesBet",
"targetHandle": "end"
},
{
"data": {
"isHovered": false
},
"id": "xy-edge__Extractor:CuteBusesBetstart-Tokenizer:LegalHorsesCheerend",
"source": "Extractor:CuteBusesBet",
"sourceHandle": "start",
"target": "Tokenizer:LegalHorsesCheer",
"targetHandle": "end"
},
{
"data": {
"isHovered": false
},
"id": "xy-edge__Parser:HipSignsRhymestart-Splitter:KindDingosJamend",
"markerEnd": "logo",
"source": "Parser:HipSignsRhyme",
"sourceHandle": "start",
"style": {
"stroke": "rgba(91, 93, 106, 1)",
"strokeWidth": 1
},
"target": "Splitter:KindDingosJam",
"targetHandle": "end",
"type": "buttonEdge",
"zIndex": 1001
}
]
},
"history": [],
"messages": [],
"path": [],
"retrieval": []
},
"avatar": ""
}