diff --git a/agent/templates/advanced_ingestion_pipeline.json b/agent/templates/advanced_ingestion_pipeline.json new file mode 100644 index 000000000..57651558f --- /dev/null +++ b/agent/templates/advanced_ingestion_pipeline.json @@ -0,0 +1,726 @@ +{ + "id": 23, + "title": { + "en": "Advanced Ingestion Pipeline", + "zh": "编排复杂的 Ingestion Pipeline" + }, + "description": { + "en": "This template demonstrates how to use an LLM to generate summaries, keywords, Q&A, and metadata for each chunk to support diverse retrieval needs.", + "zh": "此模板演示如何利用大模型为切片生成摘要、关键词、问答及元数据,以满足多样化的召回需求。" + }, + "canvas_type": "Ingestion Pipeline", + "canvas_category": "dataflow_canvas", + "dsl": { + "components": { + "File": { + "obj": { + "component_name": "File", + "params": {} + }, + "downstream": [ + "Parser:HipSignsRhyme" + ], + "upstream": [] + }, + "Parser:HipSignsRhyme": { + "obj": { + "component_name": "Parser", + "params": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": { + "pdf": { + "output_format": "markdown", + "suffix": [ + "pdf" + ], + "parse_method": "DeepDOC" + }, + "spreadsheet": { + "output_format": "html", + "suffix": [ + "xls", + "xlsx", + "csv" + ] + }, + "image": { + "output_format": "text", + "suffix": [ + "jpg", + "jpeg", + "png", + "gif" + ], + "parse_method": "ocr" + }, + "email": { + "output_format": "text", + "suffix": [ + "eml", + "msg" + ], + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ] + }, + "text&markdown": { + "output_format": "text", + "suffix": [ + "md", + "markdown", + "mdx", + "txt" + ] + }, + "word": { + "output_format": "json", + "suffix": [ + "doc", + "docx" + ] + }, + "slides": { + "output_format": "json", + "suffix": [ + "pptx" + ] + } + } + } + }, + "downstream": [ + "Splitter:KindDingosJam" + ], + "upstream": [ + "File" + ] + }, + "Splitter:KindDingosJam": { + "obj": { + "component_name": "Splitter", + "params": { + "chunk_token_size": 512, + "delimiters": [ + "\n" + ], + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "overlapped_percent": 0.002 + } + }, + "downstream": [ + "Extractor:NineTiesSin" + ], + "upstream": [ + "Parser:HipSignsRhyme" + ] + }, + "Extractor:NineTiesSin": { + "obj": { + "component_name": "Extractor", + "params": { + "field_name": "summary", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": {}, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": [ + { + "content": "Text to Summarize:\n{Splitter:KindDingosJam@chunks}", + "role": "user" + } + ], + "sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + } + }, + "downstream": [ + "Extractor:TastyPointsLay" + ], + "upstream": [ + "Splitter:KindDingosJam" + ] + }, + "Extractor:TastyPointsLay": { + "obj": { + "component_name": "Extractor", + "params": { + "field_name": "keywords", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": {}, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": [ + { + "content": "Text Content:\n{Splitter:KindDingosJam@chunks}\n", + "role": "user" + } + ], + "sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nExtract the most important keywords/phrases of a given piece of text content.\n\nRequirements\n- Summarize the text content, and give the top 5 important keywords/phrases.\n- The keywords MUST be in the same language as the given piece of text content.\n- The keywords are delimited by ENGLISH COMMA.\n- Output keywords ONLY.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + } + }, + "downstream": [ + "Extractor:BlueResultsWink" + ], + "upstream": [ + "Extractor:NineTiesSin" + ] + }, + "Extractor:BlueResultsWink": { + "obj": { + "component_name": "Extractor", + "params": { + "field_name": "questions", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": {}, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": [ + { + "content": "Text Content:\n\n{Splitter:KindDingosJam@chunks}\n", + "role": "user" + } + ], + "sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nPropose 3 questions about a given piece of text content.\n\nRequirements\n- Understand and summarize the text content, and propose the top 3 important questions.\n- The questions SHOULD NOT have overlapping meanings.\n- The questions SHOULD cover the main content of the text as much as possible.\n- The questions MUST be in the same language as the given piece of text content.\n- One question per line.\n- Output questions ONLY.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + } + }, + "downstream": [ + "Extractor:CuteBusesBet" + ], + "upstream": [ + "Extractor:TastyPointsLay" + ] + }, + "Extractor:CuteBusesBet": { + "obj": { + "component_name": "Extractor", + "params": { + "field_name": "metadata", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": {}, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": [ + { + "content": "Content: \n\n{Splitter:KindDingosJam@chunks}", + "role": "user" + } + ], + "sys_prompt": "Extract important structured information from the given content. Output ONLY a valid JSON string with no additional text. If no important structured information is found, output an empty JSON object: {}.\n\nImportant structured information may include: names, dates, locations, events, key facts, numerical data, or other extractable entities.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + } + }, + "downstream": [ + "Tokenizer:LegalHorsesCheer" + ], + "upstream": [ + "Extractor:BlueResultsWink" + ] + }, + "Tokenizer:LegalHorsesCheer": { + "obj": { + "component_name": "Tokenizer", + "params": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + } + }, + "downstream": [], + "upstream": [ + "Extractor:CuteBusesBet" + ] + } + }, + "globals": {}, + "graph": { + "nodes": [ + { + "data": { + "label": "File", + "name": "File" + }, + "dragging": false, + "id": "File", + "measured": { + "height": 48, + "width": 200 + }, + "position": { + "x": -301.4128436198721, + "y": 375.86728431988394 + }, + "selected": false, + "sourcePosition": "left", + "targetPosition": "right", + "type": "beginNode" + }, + { + "data": { + "form": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": [ + { + "fileFormat": "pdf", + "output_format": "markdown", + "parse_method": "DeepDOC" + }, + { + "fileFormat": "spreadsheet", + "output_format": "html" + }, + { + "fileFormat": "image", + "output_format": "text", + "parse_method": "ocr" + }, + { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "fileFormat": "email", + "output_format": "text" + }, + { + "fileFormat": "text&markdown", + "output_format": "text" + }, + { + "fileFormat": "word", + "output_format": "json" + }, + { + "fileFormat": "slides", + "output_format": "json" + } + ] + }, + "label": "Parser", + "name": "Parser" + }, + "dragging": false, + "id": "Parser:HipSignsRhyme", + "measured": { + "height": 56, + "width": 200 + }, + "position": { + "x": -297.12089864837964, + "y": 532.2084591689336 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "parserNode" + }, + { + "data": { + "form": { + "chunk_token_size": 512, + "delimiters": [ + { + "value": "\n" + } + ], + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "overlapped_percent": 0.2 + }, + "label": "Splitter", + "name": "Token Chunker" + }, + "dragging": false, + "id": "Splitter:KindDingosJam", + "measured": { + "height": 80, + "width": 200 + }, + "position": { + "x": 7.288275851418206, + "y": 371.19722568785704 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "splitterNode" + }, + { + "data": { + "form": { + "field_name": "summary", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": {}, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": "Text to Summarize:\n{Splitter:KindDingosJam@chunks}", + "sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + }, + "label": "Extractor", + "name": "Summarization" + }, + "dragging": false, + "id": "Extractor:NineTiesSin", + "measured": { + "height": 84, + "width": 200 + }, + "position": { + "x": 9.537168313582939, + "y": 461.26662127765564 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "contextNode" + }, + { + "data": { + "form": { + "field_name": "keywords", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": {}, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": "Text Content:\n{Splitter:KindDingosJam@chunks}\n", + "sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nExtract the most important keywords/phrases of a given piece of text content.\n\nRequirements\n- Summarize the text content, and give the top 5 important keywords/phrases.\n- The keywords MUST be in the same language as the given piece of text content.\n- The keywords are delimited by ENGLISH COMMA.\n- Output keywords ONLY.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + }, + "label": "Extractor", + "name": "Auto Keywords" + }, + "dragging": false, + "id": "Extractor:TastyPointsLay", + "measured": { + "height": 84, + "width": 200 + }, + "position": { + "x": 7.473032067783009, + "y": 533.0519245332371 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "contextNode" + }, + { + "data": { + "form": { + "field_name": "questions", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": {}, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": "Text Content:\n\n{Splitter:KindDingosJam@chunks}\n", + "sys_prompt": "Role\nYou are a text analyzer.\n\nTask\nPropose 3 questions about a given piece of text content.\n\nRequirements\n- Understand and summarize the text content, and propose the top 3 important questions.\n- The questions SHOULD NOT have overlapping meanings.\n- The questions SHOULD cover the main content of the text as much as possible.\n- The questions MUST be in the same language as the given piece of text content.\n- One question per line.\n- Output questions ONLY.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + }, + "label": "Extractor", + "name": "Auto Questions" + }, + "dragging": false, + "id": "Extractor:BlueResultsWink", + "measured": { + "height": 84, + "width": 200 + }, + "position": { + "x": 2.905601749296892, + "y": 617.0420857433816 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "contextNode" + }, + { + "data": { + "form": { + "field_name": "metadata", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": {}, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": "Content: \n\n{Splitter:KindDingosJam@chunks}", + "sys_prompt": "Extract important structured information from the given content. Output ONLY a valid JSON string with no additional text. If no important structured information is found, output an empty JSON object: {}.\n\nImportant structured information may include: names, dates, locations, events, key facts, numerical data, or other extractable entities.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + }, + "label": "Extractor", + "name": "Generate Metadata" + }, + "dragging": false, + "id": "Extractor:CuteBusesBet", + "measured": { + "height": 84, + "width": 200 + }, + "position": { + "x": 327.16477358029204, + "y": 374.11630810111944 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "contextNode" + }, + { + "data": { + "form": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + }, + "label": "Tokenizer", + "name": "Indexer" + }, + "dragging": false, + "id": "Tokenizer:LegalHorsesCheer", + "measured": { + "height": 120, + "width": 200 + }, + "position": { + "x": 345.50155210663667, + "y": 533.0511852267863 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "tokenizerNode" + }, + { + "id": "Note:CruelSidesStick", + "type": "noteNode", + "position": { + "x": -29, + "y": 765 + }, + "data": { + "label": "Note", + "name": "Add more attributes", + "form": { + "text": "Using LLM to generate summaries, keywords, Q&A, and metadata." + } + }, + "sourcePosition": "right", + "targetPosition": "left", + "dragHandle": ".note-drag-handle", + "measured": { + "width": 281, + "height": 130 + }, + "width": 281, + "height": 130, + "resizing": false + } + ], + "edges": [ + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", + "source": "File", + "sourceHandle": "start", + "target": "Parser:HipSignsRhyme", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Splitter:KindDingosJamstart-Extractor:NineTiesSinend", + "source": "Splitter:KindDingosJam", + "sourceHandle": "start", + "target": "Extractor:NineTiesSin", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Extractor:NineTiesSinstart-Extractor:TastyPointsLayend", + "source": "Extractor:NineTiesSin", + "sourceHandle": "start", + "target": "Extractor:TastyPointsLay", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Extractor:TastyPointsLaystart-Extractor:BlueResultsWinkend", + "source": "Extractor:TastyPointsLay", + "sourceHandle": "start", + "target": "Extractor:BlueResultsWink", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Extractor:BlueResultsWinkstart-Extractor:CuteBusesBetend", + "source": "Extractor:BlueResultsWink", + "sourceHandle": "start", + "target": "Extractor:CuteBusesBet", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Extractor:CuteBusesBetstart-Tokenizer:LegalHorsesCheerend", + "source": "Extractor:CuteBusesBet", + "sourceHandle": "start", + "target": "Tokenizer:LegalHorsesCheer", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Parser:HipSignsRhymestart-Splitter:KindDingosJamend", + "markerEnd": "logo", + "source": "Parser:HipSignsRhyme", + "sourceHandle": "start", + "style": { + "stroke": "rgba(91, 93, 106, 1)", + "strokeWidth": 1 + }, + "target": "Splitter:KindDingosJam", + "targetHandle": "end", + "type": "buttonEdge", + "zIndex": 1001 + } + ] + }, + "history": [], + "messages": [], + "path": [], + "retrieval": [] + }, + "avatar": "" +} \ No newline at end of file diff --git a/agent/templates/chunk_summary.json b/agent/templates/chunk_summary.json new file mode 100644 index 000000000..270d206c5 --- /dev/null +++ b/agent/templates/chunk_summary.json @@ -0,0 +1,493 @@ +{ + "id": 24, + "title": { + "en": "Chunk Summary", + "zh": "总结切片" + }, + "description": { + "en": "This template uses an LLM to generate chunk summaries for building text and vector indexes. During retrieval, summaries enhance matching, and the original chunks are returned as results.", + "zh": "此模板利用大模型生成切片摘要,并据此建立全文索引与向量。检索时以摘要提升匹配效果,最终召回对应的原文切片。" + }, + "canvas_type": "Ingestion Pipeline", + "canvas_category": "dataflow_canvas", + "dsl": { + "components": { + "File": { + "obj": { + "component_name": "File", + "params": {} + }, + "downstream": [ + "Parser:HipSignsRhyme" + ], + "upstream": [] + }, + "Parser:HipSignsRhyme": { + "obj": { + "component_name": "Parser", + "params": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": { + "pdf": { + "output_format": "json", + "suffix": [ + "pdf" + ], + "parse_method": "DeepDOC" + }, + "spreadsheet": { + "output_format": "html", + "suffix": [ + "xls", + "xlsx", + "csv" + ] + }, + "image": { + "output_format": "text", + "suffix": [ + "jpg", + "jpeg", + "png", + "gif" + ], + "parse_method": "ocr" + }, + "email": { + "output_format": "text", + "suffix": [ + "eml", + "msg" + ], + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ] + }, + "text&markdown": { + "output_format": "text", + "suffix": [ + "md", + "markdown", + "mdx", + "txt" + ] + }, + "word": { + "output_format": "json", + "suffix": [ + "doc", + "docx" + ] + }, + "slides": { + "output_format": "json", + "suffix": [ + "pptx" + ] + } + } + } + }, + "downstream": [ + "Splitter:LateExpertsFeel" + ], + "upstream": [ + "File" + ] + }, + "Splitter:LateExpertsFeel": { + "obj": { + "component_name": "Splitter", + "params": { + "chunk_token_size": 512, + "delimiters": [ + "\n" + ], + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "overlapped_percent": 0 + } + }, + "downstream": [ + "Extractor:YummyGhostsType" + ], + "upstream": [ + "Parser:HipSignsRhyme" + ] + }, + "Tokenizer:EightRocketsAppear": { + "obj": { + "component_name": "Tokenizer", + "params": { + "fields": "summary", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + } + }, + "downstream": [], + "upstream": [ + "Extractor:YummyGhostsType" + ] + }, + "Extractor:YummyGhostsType": { + "obj": { + "component_name": "Extractor", + "params": { + "field_name": "summary", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": [ + { + "content": "Text to Summarize:\n\n\n{Splitter:LateExpertsFeel@chunks}", + "role": "user" + } + ], + "sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + } + }, + "downstream": [ + "Tokenizer:EightRocketsAppear" + ], + "upstream": [ + "Splitter:LateExpertsFeel" + ] + } + }, + "globals": {}, + "graph": { + "nodes": [ + { + "data": { + "label": "File", + "name": "File" + }, + "id": "File", + "measured": { + "height": 48, + "width": 200 + }, + "position": { + "x": 50, + "y": 200 + }, + "sourcePosition": "left", + "targetPosition": "right", + "type": "beginNode" + }, + { + "data": { + "form": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": [ + { + "fileFormat": "pdf", + "output_format": "json", + "parse_method": "DeepDOC" + }, + { + "fileFormat": "spreadsheet", + "output_format": "html" + }, + { + "fileFormat": "image", + "output_format": "text", + "parse_method": "ocr" + }, + { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "fileFormat": "email", + "output_format": "text" + }, + { + "fileFormat": "text&markdown", + "output_format": "text" + }, + { + "fileFormat": "word", + "output_format": "json" + }, + { + "fileFormat": "slides", + "output_format": "json" + } + ] + }, + "label": "Parser", + "name": "Parser" + }, + "dragging": false, + "id": "Parser:HipSignsRhyme", + "measured": { + "height": 412, + "width": 200 + }, + "position": { + "x": 316.99524094206413, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "parserNode" + }, + { + "data": { + "form": { + "chunk_token_size": 512, + "delimiters": [ + { + "value": "\n" + } + ], + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "overlapped_percent": 0 + }, + "label": "Splitter", + "name": "Token Splitter" + }, + "dragging": false, + "id": "Splitter:LateExpertsFeel", + "measured": { + "height": 80, + "width": 200 + }, + "position": { + "x": 600.5891036507014, + "y": 197.6804920892271 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "splitterNode" + }, + { + "data": { + "form": { + "fields": "summary", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + }, + "label": "Tokenizer", + "name": "Tokenizer" + }, + "dragging": false, + "id": "Tokenizer:EightRocketsAppear", + "measured": { + "height": 120, + "width": 200 + }, + "position": { + "x": 1136.0745258879847, + "y": 202.22674640530906 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "tokenizerNode" + }, + { + "data": { + "form": { + "field_name": "summary", + "frequencyPenaltyEnabled": false, + "frequency_penalty": 0.7, + "llm_id": "deepseek-chat@DeepSeek", + "maxTokensEnabled": false, + "max_tokens": 256, + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "presencePenaltyEnabled": false, + "presence_penalty": 0.4, + "prompts": "Text to Summarize:\n\n\n{Splitter:LateExpertsFeel@chunks}", + "sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.", + "temperature": 0.1, + "temperatureEnabled": false, + "topPEnabled": false, + "top_p": 0.3 + }, + "label": "Extractor", + "name": "Transformer" + }, + "dragging": false, + "id": "Extractor:YummyGhostsType", + "measured": { + "height": 84, + "width": 200 + }, + "position": { + "x": 870.1728208672672, + "y": 201.4516837225608 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "contextNode" + }, + { + "id": "Note:MightyPandasWatch", + "type": "noteNode", + "position": { + "x": 1128.1996486833773, + "y": 342.4601052720091 + }, + "data": { + "label": "Note", + "name": "Index summary", + "form": { + "text": "Using summary to build both text and vector indexes." + } + }, + "sourcePosition": "right", + "targetPosition": "left", + "dragHandle": ".note-drag-handle", + "measured": { + "width": 249, + "height": 128 + }, + "selected": false, + "dragging": false + } + ], + "edges": [ + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", + "source": "File", + "sourceHandle": "start", + "target": "Parser:HipSignsRhyme", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Parser:HipSignsRhymestart-Splitter:LateExpertsFeelend", + "source": "Parser:HipSignsRhyme", + "sourceHandle": "start", + "target": "Splitter:LateExpertsFeel", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Splitter:LateExpertsFeelstart-Extractor:YummyGhostsTypeend", + "source": "Splitter:LateExpertsFeel", + "sourceHandle": "start", + "target": "Extractor:YummyGhostsType", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Extractor:YummyGhostsTypestart-Tokenizer:EightRocketsAppearend", + "markerEnd": "logo", + "source": "Extractor:YummyGhostsType", + "sourceHandle": "start", + "style": { + "stroke": "rgba(91, 93, 106, 1)", + "strokeWidth": 1 + }, + "target": "Tokenizer:EightRocketsAppear", + "targetHandle": "end", + "type": "buttonEdge", + "zIndex": 1001 + } + ] + }, + "history": [], + "messages": [], + "path": [], + "retrieval": [] + }, + "avatar": "" +} \ No newline at end of file diff --git a/agent/templates/title_chunker.json b/agent/templates/title_chunker.json new file mode 100644 index 000000000..7b8c8a745 --- /dev/null +++ b/agent/templates/title_chunker.json @@ -0,0 +1,369 @@ +{ + "id": 25, + "title": { + "en": "Title Chunker", + "zh": "标题切片" + }, + "description": { + "en": "This template slices the parsed file based on its title structure. It is ideal for documents with well-defined headings, such as product manuals, legal contracts, research reports, and academic papers.", + "zh": "此模板将解析后的文件按标题结构进行切片,适用于具有清晰标题层级的文档类型,如产品手册、合同法规、研究报告和学术论文等。" + }, + "canvas_type": "Ingestion Pipeline", + "canvas_category": "dataflow_canvas", + "dsl": { + "components": { + "File": { + "obj": { + "component_name": "File", + "params": {} + }, + "downstream": [ + "Parser:HipSignsRhyme" + ], + "upstream": [] + }, + "Parser:HipSignsRhyme": { + "obj": { + "component_name": "Parser", + "params": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": { + "pdf": { + "output_format": "json", + "suffix": [ + "pdf" + ], + "parse_method": "DeepDOC" + }, + "text&markdown": { + "output_format": "text", + "suffix": [ + "md", + "markdown", + "mdx", + "txt" + ] + }, + "word": { + "output_format": "json", + "suffix": [ + "doc", + "docx" + ] + } + } + } + }, + "downstream": [ + "HierarchicalMerger:BusyPoetsSearch" + ], + "upstream": [ + "File" + ] + }, + "Tokenizer:NeatRadiosEnd": { + "obj": { + "component_name": "Tokenizer", + "params": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + } + }, + "downstream": [], + "upstream": [ + "HierarchicalMerger:BusyPoetsSearch" + ] + }, + "HierarchicalMerger:BusyPoetsSearch": { + "obj": { + "component_name": "HierarchicalMerger", + "params": { + "hierarchy": 3, + "levels": [ + [ + "^#[^#]" + ], + [ + "^##[^#]" + ], + [ + "^###[^#]" + ], + [ + "^####[^#]" + ] + ], + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + } + } + }, + "downstream": [ + "Tokenizer:NeatRadiosEnd" + ], + "upstream": [ + "Parser:HipSignsRhyme" + ] + } + }, + "globals": {}, + "graph": { + "nodes": [ + { + "data": { + "label": "File", + "name": "File" + }, + "id": "File", + "measured": { + "height": 48, + "width": 200 + }, + "position": { + "x": 50, + "y": 200 + }, + "sourcePosition": "left", + "targetPosition": "right", + "type": "beginNode" + }, + { + "data": { + "form": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": [ + { + "fileFormat": "pdf", + "output_format": "json", + "parse_method": "DeepDOC" + }, + { + "fileFormat": "text&markdown", + "output_format": "text" + }, + { + "fileFormat": "word", + "output_format": "json" + } + ] + }, + "label": "Parser", + "name": "Parser" + }, + "dragging": false, + "id": "Parser:HipSignsRhyme", + "measured": { + "height": 204, + "width": 200 + }, + "position": { + "x": 316.99524094206413, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "parserNode" + }, + { + "data": { + "form": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + }, + "label": "Tokenizer", + "name": "Indexer" + }, + "dragging": false, + "id": "Tokenizer:NeatRadiosEnd", + "measured": { + "height": 120, + "width": 200 + }, + "position": { + "x": 855.3572909622682, + "y": 199.08562542263914 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "tokenizerNode" + }, + { + "data": { + "form": { + "hierarchy": "3", + "levels": [ + { + "expressions": [ + { + "expression": "^#[^#]" + } + ] + }, + { + "expressions": [ + { + "expression": "^##[^#]" + } + ] + }, + { + "expressions": [ + { + "expression": "^###[^#]" + } + ] + }, + { + "expressions": [ + { + "expression": "^####[^#]" + } + ] + } + ], + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + } + }, + "label": "HierarchicalMerger", + "name": "Title Chunker" + }, + "dragging": false, + "id": "HierarchicalMerger:BusyPoetsSearch", + "measured": { + "height": 80, + "width": 200 + }, + "position": { + "x": 587.0312356829183, + "y": 197.9169308584236 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "splitterNode" + }, + { + "data": { + "form": { + "text": "It is ideal for documents with well-defined headings, such as product manuals, legal contracts, research reports, and academic papers." + }, + "label": "Note", + "name": "Chunk by Title" + }, + "dragHandle": ".note-drag-handle", + "dragging": false, + "height": 159, + "id": "Note:KhakiBerriesPick", + "measured": { + "height": 159, + "width": 323 + }, + "position": { + "x": 623.9675370532708, + "y": 369.74281927307146 + }, + "resizing": false, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "noteNode", + "width": 323 + } + ], + "edges": [ + { + "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", + "source": "File", + "sourceHandle": "start", + "target": "Parser:HipSignsRhyme", + "targetHandle": "end" + }, + { + "id": "xy-edge__Parser:HipSignsRhymestart-HierarchicalMerger:BusyPoetsSearchend", + "source": "Parser:HipSignsRhyme", + "sourceHandle": "start", + "target": "HierarchicalMerger:BusyPoetsSearch", + "targetHandle": "end", + "data": { + "isHovered": false + } + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__HierarchicalMerger:BusyPoetsSearchstart-Tokenizer:NeatRadiosEndend", + "markerEnd": "logo", + "source": "HierarchicalMerger:BusyPoetsSearch", + "sourceHandle": "start", + "style": { + "stroke": "rgba(91, 93, 106, 1)", + "strokeWidth": 1 + }, + "target": "Tokenizer:NeatRadiosEnd", + "targetHandle": "end", + "type": "buttonEdge", + "zIndex": 1001 + } + ] + }, + "history": [], + "messages": [], + "path": [], + "retrieval": [] + }, + "avatar": "" +} \ No newline at end of file