diff --git a/conf/service_conf.yaml b/conf/service_conf.yaml index ec756a149..6b3cef80e 100644 --- a/conf/service_conf.yaml +++ b/conf/service_conf.yaml @@ -147,5 +147,3 @@ user_default_llm: # secret_id: 'tencent_secret_id' # secret_key: 'tencent_secret_key' # region: 'tencent_region' -# table_result_type: '1' -# markdown_image_response_type: '1' diff --git a/deepdoc/parser/tcadp_parser.py b/deepdoc/parser/tcadp_parser.py index 920b6f1a1..8d704baed 100644 --- a/deepdoc/parser/tcadp_parser.py +++ b/deepdoc/parser/tcadp_parser.py @@ -192,12 +192,16 @@ class TencentCloudAPIClient: class TCADPParser(RAGFlowPdfParser): - def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou"): + def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou", + table_result_type: str = None, markdown_image_response_type: str = None): super().__init__() # First initialize logger self.logger = logging.getLogger(self.__class__.__name__) + # Log received parameters + self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}") + # Priority: read configuration from RAGFlow configuration system (service_conf.yaml) try: tcadp_parser = get_base_config("tcadp_config", {}) @@ -205,14 +209,30 @@ class TCADPParser(RAGFlowPdfParser): self.secret_id = secret_id or tcadp_parser.get("secret_id") self.secret_key = secret_key or tcadp_parser.get("secret_key") self.region = region or tcadp_parser.get("region", "ap-guangzhou") - self.table_result_type = tcadp_parser.get("table_result_type", "1") - self.markdown_image_response_type = tcadp_parser.get("markdown_image_response_type", "1") - self.logger.info("[TCADP] Configuration read from service_conf.yaml") + # Set table_result_type and markdown_image_response_type from config or parameters + self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1") + self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1") + else: self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first") + # If config file is empty, use provided parameters or defaults + self.secret_id = secret_id + self.secret_key = secret_key + self.region = region or "ap-guangzhou" + self.table_result_type = table_result_type if table_result_type is not None else "1" + self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1" except ImportError: self.logger.info("[TCADP] Configuration module import failed") + # If config file is not available, use provided parameters or defaults + self.secret_id = secret_id + self.secret_key = secret_key + self.region = region or "ap-guangzhou" + self.table_result_type = table_result_type if table_result_type is not None else "1" + self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1" + + # Log final values + self.logger.info(f"[TCADP] Final values - table_result_type: {self.table_result_type}, markdown_image_response_type: {self.markdown_image_response_type}") if not self.secret_id or not self.secret_key: raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml") @@ -400,6 +420,8 @@ class TCADPParser(RAGFlowPdfParser): "TableResultType": self.table_result_type, "MarkdownImageResponseType": self.markdown_image_response_type } + + self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}") result = client.reconstruct_document_sse( file_type=file_type, diff --git a/docker/service_conf.yaml.template b/docker/service_conf.yaml.template index 14b1b71f9..fa85453ab 100644 --- a/docker/service_conf.yaml.template +++ b/docker/service_conf.yaml.template @@ -150,5 +150,3 @@ user_default_llm: # secret_id: '${TENCENT_SECRET_ID}' # secret_key: '${TENCENT_SECRET_KEY}' # region: '${TENCENT_REGION}' -# table_result_type: '1' -# markdown_image_response_type: '1' diff --git a/rag/app/naive.py b/rag/app/naive.py index 293e4a8b9..49dca17af 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -116,7 +116,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No else: vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese")) pdf_parser = VisionParser(vision_model=vision_model, **kwargs) - + sections, tables = pdf_parser( filename if not binary else binary, from_page=from_page, @@ -504,7 +504,7 @@ class Markdown(MarkdownParser): return images if images else None - def __call__(self, filename, binary=None, separate_tables=True,delimiter=None): + def __call__(self, filename, binary=None, separate_tables=True, delimiter=None): if binary: encoding = find_codec(binary) txt = binary.decode(encoding, errors="ignore") @@ -602,7 +602,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, _SerializedRelationships.load_from_xml = load_from_xml_v2 sections, tables = Docx()(filename, binary) - tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs) + tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs) res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") @@ -653,18 +653,47 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if name in ["tcadp", "docling", "mineru"]: parser_config["chunk_token_num"] = 0 - + res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - excel_parser = ExcelParser() - if parser_config.get("html4excel"): - sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] + + # Check if tcadp_parser is selected for spreadsheet files + layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + if layout_recognizer == "TCADP Parser": + table_result_type = parser_config.get("table_result_type", "1") + markdown_image_response_type = parser_config.get("markdown_image_response_type", "1") + tcadp_parser = TCADPParser( + table_result_type=table_result_type, + markdown_image_response_type=markdown_image_response_type + ) + if not tcadp_parser.check_installation(): + callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") + return res + + # Determine file type based on extension + file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV" + + sections, tables = tcadp_parser.parse_pdf( + filepath=filename, + binary=binary, + callback=callback, + output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), + file_type=file_type + ) + parser_config["chunk_token_num"] = 0 + res = tokenize_table(tables, doc, is_english) + callback(0.8, "Finish parsing.") else: - sections = [(_, "") for _ in excel_parser(binary) if _] - parser_config["chunk_token_num"] = 12800 + # Default DeepDOC parser + excel_parser = ExcelParser() + if parser_config.get("html4excel"): + sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] + else: + sections = [(_, "") for _ in excel_parser(binary) if _] + parser_config["chunk_token_num"] = 12800 elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") @@ -676,7 +705,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) - sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?")) + sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?")) try: vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index e3d95a470..2ba5cfa7b 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -16,6 +16,7 @@ import io import json import os import random +import re from functools import partial import trio @@ -83,6 +84,7 @@ class ParserParam(ProcessParamBase): "output_format": "json", }, "spreadsheet": { + "parse_method": "deepdoc", # deepdoc/tcadp_parser "output_format": "html", "suffix": [ "xls", @@ -102,8 +104,10 @@ class ParserParam(ProcessParamBase): "output_format": "json", }, "slides": { + "parse_method": "deepdoc", # deepdoc/tcadp_parser "suffix": [ "pptx", + "ppt" ], "output_format": "json", }, @@ -245,7 +249,12 @@ class Parser(ProcessBase): bboxes.append(box) elif conf.get("parse_method").lower() == "tcadp parser": # ADP is a document parsing tool using Tencent Cloud API - tcadp_parser = TCADPParser() + table_result_type = conf.get("table_result_type", "1") + markdown_image_response_type = conf.get("markdown_image_response_type", "1") + tcadp_parser = TCADPParser( + table_result_type=table_result_type, + markdown_image_response_type=markdown_image_response_type + ) sections, _ = tcadp_parser.parse_pdf( filepath=name, binary=blob, @@ -301,14 +310,86 @@ class Parser(ProcessBase): self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.") conf = self._param.setups["spreadsheet"] self.set_output("output_format", conf["output_format"]) - spreadsheet_parser = ExcelParser() - if conf.get("output_format") == "html": - htmls = spreadsheet_parser.html(blob, 1000000000) - self.set_output("html", htmls[0]) - elif conf.get("output_format") == "json": - self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt]) - elif conf.get("output_format") == "markdown": - self.set_output("markdown", spreadsheet_parser.markdown(blob)) + + parse_method = conf.get("parse_method", "deepdoc") + + # Handle TCADP parser + if parse_method.lower() == "tcadp parser": + table_result_type = conf.get("table_result_type", "1") + markdown_image_response_type = conf.get("markdown_image_response_type", "1") + tcadp_parser = TCADPParser( + table_result_type=table_result_type, + markdown_image_response_type=markdown_image_response_type + ) + if not tcadp_parser.check_installation(): + raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.") + + # Determine file type based on extension + if re.search(r"\.xlsx?$", name, re.IGNORECASE): + file_type = "XLSX" + else: + file_type = "CSV" + + self.callback(0.2, f"Using TCADP parser for {file_type} file.") + sections, tables = tcadp_parser.parse_pdf( + filepath=name, + binary=blob, + callback=self.callback, + file_type=file_type, + file_start_page=1, + file_end_page=1000 + ) + + # Process TCADP parser output based on configured output_format + output_format = conf.get("output_format", "html") + + if output_format == "html": + # For HTML output, combine sections and tables into HTML + html_content = "" + for section, position_tag in sections: + if section: + html_content += section + "\n" + for table in tables: + if table: + html_content += table + "\n" + + self.set_output("html", html_content) + + elif output_format == "json": + # For JSON output, create a list of text items + result = [] + # Add sections as text + for section, position_tag in sections: + if section: + result.append({"text": section}) + # Add tables as text + for table in tables: + if table: + result.append({"text": table}) + + self.set_output("json", result) + + elif output_format == "markdown": + # For markdown output, combine into markdown + md_content = "" + for section, position_tag in sections: + if section: + md_content += section + "\n\n" + for table in tables: + if table: + md_content += table + "\n\n" + + self.set_output("markdown", md_content) + else: + # Default DeepDOC parser + spreadsheet_parser = ExcelParser() + if conf.get("output_format") == "html": + htmls = spreadsheet_parser.html(blob, 1000000000) + self.set_output("html", htmls[0]) + elif conf.get("output_format") == "json": + self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt]) + elif conf.get("output_format") == "markdown": + self.set_output("markdown", spreadsheet_parser.markdown(blob)) def _word(self, name, blob): self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document") @@ -326,22 +407,69 @@ class Parser(ProcessBase): self.set_output("markdown", markdown_text) def _slides(self, name, blob): - from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser - self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document") conf = self._param.setups["slides"] self.set_output("output_format", conf["output_format"]) - ppt_parser = ppt_parser() - txts = ppt_parser(blob, 0, 100000, None) + parse_method = conf.get("parse_method", "deepdoc") - sections = [{"text": section} for section in txts if section.strip()] + # Handle TCADP parser + if parse_method.lower() == "tcadp parser": + table_result_type = conf.get("table_result_type", "1") + markdown_image_response_type = conf.get("markdown_image_response_type", "1") + tcadp_parser = TCADPParser( + table_result_type=table_result_type, + markdown_image_response_type=markdown_image_response_type + ) + if not tcadp_parser.check_installation(): + raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.") - # json - assert conf.get("output_format") == "json", "have to be json for ppt" - if conf.get("output_format") == "json": - self.set_output("json", sections) + # Determine file type based on extension + if re.search(r"\.pptx?$", name, re.IGNORECASE): + file_type = "PPTX" + else: + file_type = "PPT" + + self.callback(0.2, f"Using TCADP parser for {file_type} file.") + + sections, tables = tcadp_parser.parse_pdf( + filepath=name, + binary=blob, + callback=self.callback, + file_type=file_type, + file_start_page=1, + file_end_page=1000 + ) + + # Process TCADP parser output - PPT only supports json format + output_format = conf.get("output_format", "json") + if output_format == "json": + # For JSON output, create a list of text items + result = [] + # Add sections as text + for section, position_tag in sections: + if section: + result.append({"text": section}) + # Add tables as text + for table in tables: + if table: + result.append({"text": table}) + + self.set_output("json", result) + else: + # Default DeepDOC parser (supports .pptx format) + from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser + + ppt_parser = ppt_parser() + txts = ppt_parser(blob, 0, 100000, None) + + sections = [{"text": section} for section in txts if section.strip()] + + # json + assert conf.get("output_format") == "json", "have to be json for ppt" + if conf.get("output_format") == "json": + self.set_output("json", sections) def _markdown(self, name, blob): from functools import reduce @@ -579,6 +707,7 @@ class Parser(ProcessBase): "video": self._video, "email": self._email, } + try: from_upstream = ParserFromUpstream.model_validate(kwargs) except Exception as e: diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index 38bc6c212..40a6a072c 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -1752,6 +1752,8 @@ The variable aggregation node (originally the variable assignment node) is a cru The Indexer will store the content in the corresponding data structures for the selected methods.`, // file: 'File', parserMethod: 'PDF parser', + tableResultType: 'Table Result Type', + markdownImageResponseType: 'Markdown Image Response Type', // systemPrompt: 'System Prompt', systemPromptPlaceholder: 'Enter system prompt for image analysis, if empty the system default value will be used', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 763bd6304..5b684a131 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -1629,6 +1629,8 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 Tokenizer 会根据所选方式将内容存储为对应的数据结构。`, filenameEmbdWeight: '文件名嵌入权重', parserMethod: '解析方法', + tableResultType: '表格返回形式', + markdownImageResponseType: '图片返回形式', systemPromptPlaceholder: '请输入用于图像分析的系统提示词,若为空则使用系统缺省值', exportJson: '导出 JSON', diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx index e098df997..849340bfa 100644 --- a/web/src/pages/agent/constant/pipeline.tsx +++ b/web/src/pages/agent/constant/pipeline.tsx @@ -169,6 +169,7 @@ export const initialParserValues = { { fileFormat: FileType.Spreadsheet, output_format: SpreadsheetOutputFormat.Html, + parse_method: ParseDocumentType.DeepDOC, }, { fileFormat: FileType.Image, @@ -192,6 +193,7 @@ export const initialParserValues = { { fileFormat: FileType.PowerPoint, output_format: PptOutputFormat.Json, + parse_method: ParseDocumentType.DeepDOC, }, ], }; @@ -243,7 +245,7 @@ export const FileTypeSuffixMap = { [FileType.Email]: ['eml', 'msg'], [FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'], [FileType.Docx]: ['doc', 'docx'], - [FileType.PowerPoint]: ['pptx'], + [FileType.PowerPoint]: ['pptx', 'ppt'], [FileType.Video]: ['mp4', 'avi', 'mkv'], [FileType.Audio]: [ 'da', diff --git a/web/src/pages/agent/form/parser-form/index.tsx b/web/src/pages/agent/form/parser-form/index.tsx index 2584c7960..1942b2d05 100644 --- a/web/src/pages/agent/form/parser-form/index.tsx +++ b/web/src/pages/agent/form/parser-form/index.tsx @@ -34,6 +34,8 @@ import { OutputFormatFormField } from './common-form-fields'; import { EmailFormFields } from './email-form-fields'; import { ImageFormFields } from './image-form-fields'; import { PdfFormFields } from './pdf-form-fields'; +import { PptFormFields } from './ppt-form-fields'; +import { SpreadsheetFormFields } from './spreadsheet-form-fields'; import { buildFieldNameWithPrefix } from './utils'; import { AudioFormFields, VideoFormFields } from './video-form-fields'; @@ -41,6 +43,8 @@ const outputList = buildOutputList(initialParserValues.outputs); const FileFormatWidgetMap = { [FileType.PDF]: PdfFormFields, + [FileType.Spreadsheet]: SpreadsheetFormFields, + [FileType.PowerPoint]: PptFormFields, [FileType.Video]: VideoFormFields, [FileType.Audio]: AudioFormFields, [FileType.Email]: EmailFormFields, @@ -65,6 +69,8 @@ export const FormSchema = z.object({ fields: z.array(z.string()).optional(), llm_id: z.string().optional(), system_prompt: z.string().optional(), + table_result_type: z.string().optional(), + markdown_image_response_type: z.string().optional(), }), ), }); @@ -184,6 +190,8 @@ const ParserForm = ({ node }: INextOperatorForm) => { lang: '', fields: [], llm_id: '', + table_result_type: '', + markdown_image_response_type: '', }); }, [append]); diff --git a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx index 020032c5c..82c976f0f 100644 --- a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx @@ -1,13 +1,30 @@ import { ParseDocumentType } from '@/components/layout-recognize-form-field'; +import { + SelectWithSearch, + SelectWithSearchFlagOptionType, +} from '@/components/originui/select-with-search'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; import { isEmpty } from 'lodash'; import { useEffect, useMemo } from 'react'; import { useFormContext, useWatch } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; import { LanguageFormField, ParserMethodFormField } from './common-form-fields'; import { CommonProps } from './interface'; import { useSetInitialLanguage } from './use-set-initial-language'; import { buildFieldNameWithPrefix } from './utils'; +const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [ + { label: 'Markdown', value: '0' }, + { label: 'HTML', value: '1' }, +]; + +const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [ + { label: 'URL', value: '0' }, + { label: 'Text', value: '1' }, +]; + export function PdfFormFields({ prefix }: CommonProps) { + const { t } = useTranslation(); const form = useFormContext(); const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix); @@ -25,6 +42,12 @@ export function PdfFormFields({ prefix }: CommonProps) { ); }, [parseMethod]); + const tcadpOptionsShown = useMemo(() => { + return ( + !isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser + ); + }, [parseMethod]); + useSetInitialLanguage({ prefix, languageShown }); useEffect(() => { @@ -36,10 +59,68 @@ export function PdfFormFields({ prefix }: CommonProps) { } }, [form, parseMethodName]); + // Set default values for TCADP options when TCADP is selected + useEffect(() => { + if (tcadpOptionsShown) { + const tableResultTypeName = buildFieldNameWithPrefix( + 'table_result_type', + prefix, + ); + const markdownImageResponseTypeName = buildFieldNameWithPrefix( + 'markdown_image_response_type', + prefix, + ); + + if (isEmpty(form.getValues(tableResultTypeName))) { + form.setValue(tableResultTypeName, '1', { + shouldValidate: true, + shouldDirty: true, + }); + } + if (isEmpty(form.getValues(markdownImageResponseTypeName))) { + form.setValue(markdownImageResponseTypeName, '1', { + shouldValidate: true, + shouldDirty: true, + }); + } + } + }, [tcadpOptionsShown, form, prefix]); + return ( <> {languageShown && } + {tcadpOptionsShown && ( + <> + + {(field) => ( + + )} + + + {(field) => ( + + )} + + + )} ); } diff --git a/web/src/pages/agent/form/parser-form/ppt-form-fields.tsx b/web/src/pages/agent/form/parser-form/ppt-form-fields.tsx new file mode 100644 index 000000000..18f924959 --- /dev/null +++ b/web/src/pages/agent/form/parser-form/ppt-form-fields.tsx @@ -0,0 +1,125 @@ +import { ParseDocumentType } from '@/components/layout-recognize-form-field'; +import { + SelectWithSearch, + SelectWithSearchFlagOptionType, +} from '@/components/originui/select-with-search'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { isEmpty } from 'lodash'; +import { useEffect, useMemo } from 'react'; +import { useFormContext, useWatch } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { ParserMethodFormField } from './common-form-fields'; +import { CommonProps } from './interface'; +import { buildFieldNameWithPrefix } from './utils'; + +const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [ + { label: 'Markdown', value: '0' }, + { label: 'HTML', value: '1' }, +]; + +const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [ + { label: 'URL', value: '0' }, + { label: 'Text', value: '1' }, +]; + +export function PptFormFields({ prefix }: CommonProps) { + const { t } = useTranslation(); + const form = useFormContext(); + + const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix); + + const parseMethod = useWatch({ + name: parseMethodName, + }); + + // PPT only supports DeepDOC and TCADPParser + const optionsWithoutLLM = [ + { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC }, + { + label: ParseDocumentType.TCADPParser, + value: ParseDocumentType.TCADPParser, + }, + ]; + + const tcadpOptionsShown = useMemo(() => { + return ( + !isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser + ); + }, [parseMethod]); + + useEffect(() => { + if (isEmpty(form.getValues(parseMethodName))) { + form.setValue(parseMethodName, ParseDocumentType.DeepDOC, { + shouldValidate: true, + shouldDirty: true, + }); + } + }, [form, parseMethodName]); + + // Set default values for TCADP options when TCADP is selected + useEffect(() => { + if (tcadpOptionsShown) { + const tableResultTypeName = buildFieldNameWithPrefix( + 'table_result_type', + prefix, + ); + const markdownImageResponseTypeName = buildFieldNameWithPrefix( + 'markdown_image_response_type', + prefix, + ); + + if (isEmpty(form.getValues(tableResultTypeName))) { + form.setValue(tableResultTypeName, '1', { + shouldValidate: true, + shouldDirty: true, + }); + } + if (isEmpty(form.getValues(markdownImageResponseTypeName))) { + form.setValue(markdownImageResponseTypeName, '1', { + shouldValidate: true, + shouldDirty: true, + }); + } + } + }, [tcadpOptionsShown, form, prefix]); + + return ( + <> + + {tcadpOptionsShown && ( + <> + + {(field) => ( + + )} + + + {(field) => ( + + )} + + + )} + + ); +} diff --git a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx new file mode 100644 index 000000000..407150991 --- /dev/null +++ b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx @@ -0,0 +1,125 @@ +import { ParseDocumentType } from '@/components/layout-recognize-form-field'; +import { + SelectWithSearch, + SelectWithSearchFlagOptionType, +} from '@/components/originui/select-with-search'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { isEmpty } from 'lodash'; +import { useEffect, useMemo } from 'react'; +import { useFormContext, useWatch } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { ParserMethodFormField } from './common-form-fields'; +import { CommonProps } from './interface'; +import { buildFieldNameWithPrefix } from './utils'; + +const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [ + { label: 'Markdown', value: '0' }, + { label: 'HTML', value: '1' }, +]; + +const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [ + { label: 'URL', value: '0' }, + { label: 'Text', value: '1' }, +]; + +export function SpreadsheetFormFields({ prefix }: CommonProps) { + const { t } = useTranslation(); + const form = useFormContext(); + + const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix); + + const parseMethod = useWatch({ + name: parseMethodName, + }); + + // Spreadsheet only supports DeepDOC and TCADPParser + const optionsWithoutLLM = [ + { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC }, + { + label: ParseDocumentType.TCADPParser, + value: ParseDocumentType.TCADPParser, + }, + ]; + + const tcadpOptionsShown = useMemo(() => { + return ( + !isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser + ); + }, [parseMethod]); + + useEffect(() => { + if (isEmpty(form.getValues(parseMethodName))) { + form.setValue(parseMethodName, ParseDocumentType.DeepDOC, { + shouldValidate: true, + shouldDirty: true, + }); + } + }, [form, parseMethodName]); + + // Set default values for TCADP options when TCADP is selected + useEffect(() => { + if (tcadpOptionsShown) { + const tableResultTypeName = buildFieldNameWithPrefix( + 'table_result_type', + prefix, + ); + const markdownImageResponseTypeName = buildFieldNameWithPrefix( + 'markdown_image_response_type', + prefix, + ); + + if (isEmpty(form.getValues(tableResultTypeName))) { + form.setValue(tableResultTypeName, '1', { + shouldValidate: true, + shouldDirty: true, + }); + } + if (isEmpty(form.getValues(markdownImageResponseTypeName))) { + form.setValue(markdownImageResponseTypeName, '1', { + shouldValidate: true, + shouldDirty: true, + }); + } + } + }, [tcadpOptionsShown, form, prefix]); + + return ( + <> + + {tcadpOptionsShown && ( + <> + + {(field) => ( + + )} + + + {(field) => ( + + )} + + + )} + + ); +} diff --git a/web/src/pages/agent/utils.ts b/web/src/pages/agent/utils.ts index f40015012..f4e4a4b1d 100644 --- a/web/src/pages/agent/utils.ts +++ b/web/src/pages/agent/utils.ts @@ -214,6 +214,36 @@ function transformParserParams(params: ParserFormSchemaType) { parse_method: cur.parse_method, lang: cur.lang, }; + // Only include TCADP parameters if TCADP Parser is selected + if (cur.parse_method?.toLowerCase() === 'tcadp parser') { + filteredSetup.table_result_type = cur.table_result_type; + filteredSetup.markdown_image_response_type = + cur.markdown_image_response_type; + } + break; + case FileType.Spreadsheet: + filteredSetup = { + ...filteredSetup, + parse_method: cur.parse_method, + }; + // Only include TCADP parameters if TCADP Parser is selected + if (cur.parse_method?.toLowerCase() === 'tcadp parser') { + filteredSetup.table_result_type = cur.table_result_type; + filteredSetup.markdown_image_response_type = + cur.markdown_image_response_type; + } + break; + case FileType.PowerPoint: + filteredSetup = { + ...filteredSetup, + parse_method: cur.parse_method, + }; + // Only include TCADP parameters if TCADP Parser is selected + if (cur.parse_method?.toLowerCase() === 'tcadp parser') { + filteredSetup.table_result_type = cur.table_result_type; + filteredSetup.markdown_image_response_type = + cur.markdown_image_response_type; + } break; case FileType.Image: filteredSetup = { diff --git a/web/src/pages/data-flow/constant.tsx b/web/src/pages/data-flow/constant.tsx new file mode 100644 index 000000000..e69de29bb diff --git a/web/src/pages/data-flow/form/parser-form/index.tsx b/web/src/pages/data-flow/form/parser-form/index.tsx new file mode 100644 index 000000000..e69de29bb diff --git a/web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx b/web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx new file mode 100644 index 000000000..59b179498 --- /dev/null +++ b/web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx @@ -0,0 +1,40 @@ +import { ParseDocumentType } from '@/components/layout-recognize-form-field'; +import { isEmpty } from 'lodash'; +import { useEffect } from 'react'; +import { useFormContext } from 'react-hook-form'; +import { ParserMethodFormField } from './common-form-fields'; +import { CommonProps } from './interface'; +import { buildFieldNameWithPrefix } from './utils'; + +export function PptFormFields({ prefix }: CommonProps) { + const form = useFormContext(); + + const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix); + + // PPT only supports DeepDOC and TCADPParser + const optionsWithoutLLM = [ + { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC }, + { + label: ParseDocumentType.TCADPParser, + value: ParseDocumentType.TCADPParser, + }, + ]; + + useEffect(() => { + if (isEmpty(form.getValues(parseMethodName))) { + form.setValue(parseMethodName, ParseDocumentType.DeepDOC, { + shouldValidate: true, + shouldDirty: true, + }); + } + }, [form, parseMethodName]); + + return ( + <> + + + ); +} diff --git a/web/src/pages/data-flow/form/parser-form/spreadsheet-form-fields.tsx b/web/src/pages/data-flow/form/parser-form/spreadsheet-form-fields.tsx new file mode 100644 index 000000000..443ff6e18 --- /dev/null +++ b/web/src/pages/data-flow/form/parser-form/spreadsheet-form-fields.tsx @@ -0,0 +1,40 @@ +import { ParseDocumentType } from '@/components/layout-recognize-form-field'; +import { isEmpty } from 'lodash'; +import { useEffect } from 'react'; +import { useFormContext } from 'react-hook-form'; +import { ParserMethodFormField } from './common-form-fields'; +import { CommonProps } from './interface'; +import { buildFieldNameWithPrefix } from './utils'; + +export function SpreadsheetFormFields({ prefix }: CommonProps) { + const form = useFormContext(); + + const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix); + + // Spreadsheet only supports DeepDOC and TCADPParser + const optionsWithoutLLM = [ + { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC }, + { + label: ParseDocumentType.TCADPParser, + value: ParseDocumentType.TCADPParser, + }, + ]; + + useEffect(() => { + if (isEmpty(form.getValues(parseMethodName))) { + form.setValue(parseMethodName, ParseDocumentType.DeepDOC, { + shouldValidate: true, + shouldDirty: true, + }); + } + }, [form, parseMethodName]); + + return ( + <> + + + ); +} diff --git a/web/src/pages/data-flow/utils.ts b/web/src/pages/data-flow/utils.ts new file mode 100644 index 000000000..e69de29bb