mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat: Add TCADP parser for PPTX and spreadsheet document types. (#11041)
### What problem does this PR solve? - Added TCADP Parser configuration fields to PDF, PPT, and spreadsheet parsing forms - Implemented support for setting table result type (Markdown/HTML) and Markdown image response type (URL/Text) - Updated TCADP Parser to handle return format settings from configuration or parameters - Enhanced frontend to dynamically show TCADP options based on selected parsing method - Modified backend to pass format parameters when calling TCADP API - Optimized form default value logic for TCADP configuration items - Updated multilingual resource files for new configuration options ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -116,7 +116,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
|
||||
else:
|
||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
|
||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
||||
|
||||
|
||||
sections, tables = pdf_parser(
|
||||
filename if not binary else binary,
|
||||
from_page=from_page,
|
||||
@ -504,7 +504,7 @@ class Markdown(MarkdownParser):
|
||||
|
||||
return images if images else None
|
||||
|
||||
def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
|
||||
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
@ -602,7 +602,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
_SerializedRelationships.load_from_xml = load_from_xml_v2
|
||||
sections, tables = Docx()(filename, binary)
|
||||
|
||||
tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
|
||||
tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs)
|
||||
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -653,18 +653,47 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
excel_parser = ExcelParser()
|
||||
if parser_config.get("html4excel"):
|
||||
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
||||
|
||||
# Check if tcadp_parser is selected for spreadsheet files
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
if layout_recognizer == "TCADP Parser":
|
||||
table_result_type = parser_config.get("table_result_type", "1")
|
||||
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
if not tcadp_parser.check_installation():
|
||||
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
return res
|
||||
|
||||
# Determine file type based on extension
|
||||
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||
file_type=file_type
|
||||
)
|
||||
parser_config["chunk_token_num"] = 0
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
else:
|
||||
sections = [(_, "") for _ in excel_parser(binary) if _]
|
||||
parser_config["chunk_token_num"] = 12800
|
||||
# Default DeepDOC parser
|
||||
excel_parser = ExcelParser()
|
||||
if parser_config.get("html4excel"):
|
||||
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
||||
else:
|
||||
sections = [(_, "") for _ in excel_parser(binary) if _]
|
||||
parser_config["chunk_token_num"] = 12800
|
||||
|
||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
@ -676,7 +705,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
||||
sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
|
||||
try:
|
||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
||||
|
||||
@ -16,6 +16,7 @@ import io
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
from functools import partial
|
||||
|
||||
import trio
|
||||
@ -83,6 +84,7 @@ class ParserParam(ProcessParamBase):
|
||||
"output_format": "json",
|
||||
},
|
||||
"spreadsheet": {
|
||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||
"output_format": "html",
|
||||
"suffix": [
|
||||
"xls",
|
||||
@ -102,8 +104,10 @@ class ParserParam(ProcessParamBase):
|
||||
"output_format": "json",
|
||||
},
|
||||
"slides": {
|
||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||
"suffix": [
|
||||
"pptx",
|
||||
"ppt"
|
||||
],
|
||||
"output_format": "json",
|
||||
},
|
||||
@ -245,7 +249,12 @@ class Parser(ProcessBase):
|
||||
bboxes.append(box)
|
||||
elif conf.get("parse_method").lower() == "tcadp parser":
|
||||
# ADP is a document parsing tool using Tencent Cloud API
|
||||
tcadp_parser = TCADPParser()
|
||||
table_result_type = conf.get("table_result_type", "1")
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
sections, _ = tcadp_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
@ -301,14 +310,86 @@ class Parser(ProcessBase):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
|
||||
conf = self._param.setups["spreadsheet"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
spreadsheet_parser = ExcelParser()
|
||||
if conf.get("output_format") == "html":
|
||||
htmls = spreadsheet_parser.html(blob, 1000000000)
|
||||
self.set_output("html", htmls[0])
|
||||
elif conf.get("output_format") == "json":
|
||||
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
|
||||
elif conf.get("output_format") == "markdown":
|
||||
self.set_output("markdown", spreadsheet_parser.markdown(blob))
|
||||
|
||||
parse_method = conf.get("parse_method", "deepdoc")
|
||||
|
||||
# Handle TCADP parser
|
||||
if parse_method.lower() == "tcadp parser":
|
||||
table_result_type = conf.get("table_result_type", "1")
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
if not tcadp_parser.check_installation():
|
||||
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
|
||||
# Determine file type based on extension
|
||||
if re.search(r"\.xlsx?$", name, re.IGNORECASE):
|
||||
file_type = "XLSX"
|
||||
else:
|
||||
file_type = "CSV"
|
||||
|
||||
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
file_type=file_type,
|
||||
file_start_page=1,
|
||||
file_end_page=1000
|
||||
)
|
||||
|
||||
# Process TCADP parser output based on configured output_format
|
||||
output_format = conf.get("output_format", "html")
|
||||
|
||||
if output_format == "html":
|
||||
# For HTML output, combine sections and tables into HTML
|
||||
html_content = ""
|
||||
for section, position_tag in sections:
|
||||
if section:
|
||||
html_content += section + "\n"
|
||||
for table in tables:
|
||||
if table:
|
||||
html_content += table + "\n"
|
||||
|
||||
self.set_output("html", html_content)
|
||||
|
||||
elif output_format == "json":
|
||||
# For JSON output, create a list of text items
|
||||
result = []
|
||||
# Add sections as text
|
||||
for section, position_tag in sections:
|
||||
if section:
|
||||
result.append({"text": section})
|
||||
# Add tables as text
|
||||
for table in tables:
|
||||
if table:
|
||||
result.append({"text": table})
|
||||
|
||||
self.set_output("json", result)
|
||||
|
||||
elif output_format == "markdown":
|
||||
# For markdown output, combine into markdown
|
||||
md_content = ""
|
||||
for section, position_tag in sections:
|
||||
if section:
|
||||
md_content += section + "\n\n"
|
||||
for table in tables:
|
||||
if table:
|
||||
md_content += table + "\n\n"
|
||||
|
||||
self.set_output("markdown", md_content)
|
||||
else:
|
||||
# Default DeepDOC parser
|
||||
spreadsheet_parser = ExcelParser()
|
||||
if conf.get("output_format") == "html":
|
||||
htmls = spreadsheet_parser.html(blob, 1000000000)
|
||||
self.set_output("html", htmls[0])
|
||||
elif conf.get("output_format") == "json":
|
||||
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
|
||||
elif conf.get("output_format") == "markdown":
|
||||
self.set_output("markdown", spreadsheet_parser.markdown(blob))
|
||||
|
||||
def _word(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
||||
@ -326,22 +407,69 @@ class Parser(ProcessBase):
|
||||
self.set_output("markdown", markdown_text)
|
||||
|
||||
def _slides(self, name, blob):
|
||||
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
|
||||
|
||||
conf = self._param.setups["slides"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
ppt_parser = ppt_parser()
|
||||
txts = ppt_parser(blob, 0, 100000, None)
|
||||
parse_method = conf.get("parse_method", "deepdoc")
|
||||
|
||||
sections = [{"text": section} for section in txts if section.strip()]
|
||||
# Handle TCADP parser
|
||||
if parse_method.lower() == "tcadp parser":
|
||||
table_result_type = conf.get("table_result_type", "1")
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
if not tcadp_parser.check_installation():
|
||||
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", sections)
|
||||
# Determine file type based on extension
|
||||
if re.search(r"\.pptx?$", name, re.IGNORECASE):
|
||||
file_type = "PPTX"
|
||||
else:
|
||||
file_type = "PPT"
|
||||
|
||||
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
file_type=file_type,
|
||||
file_start_page=1,
|
||||
file_end_page=1000
|
||||
)
|
||||
|
||||
# Process TCADP parser output - PPT only supports json format
|
||||
output_format = conf.get("output_format", "json")
|
||||
if output_format == "json":
|
||||
# For JSON output, create a list of text items
|
||||
result = []
|
||||
# Add sections as text
|
||||
for section, position_tag in sections:
|
||||
if section:
|
||||
result.append({"text": section})
|
||||
# Add tables as text
|
||||
for table in tables:
|
||||
if table:
|
||||
result.append({"text": table})
|
||||
|
||||
self.set_output("json", result)
|
||||
else:
|
||||
# Default DeepDOC parser (supports .pptx format)
|
||||
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
||||
|
||||
ppt_parser = ppt_parser()
|
||||
txts = ppt_parser(blob, 0, 100000, None)
|
||||
|
||||
sections = [{"text": section} for section in txts if section.strip()]
|
||||
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", sections)
|
||||
|
||||
def _markdown(self, name, blob):
|
||||
from functools import reduce
|
||||
@ -579,6 +707,7 @@ class Parser(ProcessBase):
|
||||
"video": self._video,
|
||||
"email": self._email,
|
||||
}
|
||||
|
||||
try:
|
||||
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user