diff --git a/api/apps/sdk/chat.py b/api/apps/sdk/chat.py index 435631af8..fb426b0f0 100644 --- a/api/apps/sdk/chat.py +++ b/api/apps/sdk/chat.py @@ -252,7 +252,6 @@ async def delete_chats(tenant_id): continue temp_dict = {"status": StatusEnum.INVALID.value} success_count += DialogService.update_by_id(id, temp_dict) - print(success_count, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$", flush=True) if errors: if success_count > 0: diff --git a/common/parser_config_utils.py b/common/parser_config_utils.py new file mode 100644 index 000000000..0a79f3ad1 --- /dev/null +++ b/common/parser_config_utils.py @@ -0,0 +1,30 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any + + +def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str | None]: + parser_model_name: str | None = None + layout_recognizer = layout_recognizer_raw + + if isinstance(layout_recognizer_raw, str): + lowered = layout_recognizer_raw.lower() + if lowered.endswith("@mineru"): + parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0] + layout_recognizer = "MinerU" + + return layout_recognizer, parser_model_name diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index d86a3c87f..f22c6e48b 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -262,10 +262,8 @@ class MinerUParser(RAGFlowPdfParser): elif self.mineru_server_url: data["server_url"] = self.mineru_server_url - print("--------------------------------", flush=True) - print(f"{data=}", flush=True) - print(f"{options=}", flush=True) - print("--------------------------------", flush=True) + self.logger.info(f"[MinerU] request {data=}") + self.logger.info(f"[MinerU] request {options=}") headers = {"Accept": "application/json"} try: diff --git a/rag/app/book.py b/rag/app/book.py index b392d4139..ab5c1d2de 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -21,6 +21,7 @@ from io import BytesIO from deepdoc.parser.utils import get_text from rag.app import naive from rag.app.naive import by_plaintext, PARSERS +from common.parser_config_utils import normalize_layout_recognizer from rag.nlp import bullets_category, is_english,remove_contents_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ tokenize_chunks, attach_media_context @@ -96,7 +97,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + layout_recognizer, parser_model_name = normalize_layout_recognizer( + parser_config.get("layout_recognize", "DeepDOC") + ) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -114,6 +117,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback = callback, pdf_cls = Pdf, layout_recognizer = layout_recognizer, + mineru_llm_name=parser_model_name, **kwargs ) diff --git a/rag/app/laws.py b/rag/app/laws.py index e09bb4d67..97b58ca15 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -26,6 +26,7 @@ from rag.nlp import bullets_category, remove_contents_table, \ from rag.nlp import rag_tokenizer, Node from deepdoc.parser import PdfParser, DocxParser, HtmlParser from rag.app.naive import by_plaintext, PARSERS +from common.parser_config_utils import normalize_layout_recognizer @@ -155,7 +156,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, return tokenize_chunks(chunks, doc, eng, None) elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + layout_recognizer, parser_model_name = normalize_layout_recognizer( + parser_config.get("layout_recognize", "DeepDOC") + ) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -173,6 +176,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback = callback, pdf_cls = Pdf, layout_recognizer = layout_recognizer, + mineru_llm_name=parser_model_name, **kwargs ) diff --git a/rag/app/manual.py b/rag/app/manual.py index 54a05f192..108b2542f 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -27,6 +27,7 @@ from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision from docx import Document from PIL import Image from rag.app.naive import by_plaintext, PARSERS +from common.parser_config_utils import normalize_layout_recognizer class Pdf(PdfParser): def __init__(self): @@ -196,7 +197,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, # is it English eng = lang.lower() == "english" # pdf_parser.is_english if re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + layout_recognizer, parser_model_name = normalize_layout_recognizer( + parser_config.get("layout_recognize", "DeepDOC") + ) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -205,6 +208,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") + kwargs.pop("parse_method", None) + kwargs.pop("mineru_llm_name", None) sections, tbls, pdf_parser = pdf_parser( filename = filename, binary = binary, @@ -214,6 +219,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback = callback, pdf_cls = Pdf, layout_recognizer = layout_recognizer, + mineru_llm_name=parser_model_name, parse_method = "manual", **kwargs ) @@ -232,7 +238,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, poss = pdf_parser.extract_positions(poss) if poss: first = poss[0] # tuple: ([pn], x1, x2, y1, y2) - pn = first[0] + pn = first[0] if isinstance(pn, list) and pn: pn = pn[0] # [pn] -> pn poss[0] = (pn, *first[1:]) diff --git a/rag/app/naive.py b/rag/app/naive.py index 579ed8380..7756f7a9a 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -36,10 +36,11 @@ from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser +from common.parser_config_utils import normalize_layout_recognizer from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context -def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): +def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs): callback = callback binary = binary pdf_parser = pdf_cls() if pdf_cls else Pdf() @@ -56,11 +57,19 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese return sections, tables, pdf_parser -def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): - parse_method = kwargs.get("parse_method", "raw") - mineru_llm_name = kwargs.get("mineru_llm_name") - tenant_id = kwargs.get("tenant_id") - +def by_mineru( + filename, + binary=None, + from_page=0, + to_page=100000, + lang="Chinese", + callback=None, + pdf_cls=None, + parse_method: str = "raw", + mineru_llm_name: str | None = None, + tenant_id: str | None = None, + **kwargs, +): pdf_parser = None if tenant_id: if not mineru_llm_name: @@ -86,7 +95,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" callback=callback, parse_method=parse_method, lang=lang, - **kwargs + **kwargs, ) return sections, tables, pdf_parser except Exception as e: @@ -97,9 +106,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" return None, None, None - - -def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): +def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs): pdf_parser = DoclingParser() parse_method = kwargs.get("parse_method", "raw") @@ -118,7 +125,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese return sections, tables, pdf_parser -def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): +def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs): tcadp_parser = TCADPParser() if not tcadp_parser.check_installation(): @@ -136,10 +143,19 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): - if kwargs.get("layout_recognizer", "") == "Plain Text": + layout_recognizer = (kwargs.get("layout_recognizer") or "").strip() + if (not layout_recognizer) or (layout_recognizer == "Plain Text"): pdf_parser = PlainParser() else: - vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese")) + tenant_id = kwargs.get("tenant_id") + if not tenant_id: + raise ValueError("tenant_id is required when using vision layout recognizer") + vision_model = LLMBundle( + tenant_id, + LLMType.IMAGE2TEXT, + llm_name=layout_recognizer, + lang=kwargs.get("lang", "Chinese"), + ) pdf_parser = VisionParser(vision_model=vision_model, **kwargs) sections, tables = pdf_parser( @@ -716,14 +732,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer_raw = parser_config.get("layout_recognize", "DeepDOC") - parser_model_name = None - layout_recognizer = layout_recognizer_raw - if isinstance(layout_recognizer_raw, str): - lowered = layout_recognizer_raw.lower() - if lowered.endswith("@mineru"): - parser_model_name = layout_recognizer_raw.split("@", 1)[0] - layout_recognizer = "MinerU" + layout_recognizer, parser_model_name = normalize_layout_recognizer( + parser_config.get("layout_recognize", "DeepDOC") + ) if parser_config.get("analyze_hyperlink", False) and is_root: urls = extract_links_from_pdf(binary) diff --git a/rag/app/one.py b/rag/app/one.py index 7cd1bb785..b3358f864 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -24,6 +24,7 @@ from rag.nlp import rag_tokenizer, tokenize from deepdoc.parser import PdfParser, ExcelParser, HtmlParser from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper from rag.app.naive import by_plaintext, PARSERS +from common.parser_config_utils import normalize_layout_recognizer class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, @@ -82,7 +83,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + layout_recognizer, parser_model_name = normalize_layout_recognizer( + parser_config.get("layout_recognize", "DeepDOC") + ) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -100,6 +103,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback = callback, pdf_cls = Pdf, layout_recognizer = layout_recognizer, + mineru_llm_name=parser_model_name, **kwargs ) diff --git a/rag/app/paper.py b/rag/app/paper.py index d84d5645d..22b57738c 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -24,6 +24,7 @@ from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bull from deepdoc.parser import PdfParser import numpy as np from rag.app.naive import by_plaintext, PARSERS +from common.parser_config_utils import normalize_layout_recognizer class Pdf(PdfParser): @@ -149,7 +150,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, "parser_config", { "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) if re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + layout_recognizer, parser_model_name = normalize_layout_recognizer( + parser_config.get("layout_recognize", "DeepDOC") + ) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -163,6 +166,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, paper = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) else: + kwargs.pop("parse_method", None) + kwargs.pop("mineru_llm_name", None) sections, tables, pdf_parser = pdf_parser( filename=filename, binary=binary, @@ -171,6 +176,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang=lang, callback=callback, pdf_cls=Pdf, + layout_recognizer=layout_recognizer, + mineru_llm_name=parser_model_name, parse_method="paper", **kwargs ) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 32a9850b9..e4a093634 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -24,6 +24,7 @@ from PyPDF2 import PdfReader as pdf2_read from deepdoc.parser import PdfParser, PptParser, PlainParser from rag.app.naive import by_plaintext, PARSERS +from common.parser_config_utils import normalize_layout_recognizer from rag.nlp import rag_tokenizer from rag.nlp import tokenize, is_english @@ -195,7 +196,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res.append(d) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + layout_recognizer, parser_model_name = normalize_layout_recognizer( + parser_config.get("layout_recognize", "DeepDOC") + ) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -213,6 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=callback, pdf_cls=Pdf, layout_recognizer=layout_recognizer, + mineru_llm_name=parser_model_name, **kwargs )