mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-21 13:32:49 +08:00
Fix: model not authorized (#12001)
### What problem does this PR solve? Fix model not authorized. #11973. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -252,7 +252,6 @@ async def delete_chats(tenant_id):
|
|||||||
continue
|
continue
|
||||||
temp_dict = {"status": StatusEnum.INVALID.value}
|
temp_dict = {"status": StatusEnum.INVALID.value}
|
||||||
success_count += DialogService.update_by_id(id, temp_dict)
|
success_count += DialogService.update_by_id(id, temp_dict)
|
||||||
print(success_count, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$", flush=True)
|
|
||||||
|
|
||||||
if errors:
|
if errors:
|
||||||
if success_count > 0:
|
if success_count > 0:
|
||||||
|
|||||||
30
common/parser_config_utils.py
Normal file
30
common/parser_config_utils.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str | None]:
|
||||||
|
parser_model_name: str | None = None
|
||||||
|
layout_recognizer = layout_recognizer_raw
|
||||||
|
|
||||||
|
if isinstance(layout_recognizer_raw, str):
|
||||||
|
lowered = layout_recognizer_raw.lower()
|
||||||
|
if lowered.endswith("@mineru"):
|
||||||
|
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||||
|
layout_recognizer = "MinerU"
|
||||||
|
|
||||||
|
return layout_recognizer, parser_model_name
|
||||||
@ -262,10 +262,8 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
elif self.mineru_server_url:
|
elif self.mineru_server_url:
|
||||||
data["server_url"] = self.mineru_server_url
|
data["server_url"] = self.mineru_server_url
|
||||||
|
|
||||||
print("--------------------------------", flush=True)
|
self.logger.info(f"[MinerU] request {data=}")
|
||||||
print(f"{data=}", flush=True)
|
self.logger.info(f"[MinerU] request {options=}")
|
||||||
print(f"{options=}", flush=True)
|
|
||||||
print("--------------------------------", flush=True)
|
|
||||||
|
|
||||||
headers = {"Accept": "application/json"}
|
headers = {"Accept": "application/json"}
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -21,6 +21,7 @@ from io import BytesIO
|
|||||||
from deepdoc.parser.utils import get_text
|
from deepdoc.parser.utils import get_text
|
||||||
from rag.app import naive
|
from rag.app import naive
|
||||||
from rag.app.naive import by_plaintext, PARSERS
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
||||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
||||||
tokenize_chunks, attach_media_context
|
tokenize_chunks, attach_media_context
|
||||||
@ -96,7 +97,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||||
|
parser_config.get("layout_recognize", "DeepDOC")
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(layout_recognizer, bool):
|
if isinstance(layout_recognizer, bool):
|
||||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
@ -114,6 +117,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback = callback,
|
callback = callback,
|
||||||
pdf_cls = Pdf,
|
pdf_cls = Pdf,
|
||||||
layout_recognizer = layout_recognizer,
|
layout_recognizer = layout_recognizer,
|
||||||
|
mineru_llm_name=parser_model_name,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -26,6 +26,7 @@ from rag.nlp import bullets_category, remove_contents_table, \
|
|||||||
from rag.nlp import rag_tokenizer, Node
|
from rag.nlp import rag_tokenizer, Node
|
||||||
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
|
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
|
||||||
from rag.app.naive import by_plaintext, PARSERS
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -155,7 +156,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
return tokenize_chunks(chunks, doc, eng, None)
|
return tokenize_chunks(chunks, doc, eng, None)
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||||
|
parser_config.get("layout_recognize", "DeepDOC")
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(layout_recognizer, bool):
|
if isinstance(layout_recognizer, bool):
|
||||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
@ -173,6 +176,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback = callback,
|
callback = callback,
|
||||||
pdf_cls = Pdf,
|
pdf_cls = Pdf,
|
||||||
layout_recognizer = layout_recognizer,
|
layout_recognizer = layout_recognizer,
|
||||||
|
mineru_llm_name=parser_model_name,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -27,6 +27,7 @@ from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision
|
|||||||
from docx import Document
|
from docx import Document
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from rag.app.naive import by_plaintext, PARSERS
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -196,7 +197,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
# is it English
|
# is it English
|
||||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||||
|
parser_config.get("layout_recognize", "DeepDOC")
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(layout_recognizer, bool):
|
if isinstance(layout_recognizer, bool):
|
||||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
@ -205,6 +208,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
pdf_parser = PARSERS.get(name, by_plaintext)
|
pdf_parser = PARSERS.get(name, by_plaintext)
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
|
|
||||||
|
kwargs.pop("parse_method", None)
|
||||||
|
kwargs.pop("mineru_llm_name", None)
|
||||||
sections, tbls, pdf_parser = pdf_parser(
|
sections, tbls, pdf_parser = pdf_parser(
|
||||||
filename = filename,
|
filename = filename,
|
||||||
binary = binary,
|
binary = binary,
|
||||||
@ -214,6 +219,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback = callback,
|
callback = callback,
|
||||||
pdf_cls = Pdf,
|
pdf_cls = Pdf,
|
||||||
layout_recognizer = layout_recognizer,
|
layout_recognizer = layout_recognizer,
|
||||||
|
mineru_llm_name=parser_model_name,
|
||||||
parse_method = "manual",
|
parse_method = "manual",
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|||||||
@ -36,10 +36,11 @@ from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser
|
|||||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||||
from deepdoc.parser.docling_parser import DoclingParser
|
from deepdoc.parser.docling_parser import DoclingParser
|
||||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||||
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
|
||||||
|
|
||||||
|
|
||||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
|
||||||
callback = callback
|
callback = callback
|
||||||
binary = binary
|
binary = binary
|
||||||
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
||||||
@ -56,11 +57,19 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
|||||||
return sections, tables, pdf_parser
|
return sections, tables, pdf_parser
|
||||||
|
|
||||||
|
|
||||||
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
def by_mineru(
|
||||||
parse_method = kwargs.get("parse_method", "raw")
|
filename,
|
||||||
mineru_llm_name = kwargs.get("mineru_llm_name")
|
binary=None,
|
||||||
tenant_id = kwargs.get("tenant_id")
|
from_page=0,
|
||||||
|
to_page=100000,
|
||||||
|
lang="Chinese",
|
||||||
|
callback=None,
|
||||||
|
pdf_cls=None,
|
||||||
|
parse_method: str = "raw",
|
||||||
|
mineru_llm_name: str | None = None,
|
||||||
|
tenant_id: str | None = None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
pdf_parser = None
|
pdf_parser = None
|
||||||
if tenant_id:
|
if tenant_id:
|
||||||
if not mineru_llm_name:
|
if not mineru_llm_name:
|
||||||
@ -86,7 +95,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
|||||||
callback=callback,
|
callback=callback,
|
||||||
parse_method=parse_method,
|
parse_method=parse_method,
|
||||||
lang=lang,
|
lang=lang,
|
||||||
**kwargs
|
**kwargs,
|
||||||
)
|
)
|
||||||
return sections, tables, pdf_parser
|
return sections, tables, pdf_parser
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -97,9 +106,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
|||||||
return None, None, None
|
return None, None, None
|
||||||
|
|
||||||
|
|
||||||
|
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
|
||||||
|
|
||||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
|
||||||
pdf_parser = DoclingParser()
|
pdf_parser = DoclingParser()
|
||||||
parse_method = kwargs.get("parse_method", "raw")
|
parse_method = kwargs.get("parse_method", "raw")
|
||||||
|
|
||||||
@ -118,7 +125,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
|||||||
return sections, tables, pdf_parser
|
return sections, tables, pdf_parser
|
||||||
|
|
||||||
|
|
||||||
def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
|
||||||
tcadp_parser = TCADPParser()
|
tcadp_parser = TCADPParser()
|
||||||
|
|
||||||
if not tcadp_parser.check_installation():
|
if not tcadp_parser.check_installation():
|
||||||
@ -136,10 +143,19 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",
|
|||||||
|
|
||||||
|
|
||||||
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||||
if kwargs.get("layout_recognizer", "") == "Plain Text":
|
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
|
||||||
|
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
|
||||||
pdf_parser = PlainParser()
|
pdf_parser = PlainParser()
|
||||||
else:
|
else:
|
||||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
|
tenant_id = kwargs.get("tenant_id")
|
||||||
|
if not tenant_id:
|
||||||
|
raise ValueError("tenant_id is required when using vision layout recognizer")
|
||||||
|
vision_model = LLMBundle(
|
||||||
|
tenant_id,
|
||||||
|
LLMType.IMAGE2TEXT,
|
||||||
|
llm_name=layout_recognizer,
|
||||||
|
lang=kwargs.get("lang", "Chinese"),
|
||||||
|
)
|
||||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
||||||
|
|
||||||
sections, tables = pdf_parser(
|
sections, tables = pdf_parser(
|
||||||
@ -716,14 +732,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer_raw = parser_config.get("layout_recognize", "DeepDOC")
|
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||||
parser_model_name = None
|
parser_config.get("layout_recognize", "DeepDOC")
|
||||||
layout_recognizer = layout_recognizer_raw
|
)
|
||||||
if isinstance(layout_recognizer_raw, str):
|
|
||||||
lowered = layout_recognizer_raw.lower()
|
|
||||||
if lowered.endswith("@mineru"):
|
|
||||||
parser_model_name = layout_recognizer_raw.split("@", 1)[0]
|
|
||||||
layout_recognizer = "MinerU"
|
|
||||||
|
|
||||||
if parser_config.get("analyze_hyperlink", False) and is_root:
|
if parser_config.get("analyze_hyperlink", False) and is_root:
|
||||||
urls = extract_links_from_pdf(binary)
|
urls = extract_links_from_pdf(binary)
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from rag.nlp import rag_tokenizer, tokenize
|
|||||||
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
|
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
|
||||||
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||||
from rag.app.naive import by_plaintext, PARSERS
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
@ -82,7 +83,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||||
|
parser_config.get("layout_recognize", "DeepDOC")
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(layout_recognizer, bool):
|
if isinstance(layout_recognizer, bool):
|
||||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
@ -100,6 +103,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback = callback,
|
callback = callback,
|
||||||
pdf_cls = Pdf,
|
pdf_cls = Pdf,
|
||||||
layout_recognizer = layout_recognizer,
|
layout_recognizer = layout_recognizer,
|
||||||
|
mineru_llm_name=parser_model_name,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bull
|
|||||||
from deepdoc.parser import PdfParser
|
from deepdoc.parser import PdfParser
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from rag.app.naive import by_plaintext, PARSERS
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
|
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
@ -149,7 +150,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"parser_config", {
|
"parser_config", {
|
||||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||||
|
parser_config.get("layout_recognize", "DeepDOC")
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(layout_recognizer, bool):
|
if isinstance(layout_recognizer, bool):
|
||||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
@ -163,6 +166,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
paper = pdf_parser(filename if not binary else binary,
|
paper = pdf_parser(filename if not binary else binary,
|
||||||
from_page=from_page, to_page=to_page, callback=callback)
|
from_page=from_page, to_page=to_page, callback=callback)
|
||||||
else:
|
else:
|
||||||
|
kwargs.pop("parse_method", None)
|
||||||
|
kwargs.pop("mineru_llm_name", None)
|
||||||
sections, tables, pdf_parser = pdf_parser(
|
sections, tables, pdf_parser = pdf_parser(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
binary=binary,
|
binary=binary,
|
||||||
@ -171,6 +176,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
lang=lang,
|
lang=lang,
|
||||||
callback=callback,
|
callback=callback,
|
||||||
pdf_cls=Pdf,
|
pdf_cls=Pdf,
|
||||||
|
layout_recognizer=layout_recognizer,
|
||||||
|
mineru_llm_name=parser_model_name,
|
||||||
parse_method="paper",
|
parse_method="paper",
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|||||||
@ -24,6 +24,7 @@ from PyPDF2 import PdfReader as pdf2_read
|
|||||||
|
|
||||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||||
from rag.app.naive import by_plaintext, PARSERS
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
from rag.nlp import rag_tokenizer
|
from rag.nlp import rag_tokenizer
|
||||||
from rag.nlp import tokenize, is_english
|
from rag.nlp import tokenize, is_english
|
||||||
|
|
||||||
@ -195,7 +196,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||||
|
parser_config.get("layout_recognize", "DeepDOC")
|
||||||
|
)
|
||||||
|
|
||||||
if isinstance(layout_recognizer, bool):
|
if isinstance(layout_recognizer, bool):
|
||||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
@ -213,6 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback=callback,
|
callback=callback,
|
||||||
pdf_cls=Pdf,
|
pdf_cls=Pdf,
|
||||||
layout_recognizer=layout_recognizer,
|
layout_recognizer=layout_recognizer,
|
||||||
|
mineru_llm_name=parser_model_name,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user