mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-02 08:35:08 +08:00
feat: add paddleocr parser (#12513)
### What problem does this PR solve? Add PaddleOCR as a new PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
283
rag/app/naive.py
283
rag/app/naive.py
@ -33,29 +33,32 @@ from common.token_utils import num_tokens_from_string
|
||||
from common.constants import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, \
|
||||
PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, \
|
||||
vision_figure_parser_pdf_wrapper
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from deepdoc.parser.docling_parser import DoclingParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \
|
||||
tokenize_chunks, doc_tokenize_chunks_with_images, tokenize_table, append_context2table_image4pdf, tokenize_chunks_with_images, \
|
||||
attach_media_context # noqa: F401
|
||||
from rag.nlp import (
|
||||
concat_img,
|
||||
find_codec,
|
||||
naive_merge,
|
||||
naive_merge_with_images,
|
||||
naive_merge_docx,
|
||||
rag_tokenizer,
|
||||
tokenize_chunks,
|
||||
doc_tokenize_chunks_with_images,
|
||||
tokenize_table,
|
||||
append_context2table_image4pdf,
|
||||
tokenize_chunks_with_images,
|
||||
) # noqa: F401
|
||||
|
||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
|
||||
**kwargs):
|
||||
|
||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
|
||||
callback = callback
|
||||
binary = binary
|
||||
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
||||
sections, tables = pdf_parser(
|
||||
filename if not binary else binary,
|
||||
from_page=from_page,
|
||||
to_page=to_page,
|
||||
callback=callback
|
||||
)
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
|
||||
tables = vision_figure_parser_pdf_wrapper(
|
||||
tbls=tables,
|
||||
@ -67,17 +70,17 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
|
||||
|
||||
def by_mineru(
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
mineru_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
mineru_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
pdf_parser = None
|
||||
if tenant_id:
|
||||
@ -115,8 +118,7 @@ def by_mineru(
|
||||
return None, None, None
|
||||
|
||||
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
|
||||
**kwargs):
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
|
||||
pdf_parser = DoclingParser()
|
||||
parse_method = kwargs.get("parse_method", "raw")
|
||||
|
||||
@ -130,7 +132,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
parse_method=parse_method
|
||||
parse_method=parse_method,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
@ -142,16 +144,60 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",
|
||||
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
return None, None, tcadp_parser
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||
file_type="PDF"
|
||||
)
|
||||
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF")
|
||||
return sections, tables, tcadp_parser
|
||||
|
||||
|
||||
def by_paddleocr(
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
paddleocr_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
pdf_parser = None
|
||||
if tenant_id:
|
||||
if not paddleocr_llm_name:
|
||||
try:
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR)
|
||||
if candidates:
|
||||
paddleocr_llm_name = candidates[0].llm_name
|
||||
elif env_name:
|
||||
paddleocr_llm_name = env_name
|
||||
except Exception as e: # best-effort fallback
|
||||
logging.warning(f"fallback to env paddleocr: {e}")
|
||||
|
||||
if paddleocr_llm_name:
|
||||
try:
|
||||
ocr_model = LLMBundle(tenant_id=tenant_id, llm_type=LLMType.OCR, llm_name=paddleocr_llm_name, lang=lang)
|
||||
pdf_parser = ocr_model.mdl
|
||||
sections, tables = pdf_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
parse_method=parse_method,
|
||||
**kwargs,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to parse pdf via LLMBundle PaddleOCR ({paddleocr_llm_name}): {e}")
|
||||
|
||||
return None, None, None
|
||||
|
||||
if callback:
|
||||
callback(-1, "PaddleOCR not found.")
|
||||
return None, None, None
|
||||
|
||||
|
||||
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
|
||||
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
|
||||
@ -168,12 +214,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
|
||||
)
|
||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
||||
|
||||
sections, tables = pdf_parser(
|
||||
filename if not binary else binary,
|
||||
from_page=from_page,
|
||||
to_page=to_page,
|
||||
callback=callback
|
||||
)
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
@ -182,6 +223,7 @@ PARSERS = {
|
||||
"mineru": by_mineru,
|
||||
"docling": by_docling,
|
||||
"tcadp": by_tcadp,
|
||||
"paddleocr": by_paddleocr,
|
||||
"plaintext": by_plaintext, # default
|
||||
}
|
||||
|
||||
@ -191,12 +233,12 @@ class Docx(DocxParser):
|
||||
pass
|
||||
|
||||
def get_picture(self, document, paragraph):
|
||||
imgs = paragraph._element.xpath('.//pic:pic')
|
||||
imgs = paragraph._element.xpath(".//pic:pic")
|
||||
if not imgs:
|
||||
return None
|
||||
res_img = None
|
||||
for img in imgs:
|
||||
embed = img.xpath('.//a:blip/@r:embed')
|
||||
embed = img.xpath(".//a:blip/@r:embed")
|
||||
if not embed:
|
||||
continue
|
||||
embed = embed[0]
|
||||
@ -219,7 +261,7 @@ class Docx(DocxParser):
|
||||
logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
|
||||
continue
|
||||
try:
|
||||
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
||||
image = Image.open(BytesIO(image_blob)).convert("RGB")
|
||||
if res_img is None:
|
||||
res_img = image
|
||||
else:
|
||||
@ -251,11 +293,11 @@ class Docx(DocxParser):
|
||||
try:
|
||||
# Iterate through all paragraphs and tables in document order
|
||||
for i, block in enumerate(self.doc._element.body):
|
||||
if block.tag.endswith('p'): # Paragraph
|
||||
if block.tag.endswith("p"): # Paragraph
|
||||
p = Paragraph(block, self.doc)
|
||||
blocks.append(('p', i, p))
|
||||
elif block.tag.endswith('tbl'): # Table
|
||||
blocks.append(('t', i, None)) # Table object will be retrieved later
|
||||
blocks.append(("p", i, p))
|
||||
elif block.tag.endswith("tbl"): # Table
|
||||
blocks.append(("t", i, None)) # Table object will be retrieved later
|
||||
except Exception as e:
|
||||
logging.error(f"Error collecting blocks: {e}")
|
||||
return ""
|
||||
@ -264,7 +306,7 @@ class Docx(DocxParser):
|
||||
target_table_pos = -1
|
||||
table_count = 0
|
||||
for i, (block_type, pos, _) in enumerate(blocks):
|
||||
if block_type == 't':
|
||||
if block_type == "t":
|
||||
if table_count == table_index:
|
||||
target_table_pos = pos
|
||||
break
|
||||
@ -280,7 +322,7 @@ class Docx(DocxParser):
|
||||
if pos >= target_table_pos: # Skip blocks after the table
|
||||
continue
|
||||
|
||||
if block_type != 'p':
|
||||
if block_type != "p":
|
||||
continue
|
||||
|
||||
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||
@ -309,7 +351,7 @@ class Docx(DocxParser):
|
||||
if pos >= target_table_pos: # Skip blocks after the table
|
||||
continue
|
||||
|
||||
if block_type != 'p':
|
||||
if block_type != "p":
|
||||
continue
|
||||
|
||||
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||
@ -340,8 +382,7 @@ class Docx(DocxParser):
|
||||
return ""
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
lines = []
|
||||
last_image = None
|
||||
@ -357,7 +398,7 @@ class Docx(DocxParser):
|
||||
if pn > to_page:
|
||||
break
|
||||
|
||||
if block.tag.endswith('p'):
|
||||
if block.tag.endswith("p"):
|
||||
p = Paragraph(block, self.doc)
|
||||
|
||||
if from_page <= pn < to_page:
|
||||
@ -417,7 +458,7 @@ class Docx(DocxParser):
|
||||
if "w:br" in xml and 'type="page"' in xml:
|
||||
pn += 1
|
||||
|
||||
elif block.tag.endswith('tbl'):
|
||||
elif block.tag.endswith("tbl"):
|
||||
if pn < from_page or pn > to_page:
|
||||
table_idx += 1
|
||||
continue
|
||||
@ -455,7 +496,6 @@ class Docx(DocxParser):
|
||||
|
||||
return new_line
|
||||
|
||||
|
||||
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
|
||||
"""
|
||||
This function uses mammoth, licensed under the BSD 2-Clause License.
|
||||
@ -486,8 +526,7 @@ class Docx(DocxParser):
|
||||
|
||||
try:
|
||||
if inline_images:
|
||||
result = mammoth.convert_to_html(docx_file,
|
||||
convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
||||
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
||||
else:
|
||||
result = mammoth.convert_to_html(docx_file)
|
||||
|
||||
@ -505,18 +544,11 @@ class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
|
||||
start = timer()
|
||||
first_start = start
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
||||
|
||||
@ -559,13 +591,14 @@ class Markdown(MarkdownParser):
|
||||
return []
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html_content = markdown(text)
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
return soup
|
||||
|
||||
def get_hyperlink_urls(self, soup):
|
||||
if soup:
|
||||
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
||||
return set([a.get("href") for a in soup.find_all("a") if a.get("href")])
|
||||
return []
|
||||
|
||||
def extract_image_urls_with_lines(self, text):
|
||||
@ -588,10 +621,10 @@ class Markdown(MarkdownParser):
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(text, 'html.parser')
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
|
||||
for img_tag in soup.find_all('img'):
|
||||
src = img_tag.get('src')
|
||||
for img_tag in soup.find_all("img"):
|
||||
src = img_tag.get("src")
|
||||
if not src:
|
||||
continue
|
||||
|
||||
@ -627,14 +660,14 @@ class Markdown(MarkdownParser):
|
||||
continue
|
||||
img_obj = None
|
||||
try:
|
||||
if url.startswith(('http://', 'https://')):
|
||||
if url.startswith(("http://", "https://")):
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert('RGB')
|
||||
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
else:
|
||||
local_path = Path(url)
|
||||
if local_path.exists():
|
||||
img_obj = Image.open(url).convert('RGB')
|
||||
img_obj = Image.open(url).convert("RGB")
|
||||
else:
|
||||
logging.warning(f"Local image file not found: {url}")
|
||||
except Exception as e:
|
||||
@ -652,7 +685,7 @@ class Markdown(MarkdownParser):
|
||||
with open(filename, "r") as f:
|
||||
txt = f.read()
|
||||
|
||||
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
||||
remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables)
|
||||
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
||||
# extractor = MarkdownElementExtractor(remainder)
|
||||
extractor = MarkdownElementExtractor(txt)
|
||||
@ -678,7 +711,7 @@ class Markdown(MarkdownParser):
|
||||
|
||||
tbls = []
|
||||
for table in tables:
|
||||
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
||||
tbls.append(((None, markdown(table, extensions=["markdown.extensions.tables"])), ""))
|
||||
if return_section_images:
|
||||
return sections, tbls, section_images
|
||||
return sections, tbls
|
||||
@ -694,7 +727,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
|
||||
if rels_item_xml is not None:
|
||||
rels_elm = parse_xml(rels_item_xml)
|
||||
for rel_elm in rels_elm.Relationship_lst:
|
||||
if rel_elm.target_ref in ('../NULL', 'NULL'):
|
||||
if rel_elm.target_ref in ("../NULL", "NULL"):
|
||||
continue
|
||||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
|
||||
return srels
|
||||
@ -702,21 +735,18 @@ def load_from_xml_v2(baseURI, rels_item_xml):
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
Successive text will be sliced into pieces using 'delimiter'.
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
Successive text will be sliced into pieces using 'delimiter'.
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
"""
|
||||
urls = set()
|
||||
url_res = []
|
||||
|
||||
is_english = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
||||
|
||||
child_deli = (parser_config.get("children_delimiter") or "").encode('utf-8').decode('unicode_escape').encode(
|
||||
'latin1').decode('utf-8')
|
||||
child_deli = (parser_config.get("children_delimiter") or "").encode("utf-8").decode("unicode_escape").encode("latin1").decode("utf-8")
|
||||
cust_child_deli = re.findall(r"`([^`]+)`", child_deli)
|
||||
child_deli = "|".join(re.sub(r"`([^`]+)`", "", child_deli))
|
||||
if cust_child_deli:
|
||||
@ -728,10 +758,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
res = []
|
||||
pdf_parser = None
|
||||
@ -750,8 +777,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
# Recursively chunk each embedded file and collect results
|
||||
for embed_filename, embed_bytes in embeds:
|
||||
try:
|
||||
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False,
|
||||
**kwargs) or []
|
||||
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
|
||||
embed_res.extend(sub_res)
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to chunk embed {embed_filename}: {e}"
|
||||
@ -772,8 +798,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
||||
except Exception as e:
|
||||
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
||||
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False,
|
||||
**kwargs)
|
||||
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
||||
url_res.extend(sub_url_res)
|
||||
|
||||
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
|
||||
@ -784,11 +809,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
# chunks list[dict]
|
||||
# images list - index of image chunk in chunks
|
||||
chunks, images = naive_merge_docx(
|
||||
sections, int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"), table_context_size, image_context_size)
|
||||
|
||||
chunks, images = naive_merge_docx(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), table_context_size, image_context_size)
|
||||
|
||||
vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs)
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -801,9 +823,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if parser_config.get("analyze_hyperlink", False) and is_root:
|
||||
urls = extract_links_from_pdf(binary)
|
||||
@ -824,7 +844,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback=callback,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not sections and not tables:
|
||||
@ -833,7 +854,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if table_context_size or image_context_size:
|
||||
tables = append_context2table_image4pdf(sections, tables, image_context_size)
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
@ -847,10 +868,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if layout_recognizer == "TCADP Parser":
|
||||
table_result_type = parser_config.get("table_result_type", "1")
|
||||
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
tcadp_parser = TCADPParser(table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type)
|
||||
if not tcadp_parser.check_installation():
|
||||
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
return res
|
||||
@ -858,13 +876,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
# Determine file type based on extension
|
||||
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||
file_type=file_type
|
||||
)
|
||||
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type)
|
||||
parser_config["chunk_token_num"] = 0
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -879,9 +891,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
sections = TxtParser()(filename, binary,
|
||||
parser_config.get("chunk_token_num", 128),
|
||||
parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
|
||||
@ -919,11 +929,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
else:
|
||||
section_images = [None] * len(sections)
|
||||
section_images[idx] = combined_image
|
||||
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[
|
||||
((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
||||
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
||||
boosted_figures = markdown_vision_parser(callback=callback)
|
||||
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]),
|
||||
sections[idx][1])
|
||||
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
||||
|
||||
else:
|
||||
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
||||
@ -962,8 +970,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = tika_parser.from_buffer(binary)
|
||||
if doc_parsed.get('content', None) is not None:
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
if doc_parsed.get("content", None) is not None:
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [(_, "") for _ in sections if _]
|
||||
callback(0.8, "Finish parsing.")
|
||||
else:
|
||||
@ -972,8 +980,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
logging.warning(error_msg)
|
||||
return []
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||
|
||||
st = timer()
|
||||
if is_markdown:
|
||||
@ -1021,8 +1028,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
has_images = merged_images and any(img is not None for img in merged_images)
|
||||
|
||||
if has_images:
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images,
|
||||
child_delimiters_pattern=child_deli))
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
@ -1031,17 +1037,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
section_images = None
|
||||
|
||||
if section_images:
|
||||
chunks, images = naive_merge_with_images(sections, section_images,
|
||||
int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"))
|
||||
res.extend(
|
||||
tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
||||
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
chunks = naive_merge(
|
||||
sections, int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"))
|
||||
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
||||
|
||||
@ -1071,9 +1070,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
Reference in New Issue
Block a user