Files
ragflow/rag/app/naive.py
Billy Bao 4b8ce08050 Fix: fix pdf_parser ignored in rag/app/naive.py (#11065)
### What problem does this PR solve?

Fix: fix pdf_parser ignored in rag/app/naive.py #11000

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-06 15:20:35 +08:00

796 lines
32 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import re
import os
from functools import reduce
from io import BytesIO
from timeit import default_timer as timer
from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
from markdown import markdown
from PIL import Image
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
tables = vision_figure_parser_pdf_wrapper(tbls=tables,
callback=callback,
**kwargs)
return sections, tables, pdf_parser
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
if not pdf_parser.check_installation():
callback(-1, "MinerU not found.")
return None, None
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
return sections, tables, pdf_parser
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
pdf_parser = DoclingParser()
if not pdf_parser.check_installation():
callback(-1, "Docling not found.")
return None, None
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
return sections, tables, pdf_parser
def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return None, None
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type="PDF"
)
return sections, tables, tcadp_parser
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
if kwargs.get("layout_recognizer", "") == "Plain Text":
pdf_parser = PlainParser()
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
return sections, tables, pdf_parser
PARSERS = {
"deepdoc": by_deepdoc,
"mineru": by_mineru,
"docling": by_docling,
"tcadp": by_tcadp,
"plaintext": by_plaintext, # default
}
class Docx(DocxParser):
def __init__(self):
pass
def get_picture(self, document, paragraph):
imgs = paragraph._element.xpath('.//pic:pic')
if not imgs:
return None
res_img = None
for img in imgs:
embed = img.xpath('.//a:blip/@r:embed')
if not embed:
continue
embed = embed[0]
try:
related_part = document.part.related_parts[embed]
image_blob = related_part.image.blob
except UnrecognizedImageError:
logging.info("Unrecognized image format. Skipping image.")
continue
except UnexpectedEndOfFileError:
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
continue
except InvalidImageStreamError:
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
continue
except UnicodeDecodeError:
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
continue
except Exception:
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
continue
try:
image = Image.open(BytesIO(image_blob)).convert('RGB')
if res_img is None:
res_img = image
else:
res_img = concat_img(res_img, image)
except Exception:
continue
return res_img
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
def __get_nearest_title(self, table_index, filename):
"""Get the hierarchical title structure before the table"""
import re
from docx.text.paragraph import Paragraph
titles = []
blocks = []
# Get document name from filename parameter
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
if not doc_name:
doc_name = "Untitled Document"
# Collect all document blocks while maintaining document order
try:
# Iterate through all paragraphs and tables in document order
for i, block in enumerate(self.doc._element.body):
if block.tag.endswith('p'): # Paragraph
p = Paragraph(block, self.doc)
blocks.append(('p', i, p))
elif block.tag.endswith('tbl'): # Table
blocks.append(('t', i, None)) # Table object will be retrieved later
except Exception as e:
logging.error(f"Error collecting blocks: {e}")
return ""
# Find the target table position
target_table_pos = -1
table_count = 0
for i, (block_type, pos, _) in enumerate(blocks):
if block_type == 't':
if table_count == table_index:
target_table_pos = pos
break
table_count += 1
if target_table_pos == -1:
return "" # Target table not found
# Find the nearest heading paragraph in reverse order
nearest_title = None
for i in range(len(blocks)-1, -1, -1):
block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != 'p':
continue
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try:
level_match = re.search(r"(\d+)", block.style.name)
if level_match:
level = int(level_match.group(1))
if level <= 7: # Support up to 7 heading levels
title_text = block.text.strip()
if title_text: # Avoid empty titles
nearest_title = (level, title_text)
break
except Exception as e:
logging.error(f"Error parsing heading level: {e}")
if nearest_title:
# Add current title
titles.append(nearest_title)
current_level = nearest_title[0]
# Find all parent headings, allowing cross-level search
while current_level > 1:
found = False
for i in range(len(blocks)-1, -1, -1):
block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != 'p':
continue
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try:
level_match = re.search(r"(\d+)", block.style.name)
if level_match:
level = int(level_match.group(1))
# Find any heading with a higher level
if level < current_level:
title_text = block.text.strip()
if title_text: # Avoid empty titles
titles.append((level, title_text))
current_level = level
found = True
break
except Exception as e:
logging.error(f"Error parsing parent heading: {e}")
if not found: # Break if no parent heading is found
break
# Sort by level (ascending, from highest to lowest)
titles.sort(key=lambda x: x[0])
# Organize titles (from highest to lowest)
hierarchy = [doc_name] + [t[1] for t in titles]
return " > ".join(hierarchy)
return ""
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
last_image = None
for p in self.doc.paragraphs:
if pn > to_page:
break
if from_page <= pn < to_page:
if p.text.strip():
if p.style and p.style.name == 'Caption':
former_image = None
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
former_image = lines[-1][1].pop()
elif last_image:
former_image = last_image
last_image = None
lines.append((self.__clean(p.text), [former_image], p.style.name))
else:
current_image = self.get_picture(self.doc, p)
image_list = [current_image]
if last_image:
image_list.insert(0, last_image)
last_image = None
lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
else:
if current_image := self.get_picture(self.doc, p):
if lines:
lines[-1][1].append(current_image)
else:
last_image = current_image
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
tbls = []
for i, tb in enumerate(self.doc.tables):
title = self.__get_nearest_title(i, filename)
html = "<table>"
if title:
html += f"<caption>Table Location: {title}</caption>"
for r in tb.rows:
html += "<tr>"
i = 0
try:
while i < len(r.cells):
span = 1
c = r.cells[i]
for j in range(i + 1, len(r.cells)):
if c.text == r.cells[j].text:
span += 1
i = j
else:
break
i += 1
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
except Exception as e:
logging.warning(f"Error parsing table, ignore: {e}")
html += "</tr>"
html += "</table>"
tbls.append(((None, html), ""))
return new_line, tbls
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
"""
This function uses mammoth, licensed under the BSD 2-Clause License.
"""
import base64
import uuid
import mammoth
from markdownify import markdownify
docx_file = BytesIO(binary) if binary else open(filename, "rb")
def _convert_image_to_base64(image):
try:
with image.open() as image_file:
image_bytes = image_file.read()
encoded = base64.b64encode(image_bytes).decode("utf-8")
base64_url = f"data:{image.content_type};base64,{encoded}"
alt_name = "image"
alt_name = f"img_{uuid.uuid4().hex[:8]}"
return {"src": base64_url, "alt": alt_name}
except Exception as e:
logging.warning(f"Failed to convert image to base64: {e}")
return {"src": "", "alt": "image"}
try:
if inline_images:
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
else:
result = mammoth.convert_to_html(docx_file)
html = result.value
markdown_text = markdownify(html)
return markdown_text
finally:
if not binary:
docx_file.close()
class Pdf(PdfParser):
def __init__(self):
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
start = timer()
first_start = start
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
start = timer()
self._text_merge(zoomin=zoomin)
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
if separate_tables_figures:
tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
self._concat_downward()
logging.info("layouts cost: {}s".format(timer() - first_start))
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
else:
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
self._concat_downward()
self._final_reading_order_merge()
# self._filter_forpages()
logging.info("layouts cost: {}s".format(timer() - first_start))
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
class Markdown(MarkdownParser):
def md_to_html(self, sections):
if not sections:
return []
if isinstance(sections, type("")):
text = sections
elif isinstance(sections[0], type("")):
text = sections[0]
else:
return []
from bs4 import BeautifulSoup
html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser')
return soup
def get_picture_urls(self, soup):
if soup:
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
return []
def get_hyperlink_urls(self, soup):
if soup:
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
return []
def get_pictures(self, text):
"""Download and open all images from markdown text."""
import requests
soup = self.md_to_html(text)
image_urls = self.get_picture_urls(soup)
images = []
# Find all image URLs in text
for url in image_urls:
try:
# check if the url is a local file or a remote URL
if url.startswith(('http://', 'https://')):
# For remote URLs, download the image
response = requests.get(url, stream=True, timeout=30)
if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'):
img = Image.open(BytesIO(response.content)).convert('RGB')
images.append(img)
else:
# For local file paths, open the image directly
from pathlib import Path
local_path = Path(url)
if not local_path.exists():
logging.warning(f"Local image file not found: {url}")
continue
img = Image.open(url).convert('RGB')
images.append(img)
except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}")
continue
return images if images else None
def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(filename, "r") as f:
txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
# extractor = MarkdownElementExtractor(remainder)
extractor = MarkdownElementExtractor(txt)
element_sections = extractor.extract_elements(delimiter)
sections = [(element, "") for element in element_sections]
tbls = []
for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
return sections, tbls
def load_from_xml_v2(baseURI, rels_item_xml):
"""
Return |_SerializedRelationships| instance loaded with the
relationships contained in *rels_item_xml*. Returns an empty
collection if *rels_item_xml* is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
urls = set()
url_res = []
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
pdf_parser = None
section_images = None
is_root = kwargs.get("is_root", True)
embed_res = []
if is_root:
# Only extract embedded files at the root call
embeds = []
if binary is not None:
embeds = extract_embed_file(binary)
else:
raise Exception("Embedding extraction from file path is not supported.")
# Recursively chunk each embedded file and collect results
for embed_filename, embed_bytes in embeds:
try:
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
embed_res.extend(sub_res)
except Exception as e:
if callback:
callback(0.05, f"Failed to chunk embed {embed_filename}: {e}")
continue
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_docx(binary)
for index, url in enumerate(urls):
html_bytes, metadata = extract_html(url)
if not html_bytes:
continue
try:
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2
sections, tables = Docx()(filename, binary)
tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
st = timer()
chunks, images = naive_merge_docx(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
chunks.extend(embed_res)
chunks.extend(url_res)
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res)
res.extend(url_res)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_pdf(binary)
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
sections, tables, pdf_parser = parser(
filename = filename,
binary = binary,
from_page = from_page,
to_page = to_page,
lang = lang,
callback = callback,
**kwargs
)
if not sections and not tables:
return []
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
if parser_config.get("html4excel"):
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
else:
sections = [(_, "") for _ in excel_parser(binary) if _]
parser_config["chunk_token_num"] = 12800
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = TxtParser()(filename, binary,
parser_config.get("chunk_token_num", 128),
parser_config.get("delimiter", "\n!?;。;!?"))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
# Process images for each section
section_images = []
for idx, (section_text, _) in enumerate(sections):
images = markdown_parser.get_pictures(section_text) if section_text else None
if images:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
section_images.append(combined_image)
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
boosted_figures = markdown_vision_parser(callback=callback)
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
else:
section_images.append(None)
else:
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
if parser_config.get("hyperlink_urls", False) and is_root:
for idx, (section_text, _) in enumerate(sections):
soup = markdown_parser.md_to_html(section_text)
hyperlink_urls = markdown_parser.get_hyperlink_urls(soup)
urls.update(hyperlink_urls)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
sections = HtmlParser()(filename, binary, chunk_token_num)
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
sections = JsonParser(chunk_token_num)(binary)
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
else:
callback(0.8, f"tika.parser got empty content from {filename}.")
logging.warning(f"tika.parser got empty content from {filename}.")
return []
else:
raise NotImplementedError(
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer()
if section_images:
# if all images are None, set section_images to None
if all(image is None for image in section_images):
section_images = None
if section_images:
chunks, images = naive_merge_with_images(sections, section_images,
int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
chunks.extend(embed_res)
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
else:
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
chunks.extend(embed_res)
return chunks
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
for index, url in enumerate(urls):
html_bytes, metadata = extract_html(url)
if not html_bytes:
continue
try:
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
logging.info("naive_merge({}): {}".format(filename, timer() - st))
if embed_res:
res.extend(embed_res)
if url_res:
res.extend(url_res)
return res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)