# # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import logging import re import os from functools import reduce from io import BytesIO from timeit import default_timer as timer from docx import Document from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml from markdown import markdown from PIL import Image from common.token_utils import num_tokens_from_string from common.constants import LLMType from api.db.services.llm_service import LLMBundle from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): callback = callback binary = binary pdf_parser = pdf_cls() if pdf_cls else Pdf() sections, tables = pdf_parser( filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback ) tables = vision_figure_parser_pdf_wrapper(tbls=tables, callback=callback, **kwargs) return sections, tables, pdf_parser def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987") pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api) parse_method = kwargs.get("parse_method", "raw") if not pdf_parser.check_installation(): callback(-1, "MinerU not found.") return None, None, pdf_parser sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), backend=os.environ.get("MINERU_BACKEND", "pipeline"), server_url=os.environ.get("MINERU_SERVER_URL", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), parse_method=parse_method ) return sections, tables, pdf_parser def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): pdf_parser = DoclingParser() parse_method = kwargs.get("parse_method", "raw") if not pdf_parser.check_installation(): callback(-1, "Docling not found.") return None, None, pdf_parser sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), parse_method=parse_method ) return sections, tables, pdf_parser def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): tcadp_parser = TCADPParser() if not tcadp_parser.check_installation(): callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") return None, None, tcadp_parser sections, tables = tcadp_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF" ) return sections, tables, tcadp_parser def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): if kwargs.get("layout_recognizer", "") == "Plain Text": pdf_parser = PlainParser() else: vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese")) pdf_parser = VisionParser(vision_model=vision_model, **kwargs) sections, tables = pdf_parser( filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback ) return sections, tables, pdf_parser PARSERS = { "deepdoc": by_deepdoc, "mineru": by_mineru, "docling": by_docling, "tcadp": by_tcadp, "plaintext": by_plaintext, # default } class Docx(DocxParser): def __init__(self): pass def get_picture(self, document, paragraph): imgs = paragraph._element.xpath('.//pic:pic') if not imgs: return None res_img = None for img in imgs: embed = img.xpath('.//a:blip/@r:embed') if not embed: continue embed = embed[0] try: related_part = document.part.related_parts[embed] image_blob = related_part.image.blob except UnrecognizedImageError: logging.info("Unrecognized image format. Skipping image.") continue except UnexpectedEndOfFileError: logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.") continue except InvalidImageStreamError: logging.info("The recognized image stream appears to be corrupted. Skipping image.") continue except UnicodeDecodeError: logging.info("The recognized image stream appears to be corrupted. Skipping image.") continue except Exception: logging.info("The recognized image stream appears to be corrupted. Skipping image.") continue try: image = Image.open(BytesIO(image_blob)).convert('RGB') if res_img is None: res_img = image else: res_img = concat_img(res_img, image) except Exception: continue return res_img def __clean(self, line): line = re.sub(r"\u3000", " ", line).strip() return line def __get_nearest_title(self, table_index, filename): """Get the hierarchical title structure before the table""" import re from docx.text.paragraph import Paragraph titles = [] blocks = [] # Get document name from filename parameter doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename) if not doc_name: doc_name = "Untitled Document" # Collect all document blocks while maintaining document order try: # Iterate through all paragraphs and tables in document order for i, block in enumerate(self.doc._element.body): if block.tag.endswith('p'): # Paragraph p = Paragraph(block, self.doc) blocks.append(('p', i, p)) elif block.tag.endswith('tbl'): # Table blocks.append(('t', i, None)) # Table object will be retrieved later except Exception as e: logging.error(f"Error collecting blocks: {e}") return "" # Find the target table position target_table_pos = -1 table_count = 0 for i, (block_type, pos, _) in enumerate(blocks): if block_type == 't': if table_count == table_index: target_table_pos = pos break table_count += 1 if target_table_pos == -1: return "" # Target table not found # Find the nearest heading paragraph in reverse order nearest_title = None for i in range(len(blocks)-1, -1, -1): block_type, pos, block = blocks[i] if pos >= target_table_pos: # Skip blocks after the table continue if block_type != 'p': continue if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I): try: level_match = re.search(r"(\d+)", block.style.name) if level_match: level = int(level_match.group(1)) if level <= 7: # Support up to 7 heading levels title_text = block.text.strip() if title_text: # Avoid empty titles nearest_title = (level, title_text) break except Exception as e: logging.error(f"Error parsing heading level: {e}") if nearest_title: # Add current title titles.append(nearest_title) current_level = nearest_title[0] # Find all parent headings, allowing cross-level search while current_level > 1: found = False for i in range(len(blocks)-1, -1, -1): block_type, pos, block = blocks[i] if pos >= target_table_pos: # Skip blocks after the table continue if block_type != 'p': continue if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): try: level_match = re.search(r"(\d+)", block.style.name) if level_match: level = int(level_match.group(1)) # Find any heading with a higher level if level < current_level: title_text = block.text.strip() if title_text: # Avoid empty titles titles.append((level, title_text)) current_level = level found = True break except Exception as e: logging.error(f"Error parsing parent heading: {e}") if not found: # Break if no parent heading is found break # Sort by level (ascending, from highest to lowest) titles.sort(key=lambda x: x[0]) # Organize titles (from highest to lowest) hierarchy = [doc_name] + [t[1] for t in titles] return " > ".join(hierarchy) return "" def __call__(self, filename, binary=None, from_page=0, to_page=100000): self.doc = Document( filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] last_image = None for p in self.doc.paragraphs: if pn > to_page: break if from_page <= pn < to_page: if p.text.strip(): if p.style and p.style.name == 'Caption': former_image = None if lines and lines[-1][1] and lines[-1][2] != 'Caption': former_image = lines[-1][1].pop() elif last_image: former_image = last_image last_image = None lines.append((self.__clean(p.text), [former_image], p.style.name)) else: current_image = self.get_picture(self.doc, p) image_list = [current_image] if last_image: image_list.insert(0, last_image) last_image = None lines.append((self.__clean(p.text), image_list, p.style.name if p.style else "")) else: if current_image := self.get_picture(self.doc, p): if lines: lines[-1][1].append(current_image) else: last_image = current_image for run in p.runs: if 'lastRenderedPageBreak' in run._element.xml: pn += 1 continue if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: pn += 1 new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines] tbls = [] for i, tb in enumerate(self.doc.tables): title = self.__get_nearest_title(i, filename) html = "" if title: html += f"" for r in tb.rows: html += "" i = 0 try: while i < len(r.cells): span = 1 c = r.cells[i] for j in range(i + 1, len(r.cells)): if c.text == r.cells[j].text: span += 1 i = j else: break i += 1 html += f"" if span == 1 else f"" except Exception as e: logging.warning(f"Error parsing table, ignore: {e}") html += "" html += "
Table Location: {title}
{c.text}{c.text}
" tbls.append(((None, html), "")) return new_line, tbls def to_markdown(self, filename=None, binary=None, inline_images: bool = True): """ This function uses mammoth, licensed under the BSD 2-Clause License. """ import base64 import uuid import mammoth from markdownify import markdownify docx_file = BytesIO(binary) if binary else open(filename, "rb") def _convert_image_to_base64(image): try: with image.open() as image_file: image_bytes = image_file.read() encoded = base64.b64encode(image_bytes).decode("utf-8") base64_url = f"data:{image.content_type};base64,{encoded}" alt_name = "image" alt_name = f"img_{uuid.uuid4().hex[:8]}" return {"src": base64_url, "alt": alt_name} except Exception as e: logging.warning(f"Failed to convert image to base64: {e}") return {"src": "", "alt": "image"} try: if inline_images: result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64)) else: result = mammoth.convert_to_html(docx_file) html = result.value markdown_text = markdownify(html) return markdown_text finally: if not binary: docx_file.close() class Pdf(PdfParser): def __init__(self): super().__init__() def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, separate_tables_figures=False): start = timer() first_start = start callback(msg="OCR started") self.__images__( filename if not binary else binary, zoomin, from_page, to_page, callback ) callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) start = timer() self._layouts_rec(zoomin) callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) start = timer() self._table_transformer_job(zoomin) callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start)) start = timer() self._text_merge(zoomin=zoomin) callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) if separate_tables_figures: tbls, figures = self._extract_table_figure(True, zoomin, True, True, True) self._concat_downward() logging.info("layouts cost: {}s".format(timer() - first_start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures else: tbls = self._extract_table_figure(True, zoomin, True, True) self._naive_vertical_merge() self._concat_downward() self._final_reading_order_merge() # self._filter_forpages() logging.info("layouts cost: {}s".format(timer() - first_start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls class Markdown(MarkdownParser): def md_to_html(self, sections): if not sections: return [] if isinstance(sections, type("")): text = sections elif isinstance(sections[0], type("")): text = sections[0] else: return [] from bs4 import BeautifulSoup html_content = markdown(text) soup = BeautifulSoup(html_content, 'html.parser') return soup def get_hyperlink_urls(self, soup): if soup: return set([a.get('href') for a in soup.find_all('a') if a.get('href')]) return [] def extract_image_urls_with_lines(self, text): md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)") html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE) urls = [] seen = set() lines = text.splitlines() for idx, line in enumerate(lines): for url in md_img_re.findall(line): if (url, idx) not in seen: urls.append({"url": url, "line": idx}) seen.add((url, idx)) for url in html_img_re.findall(line): if (url, idx) not in seen: urls.append({"url": url, "line": idx}) seen.add((url, idx)) # cross-line try: from bs4 import BeautifulSoup soup = BeautifulSoup(text, 'html.parser') newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)] for img_tag in soup.find_all('img'): src = img_tag.get('src') if not src: continue tag_str = str(img_tag) pos = text.find(tag_str) if pos == -1: # fallback pos = max(text.find(src), 0) line_no = 0 for i, off in enumerate(newline_offsets): if pos <= off: line_no = i break if (src, line_no) not in seen: urls.append({"url": src, "line": line_no}) seen.add((src, line_no)) except Exception: pass return urls def load_images_from_urls(self, urls, cache=None): import requests from pathlib import Path cache = cache or {} images = [] for url in urls: if url in cache: if cache[url]: images.append(cache[url]) continue img_obj = None try: if url.startswith(('http://', 'https://')): response = requests.get(url, stream=True, timeout=30) if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'): img_obj = Image.open(BytesIO(response.content)).convert('RGB') else: local_path = Path(url) if local_path.exists(): img_obj = Image.open(url).convert('RGB') else: logging.warning(f"Local image file not found: {url}") except Exception as e: logging.error(f"Failed to download/open image from {url}: {e}") cache[url] = img_obj if img_obj: images.append(img_obj) return images, cache def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False): if binary: encoding = find_codec(binary) txt = binary.decode(encoding, errors="ignore") else: with open(filename, "r") as f: txt = f.read() remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables) # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410. # extractor = MarkdownElementExtractor(remainder) extractor = MarkdownElementExtractor(txt) image_refs = self.extract_image_urls_with_lines(txt) element_sections = extractor.extract_elements(delimiter, include_meta=True) sections = [] section_images = [] image_cache = {} for element in element_sections: content = element["content"] start_line = element["start_line"] end_line = element["end_line"] urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line] imgs = [] if urls_in_section: imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache) combined_image = None if imgs: combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0] sections.append((content, "")) section_images.append(combined_image) tbls = [] for table in tables: tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) if return_section_images: return sections, tbls, section_images return sections, tbls def load_from_xml_v2(baseURI, rels_item_xml): """ Return |_SerializedRelationships| instance loaded with the relationships contained in *rels_item_xml*. Returns an empty collection if *rels_item_xml* is |None|. """ srels = _SerializedRelationships() if rels_item_xml is not None: rels_elm = parse_xml(rels_item_xml) for rel_elm in rels_elm.Relationship_lst: if rel_elm.target_ref in ('../NULL', 'NULL'): continue srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) return srels def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ Supported file formats are docx, pdf, excel, txt. This method apply the naive ways to chunk files. Successive text will be sliced into pieces using 'delimiter'. Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. """ urls = set() url_res = [] is_english = lang.lower() == "english" # is_english(cks) parser_config = kwargs.get( "parser_config", { "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0)) image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0)) final_sections = False doc = { "docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) } doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] pdf_parser = None section_images = None is_root = kwargs.get("is_root", True) embed_res = [] if is_root: # Only extract embedded files at the root call embeds = [] if binary is not None: embeds = extract_embed_file(binary) else: raise Exception("Embedding extraction from file path is not supported.") # Recursively chunk each embedded file and collect results for embed_filename, embed_bytes in embeds: try: sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or [] embed_res.extend(sub_res) except Exception as e: if callback: callback(0.05, f"Failed to chunk embed {embed_filename}: {e}") continue if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") if parser_config.get("analyze_hyperlink", False) and is_root: urls = extract_links_from_docx(binary) for index, url in enumerate(urls): html_bytes, metadata = extract_html(url) if not html_bytes: continue try: sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) except Exception as e: logging.info(f"Failed to chunk url in registered file type {url}: {e}") sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) url_res.extend(sub_url_res) # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 _SerializedRelationships.load_from_xml = load_from_xml_v2 sections, tables = Docx()(filename, binary) tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs) res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") st = timer() chunks, images = naive_merge_docx( sections, int(parser_config.get( "chunk_token_num", 128)), parser_config.get( "delimiter", "\n!?。;!?")) if kwargs.get("section_only", False): chunks.extend(embed_res) chunks.extend(url_res) return chunks res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) logging.info("naive_merge({}): {}".format(filename, timer() - st)) res.extend(embed_res) res.extend(url_res) if table_context_size or image_context_size: attach_media_context(res, table_context_size, image_context_size) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") if parser_config.get("analyze_hyperlink", False) and is_root: urls = extract_links_from_pdf(binary) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" name = layout_recognizer.strip().lower() parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") sections, tables, pdf_parser = parser( filename = filename, binary = binary, from_page = from_page, to_page = to_page, lang = lang, callback = callback, layout_recognizer = layout_recognizer, **kwargs ) if not sections and not tables: return [] if name in ["tcadp", "docling", "mineru"]: parser_config["chunk_token_num"] = 0 res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") # Check if tcadp_parser is selected for spreadsheet files layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") if layout_recognizer == "TCADP Parser": table_result_type = parser_config.get("table_result_type", "1") markdown_image_response_type = parser_config.get("markdown_image_response_type", "1") tcadp_parser = TCADPParser( table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type ) if not tcadp_parser.check_installation(): callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") return res # Determine file type based on extension file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV" sections, tables = tcadp_parser.parse_pdf( filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type ) parser_config["chunk_token_num"] = 0 res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") else: # Default DeepDOC parser excel_parser = ExcelParser() if parser_config.get("html4excel"): sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] parser_config["chunk_token_num"] = 0 else: sections = [(_, "") for _ in excel_parser(binary) if _] elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?")) callback(0.8, "Finish parsing.") elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) sections, tables, section_images = markdown_parser( filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"), return_section_images=True, ) final_sections = True try: vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) callback(0.2, "Visual model detected. Attempting to enhance figure extraction...") except Exception: vision_model = None if vision_model: # Process images for each section for idx, (section_text, _) in enumerate(sections): images = [] if section_images and len(section_images) > idx and section_images[idx] is not None: images.append(section_images[idx]) if images and len(images) > 0: # If multiple images found, combine them using concat_img combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] if section_images: section_images[idx] = combined_image else: section_images = [None] * len(sections) section_images[idx] = combined_image markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs) boosted_figures = markdown_vision_parser(callback=callback) sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1]) else: logging.warning("No visual model detected. Skipping figure parsing enhancement.") if parser_config.get("hyperlink_urls", False) and is_root: for idx, (section_text, _) in enumerate(sections): soup = markdown_parser.md_to_html(section_text) hyperlink_urls = markdown_parser.get_hyperlink_urls(soup) urls.update(hyperlink_urls) res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") chunk_token_num = int(parser_config.get("chunk_token_num", 128)) sections = HtmlParser()(filename, binary, chunk_token_num) sections = [(_, "") for _ in sections if _] callback(0.8, "Finish parsing.") elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") chunk_token_num = int(parser_config.get("chunk_token_num", 128)) sections = JsonParser(chunk_token_num)(binary) sections = [(_, "") for _ in sections if _] callback(0.8, "Finish parsing.") elif re.search(r"\.doc$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") try: from tika import parser as tika_parser except Exception as e: callback(0.8, f"tika not available: {e}. Unsupported .doc parsing.") logging.warning(f"tika not available: {e}. Unsupported .doc parsing for {filename}.") return [] binary = BytesIO(binary) doc_parsed = tika_parser.from_buffer(binary) if doc_parsed.get('content', None) is not None: sections = doc_parsed['content'].split('\n') sections = [(_, "") for _ in sections if _] callback(0.8, "Finish parsing.") else: callback(0.8, f"tika.parser got empty content from {filename}.") logging.warning(f"tika.parser got empty content from {filename}.") return [] else: raise NotImplementedError( "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") st = timer() if final_sections: merged_chunks = [] merged_images = [] chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128))) overlapped_percent = int(parser_config.get("overlapped_percent", 0)) overlapped_percent = max(0, min(overlapped_percent, 90)) current_text = "" current_tokens = 0 current_image = None for idx, sec in enumerate(sections): text = sec[0] if isinstance(sec, tuple) else sec sec_tokens = num_tokens_from_string(text) sec_image = section_images[idx] if section_images and idx < len(section_images) else None if current_text and current_tokens + sec_tokens > chunk_limit: merged_chunks.append(current_text) merged_images.append(current_image) overlap_part = "" if overlapped_percent > 0: overlap_len = int(len(current_text) * overlapped_percent / 100) if overlap_len > 0: overlap_part = current_text[-overlap_len:] current_text = overlap_part current_tokens = num_tokens_from_string(current_text) current_image = current_image if overlap_part else None if current_text: current_text += "\n" + text else: current_text = text current_tokens += sec_tokens if sec_image: current_image = concat_img(current_image, sec_image) if current_image else sec_image if current_text: merged_chunks.append(current_text) merged_images.append(current_image) chunks = merged_chunks has_images = merged_images and any(img is not None for img in merged_images) if kwargs.get("section_only", False): chunks.extend(embed_res) return chunks if has_images: res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images)) else: res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) else: if section_images: if all(image is None for image in section_images): section_images = None if section_images: chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get( "chunk_token_num", 128)), parser_config.get( "delimiter", "\n!?。;!?")) if kwargs.get("section_only", False): chunks.extend(embed_res) return chunks res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) else: chunks = naive_merge( sections, int(parser_config.get( "chunk_token_num", 128)), parser_config.get( "delimiter", "\n!?。;!?")) if kwargs.get("section_only", False): chunks.extend(embed_res) return chunks res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) if urls and parser_config.get("analyze_hyperlink", False) and is_root: for index, url in enumerate(urls): html_bytes, metadata = extract_html(url) if not html_bytes: continue try: sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) except Exception as e: logging.info(f"Failed to chunk url in registered file type {url}: {e}") sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) url_res.extend(sub_url_res) logging.info("naive_merge({}): {}".format(filename, timer() - st)) if embed_res: res.extend(embed_res) if url_res: res.extend(url_res) if table_context_size or image_context_size: attach_media_context(res, table_context_size, image_context_size) return res if __name__ == "__main__": import sys def dummy(prog=None, msg=""): pass chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)