Feat: parsing hyperlinks in docx and pdf & Fix: default parser config of toc extraction (#10877)

### What problem does this PR solve?

Feat: parsing hyperlinks in docx and pdf #10848
Fix: default parser config of toc extraction

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Billy Bao
2025-11-03 09:34:12 +08:00
committed by GitHub
parent 360f5c1179
commit fa210e7c58
3 changed files with 181 additions and 11 deletions

View File

@ -366,7 +366,7 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
page_size = doc["parser_config"].get("task_page_size") or 12 page_size = doc["parser_config"].get("task_page_size") or 12
if doc["parser_id"] == "paper": if doc["parser_id"] == "paper":
page_size = doc["parser_config"].get("task_page_size") or 22 page_size = doc["parser_config"].get("task_page_size") or 22
if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc", True): if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc_extraction", False):
page_size = 10 ** 9 page_size = 10 ** 9
page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)] page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)]
for s, e in page_ranges: for s, e in page_ranges:

View File

@ -28,10 +28,14 @@ import sys
import tempfile import tempfile
import threading import threading
import zipfile import zipfile
import requests
import PyPDF2
from docx import Document
from io import BytesIO from io import BytesIO
from requests.exceptions import Timeout, RequestException
# Typing # Typing
from typing import List, Union, Tuple from typing import List, Union, Tuple, Optional, Dict
# Third-party imports # Third-party imports
import olefile import olefile
@ -373,3 +377,120 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
return out return out
return out return out
def extract_links_from_docx(docx_bytes: bytes):
"""
Extract all hyperlinks from a Word (.docx) document binary stream.
Args:
docx_bytes (bytes): Raw bytes of a .docx file.
Returns:
set[str]: A set of unique hyperlink URLs.
"""
links = set()
with BytesIO(docx_bytes) as bio:
document = Document(bio)
# Each relationship may represent a hyperlink, image, footer, etc.
for rel in document.part.rels.values():
if rel.reltype == (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
):
links.add(rel.target_ref)
return links
def extract_links_from_pdf(pdf_bytes: bytes):
"""
Extract all clickable hyperlinks from a PDF binary stream.
Args:
pdf_bytes (bytes): Raw bytes of a PDF file.
Returns:
set[str]: A set of unique hyperlink URLs (unordered).
"""
links = set()
with BytesIO(pdf_bytes) as bio:
pdf = PyPDF2.PdfReader(bio)
for page in pdf.pages:
annots = page.get("/Annots")
if not annots or isinstance(annots, PyPDF2.generic.IndirectObject):
continue
for annot in annots:
obj = annot.get_object()
a = obj.get("/A")
if a and a.get("/URI"):
links.add(a["/URI"])
return links
_GLOBAL_SESSION: Optional[requests.Session] = None
def _get_session(headers: Optional[Dict[str, str]] = None) -> requests.Session:
"""Get or create a global reusable session."""
global _GLOBAL_SESSION
if _GLOBAL_SESSION is None:
_GLOBAL_SESSION = requests.Session()
_GLOBAL_SESSION.headers.update({
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/121.0 Safari/537.36"
)
})
if headers:
_GLOBAL_SESSION.headers.update(headers)
return _GLOBAL_SESSION
def extract_html(
url: str,
timeout: float = 60.0,
headers: Optional[Dict[str, str]] = None,
max_retries: int = 2,
) -> Tuple[Optional[bytes], Dict[str, str]]:
"""
Extract the full HTML page as raw bytes from a given URL.
Automatically reuses a persistent HTTP session and applies robust timeout & retry logic.
Args:
url (str): Target webpage URL.
timeout (float): Request timeout in seconds (applies to connect + read).
headers (dict, optional): Extra HTTP headers.
max_retries (int): Number of retries on timeout or transient errors.
Returns:
tuple(bytes|None, dict):
- html_bytes: Raw HTML content (or None if failed)
- metadata: HTTP info (status_code, content_type, final_url, error if any)
"""
sess = _get_session(headers=headers)
metadata = {"final_url": url, "status_code": "", "content_type": "", "error": ""}
for attempt in range(1, max_retries + 1):
try:
resp = sess.get(url, timeout=timeout)
resp.raise_for_status()
html_bytes = resp.content
metadata.update({
"final_url": resp.url,
"status_code": str(resp.status_code),
"content_type": resp.headers.get("Content-Type", ""),
})
return html_bytes, metadata
except Timeout:
metadata["error"] = f"Timeout after {timeout}s (attempt {attempt}/{max_retries})"
if attempt >= max_retries:
continue
except RequestException as e:
metadata["error"] = f"Request failed: {e}"
continue
return None, metadata

View File

@ -30,7 +30,7 @@ from tika import parser
from api.db import LLMType from api.db import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from api.utils.file_utils import extract_embed_file from api.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.pdf_parser import PlainParser, VisionParser
@ -351,7 +351,7 @@ class Pdf(PdfParser):
class Markdown(MarkdownParser): class Markdown(MarkdownParser):
def get_picture_urls(self, sections): def md_to_html(self, sections):
if not sections: if not sections:
return [] return []
if isinstance(sections, type("")): if isinstance(sections, type("")):
@ -364,13 +364,23 @@ class Markdown(MarkdownParser):
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
html_content = markdown(text) html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')] return soup
return html_images
def get_picture_urls(self, soup):
if soup:
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
return []
def get_hyperlink_urls(self, soup):
if soup:
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
return []
def get_pictures(self, text): def get_pictures(self, text):
"""Download and open all images from markdown text.""" """Download and open all images from markdown text."""
import requests import requests
image_urls = self.get_picture_urls(text) soup = self.md_to_html(text)
image_urls = self.get_picture_urls(soup)
images = [] images = []
# Find all image URLs in text # Find all image URLs in text
for url in image_urls: for url in image_urls:
@ -439,12 +449,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
Successive text will be sliced into pieces using 'delimiter'. Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
""" """
urls = set()
url_res = []
is_english = lang.lower() == "english" # is_english(cks) is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get( parser_config = kwargs.get(
"parser_config", { "parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -476,8 +488,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_docx(binary)
for index, url in enumerate(urls):
html_bytes, metadata = extract_html(url)
if not html_bytes:
continue
try:
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2 _SerializedRelationships.load_from_xml = load_from_xml_v2
@ -497,15 +519,20 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if kwargs.get("section_only", False): if kwargs.get("section_only", False):
chunks.extend(embed_res) chunks.extend(embed_res)
chunks.extend(url_res)
return chunks return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res) res.extend(embed_res)
res.extend(url_res)
return res return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_pdf(binary)
if isinstance(layout_recognizer, bool): if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
@ -623,9 +650,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1]) sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
else: else:
section_images.append(None) section_images.append(None)
else: else:
logging.warning("No visual model detected. Skipping figure parsing enhancement.") logging.warning("No visual model detected. Skipping figure parsing enhancement.")
if parser_config.get("hyperlink_urls", False) and is_root:
for idx, (section_text, _) in enumerate(sections):
soup = markdown_parser.md_to_html(section_text)
hyperlink_urls = markdown_parser.get_hyperlink_urls(soup)
urls.update(hyperlink_urls)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
@ -645,6 +678,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.doc$", filename, re.IGNORECASE): elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
binary = BytesIO(binary) binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary) doc_parsed = parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None: if doc_parsed.get('content', None) is not None:
@ -686,9 +720,24 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
for index, url in enumerate(urls):
html_bytes, metadata = extract_html(url)
if not html_bytes:
continue
try:
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
if embed_res: if embed_res:
res.extend(embed_res) res.extend(embed_res)
if url_res:
res.extend(url_res)
return res return res