Feat: parsing hyperlinks in docx and pdf & Fix: default parser config of toc extraction (#10877)

### What problem does this PR solve?

Feat: parsing hyperlinks in docx and pdf #10848
Fix: default parser config of toc extraction

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Billy Bao
2025-11-03 09:34:12 +08:00
committed by GitHub
parent 360f5c1179
commit fa210e7c58
3 changed files with 181 additions and 11 deletions

View File

@ -366,7 +366,7 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
page_size = doc["parser_config"].get("task_page_size") or 12
if doc["parser_id"] == "paper":
page_size = doc["parser_config"].get("task_page_size") or 22
if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc", True):
if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc_extraction", False):
page_size = 10 ** 9
page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)]
for s, e in page_ranges:

View File

@ -28,10 +28,14 @@ import sys
import tempfile
import threading
import zipfile
import requests
import PyPDF2
from docx import Document
from io import BytesIO
from requests.exceptions import Timeout, RequestException
# Typing
from typing import List, Union, Tuple
from typing import List, Union, Tuple, Optional, Dict
# Third-party imports
import olefile
@ -373,3 +377,120 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
return out
return out
def extract_links_from_docx(docx_bytes: bytes):
"""
Extract all hyperlinks from a Word (.docx) document binary stream.
Args:
docx_bytes (bytes): Raw bytes of a .docx file.
Returns:
set[str]: A set of unique hyperlink URLs.
"""
links = set()
with BytesIO(docx_bytes) as bio:
document = Document(bio)
# Each relationship may represent a hyperlink, image, footer, etc.
for rel in document.part.rels.values():
if rel.reltype == (
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
):
links.add(rel.target_ref)
return links
def extract_links_from_pdf(pdf_bytes: bytes):
"""
Extract all clickable hyperlinks from a PDF binary stream.
Args:
pdf_bytes (bytes): Raw bytes of a PDF file.
Returns:
set[str]: A set of unique hyperlink URLs (unordered).
"""
links = set()
with BytesIO(pdf_bytes) as bio:
pdf = PyPDF2.PdfReader(bio)
for page in pdf.pages:
annots = page.get("/Annots")
if not annots or isinstance(annots, PyPDF2.generic.IndirectObject):
continue
for annot in annots:
obj = annot.get_object()
a = obj.get("/A")
if a and a.get("/URI"):
links.add(a["/URI"])
return links
_GLOBAL_SESSION: Optional[requests.Session] = None
def _get_session(headers: Optional[Dict[str, str]] = None) -> requests.Session:
"""Get or create a global reusable session."""
global _GLOBAL_SESSION
if _GLOBAL_SESSION is None:
_GLOBAL_SESSION = requests.Session()
_GLOBAL_SESSION.headers.update({
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/121.0 Safari/537.36"
)
})
if headers:
_GLOBAL_SESSION.headers.update(headers)
return _GLOBAL_SESSION
def extract_html(
url: str,
timeout: float = 60.0,
headers: Optional[Dict[str, str]] = None,
max_retries: int = 2,
) -> Tuple[Optional[bytes], Dict[str, str]]:
"""
Extract the full HTML page as raw bytes from a given URL.
Automatically reuses a persistent HTTP session and applies robust timeout & retry logic.
Args:
url (str): Target webpage URL.
timeout (float): Request timeout in seconds (applies to connect + read).
headers (dict, optional): Extra HTTP headers.
max_retries (int): Number of retries on timeout or transient errors.
Returns:
tuple(bytes|None, dict):
- html_bytes: Raw HTML content (or None if failed)
- metadata: HTTP info (status_code, content_type, final_url, error if any)
"""
sess = _get_session(headers=headers)
metadata = {"final_url": url, "status_code": "", "content_type": "", "error": ""}
for attempt in range(1, max_retries + 1):
try:
resp = sess.get(url, timeout=timeout)
resp.raise_for_status()
html_bytes = resp.content
metadata.update({
"final_url": resp.url,
"status_code": str(resp.status_code),
"content_type": resp.headers.get("Content-Type", ""),
})
return html_bytes, metadata
except Timeout:
metadata["error"] = f"Timeout after {timeout}s (attempt {attempt}/{max_retries})"
if attempt >= max_retries:
continue
except RequestException as e:
metadata["error"] = f"Request failed: {e}"
continue
return None, metadata

View File

@ -30,7 +30,7 @@ from tika import parser
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from api.utils.file_utils import extract_embed_file
from api.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
@ -351,7 +351,7 @@ class Pdf(PdfParser):
class Markdown(MarkdownParser):
def get_picture_urls(self, sections):
def md_to_html(self, sections):
if not sections:
return []
if isinstance(sections, type("")):
@ -364,13 +364,23 @@ class Markdown(MarkdownParser):
from bs4 import BeautifulSoup
html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser')
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
return html_images
return soup
def get_picture_urls(self, soup):
if soup:
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
return []
def get_hyperlink_urls(self, soup):
if soup:
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
return []
def get_pictures(self, text):
"""Download and open all images from markdown text."""
import requests
image_urls = self.get_picture_urls(text)
soup = self.md_to_html(text)
image_urls = self.get_picture_urls(soup)
images = []
# Find all image URLs in text
for url in image_urls:
@ -439,12 +449,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
urls = set()
url_res = []
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -476,8 +488,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_docx(binary)
for index, url in enumerate(urls):
html_bytes, metadata = extract_html(url)
if not html_bytes:
continue
try:
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2
@ -497,15 +519,20 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if kwargs.get("section_only", False):
chunks.extend(embed_res)
chunks.extend(url_res)
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res)
res.extend(url_res)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_pdf(binary)
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
callback(0.1, "Start to parse.")
@ -623,9 +650,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
else:
section_images.append(None)
else:
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
if parser_config.get("hyperlink_urls", False) and is_root:
for idx, (section_text, _) in enumerate(sections):
soup = markdown_parser.md_to_html(section_text)
hyperlink_urls = markdown_parser.get_hyperlink_urls(soup)
urls.update(hyperlink_urls)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
@ -645,6 +678,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.doc$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
@ -686,9 +720,24 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
for index, url in enumerate(urls):
html_bytes, metadata = extract_html(url)
if not html_bytes:
continue
try:
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
logging.info("naive_merge({}): {}".format(filename, timer() - st))
if embed_res:
res.extend(embed_res)
if url_res:
res.extend(url_res)
return res