diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index 6ed0fd0e3..5fcca824f 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -366,7 +366,7 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int): page_size = doc["parser_config"].get("task_page_size") or 12 if doc["parser_id"] == "paper": page_size = doc["parser_config"].get("task_page_size") or 22 - if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc", True): + if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc_extraction", False): page_size = 10 ** 9 page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)] for s, e in page_ranges: diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index 8eb84dcd2..45208da1e 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -28,10 +28,14 @@ import sys import tempfile import threading import zipfile +import requests +import PyPDF2 +from docx import Document from io import BytesIO +from requests.exceptions import Timeout, RequestException # Typing -from typing import List, Union, Tuple +from typing import List, Union, Tuple, Optional, Dict # Third-party imports import olefile @@ -372,4 +376,121 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes pass return out - return out \ No newline at end of file + return out + + +def extract_links_from_docx(docx_bytes: bytes): + """ + Extract all hyperlinks from a Word (.docx) document binary stream. + + Args: + docx_bytes (bytes): Raw bytes of a .docx file. + + Returns: + set[str]: A set of unique hyperlink URLs. + """ + links = set() + with BytesIO(docx_bytes) as bio: + document = Document(bio) + + # Each relationship may represent a hyperlink, image, footer, etc. + for rel in document.part.rels.values(): + if rel.reltype == ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" + ): + links.add(rel.target_ref) + + return links + + +def extract_links_from_pdf(pdf_bytes: bytes): + """ + Extract all clickable hyperlinks from a PDF binary stream. + + Args: + pdf_bytes (bytes): Raw bytes of a PDF file. + + Returns: + set[str]: A set of unique hyperlink URLs (unordered). + """ + links = set() + with BytesIO(pdf_bytes) as bio: + pdf = PyPDF2.PdfReader(bio) + + for page in pdf.pages: + annots = page.get("/Annots") + if not annots or isinstance(annots, PyPDF2.generic.IndirectObject): + continue + for annot in annots: + obj = annot.get_object() + a = obj.get("/A") + if a and a.get("/URI"): + links.add(a["/URI"]) + + return links + + +_GLOBAL_SESSION: Optional[requests.Session] = None +def _get_session(headers: Optional[Dict[str, str]] = None) -> requests.Session: + """Get or create a global reusable session.""" + global _GLOBAL_SESSION + if _GLOBAL_SESSION is None: + _GLOBAL_SESSION = requests.Session() + _GLOBAL_SESSION.headers.update({ + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/121.0 Safari/537.36" + ) + }) + if headers: + _GLOBAL_SESSION.headers.update(headers) + return _GLOBAL_SESSION + + +def extract_html( + url: str, + timeout: float = 60.0, + headers: Optional[Dict[str, str]] = None, + max_retries: int = 2, +) -> Tuple[Optional[bytes], Dict[str, str]]: + """ + Extract the full HTML page as raw bytes from a given URL. + Automatically reuses a persistent HTTP session and applies robust timeout & retry logic. + + Args: + url (str): Target webpage URL. + timeout (float): Request timeout in seconds (applies to connect + read). + headers (dict, optional): Extra HTTP headers. + max_retries (int): Number of retries on timeout or transient errors. + + Returns: + tuple(bytes|None, dict): + - html_bytes: Raw HTML content (or None if failed) + - metadata: HTTP info (status_code, content_type, final_url, error if any) + """ + sess = _get_session(headers=headers) + metadata = {"final_url": url, "status_code": "", "content_type": "", "error": ""} + + for attempt in range(1, max_retries + 1): + try: + resp = sess.get(url, timeout=timeout) + resp.raise_for_status() + + html_bytes = resp.content + metadata.update({ + "final_url": resp.url, + "status_code": str(resp.status_code), + "content_type": resp.headers.get("Content-Type", ""), + }) + return html_bytes, metadata + + except Timeout: + metadata["error"] = f"Timeout after {timeout}s (attempt {attempt}/{max_retries})" + if attempt >= max_retries: + continue + except RequestException as e: + metadata["error"] = f"Request failed: {e}" + continue + + return None, metadata \ No newline at end of file diff --git a/rag/app/naive.py b/rag/app/naive.py index 614b32ad3..a4bdbdae3 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -30,7 +30,7 @@ from tika import parser from api.db import LLMType from api.db.services.llm_service import LLMBundle -from api.utils.file_utils import extract_embed_file +from api.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser @@ -351,7 +351,7 @@ class Pdf(PdfParser): class Markdown(MarkdownParser): - def get_picture_urls(self, sections): + def md_to_html(self, sections): if not sections: return [] if isinstance(sections, type("")): @@ -364,13 +364,23 @@ class Markdown(MarkdownParser): from bs4 import BeautifulSoup html_content = markdown(text) soup = BeautifulSoup(html_content, 'html.parser') - html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')] - return html_images + return soup + + def get_picture_urls(self, soup): + if soup: + return [img.get('src') for img in soup.find_all('img') if img.get('src')] + return [] + def get_hyperlink_urls(self, soup): + if soup: + return set([a.get('href') for a in soup.find_all('a') if a.get('href')]) + return [] + def get_pictures(self, text): """Download and open all images from markdown text.""" import requests - image_urls = self.get_picture_urls(text) + soup = self.md_to_html(text) + image_urls = self.get_picture_urls(soup) images = [] # Find all image URLs in text for url in image_urls: @@ -439,12 +449,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, Successive text will be sliced into pieces using 'delimiter'. Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. """ + urls = set() + url_res = [] is_english = lang.lower() == "english" # is_english(cks) parser_config = kwargs.get( "parser_config", { - "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) + "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) doc = { "docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) @@ -476,8 +488,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - - + if parser_config.get("analyze_hyperlink", False) and is_root: + urls = extract_links_from_docx(binary) + for index, url in enumerate(urls): + html_bytes, metadata = extract_html(url) + if not html_bytes: + continue + try: + sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) + except Exception as e: + logging.info(f"Failed to chunk url in registered file type {url}: {e}") + sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) + url_res.extend(sub_url_res) # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 _SerializedRelationships.load_from_xml = load_from_xml_v2 @@ -497,15 +519,20 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if kwargs.get("section_only", False): chunks.extend(embed_res) + chunks.extend(url_res) return chunks res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) logging.info("naive_merge({}): {}".format(filename, timer() - st)) res.extend(embed_res) + res.extend(url_res) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + if parser_config.get("analyze_hyperlink", False) and is_root: + urls = extract_links_from_pdf(binary) + if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" callback(0.1, "Start to parse.") @@ -623,9 +650,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1]) else: section_images.append(None) + else: logging.warning("No visual model detected. Skipping figure parsing enhancement.") + if parser_config.get("hyperlink_urls", False) and is_root: + for idx, (section_text, _) in enumerate(sections): + soup = markdown_parser.md_to_html(section_text) + hyperlink_urls = markdown_parser.get_hyperlink_urls(soup) + urls.update(hyperlink_urls) res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") @@ -645,6 +678,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.doc$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") + binary = BytesIO(binary) doc_parsed = parser.from_buffer(binary) if doc_parsed.get('content', None) is not None: @@ -686,9 +720,24 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) + if urls and parser_config.get("analyze_hyperlink", False) and is_root: + for index, url in enumerate(urls): + html_bytes, metadata = extract_html(url) + if not html_bytes: + continue + try: + sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) + except Exception as e: + logging.info(f"Failed to chunk url in registered file type {url}: {e}") + sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) + url_res.extend(sub_url_res) + logging.info("naive_merge({}): {}".format(filename, timer() - st)) + if embed_res: res.extend(embed_res) + if url_res: + res.extend(url_res) return res