fix 'KeyError: "There is no item named 'word/NULL' in the archive"' (#9455)

### What problem does this PR solve? Issue referring to: https://github.com/python-openxml/python-docx/issues/797 Fix referring to: https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-01 16:15:07 +08:00 · 2025-08-14 12:14:03 +08:00
parent 6e862553cb
commit 6d1078b538
1 changed files with 22 additions and 1 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -22,6 +22,8 @@ from timeit import default_timer as timer
 from docx import Document
 from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
 from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 from markdown import markdown
 from PIL import Image
 from tika import parser
@ -47,8 +49,8 @@ class Docx(DocxParser):
        if not embed:
            return None
        embed = embed[0]
        related_part = document.part.related_parts[embed]
        try:
            related_part = document.part.related_parts[embed]
            image_blob = related_part.image.blob
        except UnrecognizedImageError:
            logging.info("Unrecognized image format. Skipping image.")
@ -62,6 +64,9 @@ class Docx(DocxParser):
        except UnicodeDecodeError:
            logging.info("The recognized image stream appears to be corrupted. Skipping image.")
            return None
        except Exception:
            logging.info("The recognized image stream appears to be corrupted. Skipping image.")
            return None
        try:
            image = Image.open(BytesIO(image_blob)).convert('RGB')
            return image
@ -360,6 +365,20 @@ class Markdown(MarkdownParser):
            tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
        return sections, tbls
 def load_from_xml_v2(baseURI, rels_item_xml):
    """
    Return |_SerializedRelationships| instance loaded with the
    relationships contained in *rels_item_xml*. Returns an empty
    collection if *rels_item_xml* is |None|.
    """
    srels = _SerializedRelationships()
    if rels_item_xml is not None:
        rels_elm = parse_xml(rels_item_xml)
        for rel_elm in rels_elm.Relationship_lst:
            if rel_elm.target_ref in ('../NULL', 'NULL'):
                continue
            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
    return srels
 def chunk(filename, binary=None, from_page=0, to_page=100000,
          lang="Chinese", callback=None, **kwargs):
@ -391,6 +410,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        except Exception:
            vision_model = None
        # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
        _SerializedRelationships.load_from_xml = load_from_xml_v2
        sections, tables = Docx()(filename, binary)
        if vision_model: