From 678a4f959cd2c6a8aa5e76f270b682875e638d4a Mon Sep 17 00:00:00 2001
From: MkDev11 <jaysmth689@gmail.com>
Date: Wed, 14 Jan 2026 06:08:46 -0500
Subject: [PATCH] Fix: skip internal bookmark references in DOCX parsing
 (#12604) (#12611)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What problem does this PR solve?

Fixes #12604 - DOCX files containing hyperlinks to internal bookmarks
(e.g., `#_文档目录`) cause a `KeyError` during parsing:

```
KeyError: "There is no item named 'word/#_文档目录' in the archive"
```

This happens because python-docx incorrectly tries to read internal
bookmark references as files from the ZIP archive. Internal bookmarks
are relationship targets starting with `#` and are not actual files.

This PR extends the existing `load_from_xml_v2` workaround (which
already handles `NULL` targets) to also skip relationship targets
starting with `#`.

Related upstream issue:
https://github.com/python-openxml/python-docx/issues/902

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---
Contribution by Gittensor, see my contribution statistics at
https://gittensor.io/miners/details?githubId=94194147
---
 rag/app/naive.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rag/app/naive.py b/rag/app/naive.py
index 86ac85bc8..b793b9fdc 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -727,7 +727,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
     if rels_item_xml is not None:
         rels_elm = parse_xml(rels_item_xml)
         for rel_elm in rels_elm.Relationship_lst:
-            if rel_elm.target_ref in ("../NULL", "NULL"):
+            if rel_elm.target_ref in ("../NULL", "NULL") or rel_elm.target_ref.startswith("#"):
                 continue
             srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
     return srels