Fix: debug PDF positions.. (#10365)

### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-03 09:05:07 +08:00 · 2025-09-30 09:24:44 +08:00
parent c49e81882c
commit 9989e06abb
9 changed files with 52 additions and 51 deletions
--- a/rag/flow/extractor/extractor.py
+++ b/rag/flow/extractor/extractor.py
@ -13,6 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 import random
+from copy import deepcopy
 from agent.component.llm import LLMParam, LLM
 from rag.flow.base import ProcessBase, ProcessParamBase

@ -40,7 +41,7 @@ class Extractor(ProcessBase, LLM):
        for k, v in inputs.items():
            args[k] = v["value"]
            if isinstance(args[k], list):
-                chunks = args[k]
+                chunks = deepcopy(args[k])
                chunks_key = k

        if chunks:
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -14,7 +14,6 @@
 #  limitations under the License.
 import io
 import json
-import logging
 import os
 import random
 from functools import partial
@ -31,6 +30,7 @@ from api.utils import get_uuid
 from api.utils.base64_image import image2id
 from deepdoc.parser import ExcelParser
 from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
+from rag.app.naive import Docx
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
 from rag.llm.cv_model import Base as VLM
@ -243,19 +243,13 @@ class Parser(ProcessBase):
            self.set_output("markdown", spreadsheet_parser.markdown(blob))

    def _word(self, name, blob):
-        from tika import parser as word_parser
-
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
        conf = self._param.setups["word"]
        self.set_output("output_format", conf["output_format"])
-        doc_parsed = word_parser.from_buffer(blob)
-        sections = []
-        if doc_parsed.get("content"):
-            sections = doc_parsed["content"].split("\n")
-            sections = [{"text": section} for section in sections if section]
-        else:
-            logging.warning(f"tika.parser got empty content from {name}.")
-
+        docx_parser = Docx()
+        sections, tbls = docx_parser(name, binary=blob)
+        sections = [{"text": section[0], "image": section[1]} for section in sections if section]
+        sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
        # json
        assert conf.get("output_format") == "json", "have to be json for doc"
        if conf.get("output_format") == "json":
--- a/rag/flow/splitter/splitter.py
+++ b/rag/flow/splitter/splitter.py
@ -100,7 +100,7 @@ class Splitter(ProcessBase):
            {
                "text": RAGFlowPdfParser.remove_tag(c),
                "image": img,
-                "positions": [[pos[0][-1], *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
+                "positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
            }
            for c, img in zip(chunks, images) if c.strip()
        ]