Fix:toc in pipeline (#11785)

### What problem does this PR solve?
change:
Fix toc in pipeline
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
buua436
2025-12-08 09:42:20 +08:00
committed by GitHub
parent 6546f86b4e
commit 9b8971a9de
3 changed files with 14 additions and 10 deletions

View File

@ -15,9 +15,8 @@
import json import json
import logging import logging
import random import random
from copy import deepcopy, copy from copy import deepcopy
import trio
import xxhash import xxhash
from agent.component.llm import LLMParam, LLM from agent.component.llm import LLMParam, LLM
@ -38,13 +37,13 @@ class ExtractorParam(ProcessParamBase, LLMParam):
class Extractor(ProcessBase, LLM): class Extractor(ProcessBase, LLM):
component_name = "Extractor" component_name = "Extractor"
def _build_TOC(self, docs): async def _build_TOC(self, docs):
self.callback(message="Start to generate table of content ...") self.callback(0.2,message="Start to generate table of content ...")
docs = sorted(docs, key=lambda d:( docs = sorted(docs, key=lambda d:(
d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0), d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0) d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0)
)) ))
toc: list[dict] = trio.run(run_toc_from_text, [d["text"] for d in docs], self.chat_mdl) toc = await run_toc_from_text([d["text"] for d in docs], self.chat_mdl)
logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent=' ')) logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent=' '))
ii = 0 ii = 0
while ii < len(toc): while ii < len(toc):
@ -61,7 +60,8 @@ class Extractor(ProcessBase, LLM):
ii += 1 ii += 1
if toc: if toc:
d = copy.deepcopy(docs[-1]) d = deepcopy(docs[-1])
d["doc_id"] = self._canvas._doc_id
d["content_with_weight"] = json.dumps(toc, ensure_ascii=False) d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
d["toc_kwd"] = "toc" d["toc_kwd"] = "toc"
d["available_int"] = 0 d["available_int"] = 0
@ -85,7 +85,10 @@ class Extractor(ProcessBase, LLM):
if chunks: if chunks:
if self._param.field_name == "toc": if self._param.field_name == "toc":
toc = self._build_TOC(chunks) for ck in chunks:
ck["doc_id"] = self._canvas._doc_id
ck["id"] = xxhash.xxh64((ck["text"] + str(ck["doc_id"])).encode("utf-8")).hexdigest()
toc =await self._build_TOC(chunks)
chunks.append(toc) chunks.append(toc)
self.set_output("chunks", chunks) self.set_output("chunks", chunks)
return return

View File

@ -125,7 +125,7 @@ class Splitter(ProcessBase):
{ {
"text": RAGFlowPdfParser.remove_tag(c), "text": RAGFlowPdfParser.remove_tag(c),
"image": img, "image": img,
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)] "positions": [[pos[0][-1], *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)]
} }
for c, img in zip(chunks, images) if c.strip() for c, img in zip(chunks, images) if c.strip()
] ]

View File

@ -592,7 +592,8 @@ async def run_dataflow(task: dict):
ck["docnm_kwd"] = task["name"] ck["docnm_kwd"] = task["name"]
ck["create_time"] = str(datetime.now()).replace("T", " ")[:19] ck["create_time"] = str(datetime.now()).replace("T", " ")[:19]
ck["create_timestamp_flt"] = datetime.now().timestamp() ck["create_timestamp_flt"] = datetime.now().timestamp()
ck["id"] = xxhash.xxh64((ck["text"] + str(ck["doc_id"])).encode("utf-8")).hexdigest() if not ck.get("id"):
ck["id"] = xxhash.xxh64((ck["text"] + str(ck["doc_id"])).encode("utf-8")).hexdigest()
if "questions" in ck: if "questions" in ck:
if "question_tks" not in ck: if "question_tks" not in ck:
ck["question_kwd"] = ck["questions"].split("\n") ck["question_kwd"] = ck["questions"].split("\n")