mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix:toc in pipeline (#11785)
### What problem does this PR solve? change: Fix toc in pipeline ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -15,9 +15,8 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
from copy import deepcopy, copy
|
from copy import deepcopy
|
||||||
|
|
||||||
import trio
|
|
||||||
import xxhash
|
import xxhash
|
||||||
|
|
||||||
from agent.component.llm import LLMParam, LLM
|
from agent.component.llm import LLMParam, LLM
|
||||||
@ -38,13 +37,13 @@ class ExtractorParam(ProcessParamBase, LLMParam):
|
|||||||
class Extractor(ProcessBase, LLM):
|
class Extractor(ProcessBase, LLM):
|
||||||
component_name = "Extractor"
|
component_name = "Extractor"
|
||||||
|
|
||||||
def _build_TOC(self, docs):
|
async def _build_TOC(self, docs):
|
||||||
self.callback(message="Start to generate table of content ...")
|
self.callback(0.2,message="Start to generate table of content ...")
|
||||||
docs = sorted(docs, key=lambda d:(
|
docs = sorted(docs, key=lambda d:(
|
||||||
d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
|
d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
|
||||||
d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0)
|
d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0)
|
||||||
))
|
))
|
||||||
toc: list[dict] = trio.run(run_toc_from_text, [d["text"] for d in docs], self.chat_mdl)
|
toc = await run_toc_from_text([d["text"] for d in docs], self.chat_mdl)
|
||||||
logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent=' '))
|
logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent=' '))
|
||||||
ii = 0
|
ii = 0
|
||||||
while ii < len(toc):
|
while ii < len(toc):
|
||||||
@ -61,7 +60,8 @@ class Extractor(ProcessBase, LLM):
|
|||||||
ii += 1
|
ii += 1
|
||||||
|
|
||||||
if toc:
|
if toc:
|
||||||
d = copy.deepcopy(docs[-1])
|
d = deepcopy(docs[-1])
|
||||||
|
d["doc_id"] = self._canvas._doc_id
|
||||||
d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
|
d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
|
||||||
d["toc_kwd"] = "toc"
|
d["toc_kwd"] = "toc"
|
||||||
d["available_int"] = 0
|
d["available_int"] = 0
|
||||||
@ -85,7 +85,10 @@ class Extractor(ProcessBase, LLM):
|
|||||||
|
|
||||||
if chunks:
|
if chunks:
|
||||||
if self._param.field_name == "toc":
|
if self._param.field_name == "toc":
|
||||||
toc = self._build_TOC(chunks)
|
for ck in chunks:
|
||||||
|
ck["doc_id"] = self._canvas._doc_id
|
||||||
|
ck["id"] = xxhash.xxh64((ck["text"] + str(ck["doc_id"])).encode("utf-8")).hexdigest()
|
||||||
|
toc =await self._build_TOC(chunks)
|
||||||
chunks.append(toc)
|
chunks.append(toc)
|
||||||
self.set_output("chunks", chunks)
|
self.set_output("chunks", chunks)
|
||||||
return
|
return
|
||||||
|
|||||||
@ -125,7 +125,7 @@ class Splitter(ProcessBase):
|
|||||||
{
|
{
|
||||||
"text": RAGFlowPdfParser.remove_tag(c),
|
"text": RAGFlowPdfParser.remove_tag(c),
|
||||||
"image": img,
|
"image": img,
|
||||||
"positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)]
|
"positions": [[pos[0][-1], *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)]
|
||||||
}
|
}
|
||||||
for c, img in zip(chunks, images) if c.strip()
|
for c, img in zip(chunks, images) if c.strip()
|
||||||
]
|
]
|
||||||
|
|||||||
@ -592,7 +592,8 @@ async def run_dataflow(task: dict):
|
|||||||
ck["docnm_kwd"] = task["name"]
|
ck["docnm_kwd"] = task["name"]
|
||||||
ck["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
ck["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
||||||
ck["create_timestamp_flt"] = datetime.now().timestamp()
|
ck["create_timestamp_flt"] = datetime.now().timestamp()
|
||||||
ck["id"] = xxhash.xxh64((ck["text"] + str(ck["doc_id"])).encode("utf-8")).hexdigest()
|
if not ck.get("id"):
|
||||||
|
ck["id"] = xxhash.xxh64((ck["text"] + str(ck["doc_id"])).encode("utf-8")).hexdigest()
|
||||||
if "questions" in ck:
|
if "questions" in ck:
|
||||||
if "question_tks" not in ck:
|
if "question_tks" not in ck:
|
||||||
ck["question_kwd"] = ck["questions"].split("\n")
|
ck["question_kwd"] = ck["questions"].split("\n")
|
||||||
|
|||||||
Reference in New Issue
Block a user