mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-31 17:15:32 +08:00
Refine: image/table context. (#12336)
### What problem does this PR solve? #12303 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -476,11 +476,13 @@ class RAGFlowPdfParser:
|
|||||||
self.boxes = bxs
|
self.boxes = bxs
|
||||||
|
|
||||||
def _naive_vertical_merge(self, zoomin=3):
|
def _naive_vertical_merge(self, zoomin=3):
|
||||||
bxs = self._assign_column(self.boxes, zoomin)
|
#bxs = self._assign_column(self.boxes, zoomin)
|
||||||
|
bxs = self.boxes
|
||||||
|
|
||||||
grouped = defaultdict(list)
|
grouped = defaultdict(list)
|
||||||
for b in bxs:
|
for b in bxs:
|
||||||
grouped[(b["page_number"], b.get("col_id", 0))].append(b)
|
# grouped[(b["page_number"], b.get("col_id", 0))].append(b)
|
||||||
|
grouped[(b["page_number"], "x")].append(b)
|
||||||
|
|
||||||
merged_boxes = []
|
merged_boxes = []
|
||||||
for (pg, col), bxs in grouped.items():
|
for (pg, col), bxs in grouped.items():
|
||||||
@ -551,7 +553,7 @@ class RAGFlowPdfParser:
|
|||||||
|
|
||||||
merged_boxes.extend(bxs)
|
merged_boxes.extend(bxs)
|
||||||
|
|
||||||
self.boxes = sorted(merged_boxes, key=lambda x: (x["page_number"], x.get("col_id", 0), x["top"]))
|
#self.boxes = sorted(merged_boxes, key=lambda x: (x["page_number"], x.get("col_id", 0), x["top"]))
|
||||||
|
|
||||||
def _final_reading_order_merge(self, zoomin=3):
|
def _final_reading_order_merge(self, zoomin=3):
|
||||||
if not self.boxes:
|
if not self.boxes:
|
||||||
@ -1206,7 +1208,7 @@ class RAGFlowPdfParser:
|
|||||||
start = timer()
|
start = timer()
|
||||||
self._text_merge()
|
self._text_merge()
|
||||||
self._concat_downward()
|
self._concat_downward()
|
||||||
#self._naive_vertical_merge(zoomin)
|
self._naive_vertical_merge(zoomin)
|
||||||
if callback:
|
if callback:
|
||||||
callback(0.92, "Text merged ({:.2f}s)".format(timer() - start))
|
callback(0.92, "Text merged ({:.2f}s)".format(timer() - start))
|
||||||
|
|
||||||
|
|||||||
@ -40,7 +40,7 @@ from deepdoc.parser.docling_parser import DoclingParser
|
|||||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||||
from common.parser_config_utils import normalize_layout_recognizer
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \
|
||||||
tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
|
tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context, append_context2table_image4pdf
|
||||||
|
|
||||||
|
|
||||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
|
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
|
||||||
@ -487,7 +487,7 @@ class Pdf(PdfParser):
|
|||||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||||
self._naive_vertical_merge()
|
self._naive_vertical_merge()
|
||||||
self._concat_downward()
|
self._concat_downward()
|
||||||
self._final_reading_order_merge()
|
# self._final_reading_order_merge()
|
||||||
# self._filter_forpages()
|
# self._filter_forpages()
|
||||||
logging.info("layouts cost: {}s".format(timer() - first_start))
|
logging.info("layouts cost: {}s".format(timer() - first_start))
|
||||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
||||||
@ -776,6 +776,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
if not sections and not tables:
|
if not sections and not tables:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
if table_context_size or image_context_size:
|
||||||
|
tables = append_context2table_image4pdf(sections, tables, image_context_size)
|
||||||
|
|
||||||
if name in ["tcadp", "docling", "mineru"]:
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
parser_config["chunk_token_num"] = 0
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|
||||||
@ -1006,8 +1009,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
res.extend(embed_res)
|
res.extend(embed_res)
|
||||||
if url_res:
|
if url_res:
|
||||||
res.extend(url_res)
|
res.extend(url_res)
|
||||||
if table_context_size or image_context_size:
|
#if table_context_size or image_context_size:
|
||||||
attach_media_context(res, table_context_size, image_context_size)
|
# attach_media_context(res, table_context_size, image_context_size)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
from collections import Counter
|
from collections import Counter, defaultdict
|
||||||
|
|
||||||
from common.token_utils import num_tokens_from_string
|
from common.token_utils import num_tokens_from_string
|
||||||
import re
|
import re
|
||||||
@ -667,6 +667,94 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
|
|||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0):
|
||||||
|
from deepdoc.parser import PdfParser
|
||||||
|
if table_context_size <=0:
|
||||||
|
return tabls
|
||||||
|
|
||||||
|
page_bucket = defaultdict(list)
|
||||||
|
for i, (txt, poss) in enumerate(sections):
|
||||||
|
poss = PdfParser.extract_positions(poss)
|
||||||
|
for page, left, right, top, bottom in poss:
|
||||||
|
page = page[0]
|
||||||
|
page_bucket[page].append(((left, top, right, bottom), txt))
|
||||||
|
|
||||||
|
def upper_context(page, i):
|
||||||
|
txt = ""
|
||||||
|
if page not in page_bucket:
|
||||||
|
i = -1
|
||||||
|
while num_tokens_from_string(txt) < table_context_size:
|
||||||
|
if i < 0:
|
||||||
|
page -= 1
|
||||||
|
if page < 0 or page not in page_bucket:
|
||||||
|
break
|
||||||
|
i = len(page_bucket[page]) -1
|
||||||
|
blks = page_bucket[page]
|
||||||
|
(_, _, _, _), cnt = blks[i]
|
||||||
|
txts = re.split(r"([。!??;!\n]|\. )", cnt, flags=re.DOTALL)[::-1]
|
||||||
|
for j in range(0, len(txts), 2):
|
||||||
|
txt = (txts[j+1] if j+1<len(txts) else "") + txts[j] + txt
|
||||||
|
if num_tokens_from_string(txt) > table_context_size:
|
||||||
|
break
|
||||||
|
i -= 1
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def lower_context(page, i):
|
||||||
|
txt = ""
|
||||||
|
if page not in page_bucket:
|
||||||
|
return txt
|
||||||
|
while num_tokens_from_string(txt) < table_context_size:
|
||||||
|
if i >= len(page_bucket[page]):
|
||||||
|
page += 1
|
||||||
|
if page not in page_bucket:
|
||||||
|
break
|
||||||
|
i = 0
|
||||||
|
blks = page_bucket[page]
|
||||||
|
(_, _, _, _), cnt = blks[i]
|
||||||
|
txts = re.split(r"([。!??;!\n]|\. )", cnt, flags=re.DOTALL)
|
||||||
|
for j in range(0, len(txts), 2):
|
||||||
|
txt += txts[j] + (txts[j+1] if j+1<len(txts) else "")
|
||||||
|
if num_tokens_from_string(txt) > table_context_size:
|
||||||
|
break
|
||||||
|
i += 1
|
||||||
|
return txt
|
||||||
|
|
||||||
|
res = []
|
||||||
|
for (img, tb), poss in tabls:
|
||||||
|
page, left, top, right, bott = poss[0]
|
||||||
|
_page, _left, _top, _right, _bott = poss[-1]
|
||||||
|
if isinstance(tb, list):
|
||||||
|
tb = "\n".join(tb)
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
blks = page_bucket.get(page, [])
|
||||||
|
_tb = tb
|
||||||
|
while i < len(blks):
|
||||||
|
if i + 1 >= len(blks):
|
||||||
|
if _page > page:
|
||||||
|
page += 1
|
||||||
|
i = 0
|
||||||
|
blks = page_bucket.get(page, [])
|
||||||
|
continue
|
||||||
|
tb = upper_context(page, i) + tb + lower_context(page+1, 0)
|
||||||
|
break
|
||||||
|
(_, t, r, b), txt = blks[i]
|
||||||
|
if b > top:
|
||||||
|
break
|
||||||
|
(_, _t, _r, _b), _txt = blks[i+1]
|
||||||
|
if _t < _bott:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
tb = upper_context(page, i) + tb + lower_context(page, i)
|
||||||
|
break
|
||||||
|
|
||||||
|
if _tb == tb:
|
||||||
|
tb = upper_context(page, -1) + tb + lower_context(page+1, 0)
|
||||||
|
res.append(((img, tb), poss))
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
def add_positions(d, poss):
|
def add_positions(d, poss):
|
||||||
if not poss:
|
if not poss:
|
||||||
return
|
return
|
||||||
|
|||||||
@ -729,6 +729,8 @@ TOC_FROM_TEXT_USER = load_prompt("toc_from_text_user")
|
|||||||
|
|
||||||
# Generate TOC from text chunks with text llms
|
# Generate TOC from text chunks with text llms
|
||||||
async def gen_toc_from_text(txt_info: dict, chat_mdl, callback=None):
|
async def gen_toc_from_text(txt_info: dict, chat_mdl, callback=None):
|
||||||
|
if callback:
|
||||||
|
callback(msg="")
|
||||||
try:
|
try:
|
||||||
ans = await gen_json(
|
ans = await gen_json(
|
||||||
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(),
|
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(),
|
||||||
@ -738,8 +740,6 @@ async def gen_toc_from_text(txt_info: dict, chat_mdl, callback=None):
|
|||||||
gen_conf={"temperature": 0.0, "top_p": 0.9}
|
gen_conf={"temperature": 0.0, "top_p": 0.9}
|
||||||
)
|
)
|
||||||
txt_info["toc"] = ans if ans and not isinstance(ans, str) else []
|
txt_info["toc"] = ans if ans and not isinstance(ans, str) else []
|
||||||
if callback:
|
|
||||||
callback(msg="")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception(e)
|
logging.exception(e)
|
||||||
|
|
||||||
|
|||||||
@ -332,6 +332,9 @@ async def build_chunks(task, progress_callback):
|
|||||||
async def doc_keyword_extraction(chat_mdl, d, topn):
|
async def doc_keyword_extraction(chat_mdl, d, topn):
|
||||||
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "keywords", {"topn": topn})
|
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "keywords", {"topn": topn})
|
||||||
if not cached:
|
if not cached:
|
||||||
|
if has_canceled(task["id"]):
|
||||||
|
progress_callback(-1, msg="Task has been canceled.")
|
||||||
|
return
|
||||||
async with chat_limiter:
|
async with chat_limiter:
|
||||||
cached = await keyword_extraction(chat_mdl, d["content_with_weight"], topn)
|
cached = await keyword_extraction(chat_mdl, d["content_with_weight"], topn)
|
||||||
set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "keywords", {"topn": topn})
|
set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "keywords", {"topn": topn})
|
||||||
@ -362,6 +365,9 @@ async def build_chunks(task, progress_callback):
|
|||||||
async def doc_question_proposal(chat_mdl, d, topn):
|
async def doc_question_proposal(chat_mdl, d, topn):
|
||||||
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "question", {"topn": topn})
|
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "question", {"topn": topn})
|
||||||
if not cached:
|
if not cached:
|
||||||
|
if has_canceled(task["id"]):
|
||||||
|
progress_callback(-1, msg="Task has been canceled.")
|
||||||
|
return
|
||||||
async with chat_limiter:
|
async with chat_limiter:
|
||||||
cached = await question_proposal(chat_mdl, d["content_with_weight"], topn)
|
cached = await question_proposal(chat_mdl, d["content_with_weight"], topn)
|
||||||
set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "question", {"topn": topn})
|
set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "question", {"topn": topn})
|
||||||
@ -392,6 +398,9 @@ async def build_chunks(task, progress_callback):
|
|||||||
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "metadata",
|
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "metadata",
|
||||||
task["parser_config"]["metadata"])
|
task["parser_config"]["metadata"])
|
||||||
if not cached:
|
if not cached:
|
||||||
|
if has_canceled(task["id"]):
|
||||||
|
progress_callback(-1, msg="Task has been canceled.")
|
||||||
|
return
|
||||||
async with chat_limiter:
|
async with chat_limiter:
|
||||||
cached = await gen_metadata(chat_mdl,
|
cached = await gen_metadata(chat_mdl,
|
||||||
metadata_schema(task["parser_config"]["metadata"]),
|
metadata_schema(task["parser_config"]["metadata"]),
|
||||||
@ -457,6 +466,9 @@ async def build_chunks(task, progress_callback):
|
|||||||
async def doc_content_tagging(chat_mdl, d, topn_tags):
|
async def doc_content_tagging(chat_mdl, d, topn_tags):
|
||||||
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], all_tags, {"topn": topn_tags})
|
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], all_tags, {"topn": topn_tags})
|
||||||
if not cached:
|
if not cached:
|
||||||
|
if has_canceled(task["id"]):
|
||||||
|
progress_callback(-1, msg="Task has been canceled.")
|
||||||
|
return
|
||||||
picked_examples = random.choices(examples, k=2) if len(examples) > 2 else examples
|
picked_examples = random.choices(examples, k=2) if len(examples) > 2 else examples
|
||||||
if not picked_examples:
|
if not picked_examples:
|
||||||
picked_examples.append({"content": "This is an example", TAG_FLD: {'example': 1}})
|
picked_examples.append({"content": "This is an example", TAG_FLD: {'example': 1}})
|
||||||
|
|||||||
Reference in New Issue
Block a user