mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add context for figure and table (#11547)
### What problem does this PR solve? Add context for figure table.  `==================()` for demonstrating purpose. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -749,7 +749,7 @@ class Knowledgebase(DataBaseModel):
|
|||||||
|
|
||||||
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
|
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
|
||||||
pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
|
pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
|
||||||
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
|
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
|
||||||
pagerank = IntegerField(default=0, index=False)
|
pagerank = IntegerField(default=0, index=False)
|
||||||
|
|
||||||
graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True)
|
graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True)
|
||||||
@ -774,7 +774,7 @@ class Document(DataBaseModel):
|
|||||||
kb_id = CharField(max_length=256, null=False, index=True)
|
kb_id = CharField(max_length=256, null=False, index=True)
|
||||||
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
|
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
|
||||||
pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True)
|
pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True)
|
||||||
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
|
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
|
||||||
source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
|
source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
|
||||||
type = CharField(max_length=32, null=False, help_text="file extension", index=True)
|
type = CharField(max_length=32, null=False, help_text="file extension", index=True)
|
||||||
created_by = CharField(max_length=32, null=False, help_text="who created it", index=True)
|
created_by = CharField(max_length=32, null=False, help_text="who created it", index=True)
|
||||||
|
|||||||
@ -923,7 +923,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
|||||||
ParserType.AUDIO.value: audio,
|
ParserType.AUDIO.value: audio,
|
||||||
ParserType.EMAIL.value: email
|
ParserType.EMAIL.value: email
|
||||||
}
|
}
|
||||||
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text"}
|
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text", "table_context_size": 0, "image_context_size": 0}
|
||||||
exe = ThreadPoolExecutor(max_workers=12)
|
exe = ThreadPoolExecutor(max_workers=12)
|
||||||
threads = []
|
threads = []
|
||||||
doc_nm = {}
|
doc_nm = {}
|
||||||
|
|||||||
@ -313,6 +313,10 @@ def get_parser_config(chunk_method, parser_config):
|
|||||||
chunk_method = "naive"
|
chunk_method = "naive"
|
||||||
|
|
||||||
# Define default configurations for each chunking method
|
# Define default configurations for each chunking method
|
||||||
|
base_defaults = {
|
||||||
|
"table_context_size": 0,
|
||||||
|
"image_context_size": 0,
|
||||||
|
}
|
||||||
key_mapping = {
|
key_mapping = {
|
||||||
"naive": {
|
"naive": {
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
@ -365,16 +369,19 @@ def get_parser_config(chunk_method, parser_config):
|
|||||||
|
|
||||||
default_config = key_mapping[chunk_method]
|
default_config = key_mapping[chunk_method]
|
||||||
|
|
||||||
# If no parser_config provided, return default
|
# If no parser_config provided, return default merged with base defaults
|
||||||
if not parser_config:
|
if not parser_config:
|
||||||
return default_config
|
if default_config is None:
|
||||||
|
return deep_merge(base_defaults, {})
|
||||||
|
return deep_merge(base_defaults, default_config)
|
||||||
|
|
||||||
# If parser_config is provided, merge with defaults to ensure required fields exist
|
# If parser_config is provided, merge with defaults to ensure required fields exist
|
||||||
if default_config is None:
|
if default_config is None:
|
||||||
return parser_config
|
return deep_merge(base_defaults, parser_config)
|
||||||
|
|
||||||
# Ensure raptor and graphrag fields have default values if not provided
|
# Ensure raptor and graphrag fields have default values if not provided
|
||||||
merged_config = deep_merge(default_config, parser_config)
|
merged_config = deep_merge(base_defaults, default_config)
|
||||||
|
merged_config = deep_merge(merged_config, parser_config)
|
||||||
|
|
||||||
return merged_config
|
return merged_config
|
||||||
|
|
||||||
|
|||||||
@ -23,7 +23,7 @@ from rag.app import naive
|
|||||||
from rag.app.naive import by_plaintext, PARSERS
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
||||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
||||||
tokenize_chunks
|
tokenize_chunks, attach_media_context
|
||||||
from rag.nlp import rag_tokenizer
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import PdfParser, HtmlParser
|
from deepdoc.parser import PdfParser, HtmlParser
|
||||||
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||||
@ -175,6 +175,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
res = tokenize_table(tbls, doc, eng)
|
res = tokenize_table(tbls, doc, eng)
|
||||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||||
|
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||||
|
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
attach_media_context(res, table_ctx, image_ctx)
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ import re
|
|||||||
|
|
||||||
from common.constants import ParserType
|
from common.constants import ParserType
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
|
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
|
||||||
from common.token_utils import num_tokens_from_string
|
from common.token_utils import num_tokens_from_string
|
||||||
from deepdoc.parser import PdfParser, DocxParser
|
from deepdoc.parser import PdfParser, DocxParser
|
||||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
||||||
@ -155,7 +155,7 @@ class Docx(DocxParser):
|
|||||||
sum_question = '\n'.join(question_stack)
|
sum_question = '\n'.join(question_stack)
|
||||||
if sum_question:
|
if sum_question:
|
||||||
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
|
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
|
||||||
|
|
||||||
tbls = []
|
tbls = []
|
||||||
for tb in self.doc.tables:
|
for tb in self.doc.tables:
|
||||||
html= "<table>"
|
html= "<table>"
|
||||||
@ -231,14 +231,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
if isinstance(poss, str):
|
if isinstance(poss, str):
|
||||||
poss = pdf_parser.extract_positions(poss)
|
poss = pdf_parser.extract_positions(poss)
|
||||||
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
||||||
pn = first[0]
|
pn = first[0]
|
||||||
|
|
||||||
if isinstance(pn, list):
|
if isinstance(pn, list):
|
||||||
pn = pn[0] # [pn] -> pn
|
pn = pn[0] # [pn] -> pn
|
||||||
poss[0] = (pn, *first[1:])
|
poss[0] = (pn, *first[1:])
|
||||||
|
|
||||||
return (txt, layoutno, poss)
|
return (txt, layoutno, poss)
|
||||||
|
|
||||||
|
|
||||||
sections = [_normalize_section(sec) for sec in sections]
|
sections = [_normalize_section(sec) for sec in sections]
|
||||||
|
|
||||||
@ -247,7 +247,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
if name in ["tcadp", "docling", "mineru"]:
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
parser_config["chunk_token_num"] = 0
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
||||||
@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
||||||
res = tokenize_table(tbls, doc, eng)
|
res = tokenize_table(tbls, doc, eng)
|
||||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||||
|
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||||
|
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
attach_media_context(res, table_ctx, image_ctx)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
|
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||||
@ -325,10 +329,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
d["doc_type_kwd"] = "image"
|
d["doc_type_kwd"] = "image"
|
||||||
tokenize(d, text, eng)
|
tokenize(d, text, eng)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
|
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||||
|
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
attach_media_context(res, table_ctx, image_ctx)
|
||||||
return res
|
return res
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
|
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
|
|||||||
@ -37,7 +37,7 @@ from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
|||||||
from deepdoc.parser.mineru_parser import MinerUParser
|
from deepdoc.parser.mineru_parser import MinerUParser
|
||||||
from deepdoc.parser.docling_parser import DoclingParser
|
from deepdoc.parser.docling_parser import DoclingParser
|
||||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
|
||||||
|
|
||||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||||
callback = callback
|
callback = callback
|
||||||
@ -616,6 +616,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
parser_config = kwargs.get(
|
parser_config = kwargs.get(
|
||||||
"parser_config", {
|
"parser_config", {
|
||||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
||||||
|
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||||
|
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||||
final_sections = False
|
final_sections = False
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
@ -686,6 +688,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||||
res.extend(embed_res)
|
res.extend(embed_res)
|
||||||
res.extend(url_res)
|
res.extend(url_res)
|
||||||
|
if table_context_size or image_context_size:
|
||||||
|
attach_media_context(res, table_context_size, image_context_size)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
@ -947,6 +951,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
res.extend(embed_res)
|
res.extend(embed_res)
|
||||||
if url_res:
|
if url_res:
|
||||||
res.extend(url_res)
|
res.extend(url_res)
|
||||||
|
if table_context_size or image_context_size:
|
||||||
|
attach_media_context(res, table_context_size, image_context_size)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ import re
|
|||||||
|
|
||||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
|
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
|
||||||
from common.constants import ParserType
|
from common.constants import ParserType
|
||||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, attach_media_context
|
||||||
from deepdoc.parser import PdfParser
|
from deepdoc.parser import PdfParser
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from rag.app.naive import by_plaintext, PARSERS
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
@ -150,7 +150,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||||
|
|
||||||
if isinstance(layout_recognizer, bool):
|
if isinstance(layout_recognizer, bool):
|
||||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
|
|
||||||
@ -234,6 +234,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
chunks.append(txt)
|
chunks.append(txt)
|
||||||
last_sid = sec_id
|
last_sid = sec_id
|
||||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||||
|
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||||
|
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
attach_media_context(res, table_ctx, image_ctx)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -20,11 +20,11 @@ import re
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from common.constants import LLMType
|
|
||||||
from api.db.services.llm_service import LLMBundle
|
from api.db.services.llm_service import LLMBundle
|
||||||
from deepdoc.vision import OCR
|
from common.constants import LLMType
|
||||||
from rag.nlp import rag_tokenizer, tokenize
|
|
||||||
from common.string_utils import clean_markdown_block
|
from common.string_utils import clean_markdown_block
|
||||||
|
from deepdoc.vision import OCR
|
||||||
|
from rag.nlp import attach_media_context, rag_tokenizer, tokenize
|
||||||
|
|
||||||
ocr = OCR()
|
ocr = OCR()
|
||||||
|
|
||||||
@ -39,9 +39,16 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
|||||||
}
|
}
|
||||||
eng = lang.lower() == "english"
|
eng = lang.lower() == "english"
|
||||||
|
|
||||||
|
parser_config = kwargs.get("parser_config", {}) or {}
|
||||||
|
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||||
|
|
||||||
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
|
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
|
||||||
try:
|
try:
|
||||||
doc.update({"doc_type_kwd": "video"})
|
doc.update(
|
||||||
|
{
|
||||||
|
"doc_type_kwd": "video",
|
||||||
|
}
|
||||||
|
)
|
||||||
cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
|
cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
|
||||||
ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
|
ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
|
||||||
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
|
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
|
||||||
@ -64,7 +71,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
|||||||
if (eng and len(txt.split()) > 32) or len(txt) > 32:
|
if (eng and len(txt.split()) > 32) or len(txt) > 32:
|
||||||
tokenize(doc, txt, eng)
|
tokenize(doc, txt, eng)
|
||||||
callback(0.8, "OCR results is too long to use CV LLM.")
|
callback(0.8, "OCR results is too long to use CV LLM.")
|
||||||
return [doc]
|
return attach_media_context([doc], 0, image_ctx)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
callback(0.4, "Use CV LLM to describe the picture.")
|
callback(0.4, "Use CV LLM to describe the picture.")
|
||||||
@ -76,7 +83,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
|||||||
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
|
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
|
||||||
txt += "\n" + ans
|
txt += "\n" + ans
|
||||||
tokenize(doc, txt, eng)
|
tokenize(doc, txt, eng)
|
||||||
return [doc]
|
return attach_media_context([doc], 0, image_ctx)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
callback(prog=-1, msg=str(e))
|
callback(prog=-1, msg=str(e))
|
||||||
|
|
||||||
@ -103,7 +110,7 @@ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
|
|||||||
img_binary.seek(0)
|
img_binary.seek(0)
|
||||||
img_binary.truncate()
|
img_binary.truncate()
|
||||||
img.save(img_binary, format="PNG")
|
img.save(img_binary, format="PNG")
|
||||||
|
|
||||||
img_binary.seek(0)
|
img_binary.seek(0)
|
||||||
ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
|
ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
|
||||||
txt += "\n" + ans
|
txt += "\n" + ans
|
||||||
|
|||||||
@ -19,16 +19,16 @@ import random
|
|||||||
import re
|
import re
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import trio
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import trio
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from common.constants import LLMType
|
|
||||||
from api.db.services.file2document_service import File2DocumentService
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
from api.db.services.file_service import FileService
|
from api.db.services.file_service import FileService
|
||||||
from api.db.services.llm_service import LLMBundle
|
from api.db.services.llm_service import LLMBundle
|
||||||
|
from common import settings
|
||||||
|
from common.constants import LLMType
|
||||||
from common.misc_utils import get_uuid
|
from common.misc_utils import get_uuid
|
||||||
from rag.utils.base64_image import image2id
|
|
||||||
from deepdoc.parser import ExcelParser
|
from deepdoc.parser import ExcelParser
|
||||||
from deepdoc.parser.mineru_parser import MinerUParser
|
from deepdoc.parser.mineru_parser import MinerUParser
|
||||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||||
@ -37,7 +37,8 @@ from rag.app.naive import Docx
|
|||||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||||
from rag.flow.parser.schema import ParserFromUpstream
|
from rag.flow.parser.schema import ParserFromUpstream
|
||||||
from rag.llm.cv_model import Base as VLM
|
from rag.llm.cv_model import Base as VLM
|
||||||
from common import settings
|
from rag.nlp import attach_media_context
|
||||||
|
from rag.utils.base64_image import image2id
|
||||||
|
|
||||||
|
|
||||||
class ParserParam(ProcessParamBase):
|
class ParserParam(ProcessParamBase):
|
||||||
@ -61,15 +62,18 @@ class ParserParam(ProcessParamBase):
|
|||||||
"json",
|
"json",
|
||||||
],
|
],
|
||||||
"image": [
|
"image": [
|
||||||
"text"
|
"text",
|
||||||
|
],
|
||||||
|
"email": [
|
||||||
|
"text",
|
||||||
|
"json",
|
||||||
],
|
],
|
||||||
"email": ["text", "json"],
|
|
||||||
"text&markdown": [
|
"text&markdown": [
|
||||||
"text",
|
"text",
|
||||||
"json"
|
"json",
|
||||||
],
|
],
|
||||||
"audio": [
|
"audio": [
|
||||||
"json"
|
"json",
|
||||||
],
|
],
|
||||||
"video": [],
|
"video": [],
|
||||||
}
|
}
|
||||||
@ -82,6 +86,8 @@ class ParserParam(ProcessParamBase):
|
|||||||
"pdf",
|
"pdf",
|
||||||
],
|
],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
|
"table_context_size": 0,
|
||||||
|
"image_context_size": 0,
|
||||||
},
|
},
|
||||||
"spreadsheet": {
|
"spreadsheet": {
|
||||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||||
@ -91,6 +97,8 @@ class ParserParam(ProcessParamBase):
|
|||||||
"xlsx",
|
"xlsx",
|
||||||
"csv",
|
"csv",
|
||||||
],
|
],
|
||||||
|
"table_context_size": 0,
|
||||||
|
"image_context_size": 0,
|
||||||
},
|
},
|
||||||
"word": {
|
"word": {
|
||||||
"suffix": [
|
"suffix": [
|
||||||
@ -98,18 +106,24 @@ class ParserParam(ProcessParamBase):
|
|||||||
"docx",
|
"docx",
|
||||||
],
|
],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
|
"table_context_size": 0,
|
||||||
|
"image_context_size": 0,
|
||||||
},
|
},
|
||||||
"text&markdown": {
|
"text&markdown": {
|
||||||
"suffix": ["md", "markdown", "mdx", "txt"],
|
"suffix": ["md", "markdown", "mdx", "txt"],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
|
"table_context_size": 0,
|
||||||
|
"image_context_size": 0,
|
||||||
},
|
},
|
||||||
"slides": {
|
"slides": {
|
||||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||||
"suffix": [
|
"suffix": [
|
||||||
"pptx",
|
"pptx",
|
||||||
"ppt"
|
"ppt",
|
||||||
],
|
],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
|
"table_context_size": 0,
|
||||||
|
"image_context_size": 0,
|
||||||
},
|
},
|
||||||
"image": {
|
"image": {
|
||||||
"parse_method": "ocr",
|
"parse_method": "ocr",
|
||||||
@ -121,13 +135,14 @@ class ParserParam(ProcessParamBase):
|
|||||||
},
|
},
|
||||||
"email": {
|
"email": {
|
||||||
"suffix": [
|
"suffix": [
|
||||||
"eml", "msg"
|
"eml",
|
||||||
|
"msg",
|
||||||
],
|
],
|
||||||
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
|
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
},
|
},
|
||||||
"audio": {
|
"audio": {
|
||||||
"suffix":[
|
"suffix": [
|
||||||
"da",
|
"da",
|
||||||
"wave",
|
"wave",
|
||||||
"wav",
|
"wav",
|
||||||
@ -142,15 +157,15 @@ class ParserParam(ProcessParamBase):
|
|||||||
"realaudio",
|
"realaudio",
|
||||||
"vqf",
|
"vqf",
|
||||||
"oggvorbis",
|
"oggvorbis",
|
||||||
"ape"
|
"ape",
|
||||||
],
|
],
|
||||||
"output_format": "text",
|
"output_format": "text",
|
||||||
},
|
},
|
||||||
"video": {
|
"video": {
|
||||||
"suffix":[
|
"suffix": [
|
||||||
"mp4",
|
"mp4",
|
||||||
"avi",
|
"avi",
|
||||||
"mkv"
|
"mkv",
|
||||||
],
|
],
|
||||||
"output_format": "text",
|
"output_format": "text",
|
||||||
},
|
},
|
||||||
@ -253,7 +268,7 @@ class Parser(ProcessBase):
|
|||||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||||
tcadp_parser = TCADPParser(
|
tcadp_parser = TCADPParser(
|
||||||
table_result_type=table_result_type,
|
table_result_type=table_result_type,
|
||||||
markdown_image_response_type=markdown_image_response_type
|
markdown_image_response_type=markdown_image_response_type,
|
||||||
)
|
)
|
||||||
sections, _ = tcadp_parser.parse_pdf(
|
sections, _ = tcadp_parser.parse_pdf(
|
||||||
filepath=name,
|
filepath=name,
|
||||||
@ -261,7 +276,7 @@ class Parser(ProcessBase):
|
|||||||
callback=self.callback,
|
callback=self.callback,
|
||||||
file_type="PDF",
|
file_type="PDF",
|
||||||
file_start_page=1,
|
file_start_page=1,
|
||||||
file_end_page=1000
|
file_end_page=1000,
|
||||||
)
|
)
|
||||||
bboxes = []
|
bboxes = []
|
||||||
for section, position_tag in sections:
|
for section, position_tag in sections:
|
||||||
@ -269,17 +284,20 @@ class Parser(ProcessBase):
|
|||||||
# Extract position information from TCADP's position tag
|
# Extract position information from TCADP's position tag
|
||||||
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
|
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
|
||||||
import re
|
import re
|
||||||
|
|
||||||
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
|
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
|
||||||
if match:
|
if match:
|
||||||
pn, x0, x1, top, bott = match.groups()
|
pn, x0, x1, top, bott = match.groups()
|
||||||
bboxes.append({
|
bboxes.append(
|
||||||
"page_number": int(pn.split('-')[0]), # Take the first page number
|
{
|
||||||
"x0": float(x0),
|
"page_number": int(pn.split("-")[0]), # Take the first page number
|
||||||
"x1": float(x1),
|
"x0": float(x0),
|
||||||
"top": float(top),
|
"x1": float(x1),
|
||||||
"bottom": float(bott),
|
"top": float(top),
|
||||||
"text": section
|
"bottom": float(bott),
|
||||||
})
|
"text": section,
|
||||||
|
}
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# If no position info, add as text without position
|
# If no position info, add as text without position
|
||||||
bboxes.append({"text": section})
|
bboxes.append({"text": section})
|
||||||
@ -291,7 +309,30 @@ class Parser(ProcessBase):
|
|||||||
bboxes = []
|
bboxes = []
|
||||||
for t, poss in lines:
|
for t, poss in lines:
|
||||||
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
|
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
|
||||||
bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
|
bboxes.append(
|
||||||
|
{
|
||||||
|
"page_number": int(pn[0]),
|
||||||
|
"x0": float(x0),
|
||||||
|
"x1": float(x1),
|
||||||
|
"top": float(top),
|
||||||
|
"bottom": float(bott),
|
||||||
|
"text": t,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
for b in bboxes:
|
||||||
|
text_val = b.get("text", "")
|
||||||
|
has_text = isinstance(text_val, str) and text_val.strip()
|
||||||
|
layout = b.get("layout_type")
|
||||||
|
if layout == "figure" or (b.get("image") and not has_text):
|
||||||
|
b["doc_type_kwd"] = "image"
|
||||||
|
elif layout == "table":
|
||||||
|
b["doc_type_kwd"] = "table"
|
||||||
|
|
||||||
|
table_ctx = conf.get("table_context_size", 0) or 0
|
||||||
|
image_ctx = conf.get("image_context_size", 0) or 0
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
|
||||||
|
|
||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
self.set_output("json", bboxes)
|
self.set_output("json", bboxes)
|
||||||
@ -319,7 +360,7 @@ class Parser(ProcessBase):
|
|||||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||||
tcadp_parser = TCADPParser(
|
tcadp_parser = TCADPParser(
|
||||||
table_result_type=table_result_type,
|
table_result_type=table_result_type,
|
||||||
markdown_image_response_type=markdown_image_response_type
|
markdown_image_response_type=markdown_image_response_type,
|
||||||
)
|
)
|
||||||
if not tcadp_parser.check_installation():
|
if not tcadp_parser.check_installation():
|
||||||
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||||
@ -337,7 +378,7 @@ class Parser(ProcessBase):
|
|||||||
callback=self.callback,
|
callback=self.callback,
|
||||||
file_type=file_type,
|
file_type=file_type,
|
||||||
file_start_page=1,
|
file_start_page=1,
|
||||||
file_end_page=1000
|
file_end_page=1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process TCADP parser output based on configured output_format
|
# Process TCADP parser output based on configured output_format
|
||||||
@ -365,7 +406,12 @@ class Parser(ProcessBase):
|
|||||||
# Add tables as text
|
# Add tables as text
|
||||||
for table in tables:
|
for table in tables:
|
||||||
if table:
|
if table:
|
||||||
result.append({"text": table})
|
result.append({"text": table, "doc_type_kwd": "table"})
|
||||||
|
|
||||||
|
table_ctx = conf.get("table_context_size", 0) or 0
|
||||||
|
image_ctx = conf.get("image_context_size", 0) or 0
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
result = attach_media_context(result, table_ctx, image_ctx)
|
||||||
|
|
||||||
self.set_output("json", result)
|
self.set_output("json", result)
|
||||||
|
|
||||||
@ -400,7 +446,13 @@ class Parser(ProcessBase):
|
|||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
sections, tbls = docx_parser(name, binary=blob)
|
sections, tbls = docx_parser(name, binary=blob)
|
||||||
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
||||||
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
|
sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
|
||||||
|
|
||||||
|
table_ctx = conf.get("table_context_size", 0) or 0
|
||||||
|
image_ctx = conf.get("image_context_size", 0) or 0
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
sections = attach_media_context(sections, table_ctx, image_ctx)
|
||||||
|
|
||||||
self.set_output("json", sections)
|
self.set_output("json", sections)
|
||||||
elif conf.get("output_format") == "markdown":
|
elif conf.get("output_format") == "markdown":
|
||||||
markdown_text = docx_parser.to_markdown(name, binary=blob)
|
markdown_text = docx_parser.to_markdown(name, binary=blob)
|
||||||
@ -420,7 +472,7 @@ class Parser(ProcessBase):
|
|||||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||||
tcadp_parser = TCADPParser(
|
tcadp_parser = TCADPParser(
|
||||||
table_result_type=table_result_type,
|
table_result_type=table_result_type,
|
||||||
markdown_image_response_type=markdown_image_response_type
|
markdown_image_response_type=markdown_image_response_type,
|
||||||
)
|
)
|
||||||
if not tcadp_parser.check_installation():
|
if not tcadp_parser.check_installation():
|
||||||
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||||
@ -439,7 +491,7 @@ class Parser(ProcessBase):
|
|||||||
callback=self.callback,
|
callback=self.callback,
|
||||||
file_type=file_type,
|
file_type=file_type,
|
||||||
file_start_page=1,
|
file_start_page=1,
|
||||||
file_end_page=1000
|
file_end_page=1000,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process TCADP parser output - PPT only supports json format
|
# Process TCADP parser output - PPT only supports json format
|
||||||
@ -454,7 +506,12 @@ class Parser(ProcessBase):
|
|||||||
# Add tables as text
|
# Add tables as text
|
||||||
for table in tables:
|
for table in tables:
|
||||||
if table:
|
if table:
|
||||||
result.append({"text": table})
|
result.append({"text": table, "doc_type_kwd": "table"})
|
||||||
|
|
||||||
|
table_ctx = conf.get("table_context_size", 0) or 0
|
||||||
|
image_ctx = conf.get("image_context_size", 0) or 0
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
result = attach_media_context(result, table_ctx, image_ctx)
|
||||||
|
|
||||||
self.set_output("json", result)
|
self.set_output("json", result)
|
||||||
else:
|
else:
|
||||||
@ -469,6 +526,10 @@ class Parser(ProcessBase):
|
|||||||
# json
|
# json
|
||||||
assert conf.get("output_format") == "json", "have to be json for ppt"
|
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
|
table_ctx = conf.get("table_context_size", 0) or 0
|
||||||
|
image_ctx = conf.get("image_context_size", 0) or 0
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
sections = attach_media_context(sections, table_ctx, image_ctx)
|
||||||
self.set_output("json", sections)
|
self.set_output("json", sections)
|
||||||
|
|
||||||
def _markdown(self, name, blob):
|
def _markdown(self, name, blob):
|
||||||
@ -508,11 +569,15 @@ class Parser(ProcessBase):
|
|||||||
|
|
||||||
json_results.append(json_result)
|
json_results.append(json_result)
|
||||||
|
|
||||||
|
table_ctx = conf.get("table_context_size", 0) or 0
|
||||||
|
image_ctx = conf.get("image_context_size", 0) or 0
|
||||||
|
if table_ctx or image_ctx:
|
||||||
|
json_results = attach_media_context(json_results, table_ctx, image_ctx)
|
||||||
|
|
||||||
self.set_output("json", json_results)
|
self.set_output("json", json_results)
|
||||||
else:
|
else:
|
||||||
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
|
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
|
||||||
|
|
||||||
|
|
||||||
def _image(self, name, blob):
|
def _image(self, name, blob):
|
||||||
from deepdoc.vision import OCR
|
from deepdoc.vision import OCR
|
||||||
|
|
||||||
@ -588,7 +653,7 @@ class Parser(ProcessBase):
|
|||||||
from email.parser import BytesParser
|
from email.parser import BytesParser
|
||||||
|
|
||||||
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
|
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
|
||||||
email_content['metadata'] = {}
|
email_content["metadata"] = {}
|
||||||
# handle header info
|
# handle header info
|
||||||
for header, value in msg.items():
|
for header, value in msg.items():
|
||||||
# get fields like from, to, cc, bcc, date, subject
|
# get fields like from, to, cc, bcc, date, subject
|
||||||
@ -600,6 +665,7 @@ class Parser(ProcessBase):
|
|||||||
# get body
|
# get body
|
||||||
if "body" in target_fields:
|
if "body" in target_fields:
|
||||||
body_text, body_html = [], []
|
body_text, body_html = [], []
|
||||||
|
|
||||||
def _add_content(m, content_type):
|
def _add_content(m, content_type):
|
||||||
def _decode_payload(payload, charset, target_list):
|
def _decode_payload(payload, charset, target_list):
|
||||||
try:
|
try:
|
||||||
@ -641,14 +707,17 @@ class Parser(ProcessBase):
|
|||||||
if dispositions[0].lower() == "attachment":
|
if dispositions[0].lower() == "attachment":
|
||||||
filename = part.get_filename()
|
filename = part.get_filename()
|
||||||
payload = part.get_payload(decode=True).decode(part.get_content_charset())
|
payload = part.get_payload(decode=True).decode(part.get_content_charset())
|
||||||
attachments.append({
|
attachments.append(
|
||||||
"filename": filename,
|
{
|
||||||
"payload": payload,
|
"filename": filename,
|
||||||
})
|
"payload": payload,
|
||||||
|
}
|
||||||
|
)
|
||||||
email_content["attachments"] = attachments
|
email_content["attachments"] = attachments
|
||||||
else:
|
else:
|
||||||
# handle msg file
|
# handle msg file
|
||||||
import extract_msg
|
import extract_msg
|
||||||
|
|
||||||
print("handle a msg file.")
|
print("handle a msg file.")
|
||||||
msg = extract_msg.Message(blob)
|
msg = extract_msg.Message(blob)
|
||||||
# handle header info
|
# handle header info
|
||||||
@ -662,9 +731,9 @@ class Parser(ProcessBase):
|
|||||||
}
|
}
|
||||||
email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
|
email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
|
||||||
# get metadata
|
# get metadata
|
||||||
email_content['metadata'] = {
|
email_content["metadata"] = {
|
||||||
'message_id': msg.messageId,
|
"message_id": msg.messageId,
|
||||||
'in_reply_to': msg.inReplyTo,
|
"in_reply_to": msg.inReplyTo,
|
||||||
}
|
}
|
||||||
# get body
|
# get body
|
||||||
if "body" in target_fields:
|
if "body" in target_fields:
|
||||||
@ -675,29 +744,31 @@ class Parser(ProcessBase):
|
|||||||
if "attachments" in target_fields:
|
if "attachments" in target_fields:
|
||||||
attachments = []
|
attachments = []
|
||||||
for t in msg.attachments:
|
for t in msg.attachments:
|
||||||
attachments.append({
|
attachments.append(
|
||||||
"filename": t.name,
|
{
|
||||||
"payload": t.data.decode("utf-8")
|
"filename": t.name,
|
||||||
})
|
"payload": t.data.decode("utf-8"),
|
||||||
|
}
|
||||||
|
)
|
||||||
email_content["attachments"] = attachments
|
email_content["attachments"] = attachments
|
||||||
|
|
||||||
if conf["output_format"] == "json":
|
if conf["output_format"] == "json":
|
||||||
self.set_output("json", [email_content])
|
self.set_output("json", [email_content])
|
||||||
else:
|
else:
|
||||||
content_txt = ''
|
content_txt = ""
|
||||||
for k, v in email_content.items():
|
for k, v in email_content.items():
|
||||||
if isinstance(v, str):
|
if isinstance(v, str):
|
||||||
# basic info
|
# basic info
|
||||||
content_txt += f'{k}:{v}' + "\n"
|
content_txt += f"{k}:{v}" + "\n"
|
||||||
elif isinstance(v, dict):
|
elif isinstance(v, dict):
|
||||||
# metadata
|
# metadata
|
||||||
content_txt += f'{k}:{json.dumps(v)}' + "\n"
|
content_txt += f"{k}:{json.dumps(v)}" + "\n"
|
||||||
elif isinstance(v, list):
|
elif isinstance(v, list):
|
||||||
# attachments or others
|
# attachments or others
|
||||||
for fb in v:
|
for fb in v:
|
||||||
if isinstance(fb, dict):
|
if isinstance(fb, dict):
|
||||||
# attachments
|
# attachments
|
||||||
content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
|
content_txt += f"{fb['filename']}:{fb['payload']}" + "\n"
|
||||||
else:
|
else:
|
||||||
# str, usually plain text
|
# str, usually plain text
|
||||||
content_txt += fb
|
content_txt += fb
|
||||||
|
|||||||
@ -318,6 +318,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
|
|||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
tokenize(d, rows, eng)
|
tokenize(d, rows, eng)
|
||||||
d["content_with_weight"] = rows
|
d["content_with_weight"] = rows
|
||||||
|
d["doc_type_kwd"] = "table"
|
||||||
if img:
|
if img:
|
||||||
d["image"] = img
|
d["image"] = img
|
||||||
d["doc_type_kwd"] = "image"
|
d["doc_type_kwd"] = "image"
|
||||||
@ -330,6 +331,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
|
|||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
r = de.join(rows[i:i + batch_size])
|
r = de.join(rows[i:i + batch_size])
|
||||||
tokenize(d, r, eng)
|
tokenize(d, r, eng)
|
||||||
|
d["doc_type_kwd"] = "table"
|
||||||
if img:
|
if img:
|
||||||
d["image"] = img
|
d["image"] = img
|
||||||
d["doc_type_kwd"] = "image"
|
d["doc_type_kwd"] = "image"
|
||||||
@ -338,6 +340,194 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def attach_media_context(chunks, table_context_size=0, image_context_size=0):
|
||||||
|
"""
|
||||||
|
Attach surrounding text chunk content to media chunks (table/image).
|
||||||
|
Best-effort ordering: if positional info exists on any chunk, use it to
|
||||||
|
order chunks before collecting context; otherwise keep original order.
|
||||||
|
"""
|
||||||
|
if not chunks or (table_context_size <= 0 and image_context_size <= 0):
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def is_image_chunk(ck):
|
||||||
|
if ck.get("doc_type_kwd") == "image":
|
||||||
|
return True
|
||||||
|
|
||||||
|
text_val = ck.get("content_with_weight") if isinstance(ck.get("content_with_weight"), str) else ck.get("text")
|
||||||
|
has_text = isinstance(text_val, str) and text_val.strip()
|
||||||
|
return bool(ck.get("image")) and not has_text
|
||||||
|
|
||||||
|
def is_table_chunk(ck):
|
||||||
|
return ck.get("doc_type_kwd") == "table"
|
||||||
|
|
||||||
|
def is_text_chunk(ck):
|
||||||
|
return not is_image_chunk(ck) and not is_table_chunk(ck)
|
||||||
|
|
||||||
|
def get_text(ck):
|
||||||
|
if isinstance(ck.get("content_with_weight"), str):
|
||||||
|
return ck["content_with_weight"]
|
||||||
|
if isinstance(ck.get("text"), str):
|
||||||
|
return ck["text"]
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def split_sentences(text):
|
||||||
|
pattern = r"([.。!?!?;;::\n])"
|
||||||
|
parts = re.split(pattern, text)
|
||||||
|
sentences = []
|
||||||
|
buf = ""
|
||||||
|
for p in parts:
|
||||||
|
if not p:
|
||||||
|
continue
|
||||||
|
if re.fullmatch(pattern, p):
|
||||||
|
buf += p
|
||||||
|
sentences.append(buf)
|
||||||
|
buf = ""
|
||||||
|
else:
|
||||||
|
buf += p
|
||||||
|
if buf:
|
||||||
|
sentences.append(buf)
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
def trim_to_tokens(text, token_budget, from_tail=False):
|
||||||
|
if token_budget <= 0 or not text:
|
||||||
|
return ""
|
||||||
|
sentences = split_sentences(text)
|
||||||
|
if not sentences:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
collected = []
|
||||||
|
remaining = token_budget
|
||||||
|
seq = reversed(sentences) if from_tail else sentences
|
||||||
|
for s in seq:
|
||||||
|
tks = num_tokens_from_string(s)
|
||||||
|
if tks <= 0:
|
||||||
|
continue
|
||||||
|
if tks > remaining:
|
||||||
|
collected.append(s)
|
||||||
|
break
|
||||||
|
collected.append(s)
|
||||||
|
remaining -= tks
|
||||||
|
|
||||||
|
if from_tail:
|
||||||
|
collected = list(reversed(collected))
|
||||||
|
return "".join(collected)
|
||||||
|
|
||||||
|
def extract_position(ck):
|
||||||
|
pn = None
|
||||||
|
top = None
|
||||||
|
left = None
|
||||||
|
try:
|
||||||
|
if ck.get("page_num_int"):
|
||||||
|
pn = ck["page_num_int"][0]
|
||||||
|
elif ck.get("page_number") is not None:
|
||||||
|
pn = ck.get("page_number")
|
||||||
|
|
||||||
|
if ck.get("top_int"):
|
||||||
|
top = ck["top_int"][0]
|
||||||
|
elif ck.get("top") is not None:
|
||||||
|
top = ck.get("top")
|
||||||
|
|
||||||
|
if ck.get("position_int"):
|
||||||
|
left = ck["position_int"][0][1]
|
||||||
|
elif ck.get("x0") is not None:
|
||||||
|
left = ck.get("x0")
|
||||||
|
except Exception:
|
||||||
|
pn = top = left = None
|
||||||
|
return pn, top, left
|
||||||
|
|
||||||
|
indexed = list(enumerate(chunks))
|
||||||
|
positioned_indices = []
|
||||||
|
unpositioned_indices = []
|
||||||
|
for idx, ck in indexed:
|
||||||
|
pn, top, left = extract_position(ck)
|
||||||
|
if pn is not None and top is not None:
|
||||||
|
positioned_indices.append((idx, pn, top, left if left is not None else 0))
|
||||||
|
else:
|
||||||
|
unpositioned_indices.append(idx)
|
||||||
|
|
||||||
|
if positioned_indices:
|
||||||
|
positioned_indices.sort(key=lambda x: (int(x[1]), int(x[2]), int(x[3]), x[0]))
|
||||||
|
ordered_indices = [i for i, _, _, _ in positioned_indices] + unpositioned_indices
|
||||||
|
else:
|
||||||
|
ordered_indices = [idx for idx, _ in indexed]
|
||||||
|
|
||||||
|
total = len(ordered_indices)
|
||||||
|
for sorted_pos, idx in enumerate(ordered_indices):
|
||||||
|
ck = chunks[idx]
|
||||||
|
token_budget = image_context_size if is_image_chunk(ck) else table_context_size if is_table_chunk(ck) else 0
|
||||||
|
if token_budget <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
prev_ctx = []
|
||||||
|
remaining_prev = token_budget
|
||||||
|
for prev_idx in range(sorted_pos - 1, -1, -1):
|
||||||
|
if remaining_prev <= 0:
|
||||||
|
break
|
||||||
|
neighbor_idx = ordered_indices[prev_idx]
|
||||||
|
if not is_text_chunk(chunks[neighbor_idx]):
|
||||||
|
break
|
||||||
|
txt = get_text(chunks[neighbor_idx])
|
||||||
|
if not txt:
|
||||||
|
continue
|
||||||
|
tks = num_tokens_from_string(txt)
|
||||||
|
if tks <= 0:
|
||||||
|
continue
|
||||||
|
if tks > remaining_prev:
|
||||||
|
txt = trim_to_tokens(txt, remaining_prev, from_tail=True)
|
||||||
|
tks = num_tokens_from_string(txt)
|
||||||
|
prev_ctx.append(txt)
|
||||||
|
remaining_prev -= tks
|
||||||
|
prev_ctx.reverse()
|
||||||
|
|
||||||
|
next_ctx = []
|
||||||
|
remaining_next = token_budget
|
||||||
|
for next_idx in range(sorted_pos + 1, total):
|
||||||
|
if remaining_next <= 0:
|
||||||
|
break
|
||||||
|
neighbor_idx = ordered_indices[next_idx]
|
||||||
|
if not is_text_chunk(chunks[neighbor_idx]):
|
||||||
|
break
|
||||||
|
txt = get_text(chunks[neighbor_idx])
|
||||||
|
if not txt:
|
||||||
|
continue
|
||||||
|
tks = num_tokens_from_string(txt)
|
||||||
|
if tks <= 0:
|
||||||
|
continue
|
||||||
|
if tks > remaining_next:
|
||||||
|
txt = trim_to_tokens(txt, remaining_next, from_tail=False)
|
||||||
|
tks = num_tokens_from_string(txt)
|
||||||
|
next_ctx.append(txt)
|
||||||
|
remaining_next -= tks
|
||||||
|
|
||||||
|
if not prev_ctx and not next_ctx:
|
||||||
|
continue
|
||||||
|
|
||||||
|
self_text = get_text(ck)
|
||||||
|
pieces = [*prev_ctx]
|
||||||
|
if self_text:
|
||||||
|
pieces.append(self_text)
|
||||||
|
pieces.extend(next_ctx)
|
||||||
|
combined = "\n".join(pieces)
|
||||||
|
|
||||||
|
original = ck.get("content_with_weight")
|
||||||
|
if "content_with_weight" in ck:
|
||||||
|
ck["content_with_weight"] = combined
|
||||||
|
elif "text" in ck:
|
||||||
|
original = ck.get("text")
|
||||||
|
ck["text"] = combined
|
||||||
|
|
||||||
|
if combined != original:
|
||||||
|
if "content_ltks" in ck:
|
||||||
|
ck["content_ltks"] = rag_tokenizer.tokenize(combined)
|
||||||
|
if "content_sm_ltks" in ck:
|
||||||
|
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
|
||||||
|
|
||||||
|
if positioned_indices:
|
||||||
|
chunks[:] = [chunks[i] for i in ordered_indices]
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
def add_positions(d, poss):
|
def add_positions(d, poss):
|
||||||
if not poss:
|
if not poss:
|
||||||
return
|
return
|
||||||
|
|||||||
@ -42,6 +42,8 @@ DEFAULT_PARSER_CONFIG = {
|
|||||||
"auto_keywords": 0,
|
"auto_keywords": 0,
|
||||||
"auto_questions": 0,
|
"auto_questions": 0,
|
||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
|
"image_context_size": 0,
|
||||||
|
"table_context_size": 0,
|
||||||
"topn_tags": 3,
|
"topn_tags": 3,
|
||||||
"raptor": {
|
"raptor": {
|
||||||
"use_raptor": True,
|
"use_raptor": True,
|
||||||
@ -62,4 +64,4 @@ DEFAULT_PARSER_CONFIG = {
|
|||||||
],
|
],
|
||||||
"method": "light",
|
"method": "light",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user