mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-31 15:45:08 +08:00
Feat: Redesign and refactor agent module (#9113)
### What problem does this PR solve? #9082 #6365 <u> **WARNING: it's not compatible with the older version of `Agent` module, which means that `Agent` from older versions can not work anymore.**</u> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -69,6 +69,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
@ -89,7 +92,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf()
|
||||
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
|
||||
@ -40,7 +40,7 @@ def chunk(
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config",
|
||||
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"},
|
||||
{"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"},
|
||||
)
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
|
||||
@ -145,6 +145,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
@ -163,7 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf()
|
||||
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
for txt, poss in pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)[0]:
|
||||
|
||||
@ -45,9 +45,6 @@ class Pdf(PdfParser):
|
||||
callback
|
||||
)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
# for bb in self.boxes:
|
||||
# for b in bb:
|
||||
# print(b)
|
||||
logging.debug("OCR: {}".format(timer() - start))
|
||||
|
||||
start = timer()
|
||||
@ -177,6 +174,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
"""
|
||||
Only pdf is supported.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
pdf_parser = None
|
||||
doc = {
|
||||
"docnm_kwd": filename
|
||||
@ -187,7 +187,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf()
|
||||
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
@ -222,7 +222,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
# print(lvl, self.boxes[i]["text"], most_level, sid)
|
||||
|
||||
sections = [(txt, sec_ids[i], poss)
|
||||
for i, (txt, _, poss) in enumerate(sections)]
|
||||
|
||||
@ -370,7 +370,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
is_english = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
|
||||
@ -72,7 +72,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
One file forms a chunk which maintains original text order.
|
||||
"""
|
||||
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
|
||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
@ -85,7 +87,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf()
|
||||
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
sections, _ = pdf_parser(
|
||||
filename if not binary else binary, to_page=to_page, callback=callback)
|
||||
|
||||
@ -143,8 +143,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
Only pdf is supported.
|
||||
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
if kwargs.get("parser_config", {}).get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
paper = {
|
||||
"title": filename,
|
||||
|
||||
Reference in New Issue
Block a user