mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Light GraphRAG (#4585)
### What problem does this PR solve? #4543 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -88,9 +88,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
pdf_parser = Pdf()
|
||||
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@ def chunk(
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config",
|
||||
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True},
|
||||
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"},
|
||||
)
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
|
||||
@ -1,48 +0,0 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import re
|
||||
|
||||
from graphrag.index import build_knowledge_graph_chunks
|
||||
from rag.app import naive
|
||||
from rag.nlp import rag_tokenizer, tokenize_chunks
|
||||
|
||||
|
||||
def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?;。;!?", "layout_recognize": True})
|
||||
eng = lang.lower() == "english"
|
||||
|
||||
parser_config["layout_recognize"] = True
|
||||
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
|
||||
parser_config=parser_config, callback=callback)
|
||||
chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,
|
||||
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
|
||||
)
|
||||
for c in chunks:
|
||||
c["docnm_kwd"] = filename
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
||||
"knowledge_graph_kwd": "text"
|
||||
}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
chunks.extend(tokenize_chunks(sections, doc, eng))
|
||||
|
||||
return chunks
|
||||
@ -162,9 +162,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return tokenize_chunks(chunks, doc, eng, None)
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
pdf_parser = Pdf()
|
||||
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
for txt, poss in pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)[0]:
|
||||
sections.append(txt + poss)
|
||||
|
||||
@ -184,9 +184,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
pdf_parser = Pdf()
|
||||
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
if sections and len(sections[0]) < 3:
|
||||
|
||||
@ -202,7 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
is_english = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
||||
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
@ -231,8 +231,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if parser_config.get("layout_recognize", True) else PlainParser()
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
pdf_parser = Pdf()
|
||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
|
||||
callback=callback)
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
|
||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
|
||||
@ -84,9 +84,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
pdf_parser = Pdf()
|
||||
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
sections, _ = pdf_parser(
|
||||
filename if not binary else binary, to_page=to_page, callback=callback)
|
||||
sections = [s for s, _ in sections if s]
|
||||
|
||||
@ -144,7 +144,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
|
||||
"""
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
if not kwargs.get("parser_config", {}).get("layout_recognize", True):
|
||||
if kwargs.get("parser_config", {}).get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
paper = {
|
||||
"title": filename,
|
||||
|
||||
@ -119,9 +119,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
res.append(d)
|
||||
return res
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainPdf()
|
||||
pdf_parser = Pdf()
|
||||
if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
for pn, (txt, img) in enumerate(pdf_parser(filename, binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)):
|
||||
d = copy.deepcopy(doc)
|
||||
|
||||
Reference in New Issue
Block a user