Light GraphRAG (#4585)

### What problem does this PR solve? #4543 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 23:55:06 +08:00 · 2025-01-22 19:43:14 +08:00
parent 1a367664f1
commit dd0ebbea35
55 changed files with 5461 additions and 4000 deletions
--- a/rag/app/book.py
+++ b/rag/app/book.py
@ -88,9 +88,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if kwargs.get(
-            "parser_config", {}).get(
-            "layout_recognize", True) else PlainParser()
+        pdf_parser = Pdf()
+        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
                                    from_page=from_page, to_page=to_page, callback=callback)

--- a/rag/app/email.py
+++ b/rag/app/email.py
@ -40,7 +40,7 @@ def chunk(
    eng = lang.lower() == "english"  # is_english(cks)
    parser_config = kwargs.get(
        "parser_config",
-        {"chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": True},
+        {"chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"},
    )
    doc = {
        "docnm_kwd": filename,
--- a/rag/app/knowledge_graph.py
+++ b/rag/app/knowledge_graph.py
@ -1,48 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import re
-
-from graphrag.index import build_knowledge_graph_chunks
-from rag.app import naive
-from rag.nlp import rag_tokenizer, tokenize_chunks
-
-
-def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
-    parser_config = kwargs.get(
-        "parser_config", {
-            "chunk_token_num": 512, "delimiter": "\n!?;。；！？", "layout_recognize": True})
-    eng = lang.lower() == "english"
-
-    parser_config["layout_recognize"] = True
-    sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
-                           parser_config=parser_config, callback=callback)
-    chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,
-                                          parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
-                                          )
-    for c in chunks:
-        c["docnm_kwd"] = filename
-
-    doc = {
-        "docnm_kwd": filename,
-        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
-        "knowledge_graph_kwd": "text"
-    }
-    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
-    chunks.extend(tokenize_chunks(sections, doc, eng))
-
-    return chunks
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@ -162,9 +162,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        return tokenize_chunks(chunks, doc, eng, None)

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if kwargs.get(
-            "parser_config", {}).get(
-            "layout_recognize", True) else PlainParser()
+        pdf_parser = Pdf()
+        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
        for txt, poss in pdf_parser(filename if not binary else binary,
                                    from_page=from_page, to_page=to_page, callback=callback)[0]:
            sections.append(txt + poss)
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -184,9 +184,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    # is it English
    eng = lang.lower() == "english"  # pdf_parser.is_english
    if re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if kwargs.get(
-            "parser_config", {}).get(
-            "layout_recognize", True) else PlainParser()
+        pdf_parser = Pdf()
+        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
                                    from_page=from_page, to_page=to_page, callback=callback)
        if sections and len(sections[0]) < 3:
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -202,7 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    is_english = lang.lower() == "english"  # is_english(cks)
    parser_config = kwargs.get(
        "parser_config", {
-            "chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": True})
+            "chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -231,8 +231,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        return res

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if parser_config.get("layout_recognize", True) else PlainParser()
-        sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
+        pdf_parser = Pdf()
+        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
+        sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
+                                      callback=callback)
        res = tokenize_table(tables, doc, is_english)

    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
--- a/rag/app/one.py
+++ b/rag/app/one.py
@ -84,9 +84,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if kwargs.get(
-            "parser_config", {}).get(
-            "layout_recognize", True) else PlainParser()
+        pdf_parser = Pdf()
+        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
        sections, _ = pdf_parser(
            filename if not binary else binary, to_page=to_page, callback=callback)
        sections = [s for s, _ in sections if s]
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -144,7 +144,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
    """
    if re.search(r"\.pdf$", filename, re.IGNORECASE):
-        if not kwargs.get("parser_config", {}).get("layout_recognize", True):
+        if kwargs.get("parser_config", {}).get("layout_recognize", "DeepDOC") == "Plain Text":
            pdf_parser = PlainParser()
            paper = {
                "title": filename,
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@ -119,9 +119,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            res.append(d)
        return res
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if kwargs.get(
-            "parser_config", {}).get(
-            "layout_recognize", True) else PlainPdf()
+        pdf_parser = Pdf()
+        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
+            pdf_parser = PlainParser()
        for pn, (txt, img) in enumerate(pdf_parser(filename, binary,
                                                   from_page=from_page, to_page=to_page, callback=callback)):
            d = copy.deepcopy(doc)