Add 'One' chunk method (#137)

2026-01-31 07:36:46 +08:00 · 2024-03-20 18:57:22 +08:00
parent fce14ee187
commit 5875c8ba08
11 changed files with 143 additions and 24 deletions
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -57,7 +57,7 @@ class Pdf(PdfParser):
        sec_ids = []
        sid = 0
        for i, lvl in enumerate(levels):
-            if lvl <= most_level: sid += 1
+            if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
            sec_ids.append(sid)
            #print(lvl, self.boxes[i]["text"], most_level)

@ -75,7 +75,7 @@ class Pdf(PdfParser):
                    continue
            chunks.append(txt + poss)
            if sec_id >-1: last_sid = sec_id
-        return chunks
+        return chunks, tbls


 def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@ -86,7 +86,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca

    if re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
-        cks = pdf_parser(filename if not binary else binary,
+        cks, tbls = pdf_parser(filename if not binary else binary,
                           from_page=from_page, to_page=to_page, callback=callback)
    else: raise NotImplementedError("file type not supported yet(pdf supported)")
    doc = {
@ -100,7 +100,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
    i = 0
    chunk = []
    tk_cnt = 0
-    res = []
+    res = tokenize_table(tbls, doc, eng)
    def add_chunk():
        nonlocal chunk, res, doc, pdf_parser, tk_cnt
        d = copy.deepcopy(doc)
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -49,7 +49,7 @@ class Pdf(PdfParser):

 def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    """
-        Supported file formats are docx, pdf, txt.
+        Supported file formats are docx, pdf, excel, txt.
        This method apply the naive ways to chunk files.
        Successive text will be sliced into pieces using 'delimiter'.
        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
--- a/rag/app/one.py
+++ b/rag/app/one.py
@ -0,0 +1,108 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import copy
+import re
+from rag.app import laws
+from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
+from deepdoc.parser import PdfParser, ExcelParser
+from rag.settings import cron_logger
+
+
+class Pdf(PdfParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        callback(msg="OCR is  running...")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished")
+
+        from timeit import default_timer as timer
+        start = timer()
+        self._layouts_rec(zoomin)
+        callback(0.63, "Layout analysis finished.")
+        print("paddle layouts:", timer() - start)
+        self._table_transformer_job(zoomin)
+        callback(0.65, "Table analysis finished.")
+        self._text_merge()
+        callback(0.67, "Text merging finished")
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        self._concat_downward()
+
+        sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
+        for (img, rows), poss in tbls:
+            sections.append((rows if isinstance(rows, str) else rows[0],
+                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
+        return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))]
+
+
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+    """
+        Supported file formats are docx, pdf, excel, txt.
+        One file forms a chunk which maintains original text order.
+    """
+
+    eng = lang.lower() == "english"#is_english(cks)
+
+    sections = []
+    if re.search(r"\.docx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        for txt in laws.Docx()(filename, binary):
+            sections.append(txt)
+        callback(0.8, "Finish parsing.")
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf()
+        sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
+    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = ExcelParser()
+        sections = [excel_parser.html(binary)]
+    elif re.search(r"\.txt$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = ""
+        if binary:
+            txt = binary.decode("utf-8")
+        else:
+            with open(filename, "r") as f:
+                while True:
+                    l = f.readline()
+                    if not l: break
+                    txt += l
+        sections = txt.split("\n")
+        sections = [(l, "") for l in sections if l]
+        callback(0.8, "Finish parsing.")
+    else:
+        raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
+
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
+    tokenize(doc, "\n".join(sections), eng)
+    return [doc]
+
+
+if __name__ == "__main__":
+    import sys
+
+
+    def dummy(prog=None, msg=""):
+        pass
+
+
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/llm/init.py
+++ b/rag/llm/init.py
@ -21,8 +21,8 @@ from .cv_model import *
 EmbeddingModel = {
    "Local": HuEmbedding,
    "OpenAI": OpenAIEmbed,
-    "通义千问": HuEmbedding, #QWenEmbed,
-    "智谱AI": ZhipuEmbed,
+    "Tongyi-Qianwen": HuEmbedding, #QWenEmbed,
+    "ZHIPU-AI": ZhipuEmbed,
    "Moonshot": HuEmbedding
 }

@ -30,16 +30,16 @@ EmbeddingModel = {
 CvModel = {
    "OpenAI": GptV4,
    "Local": LocalCV,
-    "通义千问": QWenCV,
-    "智谱AI": Zhipu4V,
+    "Tongyi-Qianwen": QWenCV,
+    "ZHIPU-AI": Zhipu4V,
    "Moonshot": LocalCV
 }


 ChatModel = {
    "OpenAI": GptTurbo,
-    "智谱AI": ZhipuChat,
-    "通义千问": QWenChat,
+    "ZHIPU-AI": ZhipuChat,
+    "Tongyi-Qianwen": QWenChat,
    "Local": LocalLLM,
    "Moonshot": MoonshotChat
 }
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -194,7 +194,7 @@ class Dealer:
        return [float(t) for t in txt.split("\t")]

    def insert_citations(self, answer, chunks, chunk_v,
-                         embd_mdl, tkweight=0.7, vtweight=0.3):
+                         embd_mdl, tkweight=0.1, vtweight=0.9):
        assert len(chunks) == len(chunk_v)
        pieces = re.split(r"(```)", answer)
        if len(pieces) >= 3:
@ -243,7 +243,7 @@ class Dealer:
                                                            chunks_tks,
                                                            tkweight, vtweight)
            mx = np.max(sim) * 0.99
-            if mx < 0.7:
+            if mx < 0.65:
                continue
            cites[idx[i]] = list(
                set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
--- a/rag/svr/task_broker.py
+++ b/rag/svr/task_broker.py
@ -84,6 +84,7 @@ def dispatch():
            pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
            page_size = 5
            if r["parser_id"] == "paper": page_size = 12
+            if r["parser_id"] == "one": page_size = 1000000000
            for s,e in r["parser_config"].get("pages", [(0,100000)]):
                e = min(e, pages)
                for p in range(s, e, page_size):
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -39,7 +39,7 @@ from rag.nlp import search
 from io import BytesIO
 import pandas as pd

-from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive
+from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one

 from api.db import LLMType, ParserType
 from api.db.services.document_service import DocumentService
@ -60,6 +60,7 @@ FACTORY = {
    ParserType.TABLE.value: table,
    ParserType.RESUME.value: resume,
    ParserType.PICTURE.value: picture,
+    ParserType.ONE.value: one,
 }