init README of deepdoc, add picture processer. (#71)

* init README of deepdoc, add picture processer. * add resume parsing
2025-12-08 20:42:30 +08:00 · 2024-02-23 18:28:12 +08:00
parent d32322c081
commit 7fd1eca582
42 changed files with 58319 additions and 350 deletions
--- a/rag/app/book.py
+++ b/rag/app/book.py
@ -12,7 +12,7 @@
 #
 import copy
 import re
-from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, \
+from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
    hierarchical_merge, make_colon_as_title, naive_merge, random_choices
 from rag.nlp import huqie
 from deepdoc.parser import PdfParser, DocxParser
@ -47,7 +47,7 @@ class Pdf(PdfParser):
        return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls


-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, txt.
        Since a book is long and not all the parts are useful, if it's a PDF,
@ -94,7 +94,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k

    sections = [t for t, _ in sections]
    # is it English
-    eng = is_english(random_choices(sections, k=218))
+    eng = lang.lower() == "english"#is_english(random_choices(sections, k=218))

    res = []
    # add tables
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@ -14,7 +14,7 @@ import copy
 import re
 from io import BytesIO
 from docx import Document
-from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
+from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
    make_colon_as_title
 from rag.nlp import huqie
 from deepdoc.parser import PdfParser, DocxParser
@ -68,7 +68,7 @@ class Pdf(PdfParser):
        return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]


-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, txt.
    """
@ -106,7 +106,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
    else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")

    # is it English
-    eng = is_english(sections)
+    eng = lang.lower() == "english"#is_english(sections)
    # Remove 'Contents' part
    remove_contents_table(sections, eng)

--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -1,7 +1,6 @@
 import copy
 import re
-from deepdoc.parser import tokenize
-from rag.nlp import huqie
+from rag.nlp import huqie, tokenize
 from deepdoc.parser import PdfParser
 from rag.utils import num_tokens_from_string

@ -57,7 +56,7 @@ class Pdf(PdfParser):
        return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls


-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    """
        Only pdf is supported.
    """
@ -74,7 +73,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
    doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    # is it English
-    eng = pdf_parser.is_english
+    eng = lang.lower() == "english"#pdf_parser.is_english

    res = []
    # add tables
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -13,8 +13,7 @@
 import copy
 import re
 from rag.app import laws
-from deepdoc.parser import is_english, tokenize, naive_merge
-from rag.nlp import huqie
+from rag.nlp import huqie, is_english, tokenize, naive_merge
 from deepdoc.parser import PdfParser
 from rag.settings import cron_logger

@ -38,7 +37,7 @@ class Pdf(PdfParser):
        return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]


-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, txt.
        This method apply the naive ways to chunk files.
@ -80,7 +79,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k

    parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。；！？"})
    cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
-    eng = is_english(cks)
+    eng = lang.lower() == "english"#is_english(cks)
    res = []
    # wrap up to es documents
    for ck in cks:
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -15,8 +15,7 @@ import re
 from collections import Counter

 from api.db import ParserType
-from deepdoc.parser import tokenize
-from rag.nlp import huqie
+from rag.nlp import huqie, tokenize
 from deepdoc.parser import PdfParser
 import numpy as np
 from rag.utils import num_tokens_from_string
@ -140,7 +139,7 @@ class Pdf(PdfParser):
        }


-def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    """
        Only pdf is supported.
        The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
@ -156,7 +155,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
    # is it English
-    eng = pdf_parser.is_english
+    eng = lang.lower() == "english"#pdf_parser.is_english
    print("It's English.....", eng)

    res = []
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@ -0,0 +1,56 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import io
+
+import numpy as np
+from PIL import Image
+
+from api.db import LLMType
+from api.db.services.llm_service import LLMBundle
+from rag.nlp import tokenize
+from deepdoc.vision import OCR
+
+ocr = OCR()
+
+
+def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
+    try:
+        cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
+    except Exception as e:
+        callback(prog=-1, msg=str(e))
+        return []
+    img = Image.open(io.BytesIO(binary))
+    doc = {
+        "docnm_kwd": filename,
+        "image": img
+    }
+    bxs = ocr(np.array(img))
+    txt = "\n".join([t[0] for _, t in bxs if t[0]])
+    eng = lang.lower() == "english"
+    callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
+    if (eng and len(txt.split(" ")) > 32) or len(txt) > 32:
+        tokenize(doc, txt, eng)
+        callback(0.8, "OCR results is too long to use CV LLM.")
+        return [doc]
+
+    try:
+        callback(0.4, "Use CV LLM to describe the picture.")
+        ans = cv_mdl.describe(binary)
+        callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
+        txt += "\n" + ans
+        tokenize(doc, txt, eng)
+        return [doc]
+    except Exception as e:
+        callback(prog=-1, msg=str(e))
+
+    return []
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@ -13,46 +13,14 @@
 import copy
 import re
 from io import BytesIO
-from pptx import Presentation
-from deepdoc.parser import tokenize, is_english
+from rag.nlp import tokenize, is_english
 from rag.nlp import huqie
-from deepdoc.parser import PdfParser
+from deepdoc.parser import PdfParser, PptParser


-class Ppt(object):
-    def __init__(self):
-        super().__init__()
-
-    def __extract(self, shape):
-        if shape.shape_type == 19:
-            tb = shape.table
-            rows = []
-            for i in range(1, len(tb.rows)):
-                rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
-            return "\n".join(rows)
-
-        if shape.has_text_frame:
-            return shape.text_frame.text
-
-        if shape.shape_type == 6:
-            texts = []
-            for p in shape.shapes:
-                t = self.__extract(p)
-                if t: texts.append(t)
-            return "\n".join(texts)
-
+class Ppt(PptParser):
    def __call__(self, fnm, from_page, to_page, callback=None):
-        ppt = Presentation(fnm) if isinstance(
-            fnm, str) else Presentation(
-            BytesIO(fnm))
-        txts = []
-        self.total_page = len(ppt.slides)
-        for i, slide in enumerate(ppt.slides[from_page: to_page]):
-            texts = []
-            for shape in slide.shapes:
-                txt = self.__extract(shape)
-                if txt: texts.append(txt)
-            txts.append("\n".join(texts))
+        txts = super.__call__(fnm, from_page, to_page)

        callback(0.5, "Text extraction finished.")
        import aspose.slides as slides
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -14,7 +14,7 @@ import re
 from io import BytesIO
 from nltk import word_tokenize
 from openpyxl import load_workbook
-from deepdoc.parser import is_english, random_choices
+from rag.nlp import is_english, random_choices
 from rag.nlp import huqie, stemmer
 from deepdoc.parser import ExcelParser

@ -81,7 +81,7 @@ def beAdoc(d, q, a, eng):
    return d


-def chunk(filename, binary=None, callback=None, **kwargs):
+def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
    """
        Excel and csv(txt) format files are supported.
        If the file is in excel format, there should be 2 column question and answer without header.
@ -113,7 +113,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
                        break
                    txt += l
        lines = txt.split("\n")
-        eng = is_english([rmPrefix(l) for l in lines[:100]])
+        eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]])
        fails = []
        for i, line in enumerate(lines):
            arr = [l for l in line.split("\t") if len(l) > 1]
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -20,8 +20,7 @@ from openpyxl import load_workbook
 from dateutil.parser import parse as datetime_parse

 from api.db.services.knowledgebase_service import KnowledgebaseService
-from deepdoc.parser import is_english, tokenize
-from rag.nlp import huqie
+from rag.nlp import huqie, is_english, tokenize
 from deepdoc.parser import ExcelParser


@ -112,7 +111,7 @@ def column_data_type(arr):
    return arr, ty


-def chunk(filename, binary=None, callback=None, **kwargs):
+def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
    """
        Excel and csv(txt) format files are supported.
        For csv or txt file, the delimiter between columns is TAB.
@ -192,7 +191,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
        clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
                     for i in range(len(clmns))]

-        eng = is_english(txts)
+        eng = lang.lower() == "english"#is_english(txts)
        for ii, row in df.iterrows():
            d = {}
            row_txt = []