init README of deepdoc, add picture processer. (#71)

* init README of deepdoc, add picture processer.

* add resume parsing
This commit is contained in:
KevinHuSh
2024-02-23 18:28:12 +08:00
committed by GitHub
parent d32322c081
commit 7fd1eca582
42 changed files with 58319 additions and 350 deletions

View File

@ -12,7 +12,7 @@
#
import copy
import re
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, \
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices
from rag.nlp import huqie
from deepdoc.parser import PdfParser, DocxParser
@ -47,7 +47,7 @@ class Pdf(PdfParser):
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
@ -94,7 +94,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
sections = [t for t, _ in sections]
# is it English
eng = is_english(random_choices(sections, k=218))
eng = lang.lower() == "english"#is_english(random_choices(sections, k=218))
res = []
# add tables

View File

@ -14,7 +14,7 @@ import copy
import re
from io import BytesIO
from docx import Document
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title
from rag.nlp import huqie
from deepdoc.parser import PdfParser, DocxParser
@ -68,7 +68,7 @@ class Pdf(PdfParser):
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
"""
@ -106,7 +106,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
# is it English
eng = is_english(sections)
eng = lang.lower() == "english"#is_english(sections)
# Remove 'Contents' part
remove_contents_table(sections, eng)

View File

@ -1,7 +1,6 @@
import copy
import re
from deepdoc.parser import tokenize
from rag.nlp import huqie
from rag.nlp import huqie, tokenize
from deepdoc.parser import PdfParser
from rag.utils import num_tokens_from_string
@ -57,7 +56,7 @@ class Pdf(PdfParser):
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
"""
@ -74,7 +73,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
# is it English
eng = pdf_parser.is_english
eng = lang.lower() == "english"#pdf_parser.is_english
res = []
# add tables

View File

@ -13,8 +13,7 @@
import copy
import re
from rag.app import laws
from deepdoc.parser import is_english, tokenize, naive_merge
from rag.nlp import huqie
from rag.nlp import huqie, is_english, tokenize, naive_merge
from deepdoc.parser import PdfParser
from rag.settings import cron_logger
@ -38,7 +37,7 @@ class Pdf(PdfParser):
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
This method apply the naive ways to chunk files.
@ -80,7 +79,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
eng = is_english(cks)
eng = lang.lower() == "english"#is_english(cks)
res = []
# wrap up to es documents
for ck in cks:

View File

@ -15,8 +15,7 @@ import re
from collections import Counter
from api.db import ParserType
from deepdoc.parser import tokenize
from rag.nlp import huqie
from rag.nlp import huqie, tokenize
from deepdoc.parser import PdfParser
import numpy as np
from rag.utils import num_tokens_from_string
@ -140,7 +139,7 @@ class Pdf(PdfParser):
}
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
@ -156,7 +155,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
# is it English
eng = pdf_parser.is_english
eng = lang.lower() == "english"#pdf_parser.is_english
print("It's English.....", eng)
res = []

56
rag/app/picture.py Normal file
View File

@ -0,0 +1,56 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import io
import numpy as np
from PIL import Image
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from rag.nlp import tokenize
from deepdoc.vision import OCR
ocr = OCR()
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
try:
cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
except Exception as e:
callback(prog=-1, msg=str(e))
return []
img = Image.open(io.BytesIO(binary))
doc = {
"docnm_kwd": filename,
"image": img
}
bxs = ocr(np.array(img))
txt = "\n".join([t[0] for _, t in bxs if t[0]])
eng = lang.lower() == "english"
callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
if (eng and len(txt.split(" ")) > 32) or len(txt) > 32:
tokenize(doc, txt, eng)
callback(0.8, "OCR results is too long to use CV LLM.")
return [doc]
try:
callback(0.4, "Use CV LLM to describe the picture.")
ans = cv_mdl.describe(binary)
callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
txt += "\n" + ans
tokenize(doc, txt, eng)
return [doc]
except Exception as e:
callback(prog=-1, msg=str(e))
return []

View File

@ -13,46 +13,14 @@
import copy
import re
from io import BytesIO
from pptx import Presentation
from deepdoc.parser import tokenize, is_english
from rag.nlp import tokenize, is_english
from rag.nlp import huqie
from deepdoc.parser import PdfParser
from deepdoc.parser import PdfParser, PptParser
class Ppt(object):
def __init__(self):
super().__init__()
def __extract(self, shape):
if shape.shape_type == 19:
tb = shape.table
rows = []
for i in range(1, len(tb.rows)):
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
return "\n".join(rows)
if shape.has_text_frame:
return shape.text_frame.text
if shape.shape_type == 6:
texts = []
for p in shape.shapes:
t = self.__extract(p)
if t: texts.append(t)
return "\n".join(texts)
class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None):
ppt = Presentation(fnm) if isinstance(
fnm, str) else Presentation(
BytesIO(fnm))
txts = []
self.total_page = len(ppt.slides)
for i, slide in enumerate(ppt.slides[from_page: to_page]):
texts = []
for shape in slide.shapes:
txt = self.__extract(shape)
if txt: texts.append(txt)
txts.append("\n".join(texts))
txts = super.__call__(fnm, from_page, to_page)
callback(0.5, "Text extraction finished.")
import aspose.slides as slides

View File

@ -14,7 +14,7 @@ import re
from io import BytesIO
from nltk import word_tokenize
from openpyxl import load_workbook
from deepdoc.parser import is_english, random_choices
from rag.nlp import is_english, random_choices
from rag.nlp import huqie, stemmer
from deepdoc.parser import ExcelParser
@ -81,7 +81,7 @@ def beAdoc(d, q, a, eng):
return d
def chunk(filename, binary=None, callback=None, **kwargs):
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
If the file is in excel format, there should be 2 column question and answer without header.
@ -113,7 +113,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
break
txt += l
lines = txt.split("\n")
eng = is_english([rmPrefix(l) for l in lines[:100]])
eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]])
fails = []
for i, line in enumerate(lines):
arr = [l for l in line.split("\t") if len(l) > 1]

View File

@ -20,8 +20,7 @@ from openpyxl import load_workbook
from dateutil.parser import parse as datetime_parse
from api.db.services.knowledgebase_service import KnowledgebaseService
from deepdoc.parser import is_english, tokenize
from rag.nlp import huqie
from rag.nlp import huqie, is_english, tokenize
from deepdoc.parser import ExcelParser
@ -112,7 +111,7 @@ def column_data_type(arr):
return arr, ty
def chunk(filename, binary=None, callback=None, **kwargs):
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
@ -192,7 +191,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
for i in range(len(clmns))]
eng = is_english(txts)
eng = lang.lower() == "english"#is_english(txts)
for ii, row in df.iterrows():
d = {}
row_txt = []