mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
init README of deepdoc, add picture processer. (#71)
* init README of deepdoc, add picture processer. * add resume parsing
This commit is contained in:
@ -12,7 +12,7 @@
|
||||
#
|
||||
import copy
|
||||
import re
|
||||
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
@ -47,7 +47,7 @@ class Pdf(PdfParser):
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
@ -94,7 +94,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
|
||||
sections = [t for t, _ in sections]
|
||||
# is it English
|
||||
eng = is_english(random_choices(sections, k=218))
|
||||
eng = lang.lower() == "english"#is_english(random_choices(sections, k=218))
|
||||
|
||||
res = []
|
||||
# add tables
|
||||
|
||||
@ -14,7 +14,7 @@ import copy
|
||||
import re
|
||||
from io import BytesIO
|
||||
from docx import Document
|
||||
from deepdoc.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||
make_colon_as_title
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
@ -68,7 +68,7 @@ class Pdf(PdfParser):
|
||||
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes]
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
"""
|
||||
@ -106,7 +106,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
# is it English
|
||||
eng = is_english(sections)
|
||||
eng = lang.lower() == "english"#is_english(sections)
|
||||
# Remove 'Contents' part
|
||||
remove_contents_table(sections, eng)
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
import copy
|
||||
import re
|
||||
from deepdoc.parser import tokenize
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import huqie, tokenize
|
||||
from deepdoc.parser import PdfParser
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
@ -57,7 +56,7 @@ class Pdf(PdfParser):
|
||||
return [b["text"] + self._line_tag(b, zoomin) for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
"""
|
||||
@ -74,7 +73,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
# is it English
|
||||
eng = pdf_parser.is_english
|
||||
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||
|
||||
res = []
|
||||
# add tables
|
||||
|
||||
@ -13,8 +13,7 @@
|
||||
import copy
|
||||
import re
|
||||
from rag.app import laws
|
||||
from deepdoc.parser import is_english, tokenize, naive_merge
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import huqie, is_english, tokenize, naive_merge
|
||||
from deepdoc.parser import PdfParser
|
||||
from rag.settings import cron_logger
|
||||
|
||||
@ -38,7 +37,7 @@ class Pdf(PdfParser):
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
@ -80,7 +79,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
|
||||
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
|
||||
eng = is_english(cks)
|
||||
eng = lang.lower() == "english"#is_english(cks)
|
||||
res = []
|
||||
# wrap up to es documents
|
||||
for ck in cks:
|
||||
|
||||
@ -15,8 +15,7 @@ import re
|
||||
from collections import Counter
|
||||
|
||||
from api.db import ParserType
|
||||
from deepdoc.parser import tokenize
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import huqie, tokenize
|
||||
from deepdoc.parser import PdfParser
|
||||
import numpy as np
|
||||
from rag.utils import num_tokens_from_string
|
||||
@ -140,7 +139,7 @@ class Pdf(PdfParser):
|
||||
}
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
|
||||
@ -156,7 +155,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
||||
# is it English
|
||||
eng = pdf_parser.is_english
|
||||
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||
print("It's English.....", eng)
|
||||
|
||||
res = []
|
||||
|
||||
56
rag/app/picture.py
Normal file
56
rag/app/picture.py
Normal file
@ -0,0 +1,56 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import io
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from rag.nlp import tokenize
|
||||
from deepdoc.vision import OCR
|
||||
|
||||
ocr = OCR()
|
||||
|
||||
|
||||
def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
||||
try:
|
||||
cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
|
||||
except Exception as e:
|
||||
callback(prog=-1, msg=str(e))
|
||||
return []
|
||||
img = Image.open(io.BytesIO(binary))
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"image": img
|
||||
}
|
||||
bxs = ocr(np.array(img))
|
||||
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||
eng = lang.lower() == "english"
|
||||
callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
|
||||
if (eng and len(txt.split(" ")) > 32) or len(txt) > 32:
|
||||
tokenize(doc, txt, eng)
|
||||
callback(0.8, "OCR results is too long to use CV LLM.")
|
||||
return [doc]
|
||||
|
||||
try:
|
||||
callback(0.4, "Use CV LLM to describe the picture.")
|
||||
ans = cv_mdl.describe(binary)
|
||||
callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
|
||||
txt += "\n" + ans
|
||||
tokenize(doc, txt, eng)
|
||||
return [doc]
|
||||
except Exception as e:
|
||||
callback(prog=-1, msg=str(e))
|
||||
|
||||
return []
|
||||
@ -13,46 +13,14 @@
|
||||
import copy
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pptx import Presentation
|
||||
from deepdoc.parser import tokenize, is_english
|
||||
from rag.nlp import tokenize, is_english
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import PdfParser
|
||||
from deepdoc.parser import PdfParser, PptParser
|
||||
|
||||
|
||||
class Ppt(object):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __extract(self, shape):
|
||||
if shape.shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
if shape.has_text_frame:
|
||||
return shape.text_frame.text
|
||||
|
||||
if shape.shape_type == 6:
|
||||
texts = []
|
||||
for p in shape.shapes:
|
||||
t = self.__extract(p)
|
||||
if t: texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
class Ppt(PptParser):
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
ppt = Presentation(fnm) if isinstance(
|
||||
fnm, str) else Presentation(
|
||||
BytesIO(fnm))
|
||||
txts = []
|
||||
self.total_page = len(ppt.slides)
|
||||
for i, slide in enumerate(ppt.slides[from_page: to_page]):
|
||||
texts = []
|
||||
for shape in slide.shapes:
|
||||
txt = self.__extract(shape)
|
||||
if txt: texts.append(txt)
|
||||
txts.append("\n".join(texts))
|
||||
txts = super.__call__(fnm, from_page, to_page)
|
||||
|
||||
callback(0.5, "Text extraction finished.")
|
||||
import aspose.slides as slides
|
||||
|
||||
@ -14,7 +14,7 @@ import re
|
||||
from io import BytesIO
|
||||
from nltk import word_tokenize
|
||||
from openpyxl import load_workbook
|
||||
from deepdoc.parser import is_english, random_choices
|
||||
from rag.nlp import is_english, random_choices
|
||||
from rag.nlp import huqie, stemmer
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
@ -81,7 +81,7 @@ def beAdoc(d, q, a, eng):
|
||||
return d
|
||||
|
||||
|
||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
If the file is in excel format, there should be 2 column question and answer without header.
|
||||
@ -113,7 +113,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
break
|
||||
txt += l
|
||||
lines = txt.split("\n")
|
||||
eng = is_english([rmPrefix(l) for l in lines[:100]])
|
||||
eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]])
|
||||
fails = []
|
||||
for i, line in enumerate(lines):
|
||||
arr = [l for l in line.split("\t") if len(l) > 1]
|
||||
|
||||
@ -20,8 +20,7 @@ from openpyxl import load_workbook
|
||||
from dateutil.parser import parse as datetime_parse
|
||||
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from deepdoc.parser import is_english, tokenize
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import huqie, is_english, tokenize
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
@ -112,7 +111,7 @@ def column_data_type(arr):
|
||||
return arr, ty
|
||||
|
||||
|
||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
For csv or txt file, the delimiter between columns is TAB.
|
||||
@ -192,7 +191,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j])
|
||||
for i in range(len(clmns))]
|
||||
|
||||
eng = is_english(txts)
|
||||
eng = lang.lower() == "english"#is_english(txts)
|
||||
for ii, row in df.iterrows():
|
||||
d = {}
|
||||
row_txt = []
|
||||
|
||||
Reference in New Issue
Block a user