mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fit a lot of encodings for text file. (#458)
### What problem does this PR solve? #384 ### Type of change - [x] Performance Improvement
This commit is contained in:
@ -15,7 +15,8 @@ import re
|
||||
from io import BytesIO
|
||||
|
||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, tokenize_chunks
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
|
||||
tokenize_chunks, find_codec
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
||||
|
||||
@ -87,7 +88,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
|
||||
@ -17,7 +17,7 @@ from docx import Document
|
||||
|
||||
from api.db import ParserType
|
||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||
make_colon_as_title, add_positions, tokenize_chunks
|
||||
make_colon_as_title, add_positions, tokenize_chunks, find_codec
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
||||
from rag.settings import cron_logger
|
||||
@ -111,7 +111,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
|
||||
@ -14,7 +14,7 @@ from io import BytesIO
|
||||
from docx import Document
|
||||
import re
|
||||
from deepdoc.parser.pdf_parser import PlainParser
|
||||
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks
|
||||
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||
from rag.settings import cron_logger
|
||||
|
||||
@ -139,10 +139,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:
|
||||
try:
|
||||
txt = binary.decode("utf-8")
|
||||
except Exception as e:
|
||||
txt = binary.decode("gb2312")
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
|
||||
@ -12,7 +12,7 @@
|
||||
#
|
||||
import re
|
||||
from rag.app import laws
|
||||
from rag.nlp import huqie, tokenize
|
||||
from rag.nlp import huqie, tokenize, find_codec
|
||||
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
||||
|
||||
|
||||
@ -82,7 +82,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
|
||||
@ -15,7 +15,7 @@ from copy import deepcopy
|
||||
from io import BytesIO
|
||||
from nltk import word_tokenize
|
||||
from openpyxl import load_workbook
|
||||
from rag.nlp import is_english, random_choices
|
||||
from rag.nlp import is_english, random_choices, find_codec
|
||||
from rag.nlp import huqie
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
@ -106,7 +106,8 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
|
||||
@ -20,7 +20,7 @@ from openpyxl import load_workbook
|
||||
from dateutil.parser import parse as datetime_parse
|
||||
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from rag.nlp import huqie, is_english, tokenize
|
||||
from rag.nlp import huqie, is_english, tokenize, find_codec
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
@ -147,7 +147,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
|
||||
@ -6,6 +6,35 @@ from . import huqie
|
||||
import re
|
||||
import copy
|
||||
|
||||
all_codecs = [
|
||||
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
||||
'cp037', 'cp273', 'cp424', 'cp437',
|
||||
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857',
|
||||
'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869',
|
||||
'cp874', 'cp875', 'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125',
|
||||
'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
|
||||
'cp1257', 'cp1258', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr',
|
||||
'gb2312', 'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
|
||||
'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'latin_1',
|
||||
'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7',
|
||||
'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13',
|
||||
'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
|
||||
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman',
|
||||
'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213',
|
||||
'utf_32', 'utf_32_be', 'utf_32_le''utf_16_be', 'utf_16_le', 'utf_7'
|
||||
]
|
||||
|
||||
|
||||
def find_codec(blob):
|
||||
global all_codecs
|
||||
for c in all_codecs:
|
||||
try:
|
||||
blob.decode(c)
|
||||
return c
|
||||
except Exception as e:
|
||||
pass
|
||||
return "utf-8"
|
||||
|
||||
|
||||
BULLET_PATTERN = [[
|
||||
r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
|
||||
|
||||
@ -8,6 +8,7 @@ import re
|
||||
import string
|
||||
import sys
|
||||
from hanziconv import HanziConv
|
||||
from huggingface_hub import snapshot_download
|
||||
from nltk import word_tokenize
|
||||
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
|
||||
@ -68,7 +68,7 @@ class Dealer:
|
||||
pg = int(req.get("page", 1)) - 1
|
||||
ps = int(req.get("size", 1000))
|
||||
topk = int(req.get("topk", 1024))
|
||||
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id",
|
||||
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
|
||||
"image_id", "doc_id", "q_512_vec", "q_768_vec", "position_int",
|
||||
"q_1024_vec", "q_1536_vec", "available_int", "content_with_weight"])
|
||||
|
||||
@ -289,8 +289,18 @@ class Dealer:
|
||||
sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
|
||||
if not ins_embd:
|
||||
return [], [], []
|
||||
ins_tw = [sres.field[i][cfield].split(" ")
|
||||
for i in sres.ids]
|
||||
|
||||
for i in sres.ids:
|
||||
if isinstance(sres.field[i].get("important_kwd", []), str):
|
||||
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
||||
ins_tw = []
|
||||
for i in sres.ids:
|
||||
content_ltks = sres.field[i][cfield].split(" ")
|
||||
title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
|
||||
important_kwd = sres.field[i].get("important_kwd", [])
|
||||
tks = content_ltks + title_tks + important_kwd
|
||||
ins_tw.append(tks)
|
||||
|
||||
sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
|
||||
ins_embd,
|
||||
keywords,
|
||||
@ -368,7 +378,7 @@ class Dealer:
|
||||
|
||||
def sql_retrieval(self, sql, fetch_size=128, format="json"):
|
||||
from api.settings import chat_logger
|
||||
sql = re.sub(r"[ ]+", " ", sql)
|
||||
sql = re.sub(r"[ `]+", " ", sql)
|
||||
sql = sql.replace("%", "")
|
||||
es_logger.info(f"Get es sql: {sql}")
|
||||
replaces = []
|
||||
|
||||
@ -121,6 +121,7 @@ def dispatch():
|
||||
tsks.append(new_task())
|
||||
|
||||
bulk_insert_into_db(Task, tsks, True)
|
||||
print("TSK:", len(tsks))
|
||||
set_dispatching(r["id"])
|
||||
except Exception as e:
|
||||
cron_logger.exception(e)
|
||||
|
||||
@ -19,6 +19,7 @@ import logging
|
||||
import os
|
||||
import hashlib
|
||||
import copy
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
@ -92,6 +93,7 @@ def set_progress(task_id, from_page=0, to_page=-1,
|
||||
|
||||
def collect(comm, mod, tm):
|
||||
tasks = TaskService.get_tasks(tm, mod, comm)
|
||||
#print(tasks)
|
||||
if len(tasks) == 0:
|
||||
time.sleep(1)
|
||||
return pd.DataFrame()
|
||||
@ -243,6 +245,7 @@ def main(comm, mod):
|
||||
tmf = open(tm_fnm, "a+")
|
||||
for _, r in rows.iterrows():
|
||||
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
||||
#callback(random.random()/10., "Task has been received.")
|
||||
try:
|
||||
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
||||
except Exception as e:
|
||||
@ -300,9 +303,8 @@ if __name__ == "__main__":
|
||||
peewee_logger.addHandler(database_logger.handlers[0])
|
||||
peewee_logger.setLevel(database_logger.level)
|
||||
|
||||
from mpi4py import MPI
|
||||
|
||||
comm = MPI.COMM_WORLD
|
||||
#from mpi4py import MPI
|
||||
#comm = MPI.COMM_WORLD
|
||||
while True:
|
||||
main(int(sys.argv[2]), int(sys.argv[1]))
|
||||
close_connection()
|
||||
|
||||
Reference in New Issue
Block a user