mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
refine code (#595)
### What problem does this PR solve? ### Type of change - [x] Refactoring
This commit is contained in:
@ -16,7 +16,7 @@ from io import BytesIO
|
||||
from nltk import word_tokenize
|
||||
from openpyxl import load_workbook
|
||||
from rag.nlp import is_english, random_choices, find_codec
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
@ -73,8 +73,8 @@ def beAdoc(d, q, a, eng):
|
||||
aprefix = "Answer: " if eng else "回答:"
|
||||
d["content_with_weight"] = "\t".join(
|
||||
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
||||
d["content_ltks"] = huqie.qie(q)
|
||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
return d
|
||||
|
||||
|
||||
@ -94,7 +94,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
res = []
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
Reference in New Issue
Block a user