refine code (#595)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
This commit is contained in:
KevinHuSh
2024-04-28 19:13:33 +08:00
committed by GitHub
parent aee8b48d2f
commit 8c07992b6c
24 changed files with 538 additions and 116 deletions

View File

@ -2,7 +2,7 @@ import copy
import re
from api.db import ParserType
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
from rag.utils import num_tokens_from_string
@ -70,8 +70,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc = {
"docnm_kwd": filename
}
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english