refine code (#595)

### What problem does this PR solve? ### Type of change - [x] Refactoring
2026-02-03 00:55:10 +08:00 · 2024-04-28 19:13:33 +08:00
parent aee8b48d2f
commit 8c07992b6c
24 changed files with 538 additions and 116 deletions
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -2,7 +2,7 @@ import copy
 import re

 from api.db import ParserType
-from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
 from deepdoc.parser import PdfParser, PlainParser
 from rag.utils import num_tokens_from_string

@ -70,8 +70,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    doc = {
        "docnm_kwd": filename
    }
-    doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
-    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
+    doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
+    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
    # is it English
    eng = lang.lower() == "english"  # pdf_parser.is_english