Refactor function name (#11210)

### What problem does this PR solve? As title ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2026-02-02 16:45:08 +08:00 · 2025-11-12 19:00:15 +08:00
parent a36a0fe71c
commit 296476ab89
20 changed files with 105 additions and 103 deletions
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -38,11 +38,11 @@ class FulltextQueryer:
        ]

    @staticmethod
-    def subSpecialChar(line):
+    def sub_special_char(line):
        return re.sub(r"([:\{\}/\[\]\-\*\"\(\)\|\+~\^])", r"\\\1", line).strip()

    @staticmethod
-    def isChinese(line):
+    def is_chinese(line):
        arr = re.split(r"[ \t]+", line)
        if len(arr) <= 3:
            return True
@ -92,7 +92,7 @@ class FulltextQueryer:
        otxt = txt
        txt = FulltextQueryer.rmWWW(txt)

-        if not self.isChinese(txt):
+        if not self.is_chinese(txt):
            txt = FulltextQueryer.rmWWW(txt)
            tks = rag_tokenizer.tokenize(txt).split()
            keywords = [t for t in tks if t]
@ -163,7 +163,7 @@ class FulltextQueryer:
                    )
                    for m in sm
                ]
-                sm = [FulltextQueryer.subSpecialChar(m) for m in sm if len(m) > 1]
+                sm = [FulltextQueryer.sub_special_char(m) for m in sm if len(m) > 1]
                sm = [m for m in sm if len(m) > 1]

                if len(keywords) < 32:
@ -171,7 +171,7 @@ class FulltextQueryer:
                    keywords.extend(sm)

                tk_syns = self.syn.lookup(tk)
-                tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
+                tk_syns = [FulltextQueryer.sub_special_char(s) for s in tk_syns]
                if len(keywords) < 32:
                    keywords.extend([s for s in tk_syns if s])
                tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
@ -180,7 +180,7 @@ class FulltextQueryer:
                if len(keywords) >= 32:
                    break

-                tk = FulltextQueryer.subSpecialChar(tk)
+                tk = FulltextQueryer.sub_special_char(tk)
                if tk.find(" ") > 0:
                    tk = '"%s"' % tk
                if tk_syns:
@ -198,7 +198,7 @@ class FulltextQueryer:
            syns = " OR ".join(
                [
                    '"%s"'
-                    % rag_tokenizer.tokenize(FulltextQueryer.subSpecialChar(s))
+                    % rag_tokenizer.tokenize(FulltextQueryer.sub_special_char(s))
                    for s in syns
                ]
            )
@ -217,17 +217,17 @@ class FulltextQueryer:
        return None, keywords

    def hybrid_similarity(self, avec, bvecs, atks, btkss, tkweight=0.3, vtweight=0.7):
-        from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
+        from sklearn.metrics.pairwise import cosine_similarity
        import numpy as np

-        sims = CosineSimilarity([avec], bvecs)
+        sims = cosine_similarity([avec], bvecs)
        tksim = self.token_similarity(atks, btkss)
        if np.sum(sims[0]) == 0:
            return np.array(tksim), tksim, sims[0]
        return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, tksim, sims[0]

    def token_similarity(self, atks, btkss):
-        def toDict(tks):
+        def to_dict(tks):
            if isinstance(tks, str):
                tks = tks.split()
            d = defaultdict(int)
@ -236,8 +236,8 @@ class FulltextQueryer:
                d[t] += c
            return d

-        atks = toDict(atks)
-        btkss = [toDict(tks) for tks in btkss]
+        atks = to_dict(atks)
+        btkss = [to_dict(tks) for tks in btkss]
        return [self.similarity(atks, btks) for btks in btkss]

    def similarity(self, qtwt, dtwt):
@ -262,10 +262,10 @@ class FulltextQueryer:
        keywords = [f'"{k.strip()}"' for k in keywords]
        for tk, w in sorted(tks_w, key=lambda x: x[1] * -1)[:keywords_topn]:
            tk_syns = self.syn.lookup(tk)
-            tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
+            tk_syns = [FulltextQueryer.sub_special_char(s) for s in tk_syns]
            tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
            tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns]
-            tk = FulltextQueryer.subSpecialChar(tk)
+            tk = FulltextQueryer.sub_special_char(tk)
            if tk.find(" ") > 0:
                tk = '"%s"' % tk
            if tk_syns:
--- a/rag/nlp/rag_tokenizer.py
+++ b/rag/nlp/rag_tokenizer.py
@ -35,7 +35,7 @@ class RagTokenizer:
    def rkey_(self, line):
        return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]

-    def loadDict_(self, fnm):
+    def _load_dict(self, fnm):
        logging.info(f"[HUQIE]:Build trie from {fnm}")
        try:
            of = open(fnm, "r", encoding='utf-8')
@ -85,18 +85,18 @@ class RagTokenizer:
            self.trie_ = datrie.Trie(string.printable)

        # load data from dict file and save to trie file
-        self.loadDict_(self.DIR_ + ".txt")
+        self._load_dict(self.DIR_ + ".txt")

-    def loadUserDict(self, fnm):
+    def load_user_dict(self, fnm):
        try:
            self.trie_ = datrie.Trie.load(fnm + ".trie")
            return
        except Exception:
            self.trie_ = datrie.Trie(string.printable)
-        self.loadDict_(fnm)
+        self._load_dict(fnm)

-    def addUserDict(self, fnm):
-        self.loadDict_(fnm)
+    def add_user_dict(self, fnm):
+        self._load_dict(fnm)

    def _strQ2B(self, ustring):
        """Convert full-width characters to half-width characters"""
@ -221,7 +221,7 @@ class RagTokenizer:
        logging.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
        return tks, B / len(tks) + L + F

-    def sortTks_(self, tkslist):
+    def _sort_tokens(self, tkslist):
        res = []
        for tfts in tkslist:
            tks, s = self.score_(tfts)
@ -246,7 +246,7 @@ class RagTokenizer:

        return " ".join(res)

-    def maxForward_(self, line):
+    def _max_forward(self, line):
        res = []
        s = 0
        while s < len(line):
@ -270,7 +270,7 @@ class RagTokenizer:

        return self.score_(res)

-    def maxBackward_(self, line):
+    def _max_backward(self, line):
        res = []
        s = len(line) - 1
        while s >= 0:
@ -336,8 +336,8 @@ class RagTokenizer:
                continue

            # use maxforward for the first time
-            tks, s = self.maxForward_(L)
-            tks1, s1 = self.maxBackward_(L)
+            tks, s = self._max_forward(L)
+            tks1, s1 = self._max_backward(L)
            if self.DEBUG:
                logging.debug("[FW] {} {}".format(tks, s))
                logging.debug("[BW] {} {}".format(tks1, s1))
@ -369,7 +369,7 @@ class RagTokenizer:
                # backward tokens from_i to i are different from forward tokens from _j to j.
                tkslist = []
                self.dfs_("".join(tks[_j:j]), 0, [], tkslist)
-                res.append(" ".join(self.sortTks_(tkslist)[0][0]))
+                res.append(" ".join(self._sort_tokens(tkslist)[0][0]))

                same = 1
                while i + same < len(tks1) and j + same < len(tks) and tks1[i + same] == tks[j + same]:
@ -385,7 +385,7 @@ class RagTokenizer:
                assert "".join(tks1[_i:]) == "".join(tks[_j:])
                tkslist = []
                self.dfs_("".join(tks[_j:]), 0, [], tkslist)
-                res.append(" ".join(self.sortTks_(tkslist)[0][0]))
+                res.append(" ".join(self._sort_tokens(tkslist)[0][0]))

        res = " ".join(res)
        logging.debug("[TKS] {}".format(self.merge_(res)))
@ -413,7 +413,7 @@ class RagTokenizer:
            if len(tkslist) < 2:
                res.append(tk)
                continue
-            stk = self.sortTks_(tkslist)[1][0]
+            stk = self._sort_tokens(tkslist)[1][0]
            if len(stk) == len(tk):
                stk = tk
            else:
@ -447,14 +447,13 @@ def is_number(s):


 def is_alphabet(s):
-    if (s >= u'\u0041' and s <= u'\u005a') or (
-            s >= u'\u0061' and s <= u'\u007a'):
+    if (u'\u0041' <= s <= u'\u005a') or (u'\u0061' <= s <= u'\u007a'):
        return True
    else:
        return False


-def naiveQie(txt):
+def naive_qie(txt):
    tks = []
    for t in txt.split():
        if tks and re.match(r".*[a-zA-Z]$", tks[-1]
@ -469,14 +468,14 @@ tokenize = tokenizer.tokenize
 fine_grained_tokenize = tokenizer.fine_grained_tokenize
 tag = tokenizer.tag
 freq = tokenizer.freq
-loadUserDict = tokenizer.loadUserDict
-addUserDict = tokenizer.addUserDict
+load_user_dict = tokenizer.load_user_dict
+add_user_dict = tokenizer.add_user_dict
 tradi2simp = tokenizer._tradi2simp
 strQ2B = tokenizer._strQ2B

 if __name__ == '__main__':
    tknzr = RagTokenizer(debug=True)
-    # huqie.addUserDict("/tmp/tmp.new.tks.dict")
+    # huqie.add_user_dict("/tmp/tmp.new.tks.dict")
    tks = tknzr.tokenize(
        "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
    logging.info(tknzr.fine_grained_tokenize(tks))
@ -506,7 +505,7 @@ if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit()
    tknzr.DEBUG = False
-    tknzr.loadUserDict(sys.argv[1])
+    tknzr.load_user_dict(sys.argv[1])
    of = open(sys.argv[2], "r")
    while True:
        line = of.readline()
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -102,7 +102,7 @@ class Dealer:
                orderBy.asc("top_int")
                orderBy.desc("create_timestamp_flt")
            res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
-            total = self.dataStore.getTotal(res)
+            total = self.dataStore.get_total(res)
            logging.debug("Dealer.search TOTAL: {}".format(total))
        else:
            highlightFields = ["content_ltks", "title_tks"]
@ -115,7 +115,7 @@ class Dealer:
                matchExprs = [matchText]
                res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit,
                                            idx_names, kb_ids, rank_feature=rank_feature)
-                total = self.dataStore.getTotal(res)
+                total = self.dataStore.get_total(res)
                logging.debug("Dealer.search TOTAL: {}".format(total))
            else:
                matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
@ -127,20 +127,20 @@ class Dealer:

                res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit,
                                            idx_names, kb_ids, rank_feature=rank_feature)
-                total = self.dataStore.getTotal(res)
+                total = self.dataStore.get_total(res)
                logging.debug("Dealer.search TOTAL: {}".format(total))

                # If result is empty, try again with lower min_match
                if total == 0:
                    if filters.get("doc_id"):
                        res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
-                        total = self.dataStore.getTotal(res)
+                        total = self.dataStore.get_total(res)
                    else:
                        matchText, _ = self.qryr.question(qst, min_match=0.1)
                        matchDense.extra_options["similarity"] = 0.17
                        res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr],
                                                    orderBy, offset, limit, idx_names, kb_ids, rank_feature=rank_feature)
-                        total = self.dataStore.getTotal(res)
+                        total = self.dataStore.get_total(res)
                    logging.debug("Dealer.search 2 TOTAL: {}".format(total))

            for k in keywords:
@ -153,17 +153,17 @@ class Dealer:
                    kwds.add(kk)

        logging.debug(f"TOTAL: {total}")
-        ids = self.dataStore.getChunkIds(res)
+        ids = self.dataStore.get_chunk_ids(res)
        keywords = list(kwds)
-        highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
-        aggs = self.dataStore.getAggregation(res, "docnm_kwd")
+        highlight = self.dataStore.get_highlight(res, keywords, "content_with_weight")
+        aggs = self.dataStore.get_aggregation(res, "docnm_kwd")
        return self.SearchResult(
            total=total,
            ids=ids,
            query_vector=q_vec,
            aggregation=aggs,
            highlight=highlight,
-            field=self.dataStore.getFields(res, src + ["_score"]),
+            field=self.dataStore.get_fields(res, src + ["_score"]),
            keywords=keywords
        )

@ -488,7 +488,7 @@ class Dealer:
        for p in range(offset, max_count, bs):
            es_res = self.dataStore.search(fields, [], condition, [], orderBy, p, bs, index_name(tenant_id),
                                           kb_ids)
-            dict_chunks = self.dataStore.getFields(es_res, fields)
+            dict_chunks = self.dataStore.get_fields(es_res, fields)
            for id, doc in dict_chunks.items():
                doc["id"] = id
            if dict_chunks:
@ -501,11 +501,11 @@ class Dealer:
        if not self.dataStore.indexExist(index_name(tenant_id), kb_ids[0]):
            return []
        res = self.dataStore.search([], [], {}, [], OrderByExpr(), 0, 0, index_name(tenant_id), kb_ids, ["tag_kwd"])
-        return self.dataStore.getAggregation(res, "tag_kwd")
+        return self.dataStore.get_aggregation(res, "tag_kwd")

    def all_tags_in_portion(self, tenant_id: str, kb_ids: list[str], S=1000):
        res = self.dataStore.search([], [], {}, [], OrderByExpr(), 0, 0, index_name(tenant_id), kb_ids, ["tag_kwd"])
-        res = self.dataStore.getAggregation(res, "tag_kwd")
+        res = self.dataStore.get_aggregation(res, "tag_kwd")
        total = np.sum([c for _, c in res])
        return {t: (c + 1) / (total + S) for t, c in res}

@ -513,7 +513,7 @@ class Dealer:
        idx_nm = index_name(tenant_id)
        match_txt = self.qryr.paragraph(doc["title_tks"] + " " + doc["content_ltks"], doc.get("important_kwd", []), keywords_topn)
        res = self.dataStore.search([], [], {}, [match_txt], OrderByExpr(), 0, 0, idx_nm, kb_ids, ["tag_kwd"])
-        aggs = self.dataStore.getAggregation(res, "tag_kwd")
+        aggs = self.dataStore.get_aggregation(res, "tag_kwd")
        if not aggs:
            return False
        cnt = np.sum([c for _, c in aggs])
@ -529,7 +529,7 @@ class Dealer:
            idx_nms = [index_name(tid) for tid in tenant_ids]
        match_txt, _ = self.qryr.question(question, min_match=0.0)
        res = self.dataStore.search([], [], {}, [match_txt], OrderByExpr(), 0, 0, idx_nms, kb_ids, ["tag_kwd"])
-        aggs = self.dataStore.getAggregation(res, "tag_kwd")
+        aggs = self.dataStore.get_aggregation(res, "tag_kwd")
        if not aggs:
            return {}
        cnt = np.sum([c for _, c in aggs])
@ -552,7 +552,7 @@ class Dealer:
        es_res = self.dataStore.search(["content_with_weight"], [], {"doc_id": doc_id, "toc_kwd": "toc"}, [], OrderByExpr(), 0, 128, idx_nms,
                                       kb_ids)
        toc = []
-        dict_chunks = self.dataStore.getFields(es_res, ["content_with_weight"])
+        dict_chunks = self.dataStore.get_fields(es_res, ["content_with_weight"])
        for _, doc in dict_chunks.items():
            try:
                toc.extend(json.loads(doc["content_with_weight"]))
--- a/rag/nlp/term_weight.py
+++ b/rag/nlp/term_weight.py
@ -113,20 +113,20 @@ class Dealer:
                res.append(tk)
        return res

-    def tokenMerge(self, tks):
-        def oneTerm(t): return len(t) == 1 or re.match(r"[0-9a-z]{1,2}$", t)
+    def token_merge(self, tks):
+        def one_term(t): return len(t) == 1 or re.match(r"[0-9a-z]{1,2}$", t)

        res, i = [], 0
        while i < len(tks):
            j = i
-            if i == 0 and oneTerm(tks[i]) and len(
+            if i == 0 and one_term(tks[i]) and len(
                    tks) > 1 and (len(tks[i + 1]) > 1 and not re.match(r"[0-9a-zA-Z]", tks[i + 1])):  # 多 工位
                res.append(" ".join(tks[0:2]))
                i = 2
                continue

            while j < len(
-                    tks) and tks[j] and tks[j] not in self.stop_words and oneTerm(tks[j]):
+                    tks) and tks[j] and tks[j] not in self.stop_words and one_term(tks[j]):
                j += 1
            if j - i > 1:
                if j - i < 5:
@ -232,7 +232,7 @@ class Dealer:
            tw = list(zip(tks, wts))
        else:
            for tk in tks:
-                tt = self.tokenMerge(self.pretoken(tk, True))
+                tt = self.token_merge(self.pretoken(tk, True))
                idf1 = np.array([idf(freq(t), 10000000) for t in tt])
                idf2 = np.array([idf(df(t), 1000000000) for t in tt])
                wts = (0.3 * idf1 + 0.7 * idf2) * \