Fix IDE warnings (#12281)

### What problem does this PR solve? As title ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2025-12-30 00:32:30 +08:00 · 2025-12-29 12:01:18 +08:00
parent 647fb115a0
commit 01f0ced1e6
43 changed files with 817 additions and 637 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -273,7 +273,7 @@ def tokenize(d, txt, eng):
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])


-def split_with_pattern(d, pattern:str, content:str, eng) -> list:
+def split_with_pattern(d, pattern: str, content: str, eng) -> list:
    docs = []
    txts = [txt for txt in re.split(r"(%s)" % pattern, content, flags=re.DOTALL)]
    for j in range(0, len(txts), 2):
@ -281,7 +281,7 @@ def split_with_pattern(d, pattern:str, content:str, eng) -> list:
        if not txt:
            continue
        if j + 1 < len(txts):
-            txt += txts[j+1]
+            txt += txts[j + 1]
        dd = copy.deepcopy(d)
        tokenize(dd, txt, eng)
        docs.append(dd)
@ -304,7 +304,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=
            except NotImplementedError:
                pass
        else:
-            add_positions(d, [[ii]*5])
+            add_positions(d, [[ii] * 5])

        if child_delimiters_pattern:
            d["mom_with_weight"] = ck
@ -325,7 +325,7 @@ def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_patte
        logging.debug("-- {}".format(ck))
        d = copy.deepcopy(doc)
        d["image"] = image
-        add_positions(d, [[ii]*5])
+        add_positions(d, [[ii] * 5])
        if child_delimiters_pattern:
            d["mom_with_weight"] = ck
            res.extend(split_with_pattern(d, child_delimiters_pattern, ck, eng))
@ -658,7 +658,8 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
            if "content_ltks" in ck:
                ck["content_ltks"] = rag_tokenizer.tokenize(combined)
            if "content_sm_ltks" in ck:
-                ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
+                ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(
+                    ck.get("content_ltks", rag_tokenizer.tokenize(combined)))

    if positioned_indices:
        chunks[:] = [chunks[i] for i in ordered_indices]
@ -764,8 +765,8 @@ def not_title(txt):
        return True
    return re.search(r"[,;，。；！!]", txt)

-def tree_merge(bull, sections, depth):

+def tree_merge(bull, sections, depth):
    if not sections or bull < 0:
        return sections
    if isinstance(sections[0], type("")):
@ -777,16 +778,17 @@ def tree_merge(bull, sections, depth):

    def get_level(bull, section):
        text, layout = section
-        text = re.sub(r"\u3000", " ",   text).strip()
+        text = re.sub(r"\u3000", " ", text).strip()

        for i, title in enumerate(BULLET_PATTERN[bull]):
            if re.match(title, text.strip()):
-                return i+1, text
+                return i + 1, text
        else:
            if re.search(r"(title|head)", layout) and not not_title(text):
-                return len(BULLET_PATTERN[bull])+1, text
+                return len(BULLET_PATTERN[bull]) + 1, text
            else:
-                return len(BULLET_PATTERN[bull])+2, text
+                return len(BULLET_PATTERN[bull]) + 2, text
+
    level_set = set()
    lines = []
    for section in sections:
@ -812,8 +814,8 @@ def tree_merge(bull, sections, depth):

    return [element for element in root.get_tree() if element]

-def hierarchical_merge(bull, sections, depth):

+def hierarchical_merge(bull, sections, depth):
    if not sections or bull < 0:
        return []
    if isinstance(sections[0], type("")):
@ -922,10 +924,10 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
        if tnum < 8:
            pos = ""
        # Ensure that the length of the merged chunk does not exceed chunk_token_num
-        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
+        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent) / 100.:
            if cks:
                overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
-                t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
+                t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
            if t.find(pos) < 0:
                t += pos
            cks.append(t)
@ -957,7 +959,7 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
        return cks

    for sec, pos in sections:
-        add_chunk("\n"+sec, pos)
+        add_chunk("\n" + sec, pos)

    return cks

@ -978,10 +980,10 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
        if tnum < 8:
            pos = ""
        # Ensure that the length of the merged chunk does not exceed chunk_token_num
-        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
+        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent) / 100.:
            if cks:
                overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
-                t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
+                t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
            if t.find(pos) < 0:
                t += pos
            cks.append(t)
@ -1025,9 +1027,9 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
        if isinstance(text, tuple):
            text_str = text[0]
            text_pos = text[1] if len(text) > 1 else ""
-            add_chunk("\n"+text_str, image, text_pos)
+            add_chunk("\n" + text_str, image, text_pos)
        else:
-            add_chunk("\n"+text, image)
+            add_chunk("\n" + text, image)

    return cks, result_images

@ -1042,7 +1044,7 @@ def docx_question_level(p, bull=-1):
        for j, title in enumerate(BULLET_PATTERN[bull]):
            if re.match(title, txt):
                return j + 1, txt
-    return len(BULLET_PATTERN[bull])+1, txt
+    return len(BULLET_PATTERN[bull]) + 1, txt


 def concat_img(img1, img2):
@ -1211,7 +1213,7 @@ class Node:
        child = node.get_children()

        if level == 0 and texts:
-            tree_list.append("\n".join(titles+texts))
+            tree_list.append("\n".join(titles + texts))

        # Titles within configured depth are accumulated into the current path
        if 1 <= level <= self.depth:
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -205,11 +205,11 @@ class FulltextQueryer(QueryBase):
        s = 1e-9
        for k, v in qtwt.items():
            if k in dtwt:
-                s += v #* dtwt[k]
+                s += v  # * dtwt[k]
        q = 1e-9
        for k, v in qtwt.items():
-            q += v #* v
-        return s/q #math.sqrt(3. * (s / q / math.log10( len(dtwt.keys()) + 512 )))
+            q += v  # * v
+        return s / q  # math.sqrt(3. * (s / q / math.log10( len(dtwt.keys()) + 512 )))

    def paragraph(self, content_tks: str, keywords: list = [], keywords_topn=30):
        if isinstance(content_tks, str):
@ -232,4 +232,5 @@ class FulltextQueryer(QueryBase):
                keywords.append(f"{tk}^{w}")

        return MatchTextExpr(self.query_fields, " ".join(keywords), 100,
-                             {"minimum_should_match": min(3, len(keywords) / 10), "original_query": " ".join(origin_keywords)})
+                             {"minimum_should_match": min(3, len(keywords) / 10),
+                              "original_query": " ".join(origin_keywords)})
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -66,7 +66,8 @@ class Dealer:
            if key in req and req[key] is not None:
                condition[field] = req[key]
        # TODO(yzc): `available_int` is nullable however infinity doesn't support nullable columns.
-        for key in ["knowledge_graph_kwd", "available_int", "entity_kwd", "from_entity_kwd", "to_entity_kwd", "removed_kwd"]:
+        for key in ["knowledge_graph_kwd", "available_int", "entity_kwd", "from_entity_kwd", "to_entity_kwd",
+                    "removed_kwd"]:
            if key in req and req[key] is not None:
                condition[key] = req[key]
        return condition
@ -141,7 +142,8 @@ class Dealer:
                        matchText, _ = self.qryr.question(qst, min_match=0.1)
                        matchDense.extra_options["similarity"] = 0.17
                        res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr],
-                                                    orderBy, offset, limit, idx_names, kb_ids, rank_feature=rank_feature)
+                                                    orderBy, offset, limit, idx_names, kb_ids,
+                                                    rank_feature=rank_feature)
                        total = self.dataStore.get_total(res)
                    logging.debug("Dealer.search 2 TOTAL: {}".format(total))

@ -218,8 +220,9 @@ class Dealer:
        ans_v, _ = embd_mdl.encode(pieces_)
        for i in range(len(chunk_v)):
            if len(ans_v[0]) != len(chunk_v[i]):
-                chunk_v[i] = [0.0]*len(ans_v[0])
-                logging.warning("The dimension of query and chunk do not match: {} vs. {}".format(len(ans_v[0]), len(chunk_v[i])))
+                chunk_v[i] = [0.0] * len(ans_v[0])
+                logging.warning(
+                    "The dimension of query and chunk do not match: {} vs. {}".format(len(ans_v[0]), len(chunk_v[i])))

        assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
            len(ans_v[0]), len(chunk_v[0]))
@ -273,7 +276,7 @@ class Dealer:
        if not query_rfea:
            return np.array([0 for _ in range(len(search_res.ids))]) + pageranks

-        q_denor = np.sqrt(np.sum([s*s for t,s in query_rfea.items() if t != PAGERANK_FLD]))
+        q_denor = np.sqrt(np.sum([s * s for t, s in query_rfea.items() if t != PAGERANK_FLD]))
        for i in search_res.ids:
            nor, denor = 0, 0
            if not search_res.field[i].get(TAG_FLD):
@ -286,8 +289,8 @@ class Dealer:
            if denor == 0:
                rank_fea.append(0)
            else:
-                rank_fea.append(nor/np.sqrt(denor)/q_denor)
-        return np.array(rank_fea)*10. + pageranks
+                rank_fea.append(nor / np.sqrt(denor) / q_denor)
+        return np.array(rank_fea) * 10. + pageranks

    def rerank(self, sres, query, tkweight=0.3,
               vtweight=0.7, cfield="content_ltks",
@ -358,21 +361,21 @@ class Dealer:
                                           rag_tokenizer.tokenize(inst).split())

    def retrieval(
-        self,
-        question,
-        embd_mdl,
-        tenant_ids,
-        kb_ids,
-        page,
-        page_size,
-        similarity_threshold=0.2,
-        vector_similarity_weight=0.3,
-        top=1024,
-        doc_ids=None,
-        aggs=True,
-        rerank_mdl=None,
-        highlight=False,
-        rank_feature: dict | None = {PAGERANK_FLD: 10},
+            self,
+            question,
+            embd_mdl,
+            tenant_ids,
+            kb_ids,
+            page,
+            page_size,
+            similarity_threshold=0.2,
+            vector_similarity_weight=0.3,
+            top=1024,
+            doc_ids=None,
+            aggs=True,
+            rerank_mdl=None,
+            highlight=False,
+            rank_feature: dict | None = {PAGERANK_FLD: 10},
    ):
        ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
        if not question:
@ -395,7 +398,8 @@ class Dealer:
        if isinstance(tenant_ids, str):
            tenant_ids = tenant_ids.split(",")

-        sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight, rank_feature=rank_feature)
+        sres = self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight,
+                           rank_feature=rank_feature)

        if rerank_mdl and sres.total > 0:
            sim, tsim, vsim = self.rerank_by_model(
@ -558,13 +562,14 @@ class Dealer:

    def tag_content(self, tenant_id: str, kb_ids: list[str], doc, all_tags, topn_tags=3, keywords_topn=30, S=1000):
        idx_nm = index_name(tenant_id)
-        match_txt = self.qryr.paragraph(doc["title_tks"] + " " + doc["content_ltks"], doc.get("important_kwd", []), keywords_topn)
+        match_txt = self.qryr.paragraph(doc["title_tks"] + " " + doc["content_ltks"], doc.get("important_kwd", []),
+                                        keywords_topn)
        res = self.dataStore.search([], [], {}, [match_txt], OrderByExpr(), 0, 0, idx_nm, kb_ids, ["tag_kwd"])
        aggs = self.dataStore.get_aggregation(res, "tag_kwd")
        if not aggs:
            return False
        cnt = np.sum([c for _, c in aggs])
-        tag_fea = sorted([(a, round(0.1*(c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
+        tag_fea = sorted([(a, round(0.1 * (c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
                         key=lambda x: x[1] * -1)[:topn_tags]
        doc[TAG_FLD] = {a.replace(".", "_"): c for a, c in tag_fea if c > 0}
        return True
@ -580,11 +585,11 @@ class Dealer:
        if not aggs:
            return {}
        cnt = np.sum([c for _, c in aggs])
-        tag_fea = sorted([(a, round(0.1*(c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
+        tag_fea = sorted([(a, round(0.1 * (c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
                         key=lambda x: x[1] * -1)[:topn_tags]
        return {a.replace(".", "_"): max(1, c) for a, c in tag_fea}

-    def retrieval_by_toc(self, query:str, chunks:list[dict], tenant_ids:list[str], chat_mdl, topn: int=6):
+    def retrieval_by_toc(self, query: str, chunks: list[dict], tenant_ids: list[str], chat_mdl, topn: int = 6):
        if not chunks:
            return []
        idx_nms = [index_name(tid) for tid in tenant_ids]
@ -594,9 +599,10 @@ class Dealer:
                ranks[ck["doc_id"]] = 0
            ranks[ck["doc_id"]] += ck["similarity"]
            doc_id2kb_id[ck["doc_id"]] = ck["kb_id"]
-        doc_id = sorted(ranks.items(), key=lambda x: x[1]*-1.)[0][0]
+        doc_id = sorted(ranks.items(), key=lambda x: x[1] * -1.)[0][0]
        kb_ids = [doc_id2kb_id[doc_id]]
-        es_res = self.dataStore.search(["content_with_weight"], [], {"doc_id": doc_id, "toc_kwd": "toc"}, [], OrderByExpr(), 0, 128, idx_nms,
+        es_res = self.dataStore.search(["content_with_weight"], [], {"doc_id": doc_id, "toc_kwd": "toc"}, [],
+                                       OrderByExpr(), 0, 128, idx_nms,
                                       kb_ids)
        toc = []
        dict_chunks = self.dataStore.get_fields(es_res, ["content_with_weight"])
@ -608,7 +614,7 @@ class Dealer:
        if not toc:
            return chunks

-        ids = asyncio.run(relevant_chunks_with_toc(query, toc, chat_mdl, topn*2))
+        ids = asyncio.run(relevant_chunks_with_toc(query, toc, chat_mdl, topn * 2))
        if not ids:
            return chunks

@ -644,9 +650,9 @@ class Dealer:
                    break
            chunks.append(d)

-        return sorted(chunks, key=lambda x:x["similarity"]*-1)[:topn]
+        return sorted(chunks, key=lambda x: x["similarity"] * -1)[:topn]

-    def retrieval_by_children(self, chunks:list[dict], tenant_ids:list[str]):
+    def retrieval_by_children(self, chunks: list[dict], tenant_ids: list[str]):
        if not chunks:
            return []
        idx_nms = [index_name(tid) for tid in tenant_ids]
@ -692,4 +698,4 @@ class Dealer:
                    break
            chunks.append(d)

-        return sorted(chunks, key=lambda x:x["similarity"]*-1)
+        return sorted(chunks, key=lambda x: x["similarity"] * -1)
--- a/rag/nlp/surname.py
+++ b/rag/nlp/surname.py
@ -14,129 +14,131 @@
 #  limitations under the License.
 #

-m = set(["赵","钱","孙","李",
-"周","吴","郑","王",
-"冯","陈","褚","卫",
-"蒋","沈","韩","杨",
-"朱","秦","尤","许",
-"何","吕","施","张",
-"孔","曹","严","华",
-"金","魏","陶","姜",
-"戚","谢","邹","喻",
-"柏","水","窦","章",
-"云","苏","潘","葛",
-"奚","范","彭","郎",
-"鲁","韦","昌","马",
-"苗","凤","花","方",
-"俞","任","袁","柳",
-"酆","鲍","史","唐",
-"费","廉","岑","薛",
-"雷","贺","倪","汤",
-"滕","殷","罗","毕",
-"郝","邬","安","常",
-"乐","于","时","傅",
-"皮","卞","齐","康",
-"伍","余","元","卜",
-"顾","孟","平","黄",
-"和","穆","萧","尹",
-"姚","邵","湛","汪",
-"祁","毛","禹","狄",
-"米","贝","明","臧",
-"计","伏","成","戴",
-"谈","宋","茅","庞",
-"熊","纪","舒","屈",
-"项","祝","董","梁",
-"杜","阮","蓝","闵",
-"席","季","麻","强",
-"贾","路","娄","危",
-"江","童","颜","郭",
-"梅","盛","林","刁",
-"钟","徐","邱","骆",
-"高","夏","蔡","田",
-"樊","胡","凌","霍",
-"虞","万","支","柯",
-"昝","管","卢","莫",
-"经","房","裘","缪",
-"干","解","应","宗",
-"丁","宣","贲","邓",
-"郁","单","杭","洪",
-"包","诸","左","石",
-"崔","吉","钮","龚",
-"程","嵇","邢","滑",
-"裴","陆","荣","翁",
-"荀","羊","於","惠",
-"甄","曲","家","封",
-"芮","羿","储","靳",
-"汲","邴","糜","松",
-"井","段","富","巫",
-"乌","焦","巴","弓",
-"牧","隗","山","谷",
-"车","侯","宓","蓬",
-"全","郗","班","仰",
-"秋","仲","伊","宫",
-"宁","仇","栾","暴",
-"甘","钭","厉","戎",
-"祖","武","符","刘",
-"景","詹","束","龙",
-"叶","幸","司","韶",
-"郜","黎","蓟","薄",
-"印","宿","白","怀",
-"蒲","邰","从","鄂",
-"索","咸","籍","赖",
-"卓","蔺","屠","蒙",
-"池","乔","阴","鬱",
-"胥","能","苍","双",
-"闻","莘","党","翟",
-"谭","贡","劳","逄",
-"姬","申","扶","堵",
-"冉","宰","郦","雍",
-"郤","璩","桑","桂",
-"濮","牛","寿","通",
-"边","扈","燕","冀",
-"郏","浦","尚","农",
-"温","别","庄","晏",
-"柴","瞿","阎","充",
-"慕","连","茹","习",
-"宦","艾","鱼","容",
-"向","古","易","慎",
-"戈","廖","庾","终",
-"暨","居","衡","步",
-"都","耿","满","弘",
-"匡","国","文","寇",
-"广","禄","阙","东",
-"欧","殳","沃","利",
-"蔚","越","夔","隆",
-"师","巩","厍","聂",
-"晁","勾","敖","融",
-"冷","訾","辛","阚",
-"那","简","饶","空",
-"曾","母","沙","乜",
-"养","鞠","须","丰",
-"巢","关","蒯","相",
-"查","后","荆","红",
-"游","竺","权","逯",
-"盖","益","桓","公",
-"兰","原","乞","西","阿","肖","丑","位","曽","巨","德","代","圆","尉","仵","纳","仝","脱","丘","但","展","迪","付","覃","晗","特","隋","苑","奥","漆","谌","郄","练","扎","邝","渠","信","门","陳","化","原","密","泮","鹿","赫",
-"万俟","司马","上官","欧阳",
-"夏侯","诸葛","闻人","东方",
-"赫连","皇甫","尉迟","公羊",
-"澹台","公冶","宗政","濮阳",
-"淳于","单于","太叔","申屠",
-"公孙","仲孙","轩辕","令狐",
-"钟离","宇文","长孙","慕容",
-"鲜于","闾丘","司徒","司空",
-"亓官","司寇","仉督","子车",
-"颛孙","端木","巫马","公西",
-"漆雕","乐正","壤驷","公良",
-"拓跋","夹谷","宰父","榖梁",
-"晋","楚","闫","法","汝","鄢","涂","钦",
-"段干","百里","东郭","南门",
-"呼延","归","海","羊舌","微","生",
-"岳","帅","缑","亢","况","后","有","琴",
-"梁丘","左丘","东门","西门",
-"商","牟","佘","佴","伯","赏","南宫",
-"墨","哈","谯","笪","年","爱","阳","佟",
-"第五","言","福"])
+m = set(["赵", "钱", "孙", "李",
+         "周", "吴", "郑", "王",
+         "冯", "陈", "褚", "卫",
+         "蒋", "沈", "韩", "杨",
+         "朱", "秦", "尤", "许",
+         "何", "吕", "施", "张",
+         "孔", "曹", "严", "华",
+         "金", "魏", "陶", "姜",
+         "戚", "谢", "邹", "喻",
+         "柏", "水", "窦", "章",
+         "云", "苏", "潘", "葛",
+         "奚", "范", "彭", "郎",
+         "鲁", "韦", "昌", "马",
+         "苗", "凤", "花", "方",
+         "俞", "任", "袁", "柳",
+         "酆", "鲍", "史", "唐",
+         "费", "廉", "岑", "薛",
+         "雷", "贺", "倪", "汤",
+         "滕", "殷", "罗", "毕",
+         "郝", "邬", "安", "常",
+         "乐", "于", "时", "傅",
+         "皮", "卞", "齐", "康",
+         "伍", "余", "元", "卜",
+         "顾", "孟", "平", "黄",
+         "和", "穆", "萧", "尹",
+         "姚", "邵", "湛", "汪",
+         "祁", "毛", "禹", "狄",
+         "米", "贝", "明", "臧",
+         "计", "伏", "成", "戴",
+         "谈", "宋", "茅", "庞",
+         "熊", "纪", "舒", "屈",
+         "项", "祝", "董", "梁",
+         "杜", "阮", "蓝", "闵",
+         "席", "季", "麻", "强",
+         "贾", "路", "娄", "危",
+         "江", "童", "颜", "郭",
+         "梅", "盛", "林", "刁",
+         "钟", "徐", "邱", "骆",
+         "高", "夏", "蔡", "田",
+         "樊", "胡", "凌", "霍",
+         "虞", "万", "支", "柯",
+         "昝", "管", "卢", "莫",
+         "经", "房", "裘", "缪",
+         "干", "解", "应", "宗",
+         "丁", "宣", "贲", "邓",
+         "郁", "单", "杭", "洪",
+         "包", "诸", "左", "石",
+         "崔", "吉", "钮", "龚",
+         "程", "嵇", "邢", "滑",
+         "裴", "陆", "荣", "翁",
+         "荀", "羊", "於", "惠",
+         "甄", "曲", "家", "封",
+         "芮", "羿", "储", "靳",
+         "汲", "邴", "糜", "松",
+         "井", "段", "富", "巫",
+         "乌", "焦", "巴", "弓",
+         "牧", "隗", "山", "谷",
+         "车", "侯", "宓", "蓬",
+         "全", "郗", "班", "仰",
+         "秋", "仲", "伊", "宫",
+         "宁", "仇", "栾", "暴",
+         "甘", "钭", "厉", "戎",
+         "祖", "武", "符", "刘",
+         "景", "詹", "束", "龙",
+         "叶", "幸", "司", "韶",
+         "郜", "黎", "蓟", "薄",
+         "印", "宿", "白", "怀",
+         "蒲", "邰", "从", "鄂",
+         "索", "咸", "籍", "赖",
+         "卓", "蔺", "屠", "蒙",
+         "池", "乔", "阴", "鬱",
+         "胥", "能", "苍", "双",
+         "闻", "莘", "党", "翟",
+         "谭", "贡", "劳", "逄",
+         "姬", "申", "扶", "堵",
+         "冉", "宰", "郦", "雍",
+         "郤", "璩", "桑", "桂",
+         "濮", "牛", "寿", "通",
+         "边", "扈", "燕", "冀",
+         "郏", "浦", "尚", "农",
+         "温", "别", "庄", "晏",
+         "柴", "瞿", "阎", "充",
+         "慕", "连", "茹", "习",
+         "宦", "艾", "鱼", "容",
+         "向", "古", "易", "慎",
+         "戈", "廖", "庾", "终",
+         "暨", "居", "衡", "步",
+         "都", "耿", "满", "弘",
+         "匡", "国", "文", "寇",
+         "广", "禄", "阙", "东",
+         "欧", "殳", "沃", "利",
+         "蔚", "越", "夔", "隆",
+         "师", "巩", "厍", "聂",
+         "晁", "勾", "敖", "融",
+         "冷", "訾", "辛", "阚",
+         "那", "简", "饶", "空",
+         "曾", "母", "沙", "乜",
+         "养", "鞠", "须", "丰",
+         "巢", "关", "蒯", "相",
+         "查", "后", "荆", "红",
+         "游", "竺", "权", "逯",
+         "盖", "益", "桓", "公",
+         "兰", "原", "乞", "西", "阿", "肖", "丑", "位", "曽", "巨", "德", "代", "圆", "尉", "仵", "纳", "仝", "脱",
+         "丘", "但", "展", "迪", "付", "覃", "晗", "特", "隋", "苑", "奥", "漆", "谌", "郄", "练", "扎", "邝", "渠",
+         "信", "门", "陳", "化", "原", "密", "泮", "鹿", "赫",
+         "万俟", "司马", "上官", "欧阳",
+         "夏侯", "诸葛", "闻人", "东方",
+         "赫连", "皇甫", "尉迟", "公羊",
+         "澹台", "公冶", "宗政", "濮阳",
+         "淳于", "单于", "太叔", "申屠",
+         "公孙", "仲孙", "轩辕", "令狐",
+         "钟离", "宇文", "长孙", "慕容",
+         "鲜于", "闾丘", "司徒", "司空",
+         "亓官", "司寇", "仉督", "子车",
+         "颛孙", "端木", "巫马", "公西",
+         "漆雕", "乐正", "壤驷", "公良",
+         "拓跋", "夹谷", "宰父", "榖梁",
+         "晋", "楚", "闫", "法", "汝", "鄢", "涂", "钦",
+         "段干", "百里", "东郭", "南门",
+         "呼延", "归", "海", "羊舌", "微", "生",
+         "岳", "帅", "缑", "亢", "况", "后", "有", "琴",
+         "梁丘", "左丘", "东门", "西门",
+         "商", "牟", "佘", "佴", "伯", "赏", "南宫",
+         "墨", "哈", "谯", "笪", "年", "爱", "阳", "佟",
+         "第五", "言", "福"])

-def isit(n):return n.strip() in m

+def isit(n): return n.strip() in m
--- a/rag/nlp/term_weight.py
+++ b/rag/nlp/term_weight.py
@ -1,4 +1,4 @@
-    #
+#
 #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
@ -108,13 +108,14 @@ class Dealer:
                if re.match(p, t):
                    tk = "#"
                    break
-            #tk = re.sub(r"([\+\\-])", r"\\\1", tk)
+            # tk = re.sub(r"([\+\\-])", r"\\\1", tk)
            if tk != "#" and tk:
                res.append(tk)
        return res

    def token_merge(self, tks):
-        def one_term(t): return len(t) == 1 or re.match(r"[0-9a-z]{1,2}$", t)
+        def one_term(t):
+            return len(t) == 1 or re.match(r"[0-9a-z]{1,2}$", t)

        res, i = [], 0
        while i < len(tks):
@ -152,8 +153,8 @@ class Dealer:
        tks = []
        for t in re.sub(r"[ \t]+", " ", txt).split():
            if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
-               re.match(r".*[a-zA-Z]$", t) and tks and \
-               self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
+                    re.match(r".*[a-zA-Z]$", t) and tks and \
+                    self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
                tks[-1] = tks[-1] + " " + t
            else:
                tks.append(t)
@ -220,14 +221,15 @@ class Dealer:

            return 3

-        def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
+        def idf(s, N):
+            return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))

        tw = []
        if not preprocess:
            idf1 = np.array([idf(freq(t), 10000000) for t in tks])
            idf2 = np.array([idf(df(t), 1000000000) for t in tks])
            wts = (0.3 * idf1 + 0.7 * idf2) * \
-                np.array([ner(t) * postag(t) for t in tks])
+                  np.array([ner(t) * postag(t) for t in tks])
            wts = [s for s in wts]
            tw = list(zip(tks, wts))
        else:
@ -236,7 +238,7 @@ class Dealer:
                idf1 = np.array([idf(freq(t), 10000000) for t in tt])
                idf2 = np.array([idf(df(t), 1000000000) for t in tt])
                wts = (0.3 * idf1 + 0.7 * idf2) * \
-                    np.array([ner(t) * postag(t) for t in tt])
+                      np.array([ner(t) * postag(t) for t in tt])
                wts = [s for s in wts]
                tw.extend(zip(tt, wts))