Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve? Use consistent log file names, introduced initLogger ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
2026-02-03 00:55:10 +08:00 · 2024-11-14 17:13:48 +08:00
parent ab4384e011
commit 30f6421760
75 changed files with 396 additions and 402 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -14,6 +14,7 @@
 #  limitations under the License.
 #

+import logging
 import random
 from collections import Counter

@ -26,7 +27,6 @@ from word2number import w2n
 from cn2an import cn2an
 from PIL import Image
 import json
-from api.utils.log_utils import logger

 all_codecs = [
    'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@ -236,7 +236,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
    # wrap up as es documents
    for ck in chunks:
        if len(ck.strip()) == 0:continue
-        logger.debug("-- {}".format(ck))
+        logging.debug("-- {}".format(ck))
        d = copy.deepcopy(doc)
        if pdf_parser:
            try:
@ -255,7 +255,7 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
    # wrap up as es documents
    for ck, image in zip(chunks, images):
        if len(ck.strip()) == 0:continue
-        logger.debug("-- {}".format(ck))
+        logging.debug("-- {}".format(ck))
        d = copy.deepcopy(doc)
        d["image"] = image
        tokenize(d, ck, eng)
@ -458,7 +458,7 @@ def hierarchical_merge(bull, sections, depth):

    for i in range(len(cks)):
        cks[i] = [sections[j] for j in cks[i][::-1]]
-        logger.info("\n* ".join(cks[i]))
+        logging.debug("\n* ".join(cks[i]))

    res = [[]]
    num = [0]
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -14,9 +14,9 @@
 #  limitations under the License.
 #

+import logging
 import json
 import re
-import logging
 from rag.utils.doc_store_conn import MatchTextExpr

 from rag.nlp import rag_tokenizer, term_weight, synonym
@ -88,7 +88,7 @@ class FulltextQueryer:
                syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
                syns.append(" ".join(syn))

-            q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)]
+            q = ["({}^{:.4f}".format(tk, w) + " %s)".format() for (tk, w), syn in zip(tks_w, syns)]
            for i in range(1, len(tks_w)):
                q.append(
                    '"%s %s"^%.4f'
@ -121,7 +121,7 @@ class FulltextQueryer:
            twts = self.tw.weights([tt])
            syns = self.syn.lookup(tt)
            if syns: keywords.extend(syns)
-            logging.info(json.dumps(twts, ensure_ascii=False))
+            logging.debug(json.dumps(twts, ensure_ascii=False))
            tms = []
            for tk, w in sorted(twts, key=lambda x: x[1] * -1):
                sm = (
--- a/rag/nlp/rag_tokenizer.py
+++ b/rag/nlp/rag_tokenizer.py
@ -14,6 +14,7 @@
 #  limitations under the License.
 #

+import logging
 import copy
 import datrie
 import math
@ -25,7 +26,6 @@ from hanziconv import HanziConv
 from nltk import word_tokenize
 from nltk.stem import PorterStemmer, WordNetLemmatizer
 from api.utils.file_utils import get_project_base_directory
-from api.utils.log_utils import logger


 class RagTokenizer:
@ -36,7 +36,7 @@ class RagTokenizer:
        return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]

    def loadDict_(self, fnm):
-        logger.info(f"[HUQIE]:Build trie {fnm}")
+        logging.info(f"[HUQIE]:Build trie {fnm}")
        try:
            of = open(fnm, "r", encoding='utf-8')
            while True:
@ -53,7 +53,7 @@ class RagTokenizer:
            self.trie_.save(fnm + ".trie")
            of.close()
        except Exception:
-            logger.exception(f"[HUQIE]:Build trie {fnm} failed")
+            logging.exception(f"[HUQIE]:Build trie {fnm} failed")

    def __init__(self, debug=False):
        self.DEBUG = debug
@ -69,7 +69,7 @@ class RagTokenizer:
            self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
            return
        except Exception:
-            logger.exception("[HUQIE]:Build default trie")
+            logging.exception("[HUQIE]:Build default trie")
            self.trie_ = datrie.Trie(string.printable)

        self.loadDict_(self.DIR_ + ".txt")
@ -173,7 +173,7 @@ class RagTokenizer:
            tks.append(tk)
        F /= len(tks)
        L /= len(tks)
-        logger.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
+        logging.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
        return tks, B / len(tks) + L + F

    def sortTks_(self, tkslist):
@ -277,8 +277,8 @@ class RagTokenizer:
            tks, s = self.maxForward_(L)
            tks1, s1 = self.maxBackward_(L)
            if self.DEBUG:
-                logger.debug("[FW] {} {}".format(tks, s))
-                logger.debug("[BW] {} {}".format(tks1, s1))
+                logging.debug("[FW] {} {}".format(tks, s))
+                logging.debug("[BW] {} {}".format(tks1, s1))

            i, j, _i, _j = 0, 0, 0, 0
            same = 0
@ -325,7 +325,7 @@ class RagTokenizer:
                res.append(" ".join(self.sortTks_(tkslist)[0][0]))

        res = " ".join(self.english_normalize_(res))
-        logger.debug("[TKS] {}".format(self.merge_(res)))
+        logging.debug("[TKS] {}".format(self.merge_(res)))
        return self.merge_(res)

    def fine_grained_tokenize(self, tks):
@ -416,30 +416,30 @@ if __name__ == '__main__':
    # huqie.addUserDict("/tmp/tmp.new.tks.dict")
    tks = tknzr.tokenize(
        "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    tks = tknzr.tokenize(
        "公开征求意见稿提出，境外投资者可使用自有人民币或外汇投资。使用外汇投资的，可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行（以下统称香港结算行）办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的，在其投资的债券到期或卖出后，原则上应兑换回外汇。")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    tks = tknzr.tokenize(
        "多校划片就是一个小区对应多个小学初中，让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温，把就近入学落到实处。南京市长江大桥")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    tks = tknzr.tokenize(
        "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    tks = tknzr.tokenize("虽然我不怎么玩")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    tks = tknzr.tokenize(
        "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过，今天阿奇要讲到的这家农贸市场，说实话，还真蛮有特色的！不仅环境好，还打出了")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    tks = tknzr.tokenize("这周日你去吗？这周日你有空吗？")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    tks = tknzr.tokenize(
        "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
-    logger.info(tknzr.fine_grained_tokenize(tks))
+    logging.info(tknzr.fine_grained_tokenize(tks))
    if len(sys.argv) < 2:
        sys.exit()
    tknzr.DEBUG = False
@ -449,5 +449,5 @@ if __name__ == '__main__':
        line = of.readline()
        if not line:
            break
-        logger.info(tknzr.tokenize(line))
+        logging.info(tknzr.tokenize(line))
    of.close()
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -14,12 +14,12 @@
 #  limitations under the License.
 #

+import logging
 import re
 import json
 from typing import List, Optional, Dict, Union
 from dataclasses import dataclass

-from api.utils.log_utils import logger
 from rag.utils import rmSpace
 from rag.nlp import rag_tokenizer, query
 import numpy as np
@ -83,7 +83,7 @@ class Dealer:
                orderBy.desc("create_timestamp_flt")
            res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
            total=self.dataStore.getTotal(res)
-            logger.info("Dealer.search TOTAL: {}".format(total))
+            logging.debug("Dealer.search TOTAL: {}".format(total))
        else:
            highlightFields = ["content_ltks", "title_tks"] if highlight else []
            matchText, keywords = self.qryr.question(qst, min_match=0.3)
@ -91,7 +91,7 @@ class Dealer:
                matchExprs = [matchText]
                res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
                total=self.dataStore.getTotal(res)
-                logger.info("Dealer.search TOTAL: {}".format(total))
+                logging.debug("Dealer.search TOTAL: {}".format(total))
            else:
                matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
                q_vec = matchDense.embedding_data
@ -102,7 +102,7 @@ class Dealer:

                res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
                total=self.dataStore.getTotal(res)
-                logger.info("Dealer.search TOTAL: {}".format(total))
+                logging.debug("Dealer.search TOTAL: {}".format(total))

                # If result is empty, try again with lower min_match
                if total == 0:
@ -112,7 +112,7 @@ class Dealer:
                    matchDense.extra_options["similarity"] = 0.17
                    res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
                    total=self.dataStore.getTotal(res)
-                    logger.info("Dealer.search 2 TOTAL: {}".format(total))
+                    logging.debug("Dealer.search 2 TOTAL: {}".format(total))

            for k in keywords:
                kwds.add(k)
@ -123,7 +123,7 @@ class Dealer:
                        continue
                    kwds.add(kk)

-        logger.info(f"TOTAL: {total}")
+        logging.debug(f"TOTAL: {total}")
        ids=self.dataStore.getChunkIds(res)
        keywords=list(kwds)
        highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
@ -180,7 +180,7 @@ class Dealer:
                continue
            idx.append(i)
            pieces_.append(t)
-        logger.info("{} => {}".format(answer, pieces_))
+        logging.debug("{} => {}".format(answer, pieces_))
        if not pieces_:
            return answer, set([])

@ -201,7 +201,7 @@ class Dealer:
                                                                chunks_tks,
                                                                tkweight, vtweight)
                mx = np.max(sim) * 0.99
-                logger.info("{} SIM: {}".format(pieces_[i], mx))
+                logging.debug("{} SIM: {}".format(pieces_[i], mx))
                if mx < thr:
                    continue
                cites[idx[i]] = list(
--- a/rag/nlp/synonym.py
+++ b/rag/nlp/synonym.py
@ -14,13 +14,13 @@
 #  limitations under the License.
 #

+import logging
 import json
 import os
 import time
 import re
 from nltk.corpus import wordnet
 from api.utils.file_utils import get_project_base_directory
-from api.utils.log_utils import logger


 class Dealer:
@ -33,14 +33,14 @@ class Dealer:
        try:
            self.dictionary = json.load(open(path, 'r'))
        except Exception:
-            logger.warn("Missing synonym.json")
+            logging.warn("Missing synonym.json")
            self.dictionary = {}

        if not redis:
-            logger.warning(
+            logging.warning(
                "Realtime synonym is disabled, since no redis connection.")
        if not len(self.dictionary.keys()):
-            logger.warning("Fail to load synonym")
+            logging.warning("Fail to load synonym")

        self.redis = redis
        self.load()
@ -64,7 +64,7 @@ class Dealer:
            d = json.loads(d)
            self.dictionary = d
        except Exception as e:
-            logger.error("Fail to load synonym!" + str(e))
+            logging.error("Fail to load synonym!" + str(e))

    def lookup(self, tk):
        if re.match(r"[a-z]+$", tk):
--- a/rag/nlp/term_weight.py
+++ b/rag/nlp/term_weight.py
@ -14,6 +14,7 @@
 #  limitations under the License.
 #

+import logging
 import math
 import json
 import re
@ -21,7 +22,6 @@ import os
 import numpy as np
 from rag.nlp import rag_tokenizer
 from api.utils.file_utils import get_project_base_directory
-from api.utils.log_utils import logger


 class Dealer:
@ -83,11 +83,11 @@ class Dealer:
        try:
            self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
        except Exception:
-            logger.warning("Load ner.json FAIL!")
+            logging.warning("Load ner.json FAIL!")
        try:
            self.df = load_dict(os.path.join(fnm, "term.freq"))
        except Exception:
-            logger.warning("Load term.freq FAIL!")
+            logging.warning("Load term.freq FAIL!")

    def pretoken(self, txt, num=False, stpwd=True):
        patt = [