Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
This commit is contained in:
Zhichang Yu
2024-11-14 17:13:48 +08:00
committed by GitHub
parent ab4384e011
commit 30f6421760
75 changed files with 396 additions and 402 deletions

View File

@ -14,6 +14,7 @@
# limitations under the License.
#
import logging
import random
from collections import Counter
@ -26,7 +27,6 @@ from word2number import w2n
from cn2an import cn2an
from PIL import Image
import json
from api.utils.log_utils import logger
all_codecs = [
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@ -236,7 +236,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
# wrap up as es documents
for ck in chunks:
if len(ck.strip()) == 0:continue
logger.debug("-- {}".format(ck))
logging.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
if pdf_parser:
try:
@ -255,7 +255,7 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
# wrap up as es documents
for ck, image in zip(chunks, images):
if len(ck.strip()) == 0:continue
logger.debug("-- {}".format(ck))
logging.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
d["image"] = image
tokenize(d, ck, eng)
@ -458,7 +458,7 @@ def hierarchical_merge(bull, sections, depth):
for i in range(len(cks)):
cks[i] = [sections[j] for j in cks[i][::-1]]
logger.info("\n* ".join(cks[i]))
logging.debug("\n* ".join(cks[i]))
res = [[]]
num = [0]

View File

@ -14,9 +14,9 @@
# limitations under the License.
#
import logging
import json
import re
import logging
from rag.utils.doc_store_conn import MatchTextExpr
from rag.nlp import rag_tokenizer, term_weight, synonym
@ -88,7 +88,7 @@ class FulltextQueryer:
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
syns.append(" ".join(syn))
q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)]
q = ["({}^{:.4f}".format(tk, w) + " %s)".format() for (tk, w), syn in zip(tks_w, syns)]
for i in range(1, len(tks_w)):
q.append(
'"%s %s"^%.4f'
@ -121,7 +121,7 @@ class FulltextQueryer:
twts = self.tw.weights([tt])
syns = self.syn.lookup(tt)
if syns: keywords.extend(syns)
logging.info(json.dumps(twts, ensure_ascii=False))
logging.debug(json.dumps(twts, ensure_ascii=False))
tms = []
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
sm = (

View File

@ -14,6 +14,7 @@
# limitations under the License.
#
import logging
import copy
import datrie
import math
@ -25,7 +26,6 @@ from hanziconv import HanziConv
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class RagTokenizer:
@ -36,7 +36,7 @@ class RagTokenizer:
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
def loadDict_(self, fnm):
logger.info(f"[HUQIE]:Build trie {fnm}")
logging.info(f"[HUQIE]:Build trie {fnm}")
try:
of = open(fnm, "r", encoding='utf-8')
while True:
@ -53,7 +53,7 @@ class RagTokenizer:
self.trie_.save(fnm + ".trie")
of.close()
except Exception:
logger.exception(f"[HUQIE]:Build trie {fnm} failed")
logging.exception(f"[HUQIE]:Build trie {fnm} failed")
def __init__(self, debug=False):
self.DEBUG = debug
@ -69,7 +69,7 @@ class RagTokenizer:
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
return
except Exception:
logger.exception("[HUQIE]:Build default trie")
logging.exception("[HUQIE]:Build default trie")
self.trie_ = datrie.Trie(string.printable)
self.loadDict_(self.DIR_ + ".txt")
@ -173,7 +173,7 @@ class RagTokenizer:
tks.append(tk)
F /= len(tks)
L /= len(tks)
logger.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
logging.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
return tks, B / len(tks) + L + F
def sortTks_(self, tkslist):
@ -277,8 +277,8 @@ class RagTokenizer:
tks, s = self.maxForward_(L)
tks1, s1 = self.maxBackward_(L)
if self.DEBUG:
logger.debug("[FW] {} {}".format(tks, s))
logger.debug("[BW] {} {}".format(tks1, s1))
logging.debug("[FW] {} {}".format(tks, s))
logging.debug("[BW] {} {}".format(tks1, s1))
i, j, _i, _j = 0, 0, 0, 0
same = 0
@ -325,7 +325,7 @@ class RagTokenizer:
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
res = " ".join(self.english_normalize_(res))
logger.debug("[TKS] {}".format(self.merge_(res)))
logging.debug("[TKS] {}".format(self.merge_(res)))
return self.merge_(res)
def fine_grained_tokenize(self, tks):
@ -416,30 +416,30 @@ if __name__ == '__main__':
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
tks = tknzr.tokenize(
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("虽然我不怎么玩")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
logger.info(tknzr.fine_grained_tokenize(tks))
logging.info(tknzr.fine_grained_tokenize(tks))
if len(sys.argv) < 2:
sys.exit()
tknzr.DEBUG = False
@ -449,5 +449,5 @@ if __name__ == '__main__':
line = of.readline()
if not line:
break
logger.info(tknzr.tokenize(line))
logging.info(tknzr.tokenize(line))
of.close()

View File

@ -14,12 +14,12 @@
# limitations under the License.
#
import logging
import re
import json
from typing import List, Optional, Dict, Union
from dataclasses import dataclass
from api.utils.log_utils import logger
from rag.utils import rmSpace
from rag.nlp import rag_tokenizer, query
import numpy as np
@ -83,7 +83,7 @@ class Dealer:
orderBy.desc("create_timestamp_flt")
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
logger.info("Dealer.search TOTAL: {}".format(total))
logging.debug("Dealer.search TOTAL: {}".format(total))
else:
highlightFields = ["content_ltks", "title_tks"] if highlight else []
matchText, keywords = self.qryr.question(qst, min_match=0.3)
@ -91,7 +91,7 @@ class Dealer:
matchExprs = [matchText]
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
logger.info("Dealer.search TOTAL: {}".format(total))
logging.debug("Dealer.search TOTAL: {}".format(total))
else:
matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
q_vec = matchDense.embedding_data
@ -102,7 +102,7 @@ class Dealer:
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
logger.info("Dealer.search TOTAL: {}".format(total))
logging.debug("Dealer.search TOTAL: {}".format(total))
# If result is empty, try again with lower min_match
if total == 0:
@ -112,7 +112,7 @@ class Dealer:
matchDense.extra_options["similarity"] = 0.17
res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
logger.info("Dealer.search 2 TOTAL: {}".format(total))
logging.debug("Dealer.search 2 TOTAL: {}".format(total))
for k in keywords:
kwds.add(k)
@ -123,7 +123,7 @@ class Dealer:
continue
kwds.add(kk)
logger.info(f"TOTAL: {total}")
logging.debug(f"TOTAL: {total}")
ids=self.dataStore.getChunkIds(res)
keywords=list(kwds)
highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
@ -180,7 +180,7 @@ class Dealer:
continue
idx.append(i)
pieces_.append(t)
logger.info("{} => {}".format(answer, pieces_))
logging.debug("{} => {}".format(answer, pieces_))
if not pieces_:
return answer, set([])
@ -201,7 +201,7 @@ class Dealer:
chunks_tks,
tkweight, vtweight)
mx = np.max(sim) * 0.99
logger.info("{} SIM: {}".format(pieces_[i], mx))
logging.debug("{} SIM: {}".format(pieces_[i], mx))
if mx < thr:
continue
cites[idx[i]] = list(

View File

@ -14,13 +14,13 @@
# limitations under the License.
#
import logging
import json
import os
import time
import re
from nltk.corpus import wordnet
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class Dealer:
@ -33,14 +33,14 @@ class Dealer:
try:
self.dictionary = json.load(open(path, 'r'))
except Exception:
logger.warn("Missing synonym.json")
logging.warn("Missing synonym.json")
self.dictionary = {}
if not redis:
logger.warning(
logging.warning(
"Realtime synonym is disabled, since no redis connection.")
if not len(self.dictionary.keys()):
logger.warning("Fail to load synonym")
logging.warning("Fail to load synonym")
self.redis = redis
self.load()
@ -64,7 +64,7 @@ class Dealer:
d = json.loads(d)
self.dictionary = d
except Exception as e:
logger.error("Fail to load synonym!" + str(e))
logging.error("Fail to load synonym!" + str(e))
def lookup(self, tk):
if re.match(r"[a-z]+$", tk):

View File

@ -14,6 +14,7 @@
# limitations under the License.
#
import logging
import math
import json
import re
@ -21,7 +22,6 @@ import os
import numpy as np
from rag.nlp import rag_tokenizer
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class Dealer:
@ -83,11 +83,11 @@ class Dealer:
try:
self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
except Exception:
logger.warning("Load ner.json FAIL!")
logging.warning("Load ner.json FAIL!")
try:
self.df = load_dict(os.path.join(fnm, "term.freq"))
except Exception:
logger.warning("Load term.freq FAIL!")
logging.warning("Load term.freq FAIL!")
def pretoken(self, txt, num=False, stpwd=True):
patt = [