Rework logging (#3358)

Unified all log files into one.

### What problem does this PR solve?

Unified all log files into one.

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-11-12 17:35:13 +08:00
committed by GitHub
parent 567a7563e7
commit a2a5631da4
75 changed files with 481 additions and 853 deletions

View File

@ -26,6 +26,7 @@ from word2number import w2n
from cn2an import cn2an
from PIL import Image
import json
from api.utils.log_utils import logger
all_codecs = [
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@ -235,7 +236,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
# wrap up as es documents
for ck in chunks:
if len(ck.strip()) == 0:continue
print("--", ck)
logger.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
if pdf_parser:
try:
@ -254,7 +255,7 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
# wrap up as es documents
for ck, image in zip(chunks, images):
if len(ck.strip()) == 0:continue
print("--", ck)
logger.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
d["image"] = image
tokenize(d, ck, eng)
@ -457,7 +458,7 @@ def hierarchical_merge(bull, sections, depth):
for i in range(len(cks)):
cks[i] = [sections[j] for j in cks[i][::-1]]
print("--------------\n", "\n* ".join(cks[i]))
logger.info("\n* ".join(cks[i]))
res = [[]]
num = [0]

View File

@ -22,10 +22,10 @@ import re
import string
import sys
from hanziconv import HanziConv
from huggingface_hub import snapshot_download
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class RagTokenizer:
@ -36,7 +36,7 @@ class RagTokenizer:
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
def loadDict_(self, fnm):
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
logger.info(f"[HUQIE]:Build trie {fnm}")
try:
of = open(fnm, "r", encoding='utf-8')
while True:
@ -52,8 +52,9 @@ class RagTokenizer:
self.trie_[self.rkey_(line[0])] = 1
self.trie_.save(fnm + ".trie")
of.close()
except Exception as e:
print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
except Exception:
logger.exception(f"[HUQIE]:Build trie {fnm} failed")
def __init__(self, debug=False):
self.DEBUG = debug
@ -68,8 +69,8 @@ class RagTokenizer:
try:
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
return
except Exception as e:
print("[HUQIE]:Build default trie", file=sys.stderr)
except Exception:
logger.exception("[HUQIE]:Build default trie")
self.trie_ = datrie.Trie(string.printable)
self.loadDict_(self.DIR_ + ".txt")
@ -78,7 +79,7 @@ class RagTokenizer:
try:
self.trie_ = datrie.Trie.load(fnm + ".trie")
return
except Exception as e:
except Exception:
self.trie_ = datrie.Trie(string.printable)
self.loadDict_(fnm)
@ -173,8 +174,7 @@ class RagTokenizer:
tks.append(tk)
F /= len(tks)
L /= len(tks)
if self.DEBUG:
print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
logger.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
return tks, B / len(tks) + L + F
def sortTks_(self, tkslist):
@ -278,8 +278,8 @@ class RagTokenizer:
tks, s = self.maxForward_(L)
tks1, s1 = self.maxBackward_(L)
if self.DEBUG:
print("[FW]", tks, s)
print("[BW]", tks1, s1)
logger.debug("[FW] {} {}".format(tks, s))
logger.debug("[BW] {} {}".format(tks1, s1))
i, j, _i, _j = 0, 0, 0, 0
same = 0
@ -326,8 +326,7 @@ class RagTokenizer:
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
res = " ".join(self.english_normalize_(res))
if self.DEBUG:
print("[TKS]", self.merge_(res))
logger.debug("[TKS] {}".format(self.merge_(res)))
return self.merge_(res)
def fine_grained_tokenize(self, tks):
@ -418,30 +417,30 @@ if __name__ == '__main__':
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
tks = tknzr.tokenize(
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("虽然我不怎么玩")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
if len(sys.argv) < 2:
sys.exit()
tknzr.DEBUG = False
@ -451,5 +450,5 @@ if __name__ == '__main__':
line = of.readline()
if not line:
break
print(tknzr.tokenize(line))
logger.info(tknzr.tokenize(line))
of.close()

View File

@ -19,7 +19,7 @@ import json
from typing import List, Optional, Dict, Union
from dataclasses import dataclass
from rag.settings import doc_store_logger
from api.utils.log_utils import logger
from rag.utils import rmSpace
from rag.nlp import rag_tokenizer, query
import numpy as np
@ -83,7 +83,7 @@ class Dealer:
orderBy.desc("create_timestamp_flt")
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
doc_store_logger.info("Dealer.search TOTAL: {}".format(total))
logger.info("Dealer.search TOTAL: {}".format(total))
else:
highlightFields = ["content_ltks", "title_tks"] if highlight else []
matchText, keywords = self.qryr.question(qst, min_match=0.3)
@ -91,7 +91,7 @@ class Dealer:
matchExprs = [matchText]
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
doc_store_logger.info("Dealer.search TOTAL: {}".format(total))
logger.info("Dealer.search TOTAL: {}".format(total))
else:
matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
q_vec = matchDense.embedding_data
@ -102,7 +102,7 @@ class Dealer:
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
doc_store_logger.info("Dealer.search TOTAL: {}".format(total))
logger.info("Dealer.search TOTAL: {}".format(total))
# If result is empty, try again with lower min_match
if total == 0:
@ -112,7 +112,7 @@ class Dealer:
matchDense.extra_options["similarity"] = 0.17
res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
doc_store_logger.info("Dealer.search 2 TOTAL: {}".format(total))
logger.info("Dealer.search 2 TOTAL: {}".format(total))
for k in keywords:
kwds.add(k)
@ -123,7 +123,7 @@ class Dealer:
continue
kwds.add(kk)
doc_store_logger.info(f"TOTAL: {total}")
logger.info(f"TOTAL: {total}")
ids=self.dataStore.getChunkIds(res)
keywords=list(kwds)
highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
@ -180,7 +180,7 @@ class Dealer:
continue
idx.append(i)
pieces_.append(t)
doc_store_logger.info("{} => {}".format(answer, pieces_))
logger.info("{} => {}".format(answer, pieces_))
if not pieces_:
return answer, set([])
@ -201,7 +201,7 @@ class Dealer:
chunks_tks,
tkweight, vtweight)
mx = np.max(sim) * 0.99
doc_store_logger.info("{} SIM: {}".format(pieces_[i], mx))
logger.info("{} SIM: {}".format(pieces_[i], mx))
if mx < thr:
continue
cites[idx[i]] = list(

View File

@ -17,10 +17,10 @@
import json
import os
import time
import logging
import re
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class Dealer:
@ -32,15 +32,15 @@ class Dealer:
path = os.path.join(get_project_base_directory(), "rag/res", "synonym.json")
try:
self.dictionary = json.load(open(path, 'r'))
except Exception as e:
logging.warn("Missing synonym.json")
except Exception:
logger.warn("Missing synonym.json")
self.dictionary = {}
if not redis:
logging.warning(
logger.warning(
"Realtime synonym is disabled, since no redis connection.")
if not len(self.dictionary.keys()):
logging.warning(f"Fail to load synonym")
logger.warning("Fail to load synonym")
self.redis = redis
self.load()
@ -64,7 +64,7 @@ class Dealer:
d = json.loads(d)
self.dictionary = d
except Exception as e:
logging.error("Fail to load synonym!" + str(e))
logger.error("Fail to load synonym!" + str(e))
def lookup(self, tk):
self.lookup_num += 1

View File

@ -21,6 +21,7 @@ import os
import numpy as np
from rag.nlp import rag_tokenizer
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class Dealer:
@ -81,12 +82,12 @@ class Dealer:
self.ne, self.df = {}, {}
try:
self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
except Exception as e:
print("[WARNING] Load ner.json FAIL!")
except Exception:
logger.warning("Load ner.json FAIL!")
try:
self.df = load_dict(os.path.join(fnm, "term.freq"))
except Exception as e:
print("[WARNING] Load term.freq FAIL!")
except Exception:
logger.warning("Load term.freq FAIL!")
def pretoken(self, txt, num=False, stpwd=True):
patt = [