mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Use consistent log file names, introduced initLogger (#3403)
### What problem does this PR solve? Use consistent log file names, introduced initLogger ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
This commit is contained in:
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import random
|
||||
from collections import Counter
|
||||
|
||||
@ -26,7 +27,6 @@ from word2number import w2n
|
||||
from cn2an import cn2an
|
||||
from PIL import Image
|
||||
import json
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
all_codecs = [
|
||||
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
||||
@ -236,7 +236,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
||||
# wrap up as es documents
|
||||
for ck in chunks:
|
||||
if len(ck.strip()) == 0:continue
|
||||
logger.debug("-- {}".format(ck))
|
||||
logging.debug("-- {}".format(ck))
|
||||
d = copy.deepcopy(doc)
|
||||
if pdf_parser:
|
||||
try:
|
||||
@ -255,7 +255,7 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
|
||||
# wrap up as es documents
|
||||
for ck, image in zip(chunks, images):
|
||||
if len(ck.strip()) == 0:continue
|
||||
logger.debug("-- {}".format(ck))
|
||||
logging.debug("-- {}".format(ck))
|
||||
d = copy.deepcopy(doc)
|
||||
d["image"] = image
|
||||
tokenize(d, ck, eng)
|
||||
@ -458,7 +458,7 @@ def hierarchical_merge(bull, sections, depth):
|
||||
|
||||
for i in range(len(cks)):
|
||||
cks[i] = [sections[j] for j in cks[i][::-1]]
|
||||
logger.info("\n* ".join(cks[i]))
|
||||
logging.debug("\n* ".join(cks[i]))
|
||||
|
||||
res = [[]]
|
||||
num = [0]
|
||||
|
||||
@ -14,9 +14,9 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
from rag.utils.doc_store_conn import MatchTextExpr
|
||||
|
||||
from rag.nlp import rag_tokenizer, term_weight, synonym
|
||||
@ -88,7 +88,7 @@ class FulltextQueryer:
|
||||
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
|
||||
syns.append(" ".join(syn))
|
||||
|
||||
q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)]
|
||||
q = ["({}^{:.4f}".format(tk, w) + " %s)".format() for (tk, w), syn in zip(tks_w, syns)]
|
||||
for i in range(1, len(tks_w)):
|
||||
q.append(
|
||||
'"%s %s"^%.4f'
|
||||
@ -121,7 +121,7 @@ class FulltextQueryer:
|
||||
twts = self.tw.weights([tt])
|
||||
syns = self.syn.lookup(tt)
|
||||
if syns: keywords.extend(syns)
|
||||
logging.info(json.dumps(twts, ensure_ascii=False))
|
||||
logging.debug(json.dumps(twts, ensure_ascii=False))
|
||||
tms = []
|
||||
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
||||
sm = (
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import copy
|
||||
import datrie
|
||||
import math
|
||||
@ -25,7 +26,6 @@ from hanziconv import HanziConv
|
||||
from nltk import word_tokenize
|
||||
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class RagTokenizer:
|
||||
@ -36,7 +36,7 @@ class RagTokenizer:
|
||||
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
|
||||
|
||||
def loadDict_(self, fnm):
|
||||
logger.info(f"[HUQIE]:Build trie {fnm}")
|
||||
logging.info(f"[HUQIE]:Build trie {fnm}")
|
||||
try:
|
||||
of = open(fnm, "r", encoding='utf-8')
|
||||
while True:
|
||||
@ -53,7 +53,7 @@ class RagTokenizer:
|
||||
self.trie_.save(fnm + ".trie")
|
||||
of.close()
|
||||
except Exception:
|
||||
logger.exception(f"[HUQIE]:Build trie {fnm} failed")
|
||||
logging.exception(f"[HUQIE]:Build trie {fnm} failed")
|
||||
|
||||
def __init__(self, debug=False):
|
||||
self.DEBUG = debug
|
||||
@ -69,7 +69,7 @@ class RagTokenizer:
|
||||
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
|
||||
return
|
||||
except Exception:
|
||||
logger.exception("[HUQIE]:Build default trie")
|
||||
logging.exception("[HUQIE]:Build default trie")
|
||||
self.trie_ = datrie.Trie(string.printable)
|
||||
|
||||
self.loadDict_(self.DIR_ + ".txt")
|
||||
@ -173,7 +173,7 @@ class RagTokenizer:
|
||||
tks.append(tk)
|
||||
F /= len(tks)
|
||||
L /= len(tks)
|
||||
logger.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
|
||||
logging.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
|
||||
return tks, B / len(tks) + L + F
|
||||
|
||||
def sortTks_(self, tkslist):
|
||||
@ -277,8 +277,8 @@ class RagTokenizer:
|
||||
tks, s = self.maxForward_(L)
|
||||
tks1, s1 = self.maxBackward_(L)
|
||||
if self.DEBUG:
|
||||
logger.debug("[FW] {} {}".format(tks, s))
|
||||
logger.debug("[BW] {} {}".format(tks1, s1))
|
||||
logging.debug("[FW] {} {}".format(tks, s))
|
||||
logging.debug("[BW] {} {}".format(tks1, s1))
|
||||
|
||||
i, j, _i, _j = 0, 0, 0, 0
|
||||
same = 0
|
||||
@ -325,7 +325,7 @@ class RagTokenizer:
|
||||
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
||||
|
||||
res = " ".join(self.english_normalize_(res))
|
||||
logger.debug("[TKS] {}".format(self.merge_(res)))
|
||||
logging.debug("[TKS] {}".format(self.merge_(res)))
|
||||
return self.merge_(res)
|
||||
|
||||
def fine_grained_tokenize(self, tks):
|
||||
@ -416,30 +416,30 @@ if __name__ == '__main__':
|
||||
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
|
||||
tks = tknzr.tokenize(
|
||||
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("虽然我不怎么玩")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
if len(sys.argv) < 2:
|
||||
sys.exit()
|
||||
tknzr.DEBUG = False
|
||||
@ -449,5 +449,5 @@ if __name__ == '__main__':
|
||||
line = of.readline()
|
||||
if not line:
|
||||
break
|
||||
logger.info(tknzr.tokenize(line))
|
||||
logging.info(tknzr.tokenize(line))
|
||||
of.close()
|
||||
|
||||
@ -14,12 +14,12 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
from typing import List, Optional, Dict, Union
|
||||
from dataclasses import dataclass
|
||||
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils import rmSpace
|
||||
from rag.nlp import rag_tokenizer, query
|
||||
import numpy as np
|
||||
@ -83,7 +83,7 @@ class Dealer:
|
||||
orderBy.desc("create_timestamp_flt")
|
||||
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
logging.debug("Dealer.search TOTAL: {}".format(total))
|
||||
else:
|
||||
highlightFields = ["content_ltks", "title_tks"] if highlight else []
|
||||
matchText, keywords = self.qryr.question(qst, min_match=0.3)
|
||||
@ -91,7 +91,7 @@ class Dealer:
|
||||
matchExprs = [matchText]
|
||||
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
logging.debug("Dealer.search TOTAL: {}".format(total))
|
||||
else:
|
||||
matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
|
||||
q_vec = matchDense.embedding_data
|
||||
@ -102,7 +102,7 @@ class Dealer:
|
||||
|
||||
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
logging.debug("Dealer.search TOTAL: {}".format(total))
|
||||
|
||||
# If result is empty, try again with lower min_match
|
||||
if total == 0:
|
||||
@ -112,7 +112,7 @@ class Dealer:
|
||||
matchDense.extra_options["similarity"] = 0.17
|
||||
res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
logger.info("Dealer.search 2 TOTAL: {}".format(total))
|
||||
logging.debug("Dealer.search 2 TOTAL: {}".format(total))
|
||||
|
||||
for k in keywords:
|
||||
kwds.add(k)
|
||||
@ -123,7 +123,7 @@ class Dealer:
|
||||
continue
|
||||
kwds.add(kk)
|
||||
|
||||
logger.info(f"TOTAL: {total}")
|
||||
logging.debug(f"TOTAL: {total}")
|
||||
ids=self.dataStore.getChunkIds(res)
|
||||
keywords=list(kwds)
|
||||
highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
|
||||
@ -180,7 +180,7 @@ class Dealer:
|
||||
continue
|
||||
idx.append(i)
|
||||
pieces_.append(t)
|
||||
logger.info("{} => {}".format(answer, pieces_))
|
||||
logging.debug("{} => {}".format(answer, pieces_))
|
||||
if not pieces_:
|
||||
return answer, set([])
|
||||
|
||||
@ -201,7 +201,7 @@ class Dealer:
|
||||
chunks_tks,
|
||||
tkweight, vtweight)
|
||||
mx = np.max(sim) * 0.99
|
||||
logger.info("{} SIM: {}".format(pieces_[i], mx))
|
||||
logging.debug("{} SIM: {}".format(pieces_[i], mx))
|
||||
if mx < thr:
|
||||
continue
|
||||
cites[idx[i]] = list(
|
||||
|
||||
@ -14,13 +14,13 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
from nltk.corpus import wordnet
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Dealer:
|
||||
@ -33,14 +33,14 @@ class Dealer:
|
||||
try:
|
||||
self.dictionary = json.load(open(path, 'r'))
|
||||
except Exception:
|
||||
logger.warn("Missing synonym.json")
|
||||
logging.warn("Missing synonym.json")
|
||||
self.dictionary = {}
|
||||
|
||||
if not redis:
|
||||
logger.warning(
|
||||
logging.warning(
|
||||
"Realtime synonym is disabled, since no redis connection.")
|
||||
if not len(self.dictionary.keys()):
|
||||
logger.warning("Fail to load synonym")
|
||||
logging.warning("Fail to load synonym")
|
||||
|
||||
self.redis = redis
|
||||
self.load()
|
||||
@ -64,7 +64,7 @@ class Dealer:
|
||||
d = json.loads(d)
|
||||
self.dictionary = d
|
||||
except Exception as e:
|
||||
logger.error("Fail to load synonym!" + str(e))
|
||||
logging.error("Fail to load synonym!" + str(e))
|
||||
|
||||
def lookup(self, tk):
|
||||
if re.match(r"[a-z]+$", tk):
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import math
|
||||
import json
|
||||
import re
|
||||
@ -21,7 +22,6 @@ import os
|
||||
import numpy as np
|
||||
from rag.nlp import rag_tokenizer
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Dealer:
|
||||
@ -83,11 +83,11 @@ class Dealer:
|
||||
try:
|
||||
self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
|
||||
except Exception:
|
||||
logger.warning("Load ner.json FAIL!")
|
||||
logging.warning("Load ner.json FAIL!")
|
||||
try:
|
||||
self.df = load_dict(os.path.join(fnm, "term.freq"))
|
||||
except Exception:
|
||||
logger.warning("Load term.freq FAIL!")
|
||||
logging.warning("Load term.freq FAIL!")
|
||||
|
||||
def pretoken(self, txt, num=False, stpwd=True):
|
||||
patt = [
|
||||
|
||||
Reference in New Issue
Block a user