Rework logging (#3358)

Unified all log files into one.

### What problem does this PR solve?

Unified all log files into one.

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-11-12 17:35:13 +08:00
committed by GitHub
parent 567a7563e7
commit a2a5631da4
75 changed files with 481 additions and 853 deletions

View File

@ -20,6 +20,7 @@ from rag.nlp import bullets_category, is_english,remove_contents_table, \
tokenize_chunks
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
from api.utils.log_utils import logger
class Pdf(PdfParser):
@ -38,7 +39,7 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis finished")
print("layouts:", timer() - start)
logger.info("layouts: {}".format(timer() - start))
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished")
self._text_merge()

View File

@ -18,7 +18,7 @@ import re
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
from deepdoc.parser import HtmlParser, TxtParser
from timeit import default_timer as timer
from rag.settings import cron_logger
from api.utils.log_utils import logger
import io
@ -86,7 +86,7 @@ def chunk(
)
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
logger.info("naive_merge({}): {}".format(filename, timer() - st))
# get the attachment info
for part in msg.iter_attachments():
content_disposition = part.get("Content-Disposition")

View File

@ -21,7 +21,7 @@ from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge,
make_colon_as_title, tokenize_chunks, docx_question_level
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
from rag.settings import cron_logger
from api.utils.log_utils import logger
class Docx(DocxParser):
@ -122,8 +122,8 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis finished")
cron_logger.info("layouts:".format(
(timer() - start) / (self.total_page + 0.1)))
logger.info("layouts:".format(
))
self._naive_vertical_merge()
callback(0.8, "Text extraction finished")

View File

@ -24,6 +24,7 @@ from rag.utils import num_tokens_from_string
from deepdoc.parser import PdfParser, PlainParser, DocxParser
from docx import Document
from PIL import Image
from api.utils.log_utils import logger
class Pdf(PdfParser):
@ -47,11 +48,11 @@ class Pdf(PdfParser):
# for bb in self.boxes:
# for b in bb:
# print(b)
print("OCR:", timer() - start)
logger.info("OCR: {}".format(timer() - start))
self._layouts_rec(zoomin)
callback(0.65, "Layout analysis finished.")
print("layouts:", timer() - start)
logger.info("layouts: {}".format(timer() - start))
self._table_transformer_job(zoomin)
callback(0.67, "Table analysis finished.")
self._text_merge()

View File

@ -19,7 +19,7 @@ from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
from rag.settings import cron_logger
from api.utils.log_utils import logger
from rag.utils import num_tokens_from_string
from PIL import Image
from functools import reduce
@ -41,18 +41,18 @@ class Docx(DocxParser):
try:
image_blob = related_part.image.blob
except UnrecognizedImageError:
print("Unrecognized image format. Skipping image.")
logger.info("Unrecognized image format. Skipping image.")
return None
except UnexpectedEndOfFileError:
print("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
logger.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
return None
except InvalidImageStreamError:
print("The recognized image stream appears to be corrupted. Skipping image.")
logger.info("The recognized image stream appears to be corrupted. Skipping image.")
return None
try:
image = Image.open(BytesIO(image_blob)).convert('RGB')
return image
except Exception as e:
except Exception:
return None
def __clean(self, line):
@ -133,7 +133,7 @@ class Pdf(PdfParser):
callback
)
callback(msg="OCR finished")
cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
start = timer()
self._layouts_rec(zoomin)
@ -147,7 +147,7 @@ class Pdf(PdfParser):
self._concat_downward()
# self._filter_forpages()
cron_logger.info("layouts: {}".format(timer() - start))
logger.info("layouts cost: {}s".format(timer() - start))
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], tbls
@ -216,7 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return chunks
res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
logger.info("naive_merge({}): {}".format(filename, timer() - st))
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -280,7 +280,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return chunks
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
logger.info("naive_merge({}): {}".format(filename, timer() - st))
return res

View File

@ -18,6 +18,7 @@ from deepdoc.parser.utils import get_text
from rag.app import laws
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
from api.utils.log_utils import logger
class Pdf(PdfParser):
@ -37,7 +38,7 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin, drop=False)
callback(0.63, "Layout analysis finished.")
print("layouts:", timer() - start)
logger.info("layouts cost: {}s".format(timer() - start))
self._table_transformer_job(zoomin)
callback(0.65, "Table analysis finished.")
self._text_merge()

View File

@ -17,6 +17,7 @@ from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
import numpy as np
from api.utils.log_utils import logger
class Pdf(PdfParser):
@ -40,7 +41,7 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin)
callback(0.63, "Layout analysis finished")
print("layouts:", timer() - start)
logger.info(f"layouts cost: {timer() - start}s")
self._table_transformer_job(zoomin)
callback(0.68, "Table analysis finished")
self._text_merge()
@ -52,8 +53,8 @@ class Pdf(PdfParser):
# clean mess
if column_width < self.page_images[0].size[0] / zoomin / 2:
print("two_column...................", column_width,
self.page_images[0].size[0] / zoomin / 2)
logger.info("two_column................... {} {}".format(column_width,
self.page_images[0].size[0] / zoomin / 2))
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
@ -114,8 +115,8 @@ class Pdf(PdfParser):
from_page, min(
to_page, self.total_page)))
for b in self.boxes:
print(b["text"], b.get("layoutno"))
print(tbls)
logger.info("{} {}".format(b["text"], b.get("layoutno")))
logger.info("{}".format(tbls))
return {
"title": title,
@ -156,7 +157,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
print("It's English.....", eng)
logger.info("It's English.....{}".format(eng))
res = tokenize_table(paper["tables"], doc, eng)
@ -183,7 +184,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
print(lvl, sorted_sections[i][0], most_level, sid)
logger.info("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
chunks = []
last_sid = -2

View File

@ -19,7 +19,7 @@ from openpyxl import load_workbook
from deepdoc.parser.utils import get_text
from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
from rag.settings import cron_logger
from api.utils.log_utils import logger
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from docx import Document
from PIL import Image
@ -82,7 +82,7 @@ class Pdf(PdfParser):
callback
)
callback(msg="OCR finished")
cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
start = timer()
self._layouts_rec(zoomin, drop=False)
callback(0.63, "Layout analysis finished.")
@ -94,7 +94,7 @@ class Pdf(PdfParser):
#self._naive_vertical_merge()
# self._concat_downward()
#self._filter_forpages()
cron_logger.info("layouts: {}".format(timer() - start))
logger.info("layouts: {}".format(timer() - start))
sections = [b["text"] for b in self.boxes]
bull_x0_list = []
q_bull, reg = qbullets_category(sections)

View File

@ -14,14 +14,13 @@ import base64
import datetime
import json
import re
import pandas as pd
import requests
from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import rag_tokenizer
from deepdoc.parser.resume import refactor
from deepdoc.parser.resume import step_one, step_two
from rag.settings import cron_logger
from api.utils.log_utils import logger
from rag.utils import rmSpace
forbidden_select_fields4resume = [
@ -64,8 +63,8 @@ def remote_call(filename, binary):
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
resume = step_two.parse(resume)
return resume
except Exception as e:
cron_logger.error("Resume parser error: " + str(e))
except Exception:
logger.exception("Resume parser error")
return {}
@ -87,7 +86,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
callback(-1, "Resume is not successfully parsed.")
raise Exception("Resume parser remote call fail!")
callback(0.6, "Done parsing. Chunking...")
print(json.dumps(resume, ensure_ascii=False, indent=2))
logger.info("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2))
field_map = {
"name_kwd": "姓名/名字",
@ -159,7 +158,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
doc[n] = resume[n]
print(doc)
logger.info("chunked resume to " + str(doc))
KnowledgebaseService.update_parser_config(
kwargs["kb_id"], {"field_map": field_map})
return [doc]

View File

@ -32,6 +32,7 @@ from api.utils.file_utils import get_home_cache_dir
from rag.utils import num_tokens_from_string, truncate
import google.generativeai as genai
import json
from api.utils.log_utils import logger
class Base(ABC):
def __init__(self, key, model_name):
@ -68,7 +69,7 @@ class DefaultEmbedding(Base):
DefaultEmbedding._model = FlagModel(os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z]+/", "", model_name)),
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=torch.cuda.is_available())
except Exception as e:
except Exception:
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5",
local_dir=os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z]+/", "", model_name)),
local_dir_use_symlinks=False)
@ -189,7 +190,7 @@ class QWenEmbed(Base):
)
return np.array(resp["output"]["embeddings"][0]
["embedding"]), resp["usage"]["total_tokens"]
except Exception as e:
except Exception:
raise Exception("Account abnormal. Please ensure it's on good standing to use QWen's "+self.model_name)
return np.array([]), 0
@ -296,11 +297,11 @@ class YoudaoEmbed(Base):
if not LIGHTEN and not YoudaoEmbed._client:
from BCEmbedding import EmbeddingModel as qanthing
try:
print("LOADING BCE...")
logger.info("LOADING BCE...")
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(
get_home_cache_dir(),
"bce-embedding-base_v1"))
except Exception as e:
except Exception:
YoudaoEmbed._client = qanthing(
model_name_or_path=model_name.replace(
"maidalun1020", "InfiniFlow"))

View File

@ -27,6 +27,7 @@ from api.settings import LIGHTEN
from api.utils.file_utils import get_home_cache_dir
from rag.utils import num_tokens_from_string, truncate
import json
from api.utils.log_utils import logger
def sigmoid(x):
@ -66,7 +67,7 @@ class DefaultRerank(Base):
DefaultRerank._model = FlagReranker(
os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z]+/", "", model_name)),
use_fp16=torch.cuda.is_available())
except Exception as e:
except Exception:
model_dir = snapshot_download(repo_id=model_name,
local_dir=os.path.join(get_home_cache_dir(),
re.sub(r"^[a-zA-Z]+/", "", model_name)),
@ -126,11 +127,11 @@ class YoudaoRerank(DefaultRerank):
with YoudaoRerank._model_lock:
if not YoudaoRerank._model:
try:
print("LOADING BCE...")
logger.info("LOADING BCE...")
YoudaoRerank._model = RerankerModel(model_name_or_path=os.path.join(
get_home_cache_dir(),
re.sub(r"^[a-zA-Z]+/", "", model_name)))
except Exception as e:
except Exception:
YoudaoRerank._model = RerankerModel(
model_name_or_path=model_name.replace(
"maidalun1020", "InfiniFlow"))

View File

@ -26,6 +26,7 @@ from word2number import w2n
from cn2an import cn2an
from PIL import Image
import json
from api.utils.log_utils import logger
all_codecs = [
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@ -235,7 +236,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
# wrap up as es documents
for ck in chunks:
if len(ck.strip()) == 0:continue
print("--", ck)
logger.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
if pdf_parser:
try:
@ -254,7 +255,7 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
# wrap up as es documents
for ck, image in zip(chunks, images):
if len(ck.strip()) == 0:continue
print("--", ck)
logger.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
d["image"] = image
tokenize(d, ck, eng)
@ -457,7 +458,7 @@ def hierarchical_merge(bull, sections, depth):
for i in range(len(cks)):
cks[i] = [sections[j] for j in cks[i][::-1]]
print("--------------\n", "\n* ".join(cks[i]))
logger.info("\n* ".join(cks[i]))
res = [[]]
num = [0]

View File

@ -22,10 +22,10 @@ import re
import string
import sys
from hanziconv import HanziConv
from huggingface_hub import snapshot_download
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class RagTokenizer:
@ -36,7 +36,7 @@ class RagTokenizer:
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
def loadDict_(self, fnm):
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
logger.info(f"[HUQIE]:Build trie {fnm}")
try:
of = open(fnm, "r", encoding='utf-8')
while True:
@ -52,8 +52,9 @@ class RagTokenizer:
self.trie_[self.rkey_(line[0])] = 1
self.trie_.save(fnm + ".trie")
of.close()
except Exception as e:
print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
except Exception:
logger.exception(f"[HUQIE]:Build trie {fnm} failed")
def __init__(self, debug=False):
self.DEBUG = debug
@ -68,8 +69,8 @@ class RagTokenizer:
try:
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
return
except Exception as e:
print("[HUQIE]:Build default trie", file=sys.stderr)
except Exception:
logger.exception("[HUQIE]:Build default trie")
self.trie_ = datrie.Trie(string.printable)
self.loadDict_(self.DIR_ + ".txt")
@ -78,7 +79,7 @@ class RagTokenizer:
try:
self.trie_ = datrie.Trie.load(fnm + ".trie")
return
except Exception as e:
except Exception:
self.trie_ = datrie.Trie(string.printable)
self.loadDict_(fnm)
@ -173,8 +174,7 @@ class RagTokenizer:
tks.append(tk)
F /= len(tks)
L /= len(tks)
if self.DEBUG:
print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
logger.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
return tks, B / len(tks) + L + F
def sortTks_(self, tkslist):
@ -278,8 +278,8 @@ class RagTokenizer:
tks, s = self.maxForward_(L)
tks1, s1 = self.maxBackward_(L)
if self.DEBUG:
print("[FW]", tks, s)
print("[BW]", tks1, s1)
logger.debug("[FW] {} {}".format(tks, s))
logger.debug("[BW] {} {}".format(tks1, s1))
i, j, _i, _j = 0, 0, 0, 0
same = 0
@ -326,8 +326,7 @@ class RagTokenizer:
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
res = " ".join(self.english_normalize_(res))
if self.DEBUG:
print("[TKS]", self.merge_(res))
logger.debug("[TKS] {}".format(self.merge_(res)))
return self.merge_(res)
def fine_grained_tokenize(self, tks):
@ -418,30 +417,30 @@ if __name__ == '__main__':
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
tks = tknzr.tokenize(
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("虽然我不怎么玩")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
print(tknzr.fine_grained_tokenize(tks))
logger.info(tknzr.fine_grained_tokenize(tks))
if len(sys.argv) < 2:
sys.exit()
tknzr.DEBUG = False
@ -451,5 +450,5 @@ if __name__ == '__main__':
line = of.readline()
if not line:
break
print(tknzr.tokenize(line))
logger.info(tknzr.tokenize(line))
of.close()

View File

@ -19,7 +19,7 @@ import json
from typing import List, Optional, Dict, Union
from dataclasses import dataclass
from rag.settings import doc_store_logger
from api.utils.log_utils import logger
from rag.utils import rmSpace
from rag.nlp import rag_tokenizer, query
import numpy as np
@ -83,7 +83,7 @@ class Dealer:
orderBy.desc("create_timestamp_flt")
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
doc_store_logger.info("Dealer.search TOTAL: {}".format(total))
logger.info("Dealer.search TOTAL: {}".format(total))
else:
highlightFields = ["content_ltks", "title_tks"] if highlight else []
matchText, keywords = self.qryr.question(qst, min_match=0.3)
@ -91,7 +91,7 @@ class Dealer:
matchExprs = [matchText]
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
doc_store_logger.info("Dealer.search TOTAL: {}".format(total))
logger.info("Dealer.search TOTAL: {}".format(total))
else:
matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
q_vec = matchDense.embedding_data
@ -102,7 +102,7 @@ class Dealer:
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
doc_store_logger.info("Dealer.search TOTAL: {}".format(total))
logger.info("Dealer.search TOTAL: {}".format(total))
# If result is empty, try again with lower min_match
if total == 0:
@ -112,7 +112,7 @@ class Dealer:
matchDense.extra_options["similarity"] = 0.17
res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
doc_store_logger.info("Dealer.search 2 TOTAL: {}".format(total))
logger.info("Dealer.search 2 TOTAL: {}".format(total))
for k in keywords:
kwds.add(k)
@ -123,7 +123,7 @@ class Dealer:
continue
kwds.add(kk)
doc_store_logger.info(f"TOTAL: {total}")
logger.info(f"TOTAL: {total}")
ids=self.dataStore.getChunkIds(res)
keywords=list(kwds)
highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
@ -180,7 +180,7 @@ class Dealer:
continue
idx.append(i)
pieces_.append(t)
doc_store_logger.info("{} => {}".format(answer, pieces_))
logger.info("{} => {}".format(answer, pieces_))
if not pieces_:
return answer, set([])
@ -201,7 +201,7 @@ class Dealer:
chunks_tks,
tkweight, vtweight)
mx = np.max(sim) * 0.99
doc_store_logger.info("{} SIM: {}".format(pieces_[i], mx))
logger.info("{} SIM: {}".format(pieces_[i], mx))
if mx < thr:
continue
cites[idx[i]] = list(

View File

@ -17,10 +17,10 @@
import json
import os
import time
import logging
import re
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class Dealer:
@ -32,15 +32,15 @@ class Dealer:
path = os.path.join(get_project_base_directory(), "rag/res", "synonym.json")
try:
self.dictionary = json.load(open(path, 'r'))
except Exception as e:
logging.warn("Missing synonym.json")
except Exception:
logger.warn("Missing synonym.json")
self.dictionary = {}
if not redis:
logging.warning(
logger.warning(
"Realtime synonym is disabled, since no redis connection.")
if not len(self.dictionary.keys()):
logging.warning(f"Fail to load synonym")
logger.warning("Fail to load synonym")
self.redis = redis
self.load()
@ -64,7 +64,7 @@ class Dealer:
d = json.loads(d)
self.dictionary = d
except Exception as e:
logging.error("Fail to load synonym!" + str(e))
logger.error("Fail to load synonym!" + str(e))
def lookup(self, tk):
self.lookup_num += 1

View File

@ -21,6 +21,7 @@ import os
import numpy as np
from rag.nlp import rag_tokenizer
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import logger
class Dealer:
@ -81,12 +82,12 @@ class Dealer:
self.ne, self.df = {}, {}
try:
self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
except Exception as e:
print("[WARNING] Load ner.json FAIL!")
except Exception:
logger.warning("Load ner.json FAIL!")
try:
self.df = load_dict(os.path.join(fnm, "term.freq"))
except Exception as e:
print("[WARNING] Load term.freq FAIL!")
except Exception:
logger.warning("Load term.freq FAIL!")
def pretoken(self, txt, num=False, stpwd=True):
patt = [

View File

@ -14,7 +14,6 @@
# limitations under the License.
#
import re
import traceback
from concurrent.futures import ThreadPoolExecutor, ALL_COMPLETED, wait
from threading import Lock
from typing import Tuple
@ -22,7 +21,8 @@ import umap
import numpy as np
from sklearn.mixture import GaussianMixture
from rag.utils import num_tokens_from_string, truncate
from rag.utils import truncate
from api.utils.log_utils import logger
class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
@ -62,14 +62,13 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
{"temperature": 0.3, "max_tokens": self._max_token}
)
cnt = re.sub("(······\n由于长度的原因,回答被截断了,要继续吗?|For the content length reason, it stopped, continue?)", "", cnt)
print("SUM:", cnt)
logger.info(f"SUM: {cnt}")
embds, _ = self._embd_model.encode([cnt])
with lock:
if not len(embds[0]): return
chunks.append((cnt, embds[0]))
except Exception as e:
print(e, flush=True)
traceback.print_stack(e)
logger.exception("summarize got exception")
return e
labels = []
@ -105,7 +104,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
ck_idx = [i+start for i in range(len(lbls)) if lbls[i] == c]
threads.append(executor.submit(summarize, ck_idx, lock))
wait(threads, return_when=ALL_COMPLETED)
print([t.result() for t in threads])
logger.info(str([t.result() for t in threads]))
assert len(chunks) - end == n_clusters, "{} vs. {}".format(len(chunks) - end, n_clusters)
labels.extend(lbls)

View File

@ -14,15 +14,11 @@
# limitations under the License.
#
import os
import logging
from api.utils import get_base_config, decrypt_database_config
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import LoggerFactory, getLogger
# Server
RAG_CONF_PATH = os.path.join(get_project_base_directory(), "conf")
SUBPROCESS_STD_LOG_NAME = "std.log"
ES = get_base_config("es", {})
INFINITY = get_base_config("infinity", {"uri": "infinity:23817"})
@ -36,29 +32,6 @@ except Exception:
pass
DOC_MAXIMUM_SIZE = int(os.environ.get("MAX_CONTENT_LENGTH", 128 * 1024 * 1024))
# Logger
LoggerFactory.set_directory(
os.path.join(
get_project_base_directory(),
"logs",
"rag"))
# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
LoggerFactory.LEVEL = 30
doc_store_logger = getLogger("doc_store")
minio_logger = getLogger("minio")
s3_logger = getLogger("s3")
azure_logger = getLogger("azure")
cron_logger = getLogger("cron_logger")
chunk_logger = getLogger("chunk_logger")
database_logger = getLogger("database")
formatter = logging.Formatter("%(asctime)-15s %(levelname)-8s (%(process)d) %(message)s")
for logger in [doc_store_logger, minio_logger, s3_logger, azure_logger, cron_logger, chunk_logger, database_logger]:
logger.setLevel(logging.INFO)
for handler in logger.handlers:
handler.setFormatter(fmt=formatter)
SVR_QUEUE_NAME = "rag_flow_svr_queue"
SVR_QUEUE_RETENTION = 60*60
SVR_QUEUE_MAX_LEN = 1024

View File

@ -13,20 +13,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import random
import time
import traceback
from api.db.db_models import close_connection
from api.db.services.task_service import TaskService
from rag.settings import cron_logger
from api.utils.log_utils import logger
from rag.utils.storage_factory import STORAGE_IMPL
from rag.utils.redis_conn import REDIS_CONN
def collect():
doc_locations = TaskService.get_ongoing_doc_name()
print(doc_locations)
logger.info(doc_locations)
if len(doc_locations) == 0:
time.sleep(1)
return
@ -35,7 +34,7 @@ def collect():
def main():
locations = collect()
if not locations:return
print("TASKS:", len(locations))
logger.info(f"TASKS: {len(locations)}")
for kb_id, loc in locations:
try:
if REDIS_CONN.is_alive():
@ -44,7 +43,7 @@ def main():
if REDIS_CONN.exist(key):continue
file_bin = STORAGE_IMPL.get(kb_id, loc)
REDIS_CONN.transaction(key, file_bin, 12 * 60)
cron_logger.info("CACHE: {}".format(loc))
logger.info("CACHE: {}".format(loc))
except Exception as e:
traceback.print_stack(e)
except Exception as e:

View File

@ -17,6 +17,7 @@ import discord
import requests
import base64
import asyncio
from api.utils.log_utils import logger
URL = '{YOUR_IP_ADDRESS:PORT}/v1/api/completion_aibotk' # Default: https://demo.ragflow.io/v1/api/completion_aibotk
@ -36,7 +37,7 @@ client = discord.Client(intents=intents)
@client.event
async def on_ready():
print(f'We have logged in as {client.user}')
logger.info(f'We have logged in as {client.user}')
@client.event

View File

@ -22,7 +22,6 @@ import copy
import re
import sys
import time
import traceback
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from io import BytesIO
@ -43,8 +42,8 @@ from api.db.db_models import close_connection
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
from rag.nlp import search, rag_tokenizer
from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
from rag.settings import database_logger, SVR_QUEUE_NAME
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
from api.utils.log_utils import logger, LOG_FILE
from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME
from rag.utils import rmSpace, num_tokens_from_string
from rag.utils.redis_conn import REDIS_CONN, Payload
from rag.utils.storage_factory import STORAGE_IMPL
@ -90,8 +89,8 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing...
d["progress"] = prog
try:
TaskService.update_progress(task_id, d)
except Exception as e:
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
except Exception:
logger.exception(f"set_progress({task_id}) got exception")
close_connection()
if cancel:
@ -110,8 +109,8 @@ def collect():
if not PAYLOAD:
time.sleep(1)
return pd.DataFrame()
except Exception as e:
cron_logger.error("Get task event from queue exception:" + str(e))
except Exception:
logger.exception("Get task event from queue exception")
return pd.DataFrame()
msg = PAYLOAD.get_message()
@ -119,11 +118,11 @@ def collect():
return pd.DataFrame()
if TaskService.do_cancel(msg["id"]):
cron_logger.info("Task {} has been canceled.".format(msg["id"]))
logger.info("Task {} has been canceled.".format(msg["id"]))
return pd.DataFrame()
tasks = TaskService.get_tasks(msg["id"])
if not tasks:
cron_logger.warning("{} empty task!".format(msg["id"]))
logger.warning("{} empty task!".format(msg["id"]))
return []
tasks = pd.DataFrame(tasks)
@ -152,33 +151,29 @@ def build(row):
st = timer()
bucket, name = File2DocumentService.get_storage_address(doc_id=row["doc_id"])
binary = get_storage_binary(bucket, name)
cron_logger.info(
logger.info(
"From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
except TimeoutError:
callback(-1, "Internal server error: Fetch file from minio timeout. Could you try it again.")
cron_logger.error(
"Minio {}/{}: Fetch file from minio timeout.".format(row["location"], row["name"]))
logger.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"]))
return
except Exception as e:
if re.search("(No such file|not found)", str(e)):
callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"])
else:
callback(-1, "Get file from minio: %s" % str(e).replace("'", ""))
traceback.print_exc()
logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
return
try:
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
to_page=row["to_page"], lang=row["language"], callback=callback,
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
cron_logger.info(
"Chunking({}) {}/{}".format(timer() - st, row["location"], row["name"]))
logger.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"]))
except Exception as e:
callback(-1, "Internal server error while chunking: %s" %
str(e).replace("'", ""))
cron_logger.error(
"Chunking {}/{}: {}".format(row["location"], row["name"], str(e)))
traceback.print_exc()
logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
return
docs = []
@ -214,14 +209,13 @@ def build(row):
st = timer()
STORAGE_IMPL.put(row["kb_id"], d["id"], output_buffer.getvalue())
el += timer() - st
except Exception as e:
cron_logger.error(str(e))
traceback.print_exc()
except Exception:
logger.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"]))
d["img_id"] = "{}-{}".format(row["kb_id"], d["id"])
del d["image"]
docs.append(d)
cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
logger.info("MINIO PUT({}):{}".format(row["name"], el))
if row["parser_config"].get("auto_keywords", 0):
callback(msg="Start to generate keywords for every chunk ...")
@ -347,7 +341,7 @@ def main():
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
except Exception as e:
callback(-1, msg=str(e))
cron_logger.error(str(e))
logger.exception("LLMBundle got exception")
continue
if r.get("task_type", "") == "raptor":
@ -356,12 +350,12 @@ def main():
cks, tk_count, vector_size = run_raptor(r, chat_mdl, embd_mdl, callback)
except Exception as e:
callback(-1, msg=str(e))
cron_logger.error(str(e))
logger.exception("run_raptor got exception")
continue
else:
st = timer()
cks = build(r)
cron_logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
if cks is None:
continue
if not cks:
@ -377,12 +371,12 @@ def main():
tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
except Exception as e:
callback(-1, "Embedding error:{}".format(str(e)))
cron_logger.error(str(e))
logger.exception("run_rembedding got exception")
tk_count = 0
cron_logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer() - st))
# cron_logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
# logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
init_kb(r, vector_size)
chunk_count = len(set([c["id"] for c in cks]))
st = timer()
@ -393,11 +387,11 @@ def main():
if b % 128 == 0:
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
cron_logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
if es_r:
callback(-1, "Insert chunk error, detail info please check ragflow-logs/api/cron_logger.log. Please also check ES status!")
callback(-1, f"Insert chunk error, detail info please check {LOG_FILE}. Please also check ES status!")
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
cron_logger.error('Insert chunk error: ' + str(es_r))
logger.error('Insert chunk error: ' + str(es_r))
else:
if TaskService.do_cancel(r["id"]):
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
@ -405,7 +399,7 @@ def main():
callback(1., "Done!")
DocumentService.increment_chunk_num(
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
cron_logger.info(
logger.info(
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
r["id"], tk_count, len(cks), timer() - st))
@ -421,16 +415,16 @@ def report_status():
obj[CONSUMER_NAME].append(timer())
obj[CONSUMER_NAME] = obj[CONSUMER_NAME][-60:]
REDIS_CONN.set_obj("TASKEXE", obj, 60*2)
except Exception as e:
print("[Exception]:", str(e))
except Exception:
logger.exception("report_status got exception")
time.sleep(30)
if __name__ == "__main__":
peewee_logger = logging.getLogger('peewee')
peewee_logger.propagate = False
peewee_logger.addHandler(database_logger.handlers[0])
peewee_logger.setLevel(database_logger.level)
peewee_logger.addHandler(logger.handlers[0])
peewee_logger.setLevel(logger.handlers[0].level)
exe = ThreadPoolExecutor(max_workers=1)
exe.submit(report_status)

View File

@ -2,7 +2,6 @@ import os
import time
from io import BytesIO
from rag import settings
from rag.settings import azure_logger
from rag.utils import singleton
from azure.storage.blob import ContainerClient
@ -19,14 +18,13 @@ class RAGFlowAzureSasBlob(object):
try:
if self.conn:
self.__close__()
except Exception as e:
except Exception:
pass
try:
self.conn = ContainerClient.from_container_url(self.container_url + "?" + self.sas_token)
except Exception as e:
azure_logger.error(
"Fail to connect %s " % self.container_url + str(e))
except Exception:
logger.exception("Fail to connect %s " % self.container_url)
def __close__(self):
del self.conn
@ -40,24 +38,24 @@ class RAGFlowAzureSasBlob(object):
for _ in range(3):
try:
return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary))
except Exception as e:
azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
self.__open__()
time.sleep(1)
def rm(self, bucket, fnm):
try:
self.conn.delete_blob(fnm)
except Exception as e:
azure_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail rm {bucket}/{fnm}")
def get(self, bucket, fnm):
for _ in range(1):
try:
r = self.conn.download_blob(fnm)
return r.read()
except Exception as e:
azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return
@ -65,16 +63,16 @@ class RAGFlowAzureSasBlob(object):
def obj_exist(self, bucket, fnm):
try:
return self.conn.get_blob_client(fnm).exists()
except Exception as e:
azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
return False
def get_presigned_url(self, bucket, fnm, expires):
for _ in range(10):
try:
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
except Exception as e:
azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return

View File

@ -1,7 +1,6 @@
import os
import time
from rag import settings
from rag.settings import azure_logger
from rag.utils import singleton
from azure.identity import ClientSecretCredential, AzureAuthorityHosts
from azure.storage.filedatalake import FileSystemClient
@ -22,15 +21,14 @@ class RAGFlowAzureSpnBlob(object):
try:
if self.conn:
self.__close__()
except Exception as e:
except Exception:
pass
try:
credentials = ClientSecretCredential(tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.secret, authority=AzureAuthorityHosts.AZURE_CHINA)
self.conn = FileSystemClient(account_url=self.account_url, file_system_name=self.container_name, credential=credentials)
except Exception as e:
azure_logger.error(
"Fail to connect %s " % self.account_url + str(e))
except Exception:
logger.exception("Fail to connect %s" % self.account_url)
def __close__(self):
del self.conn
@ -48,16 +46,16 @@ class RAGFlowAzureSpnBlob(object):
f = self.conn.create_file(fnm)
f.append_data(binary, offset=0, length=len(binary))
return f.flush_data(len(binary))
except Exception as e:
azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
self.__open__()
time.sleep(1)
def rm(self, bucket, fnm):
try:
self.conn.delete_file(fnm)
except Exception as e:
azure_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail rm {bucket}/{fnm}")
def get(self, bucket, fnm):
for _ in range(1):
@ -65,8 +63,8 @@ class RAGFlowAzureSpnBlob(object):
client = self.conn.get_file_client(fnm)
r = client.download_file()
return r.read()
except Exception as e:
azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return
@ -75,16 +73,16 @@ class RAGFlowAzureSpnBlob(object):
try:
client = self.conn.get_file_client(fnm)
return client.exists()
except Exception as e:
azure_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
return False
def get_presigned_url(self, bucket, fnm, expires):
for _ in range(10):
try:
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
except Exception as e:
azure_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return

View File

@ -9,7 +9,7 @@ import copy
from elasticsearch import Elasticsearch
from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
from elastic_transport import ConnectionTimeout
from rag.settings import doc_store_logger
from api.utils.log_utils import logger
from rag import settings
from rag.utils import singleton
from api.utils.file_utils import get_project_base_directory
@ -17,7 +17,7 @@ import polars as pl
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, FusionExpr
from rag.nlp import is_english, rag_tokenizer
doc_store_logger.info("Elasticsearch sdk version: "+str(elasticsearch.__version__))
logger.info("Elasticsearch sdk version: "+str(elasticsearch.__version__))
@singleton
@ -34,10 +34,10 @@ class ESConnection(DocStoreConnection):
)
if self.es:
self.info = self.es.info()
doc_store_logger.info("Connect to es.")
logger.info("Connect to es.")
break
except Exception as e:
doc_store_logger.error("Fail to connect to es: " + str(e))
except Exception:
logger.exception("Fail to connect to es")
time.sleep(1)
if not self.es.ping():
raise Exception("Can't connect to ES cluster")
@ -70,14 +70,14 @@ class ESConnection(DocStoreConnection):
return IndicesClient(self.es).create(index=indexName,
settings=self.mapping["settings"],
mappings=self.mapping["mappings"])
except Exception as e:
doc_store_logger.error("ES create index error %s ----%s" % (indexName, str(e)))
except Exception:
logger.exception("ES create index error %s" % (indexName))
def deleteIdx(self, indexName: str, knowledgebaseId: str):
try:
return self.es.indices.delete(indexName, allow_no_indices=True)
except Exception as e:
doc_store_logger.error("ES delete index error %s ----%s" % (indexName, str(e)))
except Exception:
logger.exception("ES delete index error %s" % (indexName))
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
s = Index(indexName, self.es)
@ -85,7 +85,7 @@ class ESConnection(DocStoreConnection):
try:
return s.exists()
except Exception as e:
doc_store_logger.error("ES indexExist: " + str(e))
logger.exception("ES indexExist")
if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
continue
return False
@ -159,7 +159,7 @@ class ESConnection(DocStoreConnection):
if limit > 0:
s = s[offset:limit]
q = s.to_dict()
doc_store_logger.info("ESConnection.search [Q]: " + json.dumps(q))
# logger.info("ESConnection.search [Q]: " + json.dumps(q))
for i in range(3):
try:
@ -171,18 +171,14 @@ class ESConnection(DocStoreConnection):
_source=True)
if str(res.get("timed_out", "")).lower() == "true":
raise Exception("Es Timeout.")
doc_store_logger.info("ESConnection.search res: " + str(res))
logger.info("ESConnection.search res: " + str(res))
return res
except Exception as e:
doc_store_logger.error(
"ES search exception: " +
str(e) +
"\n[Q]: " +
str(q))
logger.exception("ES search [Q]: " + str(q))
if str(e).find("Timeout") > 0:
continue
raise e
doc_store_logger.error("ES search timeout for 3 times!")
logger.error("ES search timeout for 3 times!")
raise Exception("ES search timeout.")
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
@ -198,15 +194,11 @@ class ESConnection(DocStoreConnection):
chunk["id"] = chunkId
return chunk
except Exception as e:
doc_store_logger.error(
"ES get exception: " +
str(e) +
"[Q]: " +
chunkId)
logger.exception(f"ES get({chunkId}) got exception")
if str(e).find("Timeout") > 0:
continue
raise e
doc_store_logger.error("ES search timeout for 3 times!")
logger.error("ES search timeout for 3 times!")
raise Exception("ES search timeout.")
def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str) -> list[str]:
@ -236,7 +228,7 @@ class ESConnection(DocStoreConnection):
res.append(str(item[action]["_id"]) + ":" + str(item[action]["error"]))
return res
except Exception as e:
doc_store_logger.warning("Fail to bulk: " + str(e))
logger.warning("Fail to bulk: " + str(e))
if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
time.sleep(3)
continue
@ -253,9 +245,7 @@ class ESConnection(DocStoreConnection):
self.es.update(index=indexName, id=chunkId, doc=doc)
return True
except Exception as e:
doc_store_logger.error(
"ES update exception: " + str(e) + " id:" + str(id) +
json.dumps(newValue, ensure_ascii=False))
logger.exception(f"ES failed to update(index={indexName}, id={id}, doc={json.dumps(condition, ensure_ascii=False)})")
if str(e).find("Timeout") > 0:
continue
else:
@ -292,8 +282,7 @@ class ESConnection(DocStoreConnection):
_ = ubq.execute()
return True
except Exception as e:
doc_store_logger.error("ES update exception: " +
str(e) + "[Q]:" + str(bqry.to_dict()))
logger.error("ES update exception: " + str(e) + "[Q]:" + str(bqry.to_dict()))
if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
continue
return False
@ -315,7 +304,7 @@ class ESConnection(DocStoreConnection):
qry.must.append(Q("term", **{k: v}))
else:
raise Exception("Condition value must be int, str or list.")
doc_store_logger.info("ESConnection.delete [Q]: " + json.dumps(qry.to_dict()))
logger.info("ESConnection.delete [Q]: " + json.dumps(qry.to_dict()))
for _ in range(10):
try:
res = self.es.delete_by_query(
@ -324,7 +313,7 @@ class ESConnection(DocStoreConnection):
refresh=True)
return res["deleted"]
except Exception as e:
doc_store_logger.warning("Fail to delete: " + str(filter) + str(e))
logger.warning("Fail to delete: " + str(filter) + str(e))
if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
time.sleep(3)
continue
@ -407,7 +396,7 @@ class ESConnection(DocStoreConnection):
SQL
"""
def sql(self, sql: str, fetch_size: int, format: str):
doc_store_logger.info(f"ESConnection.sql get sql: {sql}")
logger.info(f"ESConnection.sql get sql: {sql}")
sql = re.sub(r"[ `]+", " ", sql)
sql = sql.replace("%", "")
replaces = []
@ -424,17 +413,17 @@ class ESConnection(DocStoreConnection):
for p, r in replaces:
sql = sql.replace(p, r, 1)
doc_store_logger.info(f"ESConnection.sql to es: {sql}")
logger.info(f"ESConnection.sql to es: {sql}")
for i in range(3):
try:
res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, request_timeout="2s")
return res
except ConnectionTimeout:
doc_store_logger.error("ESConnection.sql timeout [Q]: " + sql)
logger.exception("ESConnection.sql timeout [Q]: " + sql)
continue
except Exception as e:
doc_store_logger.error(f"ESConnection.sql failure: {sql} => " + str(e))
except Exception:
logger.exception("ESConnection.sql got exception [Q]: " + sql)
return None
doc_store_logger.error("ESConnection.sql timeout for 3 times!")
logger.error("ESConnection.sql timeout for 3 times!")
return None

View File

@ -7,7 +7,7 @@ from infinity.common import ConflictType, InfinityException
from infinity.index import IndexInfo, IndexType
from infinity.connection_pool import ConnectionPool
from rag import settings
from rag.settings import doc_store_logger
from api.utils.log_utils import logger
from rag.utils import singleton
import polars as pl
from polars.series.series import Series
@ -22,7 +22,6 @@ from rag.utils.doc_store_conn import (
OrderByExpr,
)
def equivalent_condition_to_str(condition: dict) -> str:
assert "_id" not in condition
cond = list()
@ -56,7 +55,7 @@ class InfinityConnection(DocStoreConnection):
host, port = infinity_uri.split(":")
infinity_uri = infinity.common.NetworkAddress(host, int(port))
self.connPool = ConnectionPool(infinity_uri)
doc_store_logger.info(f"Connected to infinity {infinity_uri}.")
logger.info(f"Connected to infinity {infinity_uri}.")
"""
Database operations
@ -71,7 +70,7 @@ class InfinityConnection(DocStoreConnection):
TODO: Infinity-sdk provides health() to wrap `show global variables` and `show tables`
"""
inf_conn = self.connPool.get_conn()
res = infinity.show_current_node()
res = inf_conn.show_current_node()
self.connPool.release_conn(inf_conn)
color = "green" if res.error_code == 0 else "red"
res2 = {
@ -132,7 +131,7 @@ class InfinityConnection(DocStoreConnection):
)
break
self.connPool.release_conn(inf_conn)
doc_store_logger.info(
logger.info(
f"INFINITY created table {table_name}, vector size {vectorSize}"
)
@ -142,7 +141,7 @@ class InfinityConnection(DocStoreConnection):
db_instance = inf_conn.get_database(self.dbName)
db_instance.drop_table(table_name, ConflictType.Ignore)
self.connPool.release_conn(inf_conn)
doc_store_logger.info(f"INFINITY dropped table {table_name}")
logger.info(f"INFINITY dropped table {table_name}")
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
table_name = f"{indexName}_{knowledgebaseId}"
@ -152,8 +151,8 @@ class InfinityConnection(DocStoreConnection):
_ = db_instance.get_table(table_name)
self.connPool.release_conn(inf_conn)
return True
except Exception as e:
doc_store_logger.error("INFINITY indexExist: " + str(e))
except Exception:
logger.exception("INFINITY indexExist")
return False
"""
@ -263,7 +262,7 @@ class InfinityConnection(DocStoreConnection):
df_list.append(kb_res)
self.connPool.release_conn(inf_conn)
res = pl.concat(df_list)
doc_store_logger.info("INFINITY search tables: " + str(table_list))
logger.info("INFINITY search tables: " + str(table_list))
return res
def get(
@ -318,8 +317,8 @@ class InfinityConnection(DocStoreConnection):
str_filter = f"id IN ({str_ids})"
table_instance.delete(str_filter)
# for doc in documents:
# doc_store_logger.info(f"insert position_list: {doc['position_list']}")
# doc_store_logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
# logger.info(f"insert position_list: {doc['position_list']}")
# logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
table_instance.insert(documents)
self.connPool.release_conn(inf_conn)
doc_store_logger.info(f"inserted into {table_name} {str_ids}.")
@ -329,7 +328,7 @@ class InfinityConnection(DocStoreConnection):
self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str
) -> bool:
# if 'position_list' in newValue:
# doc_store_logger.info(f"update position_list: {newValue['position_list']}")
# logger.info(f"upsert position_list: {newValue['position_list']}")
inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName)
table_name = f"{indexName}_{knowledgebaseId}"
@ -350,7 +349,7 @@ class InfinityConnection(DocStoreConnection):
try:
table_instance = db_instance.get_table(table_name)
except Exception:
doc_store_logger.warning(
logger.warning(
f"Skipped deleting `{filter}` from table {table_name} since the table doesn't exist."
)
return 0

View File

@ -1,10 +1,9 @@
import os
import time
from minio import Minio
from io import BytesIO
from rag import settings
from rag.settings import minio_logger
from rag.utils import singleton
from api.utils.log_utils import logger
@singleton
@ -17,7 +16,7 @@ class RAGFlowMinio(object):
try:
if self.conn:
self.__close__()
except Exception as e:
except Exception:
pass
try:
@ -26,9 +25,9 @@ class RAGFlowMinio(object):
secret_key=settings.MINIO["password"],
secure=False
)
except Exception as e:
minio_logger.error(
"Fail to connect %s " % settings.MINIO["host"] + str(e))
except Exception:
logger.exception(
"Fail to connect %s " % settings.MINIO["host"])
def __close__(self):
del self.conn
@ -55,24 +54,24 @@ class RAGFlowMinio(object):
len(binary)
)
return r
except Exception as e:
minio_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
self.__open__()
time.sleep(1)
def rm(self, bucket, fnm):
try:
self.conn.remove_object(bucket, fnm)
except Exception as e:
minio_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
def get(self, bucket, fnm):
for _ in range(1):
try:
r = self.conn.get_object(bucket, fnm)
return r.read()
except Exception as e:
minio_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
self.__open__()
time.sleep(1)
return
@ -81,8 +80,8 @@ class RAGFlowMinio(object):
try:
if self.conn.stat_object(bucket, fnm):return True
return False
except Exception as e:
minio_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
return False
@ -90,8 +89,8 @@ class RAGFlowMinio(object):
for _ in range(10):
try:
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
except Exception as e:
minio_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}:")
self.__open__()
time.sleep(1)
return

View File

@ -110,9 +110,8 @@ class RedisDB:
#pipeline.expire(queue, exp)
pipeline.execute()
return True
except Exception as e:
print(e)
logging.warning("[EXCEPTION]producer" + str(queue) + "||" + str(e))
except Exception:
logging.exception("producer" + str(queue) + " got exception")
return False
def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> Payload:
@ -143,7 +142,7 @@ class RedisDB:
if 'key' in str(e):
pass
else:
logging.warning("[EXCEPTION]consumer: " + str(queue_name) + "||" + str(e))
logging.exception("consumer: " + str(queue_name) + " got exception")
return None
def get_unacked_for(self, consumer_name, queue_name, group_name):
@ -160,7 +159,7 @@ class RedisDB:
except Exception as e:
if 'key' in str(e):
return
logging.warning("[EXCEPTION]xpending_range: " + consumer_name + "||" + str(e))
logging.exception("xpending_range: " + consumer_name + " got exception")
self.__open__()
REDIS_CONN = RedisDB()

View File

@ -4,7 +4,6 @@ from botocore.exceptions import ClientError
from botocore.client import Config
import time
from io import BytesIO
from rag.settings import s3_logger
from rag.utils import singleton
@singleton
@ -21,7 +20,7 @@ class RAGFlowS3(object):
try:
if self.conn:
self.__close__()
except Exception as e:
except Exception:
pass
try:
@ -40,9 +39,9 @@ class RAGFlowS3(object):
aws_secret_access_key=self.secret_key,
config=config
)
except Exception as e:
s3_logger.error(
"Fail to connect %s " % self.endpoint + str(e))
except Exception:
logger.exception(
"Fail to connect %s" % self.endpoint)
def __close__(self):
del self.conn
@ -50,11 +49,11 @@ class RAGFlowS3(object):
def bucket_exists(self, bucket):
try:
s3_logger.error(f"head_bucket bucketname {bucket}")
logger.debug(f"head_bucket bucketname {bucket}")
self.conn.head_bucket(Bucket=bucket)
exists = True
except ClientError as e:
s3_logger.error(f"head_bucket error {bucket}: " + str(e))
except ClientError:
logger.exception(f"head_bucket error {bucket}")
exists = False
return exists
@ -63,7 +62,7 @@ class RAGFlowS3(object):
if not self.bucket_exists(bucket):
self.conn.create_bucket(Bucket=bucket)
s3_logger.error(f"create bucket {bucket} ********")
logger.debug(f"create bucket {bucket} ********")
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
return r
@ -75,25 +74,25 @@ class RAGFlowS3(object):
return []
def put(self, bucket, fnm, binary):
s3_logger.error(f"bucket name {bucket}; filename :{fnm}:")
logger.debug(f"bucket name {bucket}; filename :{fnm}:")
for _ in range(1):
try:
if not self.bucket_exists(bucket):
self.conn.create_bucket(Bucket=bucket)
s3_logger.error(f"create bucket {bucket} ********")
logger.info(f"create bucket {bucket} ********")
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
return r
except Exception as e:
s3_logger.error(f"Fail put {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail put {bucket}/{fnm}")
self.__open__()
time.sleep(1)
def rm(self, bucket, fnm):
try:
self.conn.delete_object(Bucket=bucket, Key=fnm)
except Exception as e:
s3_logger.error(f"Fail rm {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"Fail rm {bucket}/{fnm}")
def get(self, bucket, fnm):
for _ in range(1):
@ -101,8 +100,8 @@ class RAGFlowS3(object):
r = self.conn.get_object(Bucket=bucket, Key=fnm)
object_data = r['Body'].read()
return object_data
except Exception as e:
s3_logger.error(f"fail get {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"fail get {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return
@ -128,8 +127,8 @@ class RAGFlowS3(object):
ExpiresIn=expires)
return r
except Exception as e:
s3_logger.error(f"fail get url {bucket}/{fnm}: " + str(e))
except Exception:
logger.exception(f"fail get url {bucket}/{fnm}")
self.__open__()
time.sleep(1)
return