mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Use consistent log file names, introduced initLogger (#3403)
### What problem does this PR solve? Use consistent log file names, introduced initLogger ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [x] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
This commit is contained in:
@ -10,6 +10,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
from tika import parser
|
||||
import re
|
||||
from io import BytesIO
|
||||
@ -20,7 +21,6 @@ from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
||||
tokenize_chunks
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -39,7 +39,7 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.67, "Layout analysis finished")
|
||||
logger.info("layouts: {}".format(timer() - start))
|
||||
logging.debug("layouts: {}".format(timer() - start))
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.68, "Table analysis finished")
|
||||
self._text_merge()
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
from email import policy
|
||||
from email.parser import BytesParser
|
||||
from rag.app.naive import chunk as naive_chunk
|
||||
@ -18,7 +19,6 @@ import re
|
||||
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
|
||||
from deepdoc.parser import HtmlParser, TxtParser
|
||||
from timeit import default_timer as timer
|
||||
from api.utils.log_utils import logger
|
||||
import io
|
||||
|
||||
|
||||
@ -86,7 +86,7 @@ def chunk(
|
||||
)
|
||||
|
||||
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
|
||||
logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
logging.debug("naive_merge({}): {}".format(filename, timer() - st))
|
||||
# get the attachment info
|
||||
for part in msg.iter_attachments():
|
||||
content_disposition = part.get("Content-Disposition")
|
||||
|
||||
@ -21,7 +21,6 @@ from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge,
|
||||
make_colon_as_title, tokenize_chunks, docx_question_level
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Docx(DocxParser):
|
||||
@ -122,7 +121,7 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.67, "Layout analysis finished")
|
||||
logger.info("layouts:".format(
|
||||
logging.debug("layouts:".format(
|
||||
))
|
||||
self._naive_vertical_merge()
|
||||
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import copy
|
||||
import re
|
||||
|
||||
@ -24,7 +25,6 @@ from rag.utils import num_tokens_from_string
|
||||
from deepdoc.parser import PdfParser, PlainParser, DocxParser
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -48,11 +48,11 @@ class Pdf(PdfParser):
|
||||
# for bb in self.boxes:
|
||||
# for b in bb:
|
||||
# print(b)
|
||||
logger.info("OCR: {}".format(timer() - start))
|
||||
logging.debug("OCR: {}".format(timer() - start))
|
||||
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.65, "Layout analysis finished.")
|
||||
logger.info("layouts: {}".format(timer() - start))
|
||||
logging.debug("layouts: {}".format(timer() - start))
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.67, "Table analysis finished.")
|
||||
self._text_merge()
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
from tika import parser
|
||||
from io import BytesIO
|
||||
from docx import Document
|
||||
@ -19,7 +20,6 @@ from deepdoc.parser.pdf_parser import PlainParser
|
||||
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
|
||||
naive_merge_docx, tokenize_chunks_docx
|
||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils import num_tokens_from_string
|
||||
from PIL import Image
|
||||
from functools import reduce
|
||||
@ -41,13 +41,13 @@ class Docx(DocxParser):
|
||||
try:
|
||||
image_blob = related_part.image.blob
|
||||
except UnrecognizedImageError:
|
||||
logger.info("Unrecognized image format. Skipping image.")
|
||||
logging.info("Unrecognized image format. Skipping image.")
|
||||
return None
|
||||
except UnexpectedEndOfFileError:
|
||||
logger.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
|
||||
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
|
||||
return None
|
||||
except InvalidImageStreamError:
|
||||
logger.info("The recognized image stream appears to be corrupted. Skipping image.")
|
||||
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
||||
return None
|
||||
try:
|
||||
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
||||
@ -133,7 +133,7 @@ class Pdf(PdfParser):
|
||||
callback
|
||||
)
|
||||
callback(msg="OCR finished")
|
||||
logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
||||
logging.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
||||
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
@ -147,7 +147,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward()
|
||||
# self._filter_forpages()
|
||||
|
||||
logger.info("layouts cost: {}s".format(timer() - start))
|
||||
logging.info("layouts cost: {}s".format(timer() - start))
|
||||
return [(b["text"], self._line_tag(b, zoomin))
|
||||
for b in self.boxes], tbls
|
||||
|
||||
@ -216,7 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return chunks
|
||||
|
||||
res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
|
||||
logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
@ -280,7 +280,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return chunks
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
return res
|
||||
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
from tika import parser
|
||||
from io import BytesIO
|
||||
import re
|
||||
@ -18,7 +19,6 @@ from deepdoc.parser.utils import get_text
|
||||
from rag.app import laws
|
||||
from rag.nlp import rag_tokenizer, tokenize
|
||||
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -38,7 +38,7 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin, drop=False)
|
||||
callback(0.63, "Layout analysis finished.")
|
||||
logger.info("layouts cost: {}s".format(timer() - start))
|
||||
logging.debug("layouts cost: {}s".format(timer() - start))
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.65, "Table analysis finished.")
|
||||
self._text_merge()
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import copy
|
||||
import re
|
||||
|
||||
@ -17,7 +18,6 @@ from api.db import ParserType
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||
from deepdoc.parser import PdfParser, PlainParser
|
||||
import numpy as np
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -41,7 +41,7 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.63, "Layout analysis finished")
|
||||
logger.info(f"layouts cost: {timer() - start}s")
|
||||
logging.debug(f"layouts cost: {timer() - start}s")
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.68, "Table analysis finished")
|
||||
self._text_merge()
|
||||
@ -53,7 +53,7 @@ class Pdf(PdfParser):
|
||||
|
||||
# clean mess
|
||||
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
||||
logger.info("two_column................... {} {}".format(column_width,
|
||||
logging.debug("two_column................... {} {}".format(column_width,
|
||||
self.page_images[0].size[0] / zoomin / 2))
|
||||
self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
|
||||
for b in self.boxes:
|
||||
@ -115,8 +115,8 @@ class Pdf(PdfParser):
|
||||
from_page, min(
|
||||
to_page, self.total_page)))
|
||||
for b in self.boxes:
|
||||
logger.info("{} {}".format(b["text"], b.get("layoutno")))
|
||||
logger.info("{}".format(tbls))
|
||||
logging.debug("{} {}".format(b["text"], b.get("layoutno")))
|
||||
logging.debug("{}".format(tbls))
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
@ -157,7 +157,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
logger.info("It's English.....{}".format(eng))
|
||||
logging.debug("It's English.....{}".format(eng))
|
||||
|
||||
res = tokenize_table(paper["tables"], doc, eng)
|
||||
|
||||
@ -184,7 +184,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
logger.info("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
|
||||
logging.debug("{} {} {} {}".format(lvl, sorted_sections[i][0], most_level, sid))
|
||||
|
||||
chunks = []
|
||||
last_sid = -2
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from io import BytesIO
|
||||
@ -19,7 +20,6 @@ from openpyxl import load_workbook
|
||||
from deepdoc.parser.utils import get_text
|
||||
from rag.nlp import is_english, random_choices, qbullets_category, add_positions, has_qbullet, docx_question_level
|
||||
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
|
||||
from api.utils.log_utils import logger
|
||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
@ -82,7 +82,7 @@ class Pdf(PdfParser):
|
||||
callback
|
||||
)
|
||||
callback(msg="OCR finished")
|
||||
logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
||||
logging.debug("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin, drop=False)
|
||||
callback(0.63, "Layout analysis finished.")
|
||||
@ -94,7 +94,7 @@ class Pdf(PdfParser):
|
||||
#self._naive_vertical_merge()
|
||||
# self._concat_downward()
|
||||
#self._filter_forpages()
|
||||
logger.info("layouts: {}".format(timer() - start))
|
||||
logging.debug("layouts: {}".format(timer() - start))
|
||||
sections = [b["text"] for b in self.boxes]
|
||||
bull_x0_list = []
|
||||
q_bull, reg = qbullets_category(sections)
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import base64
|
||||
import datetime
|
||||
import json
|
||||
@ -20,7 +21,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser.resume import refactor
|
||||
from deepdoc.parser.resume import step_one, step_two
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils import rmSpace
|
||||
|
||||
forbidden_select_fields4resume = [
|
||||
@ -64,7 +64,7 @@ def remote_call(filename, binary):
|
||||
resume = step_two.parse(resume)
|
||||
return resume
|
||||
except Exception:
|
||||
logger.exception("Resume parser error")
|
||||
logging.exception("Resume parser error")
|
||||
return {}
|
||||
|
||||
|
||||
@ -86,7 +86,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
callback(-1, "Resume is not successfully parsed.")
|
||||
raise Exception("Resume parser remote call fail!")
|
||||
callback(0.6, "Done parsing. Chunking...")
|
||||
logger.info("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2))
|
||||
logging.debug("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2))
|
||||
|
||||
field_map = {
|
||||
"name_kwd": "姓名/名字",
|
||||
@ -158,7 +158,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
|
||||
doc[n] = resume[n]
|
||||
|
||||
logger.info("chunked resume to " + str(doc))
|
||||
logging.debug("chunked resume to " + str(doc))
|
||||
KnowledgebaseService.update_parser_config(
|
||||
kwargs["kb_id"], {"field_map": field_map})
|
||||
return [doc]
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
import threading
|
||||
@ -32,7 +33,6 @@ from api.utils.file_utils import get_home_cache_dir
|
||||
from rag.utils import num_tokens_from_string, truncate
|
||||
import google.generativeai as genai
|
||||
import json
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
class Base(ABC):
|
||||
def __init__(self, key, model_name):
|
||||
@ -297,7 +297,7 @@ class YoudaoEmbed(Base):
|
||||
if not LIGHTEN and not YoudaoEmbed._client:
|
||||
from BCEmbedding import EmbeddingModel as qanthing
|
||||
try:
|
||||
logger.info("LOADING BCE...")
|
||||
logging.info("LOADING BCE...")
|
||||
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(
|
||||
get_home_cache_dir(),
|
||||
"bce-embedding-base_v1"))
|
||||
|
||||
@ -27,7 +27,6 @@ from api.settings import LIGHTEN
|
||||
from api.utils.file_utils import get_home_cache_dir
|
||||
from rag.utils import num_tokens_from_string, truncate
|
||||
import json
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
def sigmoid(x):
|
||||
@ -127,7 +126,7 @@ class YoudaoRerank(DefaultRerank):
|
||||
with YoudaoRerank._model_lock:
|
||||
if not YoudaoRerank._model:
|
||||
try:
|
||||
logger.info("LOADING BCE...")
|
||||
logging.info("LOADING BCE...")
|
||||
YoudaoRerank._model = RerankerModel(model_name_or_path=os.path.join(
|
||||
get_home_cache_dir(),
|
||||
re.sub(r"^[a-zA-Z0-9]+/", "", model_name)))
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import random
|
||||
from collections import Counter
|
||||
|
||||
@ -26,7 +27,6 @@ from word2number import w2n
|
||||
from cn2an import cn2an
|
||||
from PIL import Image
|
||||
import json
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
all_codecs = [
|
||||
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
||||
@ -236,7 +236,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
||||
# wrap up as es documents
|
||||
for ck in chunks:
|
||||
if len(ck.strip()) == 0:continue
|
||||
logger.debug("-- {}".format(ck))
|
||||
logging.debug("-- {}".format(ck))
|
||||
d = copy.deepcopy(doc)
|
||||
if pdf_parser:
|
||||
try:
|
||||
@ -255,7 +255,7 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
|
||||
# wrap up as es documents
|
||||
for ck, image in zip(chunks, images):
|
||||
if len(ck.strip()) == 0:continue
|
||||
logger.debug("-- {}".format(ck))
|
||||
logging.debug("-- {}".format(ck))
|
||||
d = copy.deepcopy(doc)
|
||||
d["image"] = image
|
||||
tokenize(d, ck, eng)
|
||||
@ -458,7 +458,7 @@ def hierarchical_merge(bull, sections, depth):
|
||||
|
||||
for i in range(len(cks)):
|
||||
cks[i] = [sections[j] for j in cks[i][::-1]]
|
||||
logger.info("\n* ".join(cks[i]))
|
||||
logging.debug("\n* ".join(cks[i]))
|
||||
|
||||
res = [[]]
|
||||
num = [0]
|
||||
|
||||
@ -14,9 +14,9 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
from rag.utils.doc_store_conn import MatchTextExpr
|
||||
|
||||
from rag.nlp import rag_tokenizer, term_weight, synonym
|
||||
@ -88,7 +88,7 @@ class FulltextQueryer:
|
||||
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
|
||||
syns.append(" ".join(syn))
|
||||
|
||||
q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)]
|
||||
q = ["({}^{:.4f}".format(tk, w) + " %s)".format() for (tk, w), syn in zip(tks_w, syns)]
|
||||
for i in range(1, len(tks_w)):
|
||||
q.append(
|
||||
'"%s %s"^%.4f'
|
||||
@ -121,7 +121,7 @@ class FulltextQueryer:
|
||||
twts = self.tw.weights([tt])
|
||||
syns = self.syn.lookup(tt)
|
||||
if syns: keywords.extend(syns)
|
||||
logging.info(json.dumps(twts, ensure_ascii=False))
|
||||
logging.debug(json.dumps(twts, ensure_ascii=False))
|
||||
tms = []
|
||||
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
||||
sm = (
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import copy
|
||||
import datrie
|
||||
import math
|
||||
@ -25,7 +26,6 @@ from hanziconv import HanziConv
|
||||
from nltk import word_tokenize
|
||||
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class RagTokenizer:
|
||||
@ -36,7 +36,7 @@ class RagTokenizer:
|
||||
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
|
||||
|
||||
def loadDict_(self, fnm):
|
||||
logger.info(f"[HUQIE]:Build trie {fnm}")
|
||||
logging.info(f"[HUQIE]:Build trie {fnm}")
|
||||
try:
|
||||
of = open(fnm, "r", encoding='utf-8')
|
||||
while True:
|
||||
@ -53,7 +53,7 @@ class RagTokenizer:
|
||||
self.trie_.save(fnm + ".trie")
|
||||
of.close()
|
||||
except Exception:
|
||||
logger.exception(f"[HUQIE]:Build trie {fnm} failed")
|
||||
logging.exception(f"[HUQIE]:Build trie {fnm} failed")
|
||||
|
||||
def __init__(self, debug=False):
|
||||
self.DEBUG = debug
|
||||
@ -69,7 +69,7 @@ class RagTokenizer:
|
||||
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
|
||||
return
|
||||
except Exception:
|
||||
logger.exception("[HUQIE]:Build default trie")
|
||||
logging.exception("[HUQIE]:Build default trie")
|
||||
self.trie_ = datrie.Trie(string.printable)
|
||||
|
||||
self.loadDict_(self.DIR_ + ".txt")
|
||||
@ -173,7 +173,7 @@ class RagTokenizer:
|
||||
tks.append(tk)
|
||||
F /= len(tks)
|
||||
L /= len(tks)
|
||||
logger.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
|
||||
logging.debug("[SC] {} {} {} {} {}".format(tks, len(tks), L, F, B / len(tks) + L + F))
|
||||
return tks, B / len(tks) + L + F
|
||||
|
||||
def sortTks_(self, tkslist):
|
||||
@ -277,8 +277,8 @@ class RagTokenizer:
|
||||
tks, s = self.maxForward_(L)
|
||||
tks1, s1 = self.maxBackward_(L)
|
||||
if self.DEBUG:
|
||||
logger.debug("[FW] {} {}".format(tks, s))
|
||||
logger.debug("[BW] {} {}".format(tks1, s1))
|
||||
logging.debug("[FW] {} {}".format(tks, s))
|
||||
logging.debug("[BW] {} {}".format(tks1, s1))
|
||||
|
||||
i, j, _i, _j = 0, 0, 0, 0
|
||||
same = 0
|
||||
@ -325,7 +325,7 @@ class RagTokenizer:
|
||||
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
||||
|
||||
res = " ".join(self.english_normalize_(res))
|
||||
logger.debug("[TKS] {}".format(self.merge_(res)))
|
||||
logging.debug("[TKS] {}".format(self.merge_(res)))
|
||||
return self.merge_(res)
|
||||
|
||||
def fine_grained_tokenize(self, tks):
|
||||
@ -416,30 +416,30 @@ if __name__ == '__main__':
|
||||
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
|
||||
tks = tknzr.tokenize(
|
||||
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("虽然我不怎么玩")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
|
||||
logger.info(tknzr.fine_grained_tokenize(tks))
|
||||
logging.info(tknzr.fine_grained_tokenize(tks))
|
||||
if len(sys.argv) < 2:
|
||||
sys.exit()
|
||||
tknzr.DEBUG = False
|
||||
@ -449,5 +449,5 @@ if __name__ == '__main__':
|
||||
line = of.readline()
|
||||
if not line:
|
||||
break
|
||||
logger.info(tknzr.tokenize(line))
|
||||
logging.info(tknzr.tokenize(line))
|
||||
of.close()
|
||||
|
||||
@ -14,12 +14,12 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
from typing import List, Optional, Dict, Union
|
||||
from dataclasses import dataclass
|
||||
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils import rmSpace
|
||||
from rag.nlp import rag_tokenizer, query
|
||||
import numpy as np
|
||||
@ -83,7 +83,7 @@ class Dealer:
|
||||
orderBy.desc("create_timestamp_flt")
|
||||
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
logging.debug("Dealer.search TOTAL: {}".format(total))
|
||||
else:
|
||||
highlightFields = ["content_ltks", "title_tks"] if highlight else []
|
||||
matchText, keywords = self.qryr.question(qst, min_match=0.3)
|
||||
@ -91,7 +91,7 @@ class Dealer:
|
||||
matchExprs = [matchText]
|
||||
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
logging.debug("Dealer.search TOTAL: {}".format(total))
|
||||
else:
|
||||
matchDense = self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
|
||||
q_vec = matchDense.embedding_data
|
||||
@ -102,7 +102,7 @@ class Dealer:
|
||||
|
||||
res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
logger.info("Dealer.search TOTAL: {}".format(total))
|
||||
logging.debug("Dealer.search TOTAL: {}".format(total))
|
||||
|
||||
# If result is empty, try again with lower min_match
|
||||
if total == 0:
|
||||
@ -112,7 +112,7 @@ class Dealer:
|
||||
matchDense.extra_options["similarity"] = 0.17
|
||||
res = self.dataStore.search(src, highlightFields, filters, [matchText, matchDense, fusionExpr], orderBy, offset, limit, idx_names, kb_ids)
|
||||
total=self.dataStore.getTotal(res)
|
||||
logger.info("Dealer.search 2 TOTAL: {}".format(total))
|
||||
logging.debug("Dealer.search 2 TOTAL: {}".format(total))
|
||||
|
||||
for k in keywords:
|
||||
kwds.add(k)
|
||||
@ -123,7 +123,7 @@ class Dealer:
|
||||
continue
|
||||
kwds.add(kk)
|
||||
|
||||
logger.info(f"TOTAL: {total}")
|
||||
logging.debug(f"TOTAL: {total}")
|
||||
ids=self.dataStore.getChunkIds(res)
|
||||
keywords=list(kwds)
|
||||
highlight = self.dataStore.getHighlight(res, keywords, "content_with_weight")
|
||||
@ -180,7 +180,7 @@ class Dealer:
|
||||
continue
|
||||
idx.append(i)
|
||||
pieces_.append(t)
|
||||
logger.info("{} => {}".format(answer, pieces_))
|
||||
logging.debug("{} => {}".format(answer, pieces_))
|
||||
if not pieces_:
|
||||
return answer, set([])
|
||||
|
||||
@ -201,7 +201,7 @@ class Dealer:
|
||||
chunks_tks,
|
||||
tkweight, vtweight)
|
||||
mx = np.max(sim) * 0.99
|
||||
logger.info("{} SIM: {}".format(pieces_[i], mx))
|
||||
logging.debug("{} SIM: {}".format(pieces_[i], mx))
|
||||
if mx < thr:
|
||||
continue
|
||||
cites[idx[i]] = list(
|
||||
|
||||
@ -14,13 +14,13 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
from nltk.corpus import wordnet
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Dealer:
|
||||
@ -33,14 +33,14 @@ class Dealer:
|
||||
try:
|
||||
self.dictionary = json.load(open(path, 'r'))
|
||||
except Exception:
|
||||
logger.warn("Missing synonym.json")
|
||||
logging.warn("Missing synonym.json")
|
||||
self.dictionary = {}
|
||||
|
||||
if not redis:
|
||||
logger.warning(
|
||||
logging.warning(
|
||||
"Realtime synonym is disabled, since no redis connection.")
|
||||
if not len(self.dictionary.keys()):
|
||||
logger.warning("Fail to load synonym")
|
||||
logging.warning("Fail to load synonym")
|
||||
|
||||
self.redis = redis
|
||||
self.load()
|
||||
@ -64,7 +64,7 @@ class Dealer:
|
||||
d = json.loads(d)
|
||||
self.dictionary = d
|
||||
except Exception as e:
|
||||
logger.error("Fail to load synonym!" + str(e))
|
||||
logging.error("Fail to load synonym!" + str(e))
|
||||
|
||||
def lookup(self, tk):
|
||||
if re.match(r"[a-z]+$", tk):
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import logging
|
||||
import math
|
||||
import json
|
||||
import re
|
||||
@ -21,7 +22,6 @@ import os
|
||||
import numpy as np
|
||||
from rag.nlp import rag_tokenizer
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class Dealer:
|
||||
@ -83,11 +83,11 @@ class Dealer:
|
||||
try:
|
||||
self.ne = json.load(open(os.path.join(fnm, "ner.json"), "r"))
|
||||
except Exception:
|
||||
logger.warning("Load ner.json FAIL!")
|
||||
logging.warning("Load ner.json FAIL!")
|
||||
try:
|
||||
self.df = load_dict(os.path.join(fnm, "term.freq"))
|
||||
except Exception:
|
||||
logger.warning("Load term.freq FAIL!")
|
||||
logging.warning("Load term.freq FAIL!")
|
||||
|
||||
def pretoken(self, txt, num=False, stpwd=True):
|
||||
patt = [
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor, ALL_COMPLETED, wait
|
||||
from threading import Lock
|
||||
@ -22,7 +23,6 @@ import numpy as np
|
||||
from sklearn.mixture import GaussianMixture
|
||||
|
||||
from rag.utils import truncate
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
||||
@ -62,13 +62,13 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
||||
{"temperature": 0.3, "max_tokens": self._max_token}
|
||||
)
|
||||
cnt = re.sub("(······\n由于长度的原因,回答被截断了,要继续吗?|For the content length reason, it stopped, continue?)", "", cnt)
|
||||
logger.info(f"SUM: {cnt}")
|
||||
logging.debug(f"SUM: {cnt}")
|
||||
embds, _ = self._embd_model.encode([cnt])
|
||||
with lock:
|
||||
if not len(embds[0]): return
|
||||
chunks.append((cnt, embds[0]))
|
||||
except Exception as e:
|
||||
logger.exception("summarize got exception")
|
||||
logging.exception("summarize got exception")
|
||||
return e
|
||||
|
||||
labels = []
|
||||
@ -104,7 +104,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
||||
ck_idx = [i+start for i in range(len(lbls)) if lbls[i] == c]
|
||||
threads.append(executor.submit(summarize, ck_idx, lock))
|
||||
wait(threads, return_when=ALL_COMPLETED)
|
||||
logger.info(str([t.result() for t in threads]))
|
||||
logging.debug(str([t.result() for t in threads]))
|
||||
|
||||
assert len(chunks) - end == n_clusters, "{} vs. {}".format(len(chunks) - end, n_clusters)
|
||||
labels.extend(lbls)
|
||||
|
||||
@ -13,19 +13,19 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import time
|
||||
import traceback
|
||||
|
||||
from api.db.db_models import close_connection
|
||||
from api.db.services.task_service import TaskService
|
||||
from api.utils.log_utils import logger
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
|
||||
|
||||
def collect():
|
||||
doc_locations = TaskService.get_ongoing_doc_name()
|
||||
logger.info(doc_locations)
|
||||
logging.debug(doc_locations)
|
||||
if len(doc_locations) == 0:
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -34,7 +34,7 @@ def collect():
|
||||
def main():
|
||||
locations = collect()
|
||||
if not locations:return
|
||||
logger.info(f"TASKS: {len(locations)}")
|
||||
logging.info(f"TASKS: {len(locations)}")
|
||||
for kb_id, loc in locations:
|
||||
try:
|
||||
if REDIS_CONN.is_alive():
|
||||
@ -43,7 +43,7 @@ def main():
|
||||
if REDIS_CONN.exist(key):continue
|
||||
file_bin = STORAGE_IMPL.get(kb_id, loc)
|
||||
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
||||
logger.info("CACHE: {}".format(loc))
|
||||
logging.info("CACHE: {}".format(loc))
|
||||
except Exception as e:
|
||||
traceback.print_stack(e)
|
||||
except Exception as e:
|
||||
|
||||
@ -13,11 +13,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import discord
|
||||
import requests
|
||||
import base64
|
||||
import asyncio
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
URL = '{YOUR_IP_ADDRESS:PORT}/v1/api/completion_aibotk' # Default: https://demo.ragflow.io/v1/api/completion_aibotk
|
||||
|
||||
@ -37,7 +37,7 @@ client = discord.Client(intents=intents)
|
||||
|
||||
@client.event
|
||||
async def on_ready():
|
||||
logger.info(f'We have logged in as {client.user}')
|
||||
logging.info(f'We have logged in as {client.user}')
|
||||
|
||||
|
||||
@client.event
|
||||
|
||||
@ -13,9 +13,20 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import inspect
|
||||
from api.utils.log_utils import initRootLogger
|
||||
initRootLogger(inspect.getfile(inspect.currentframe()))
|
||||
for module in ["pdfminer"]:
|
||||
module_logger = logging.getLogger(module)
|
||||
module_logger.setLevel(logging.WARNING)
|
||||
for module in ["peewee"]:
|
||||
module_logger = logging.getLogger(module)
|
||||
module_logger.handlers.clear()
|
||||
module_logger.propagate = True
|
||||
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import hashlib
|
||||
import copy
|
||||
@ -42,7 +53,6 @@ from api.db.db_models import close_connection
|
||||
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
|
||||
from rag.nlp import search, rag_tokenizer
|
||||
from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
|
||||
from api.utils.log_utils import logger, LOG_FILE
|
||||
from rag.settings import DOC_MAXIMUM_SIZE, SVR_QUEUE_NAME
|
||||
from rag.utils import rmSpace, num_tokens_from_string
|
||||
from rag.utils.redis_conn import REDIS_CONN, Payload
|
||||
@ -90,7 +100,7 @@ def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing...
|
||||
try:
|
||||
TaskService.update_progress(task_id, d)
|
||||
except Exception:
|
||||
logger.exception(f"set_progress({task_id}) got exception")
|
||||
logging.exception(f"set_progress({task_id}) got exception")
|
||||
|
||||
close_connection()
|
||||
if cancel:
|
||||
@ -110,7 +120,7 @@ def collect():
|
||||
time.sleep(1)
|
||||
return pd.DataFrame()
|
||||
except Exception:
|
||||
logger.exception("Get task event from queue exception")
|
||||
logging.exception("Get task event from queue exception")
|
||||
return pd.DataFrame()
|
||||
|
||||
msg = PAYLOAD.get_message()
|
||||
@ -118,11 +128,11 @@ def collect():
|
||||
return pd.DataFrame()
|
||||
|
||||
if TaskService.do_cancel(msg["id"]):
|
||||
logger.info("Task {} has been canceled.".format(msg["id"]))
|
||||
logging.info("Task {} has been canceled.".format(msg["id"]))
|
||||
return pd.DataFrame()
|
||||
tasks = TaskService.get_tasks(msg["id"])
|
||||
if not tasks:
|
||||
logger.warning("{} empty task!".format(msg["id"]))
|
||||
logging.warning("{} empty task!".format(msg["id"]))
|
||||
return []
|
||||
|
||||
tasks = pd.DataFrame(tasks)
|
||||
@ -151,29 +161,29 @@ def build(row):
|
||||
st = timer()
|
||||
bucket, name = File2DocumentService.get_storage_address(doc_id=row["doc_id"])
|
||||
binary = get_storage_binary(bucket, name)
|
||||
logger.info(
|
||||
logging.info(
|
||||
"From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
|
||||
except TimeoutError:
|
||||
callback(-1, "Internal server error: Fetch file from minio timeout. Could you try it again.")
|
||||
logger.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"]))
|
||||
logging.exception("Minio {}/{} got timeout: Fetch file from minio timeout.".format(row["location"], row["name"]))
|
||||
return
|
||||
except Exception as e:
|
||||
if re.search("(No such file|not found)", str(e)):
|
||||
callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"])
|
||||
else:
|
||||
callback(-1, "Get file from minio: %s" % str(e).replace("'", ""))
|
||||
logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
|
||||
logging.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
|
||||
return
|
||||
|
||||
try:
|
||||
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
|
||||
to_page=row["to_page"], lang=row["language"], callback=callback,
|
||||
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
|
||||
logger.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"]))
|
||||
logging.info("Chunking({}) {}/{} done".format(timer() - st, row["location"], row["name"]))
|
||||
except Exception as e:
|
||||
callback(-1, "Internal server error while chunking: %s" %
|
||||
str(e).replace("'", ""))
|
||||
logger.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
|
||||
logging.exception("Chunking {}/{} got exception".format(row["location"], row["name"]))
|
||||
return
|
||||
|
||||
docs = []
|
||||
@ -210,12 +220,12 @@ def build(row):
|
||||
STORAGE_IMPL.put(row["kb_id"], d["id"], output_buffer.getvalue())
|
||||
el += timer() - st
|
||||
except Exception:
|
||||
logger.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"]))
|
||||
logging.exception("Saving image of chunk {}/{}/{} got exception".format(row["location"], row["name"], d["_id"]))
|
||||
|
||||
d["img_id"] = "{}-{}".format(row["kb_id"], d["id"])
|
||||
del d["image"]
|
||||
docs.append(d)
|
||||
logger.info("MINIO PUT({}):{}".format(row["name"], el))
|
||||
logging.info("MINIO PUT({}):{}".format(row["name"], el))
|
||||
|
||||
if row["parser_config"].get("auto_keywords", 0):
|
||||
st = timer()
|
||||
@ -345,7 +355,7 @@ def main():
|
||||
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
||||
except Exception as e:
|
||||
callback(-1, msg=str(e))
|
||||
logger.exception("LLMBundle got exception")
|
||||
logging.exception("LLMBundle got exception")
|
||||
continue
|
||||
|
||||
if r.get("task_type", "") == "raptor":
|
||||
@ -354,12 +364,12 @@ def main():
|
||||
cks, tk_count, vector_size = run_raptor(r, chat_mdl, embd_mdl, callback)
|
||||
except Exception as e:
|
||||
callback(-1, msg=str(e))
|
||||
logger.exception("run_raptor got exception")
|
||||
logging.exception("run_raptor got exception")
|
||||
continue
|
||||
else:
|
||||
st = timer()
|
||||
cks = build(r)
|
||||
logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
|
||||
logging.info("Build chunks({}): {}".format(r["name"], timer() - st))
|
||||
if cks is None:
|
||||
continue
|
||||
if not cks:
|
||||
@ -375,12 +385,12 @@ def main():
|
||||
tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
|
||||
except Exception as e:
|
||||
callback(-1, "Embedding error:{}".format(str(e)))
|
||||
logger.exception("run_rembedding got exception")
|
||||
logging.exception("run_rembedding got exception")
|
||||
tk_count = 0
|
||||
logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||
logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||
callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st))
|
||||
|
||||
# logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
||||
# logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
||||
init_kb(r, vector_size)
|
||||
chunk_count = len(set([c["id"] for c in cks]))
|
||||
st = timer()
|
||||
@ -391,11 +401,11 @@ def main():
|
||||
if b % 128 == 0:
|
||||
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
|
||||
|
||||
logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||
logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||
if es_r:
|
||||
callback(-1, f"Insert chunk error, detail info please check {LOG_FILE}. Please also check ES status!")
|
||||
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
||||
logger.error('Insert chunk error: ' + str(es_r))
|
||||
logging.error('Insert chunk error: ' + str(es_r))
|
||||
else:
|
||||
if TaskService.do_cancel(r["id"]):
|
||||
docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
||||
@ -404,7 +414,7 @@ def main():
|
||||
callback(1., "Done!")
|
||||
DocumentService.increment_chunk_num(
|
||||
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
||||
logger.info(
|
||||
logging.info(
|
||||
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
|
||||
r["id"], tk_count, len(cks), timer() - st))
|
||||
|
||||
@ -421,16 +431,11 @@ def report_status():
|
||||
obj[CONSUMER_NAME] = obj[CONSUMER_NAME][-60:]
|
||||
REDIS_CONN.set_obj("TASKEXE", obj, 60*2)
|
||||
except Exception:
|
||||
logger.exception("report_status got exception")
|
||||
logging.exception("report_status got exception")
|
||||
time.sleep(30)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
peewee_logger = logging.getLogger('peewee')
|
||||
peewee_logger.propagate = False
|
||||
peewee_logger.addHandler(logger.handlers[0])
|
||||
peewee_logger.setLevel(logger.handlers[0].level)
|
||||
|
||||
exe = ThreadPoolExecutor(max_workers=1)
|
||||
exe.submit(report_status)
|
||||
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from io import BytesIO
|
||||
@ -24,7 +25,7 @@ class RAGFlowAzureSasBlob(object):
|
||||
try:
|
||||
self.conn = ContainerClient.from_container_url(self.container_url + "?" + self.sas_token)
|
||||
except Exception:
|
||||
logger.exception("Fail to connect %s " % self.container_url)
|
||||
logging.exception("Fail to connect %s " % self.container_url)
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
@ -39,7 +40,7 @@ class RAGFlowAzureSasBlob(object):
|
||||
try:
|
||||
return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
@ -47,7 +48,7 @@ class RAGFlowAzureSasBlob(object):
|
||||
try:
|
||||
self.conn.delete_blob(fnm)
|
||||
except Exception:
|
||||
logger.exception(f"Fail rm {bucket}/{fnm}")
|
||||
logging.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
@ -55,7 +56,7 @@ class RAGFlowAzureSasBlob(object):
|
||||
r = self.conn.download_blob(fnm)
|
||||
return r.read()
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -64,7 +65,7 @@ class RAGFlowAzureSasBlob(object):
|
||||
try:
|
||||
return self.conn.get_blob_client(fnm).exists()
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
return False
|
||||
|
||||
def get_presigned_url(self, bucket, fnm, expires):
|
||||
@ -72,7 +73,7 @@ class RAGFlowAzureSasBlob(object):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from rag import settings
|
||||
@ -28,7 +29,7 @@ class RAGFlowAzureSpnBlob(object):
|
||||
credentials = ClientSecretCredential(tenant_id=self.tenant_id, client_id=self.client_id, client_secret=self.secret, authority=AzureAuthorityHosts.AZURE_CHINA)
|
||||
self.conn = FileSystemClient(account_url=self.account_url, file_system_name=self.container_name, credential=credentials)
|
||||
except Exception:
|
||||
logger.exception("Fail to connect %s" % self.account_url)
|
||||
logging.exception("Fail to connect %s" % self.account_url)
|
||||
|
||||
def __close__(self):
|
||||
del self.conn
|
||||
@ -47,7 +48,7 @@ class RAGFlowAzureSpnBlob(object):
|
||||
f.append_data(binary, offset=0, length=len(binary))
|
||||
return f.flush_data(len(binary))
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
@ -55,7 +56,7 @@ class RAGFlowAzureSpnBlob(object):
|
||||
try:
|
||||
self.conn.delete_file(fnm)
|
||||
except Exception:
|
||||
logger.exception(f"Fail rm {bucket}/{fnm}")
|
||||
logging.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
@ -64,7 +65,7 @@ class RAGFlowAzureSpnBlob(object):
|
||||
r = client.download_file()
|
||||
return r.read()
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -74,7 +75,7 @@ class RAGFlowAzureSpnBlob(object):
|
||||
client = self.conn.get_file_client(fnm)
|
||||
return client.exists()
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
return False
|
||||
|
||||
def get_presigned_url(self, bucket, fnm, expires):
|
||||
@ -82,7 +83,7 @@ class RAGFlowAzureSpnBlob(object):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -1,3 +1,4 @@
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
@ -8,7 +9,6 @@ import copy
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
|
||||
from elastic_transport import ConnectionTimeout
|
||||
from api.utils.log_utils import logger
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
@ -22,7 +22,7 @@ from rag.nlp import is_english, rag_tokenizer
|
||||
class ESConnection(DocStoreConnection):
|
||||
def __init__(self):
|
||||
self.info = {}
|
||||
logger.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.")
|
||||
logging.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.")
|
||||
for _ in range(24):
|
||||
try:
|
||||
self.es = Elasticsearch(
|
||||
@ -36,25 +36,25 @@ class ESConnection(DocStoreConnection):
|
||||
self.info = self.es.info()
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warn(f"{str(e)}. Waiting Elasticsearch {settings.ES['hosts']} to be healthy.")
|
||||
logging.warn(f"{str(e)}. Waiting Elasticsearch {settings.ES['hosts']} to be healthy.")
|
||||
time.sleep(5)
|
||||
if not self.es.ping():
|
||||
msg = f"Elasticsearch {settings.ES['hosts']} didn't become healthy in 120s."
|
||||
logger.error(msg)
|
||||
logging.error(msg)
|
||||
raise Exception(msg)
|
||||
v = self.info.get("version", {"number": "8.11.3"})
|
||||
v = v["number"].split(".")[0]
|
||||
if int(v) < 8:
|
||||
msg = f"Elasticsearch version must be greater than or equal to 8, current version: {v}"
|
||||
logger.error(msg)
|
||||
logging.error(msg)
|
||||
raise Exception(msg)
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", "mapping.json")
|
||||
if not os.path.exists(fp_mapping):
|
||||
msg = f"Elasticsearch mapping file not found at {fp_mapping}"
|
||||
logger.error(msg)
|
||||
logging.error(msg)
|
||||
raise Exception(msg)
|
||||
self.mapping = json.load(open(fp_mapping, "r"))
|
||||
logger.info(f"Elasticsearch {settings.ES['hosts']} is healthy.")
|
||||
logging.info(f"Elasticsearch {settings.ES['hosts']} is healthy.")
|
||||
|
||||
"""
|
||||
Database operations
|
||||
@ -79,13 +79,13 @@ class ESConnection(DocStoreConnection):
|
||||
settings=self.mapping["settings"],
|
||||
mappings=self.mapping["mappings"])
|
||||
except Exception:
|
||||
logger.exception("ES create index error %s" % (indexName))
|
||||
logging.exception("ES create index error %s" % (indexName))
|
||||
|
||||
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
||||
try:
|
||||
return self.es.indices.delete(indexName, allow_no_indices=True)
|
||||
except Exception:
|
||||
logger.exception("ES delete index error %s" % (indexName))
|
||||
logging.exception("ES delete index error %s" % (indexName))
|
||||
|
||||
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
|
||||
s = Index(indexName, self.es)
|
||||
@ -93,7 +93,7 @@ class ESConnection(DocStoreConnection):
|
||||
try:
|
||||
return s.exists()
|
||||
except Exception as e:
|
||||
logger.exception("ES indexExist")
|
||||
logging.exception("ES indexExist")
|
||||
if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
|
||||
continue
|
||||
return False
|
||||
@ -178,7 +178,7 @@ class ESConnection(DocStoreConnection):
|
||||
s = s[offset:limit]
|
||||
q = s.to_dict()
|
||||
print(json.dumps(q), flush=True)
|
||||
# logger.info("ESConnection.search [Q]: " + json.dumps(q))
|
||||
logging.debug("ESConnection.search [Q]: " + json.dumps(q))
|
||||
|
||||
for i in range(3):
|
||||
try:
|
||||
@ -190,14 +190,14 @@ class ESConnection(DocStoreConnection):
|
||||
_source=True)
|
||||
if str(res.get("timed_out", "")).lower() == "true":
|
||||
raise Exception("Es Timeout.")
|
||||
logger.info("ESConnection.search res: " + str(res))
|
||||
logging.debug("ESConnection.search res: " + str(res))
|
||||
return res
|
||||
except Exception as e:
|
||||
logger.exception("ES search [Q]: " + str(q))
|
||||
logging.exception("ES search [Q]: " + str(q))
|
||||
if str(e).find("Timeout") > 0:
|
||||
continue
|
||||
raise e
|
||||
logger.error("ES search timeout for 3 times!")
|
||||
logging.error("ES search timeout for 3 times!")
|
||||
raise Exception("ES search timeout.")
|
||||
|
||||
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
|
||||
@ -213,11 +213,11 @@ class ESConnection(DocStoreConnection):
|
||||
chunk["id"] = chunkId
|
||||
return chunk
|
||||
except Exception as e:
|
||||
logger.exception(f"ES get({chunkId}) got exception")
|
||||
logging.exception(f"ES get({chunkId}) got exception")
|
||||
if str(e).find("Timeout") > 0:
|
||||
continue
|
||||
raise e
|
||||
logger.error("ES search timeout for 3 times!")
|
||||
logging.error("ES search timeout for 3 times!")
|
||||
raise Exception("ES search timeout.")
|
||||
|
||||
def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str) -> list[str]:
|
||||
@ -247,7 +247,7 @@ class ESConnection(DocStoreConnection):
|
||||
res.append(str(item[action]["_id"]) + ":" + str(item[action]["error"]))
|
||||
return res
|
||||
except Exception as e:
|
||||
logger.warning("Fail to bulk: " + str(e))
|
||||
logging.warning("Fail to bulk: " + str(e))
|
||||
if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
|
||||
time.sleep(3)
|
||||
continue
|
||||
@ -264,7 +264,7 @@ class ESConnection(DocStoreConnection):
|
||||
self.es.update(index=indexName, id=chunkId, doc=doc)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
logging.exception(
|
||||
f"ES failed to update(index={indexName}, id={id}, doc={json.dumps(condition, ensure_ascii=False)})")
|
||||
if str(e).find("Timeout") > 0:
|
||||
continue
|
||||
@ -304,7 +304,7 @@ class ESConnection(DocStoreConnection):
|
||||
_ = ubq.execute()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error("ES update exception: " + str(e) + "[Q]:" + str(bqry.to_dict()))
|
||||
logging.error("ES update exception: " + str(e) + "[Q]:" + str(bqry.to_dict()))
|
||||
if str(e).find("Timeout") > 0 or str(e).find("Conflict") > 0:
|
||||
continue
|
||||
return False
|
||||
@ -326,7 +326,7 @@ class ESConnection(DocStoreConnection):
|
||||
qry.must.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception("Condition value must be int, str or list.")
|
||||
logger.info("ESConnection.delete [Q]: " + json.dumps(qry.to_dict()))
|
||||
logging.debug("ESConnection.delete [Q]: " + json.dumps(qry.to_dict()))
|
||||
for _ in range(10):
|
||||
try:
|
||||
res = self.es.delete_by_query(
|
||||
@ -335,7 +335,7 @@ class ESConnection(DocStoreConnection):
|
||||
refresh=True)
|
||||
return res["deleted"]
|
||||
except Exception as e:
|
||||
logger.warning("Fail to delete: " + str(filter) + str(e))
|
||||
logging.warning("Fail to delete: " + str(filter) + str(e))
|
||||
if re.search(r"(Timeout|time out)", str(e), re.IGNORECASE):
|
||||
time.sleep(3)
|
||||
continue
|
||||
@ -419,7 +419,7 @@ class ESConnection(DocStoreConnection):
|
||||
"""
|
||||
|
||||
def sql(self, sql: str, fetch_size: int, format: str):
|
||||
logger.info(f"ESConnection.sql get sql: {sql}")
|
||||
logging.debug(f"ESConnection.sql get sql: {sql}")
|
||||
sql = re.sub(r"[ `]+", " ", sql)
|
||||
sql = sql.replace("%", "")
|
||||
replaces = []
|
||||
@ -436,7 +436,7 @@ class ESConnection(DocStoreConnection):
|
||||
|
||||
for p, r in replaces:
|
||||
sql = sql.replace(p, r, 1)
|
||||
logger.info(f"ESConnection.sql to es: {sql}")
|
||||
logging.debug(f"ESConnection.sql to es: {sql}")
|
||||
|
||||
for i in range(3):
|
||||
try:
|
||||
@ -444,10 +444,10 @@ class ESConnection(DocStoreConnection):
|
||||
request_timeout="2s")
|
||||
return res
|
||||
except ConnectionTimeout:
|
||||
logger.exception("ESConnection.sql timeout [Q]: " + sql)
|
||||
logging.exception("ESConnection.sql timeout [Q]: " + sql)
|
||||
continue
|
||||
except Exception:
|
||||
logger.exception("ESConnection.sql got exception [Q]: " + sql)
|
||||
logging.exception("ESConnection.sql got exception [Q]: " + sql)
|
||||
return None
|
||||
logger.error("ESConnection.sql timeout for 3 times!")
|
||||
logging.error("ESConnection.sql timeout for 3 times!")
|
||||
return None
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
@ -7,7 +8,6 @@ import infinity
|
||||
from infinity.common import ConflictType, InfinityException
|
||||
from infinity.index import IndexInfo, IndexType
|
||||
from infinity.connection_pool import ConnectionPool
|
||||
from api.utils.log_utils import logger
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
import polars as pl
|
||||
@ -56,7 +56,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
host, port = infinity_uri.split(":")
|
||||
infinity_uri = infinity.common.NetworkAddress(host, int(port))
|
||||
self.connPool = None
|
||||
logger.info(f"Use Infinity {infinity_uri} as the doc engine.")
|
||||
logging.info(f"Use Infinity {infinity_uri} as the doc engine.")
|
||||
for _ in range(24):
|
||||
try:
|
||||
connPool = ConnectionPool(infinity_uri)
|
||||
@ -66,13 +66,13 @@ class InfinityConnection(DocStoreConnection):
|
||||
self.connPool = connPool
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warn(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.")
|
||||
logging.warn(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.")
|
||||
time.sleep(5)
|
||||
if self.connPool is None:
|
||||
msg = f"Infinity {infinity_uri} didn't become healthy in 120s."
|
||||
logger.error(msg)
|
||||
logging.error(msg)
|
||||
raise Exception(msg)
|
||||
logger.info(f"Infinity {infinity_uri} is healthy.")
|
||||
logging.info(f"Infinity {infinity_uri} is healthy.")
|
||||
|
||||
"""
|
||||
Database operations
|
||||
@ -148,7 +148,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
)
|
||||
break
|
||||
self.connPool.release_conn(inf_conn)
|
||||
logger.info(
|
||||
logging.info(
|
||||
f"INFINITY created table {table_name}, vector size {vectorSize}"
|
||||
)
|
||||
|
||||
@ -158,7 +158,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
db_instance.drop_table(table_name, ConflictType.Ignore)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
logger.info(f"INFINITY dropped table {table_name}")
|
||||
logging.info(f"INFINITY dropped table {table_name}")
|
||||
|
||||
def indexExist(self, indexName: str, knowledgebaseId: str) -> bool:
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
@ -169,7 +169,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
self.connPool.release_conn(inf_conn)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warn(f"INFINITY indexExist {str(e)}")
|
||||
logging.warn(f"INFINITY indexExist {str(e)}")
|
||||
return False
|
||||
|
||||
"""
|
||||
@ -216,7 +216,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
)
|
||||
if len(filter_cond) != 0:
|
||||
filter_fulltext = f"({filter_cond}) AND {filter_fulltext}"
|
||||
# logger.info(f"filter_fulltext: {filter_fulltext}")
|
||||
logging.debug(f"filter_fulltext: {filter_fulltext}")
|
||||
minimum_should_match = "0%"
|
||||
if "minimum_should_match" in matchExpr.extra_options:
|
||||
minimum_should_match = (
|
||||
@ -279,7 +279,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
df_list.append(kb_res)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
res = pl.concat(df_list)
|
||||
logger.info("INFINITY search tables: " + str(table_list))
|
||||
logging.debug("INFINITY search tables: " + str(table_list))
|
||||
return res
|
||||
|
||||
def get(
|
||||
@ -334,18 +334,18 @@ class InfinityConnection(DocStoreConnection):
|
||||
str_filter = f"id IN ({str_ids})"
|
||||
table_instance.delete(str_filter)
|
||||
# for doc in documents:
|
||||
# logger.info(f"insert position_list: {doc['position_list']}")
|
||||
# logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
|
||||
# logging.info(f"insert position_list: {doc['position_list']}")
|
||||
# logging.info(f"InfinityConnection.insert {json.dumps(documents)}")
|
||||
table_instance.insert(documents)
|
||||
self.connPool.release_conn(inf_conn)
|
||||
logger.info(f"inserted into {table_name} {str_ids}.")
|
||||
logging.debug(f"inserted into {table_name} {str_ids}.")
|
||||
return []
|
||||
|
||||
def update(
|
||||
self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str
|
||||
) -> bool:
|
||||
# if 'position_list' in newValue:
|
||||
# logger.info(f"upsert position_list: {newValue['position_list']}")
|
||||
# logging.info(f"upsert position_list: {newValue['position_list']}")
|
||||
inf_conn = self.connPool.get_conn()
|
||||
db_instance = inf_conn.get_database(self.dbName)
|
||||
table_name = f"{indexName}_{knowledgebaseId}"
|
||||
@ -366,7 +366,7 @@ class InfinityConnection(DocStoreConnection):
|
||||
try:
|
||||
table_instance = db_instance.get_table(table_name)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
logging.warning(
|
||||
f"Skipped deleting `{filter}` from table {table_name} since the table doesn't exist."
|
||||
)
|
||||
return 0
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
import logging
|
||||
import time
|
||||
from minio import Minio
|
||||
from io import BytesIO
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
from api.utils.log_utils import logger
|
||||
|
||||
|
||||
@singleton
|
||||
@ -26,7 +26,7 @@ class RAGFlowMinio(object):
|
||||
secure=False
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
logging.exception(
|
||||
"Fail to connect %s " % settings.MINIO["host"])
|
||||
|
||||
def __close__(self):
|
||||
@ -55,7 +55,7 @@ class RAGFlowMinio(object):
|
||||
)
|
||||
return r
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}:")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
@ -63,7 +63,7 @@ class RAGFlowMinio(object):
|
||||
try:
|
||||
self.conn.remove_object(bucket, fnm)
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}:")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
@ -71,7 +71,7 @@ class RAGFlowMinio(object):
|
||||
r = self.conn.get_object(bucket, fnm)
|
||||
return r.read()
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}:")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -81,7 +81,7 @@ class RAGFlowMinio(object):
|
||||
if self.conn.stat_object(bucket, fnm):return True
|
||||
return False
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}:")
|
||||
return False
|
||||
|
||||
|
||||
@ -90,7 +90,7 @@ class RAGFlowMinio(object):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}:")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}:")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import logging
|
||||
import json
|
||||
|
||||
import valkey as redis
|
||||
import logging
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import logging
|
||||
import boto3
|
||||
import os
|
||||
from botocore.exceptions import ClientError
|
||||
@ -40,7 +41,7 @@ class RAGFlowS3(object):
|
||||
config=config
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
logging.exception(
|
||||
"Fail to connect %s" % self.endpoint)
|
||||
|
||||
def __close__(self):
|
||||
@ -49,11 +50,11 @@ class RAGFlowS3(object):
|
||||
|
||||
def bucket_exists(self, bucket):
|
||||
try:
|
||||
logger.debug(f"head_bucket bucketname {bucket}")
|
||||
logging.debug(f"head_bucket bucketname {bucket}")
|
||||
self.conn.head_bucket(Bucket=bucket)
|
||||
exists = True
|
||||
except ClientError:
|
||||
logger.exception(f"head_bucket error {bucket}")
|
||||
logging.exception(f"head_bucket error {bucket}")
|
||||
exists = False
|
||||
return exists
|
||||
|
||||
@ -62,7 +63,7 @@ class RAGFlowS3(object):
|
||||
|
||||
if not self.bucket_exists(bucket):
|
||||
self.conn.create_bucket(Bucket=bucket)
|
||||
logger.debug(f"create bucket {bucket} ********")
|
||||
logging.debug(f"create bucket {bucket} ********")
|
||||
|
||||
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
|
||||
return r
|
||||
@ -74,17 +75,17 @@ class RAGFlowS3(object):
|
||||
return []
|
||||
|
||||
def put(self, bucket, fnm, binary):
|
||||
logger.debug(f"bucket name {bucket}; filename :{fnm}:")
|
||||
logging.debug(f"bucket name {bucket}; filename :{fnm}:")
|
||||
for _ in range(1):
|
||||
try:
|
||||
if not self.bucket_exists(bucket):
|
||||
self.conn.create_bucket(Bucket=bucket)
|
||||
logger.info(f"create bucket {bucket} ********")
|
||||
logging.info(f"create bucket {bucket} ********")
|
||||
r = self.conn.upload_fileobj(BytesIO(binary), bucket, fnm)
|
||||
|
||||
return r
|
||||
except Exception:
|
||||
logger.exception(f"Fail put {bucket}/{fnm}")
|
||||
logging.exception(f"Fail put {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
@ -92,7 +93,7 @@ class RAGFlowS3(object):
|
||||
try:
|
||||
self.conn.delete_object(Bucket=bucket, Key=fnm)
|
||||
except Exception:
|
||||
logger.exception(f"Fail rm {bucket}/{fnm}")
|
||||
logging.exception(f"Fail rm {bucket}/{fnm}")
|
||||
|
||||
def get(self, bucket, fnm):
|
||||
for _ in range(1):
|
||||
@ -101,7 +102,7 @@ class RAGFlowS3(object):
|
||||
object_data = r['Body'].read()
|
||||
return object_data
|
||||
except Exception:
|
||||
logger.exception(f"fail get {bucket}/{fnm}")
|
||||
logging.exception(f"fail get {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -128,7 +129,7 @@ class RAGFlowS3(object):
|
||||
|
||||
return r
|
||||
except Exception:
|
||||
logger.exception(f"fail get url {bucket}/{fnm}")
|
||||
logging.exception(f"fail get url {bucket}/{fnm}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user