mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix errors detected by Ruff (#3918)
### What problem does this PR solve? Fix errors detected by Ruff ### Type of change - [x] Refactoring
This commit is contained in:
@ -222,7 +222,8 @@ def bullets_category(sections):
|
||||
|
||||
def is_english(texts):
|
||||
eng = 0
|
||||
if not texts: return False
|
||||
if not texts:
|
||||
return False
|
||||
for t in texts:
|
||||
if re.match(r"[ `a-zA-Z.,':;/\"?<>!\(\)-]", t.strip()):
|
||||
eng += 1
|
||||
@ -250,7 +251,8 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
||||
res = []
|
||||
# wrap up as es documents
|
||||
for ck in chunks:
|
||||
if len(ck.strip()) == 0:continue
|
||||
if len(ck.strip()) == 0:
|
||||
continue
|
||||
logging.debug("-- {}".format(ck))
|
||||
d = copy.deepcopy(doc)
|
||||
if pdf_parser:
|
||||
@ -269,7 +271,8 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
|
||||
res = []
|
||||
# wrap up as es documents
|
||||
for ck, image in zip(chunks, images):
|
||||
if len(ck.strip()) == 0:continue
|
||||
if len(ck.strip()) == 0:
|
||||
continue
|
||||
logging.debug("-- {}".format(ck))
|
||||
d = copy.deepcopy(doc)
|
||||
d["image"] = image
|
||||
@ -288,8 +291,10 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
|
||||
d = copy.deepcopy(doc)
|
||||
tokenize(d, rows, eng)
|
||||
d["content_with_weight"] = rows
|
||||
if img: d["image"] = img
|
||||
if poss: add_positions(d, poss)
|
||||
if img:
|
||||
d["image"] = img
|
||||
if poss:
|
||||
add_positions(d, poss)
|
||||
res.append(d)
|
||||
continue
|
||||
de = "; " if eng else "; "
|
||||
@ -387,9 +392,9 @@ def title_frequency(bull, sections):
|
||||
if re.search(r"(title|head)", layout) and not not_title(txt.split("@")[0]):
|
||||
levels[i] = bullets_size
|
||||
most_level = bullets_size+1
|
||||
for l, c in sorted(Counter(levels).items(), key=lambda x:x[1]*-1):
|
||||
if l <= bullets_size:
|
||||
most_level = l
|
||||
for level, c in sorted(Counter(levels).items(), key=lambda x:x[1]*-1):
|
||||
if level <= bullets_size:
|
||||
most_level = level
|
||||
break
|
||||
return most_level, levels
|
||||
|
||||
@ -504,7 +509,8 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
def add_chunk(t, pos):
|
||||
nonlocal cks, tk_nums, delimiter
|
||||
tnum = num_tokens_from_string(t)
|
||||
if not pos: pos = ""
|
||||
if not pos:
|
||||
pos = ""
|
||||
if tnum < 8:
|
||||
pos = ""
|
||||
# Ensure that the length of the merged chunk does not exceed chunk_token_num
|
||||
|
||||
@ -121,7 +121,8 @@ class FulltextQueryer:
|
||||
keywords.append(tt)
|
||||
twts = self.tw.weights([tt])
|
||||
syns = self.syn.lookup(tt)
|
||||
if syns and len(keywords) < 32: keywords.extend(syns)
|
||||
if syns and len(keywords) < 32:
|
||||
keywords.extend(syns)
|
||||
logging.debug(json.dumps(twts, ensure_ascii=False))
|
||||
tms = []
|
||||
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
||||
@ -147,7 +148,8 @@ class FulltextQueryer:
|
||||
|
||||
tk_syns = self.syn.lookup(tk)
|
||||
tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
|
||||
if len(keywords) < 32: keywords.extend([s for s in tk_syns if s])
|
||||
if len(keywords) < 32:
|
||||
keywords.extend([s for s in tk_syns if s])
|
||||
tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
|
||||
tk_syns = [f"\"{s}\"" if s.find(" ")>0 else s for s in tk_syns]
|
||||
|
||||
|
||||
@ -104,7 +104,6 @@ class RagTokenizer:
|
||||
return HanziConv.toSimplified(line)
|
||||
|
||||
def dfs_(self, chars, s, preTks, tkslist):
|
||||
MAX_L = 10
|
||||
res = s
|
||||
# if s > MAX_L or s>= len(chars):
|
||||
if s >= len(chars):
|
||||
@ -184,12 +183,6 @@ class RagTokenizer:
|
||||
return sorted(res, key=lambda x: x[1], reverse=True)
|
||||
|
||||
def merge_(self, tks):
|
||||
patts = [
|
||||
(r"[ ]+", " "),
|
||||
(r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
|
||||
]
|
||||
# for p,s in patts: tks = re.sub(p, s, tks)
|
||||
|
||||
# if split chars is part of token
|
||||
res = []
|
||||
tks = re.sub(r"[ ]+", " ", tks).split()
|
||||
@ -284,7 +277,8 @@ class RagTokenizer:
|
||||
same = 0
|
||||
while i + same < len(tks1) and j + same < len(tks) and tks1[i + same] == tks[j + same]:
|
||||
same += 1
|
||||
if same > 0: res.append(" ".join(tks[j: j + same]))
|
||||
if same > 0:
|
||||
res.append(" ".join(tks[j: j + same]))
|
||||
_i = i + same
|
||||
_j = j + same
|
||||
j = _j + 1
|
||||
|
||||
@ -62,10 +62,10 @@ class Dealer:
|
||||
res = {}
|
||||
f = open(fnm, "r")
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
arr = l.replace("\n", "").split("\t")
|
||||
arr = line.replace("\n", "").split("\t")
|
||||
if len(arr) < 2:
|
||||
res[arr[0]] = 0
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user