mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
apply pep8 formalize (#155)
This commit is contained in:
@ -372,7 +372,8 @@ class PptChunker(HuChunker):
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
rows.append("; ".join([tb.cell(
|
||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
if shape.has_text_frame:
|
||||
@ -382,7 +383,8 @@ class PptChunker(HuChunker):
|
||||
texts = []
|
||||
for p in shape.shapes:
|
||||
t = self.__extract(p)
|
||||
if t: texts.append(t)
|
||||
if t:
|
||||
texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
def __call__(self, fnm):
|
||||
@ -395,7 +397,8 @@ class PptChunker(HuChunker):
|
||||
texts = []
|
||||
for shape in slide.shapes:
|
||||
txt = self.__extract(shape)
|
||||
if txt: texts.append(txt)
|
||||
if txt:
|
||||
texts.append(txt)
|
||||
txts.append("\n".join(texts))
|
||||
|
||||
import aspose.slides as slides
|
||||
@ -404,9 +407,12 @@ class PptChunker(HuChunker):
|
||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
||||
for slide in presentation.slides:
|
||||
buffered = BytesIO()
|
||||
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
slide.get_thumbnail(
|
||||
0.5, 0.5).save(
|
||||
buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
imgs.append(buffered.getvalue())
|
||||
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
assert len(imgs) == len(
|
||||
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
|
||||
flds = self.Fields()
|
||||
flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||
@ -445,7 +451,8 @@ class TextChunker(HuChunker):
|
||||
if isinstance(fnm, str):
|
||||
with open(fnm, "r") as f:
|
||||
txt = f.read()
|
||||
else: txt = fnm.decode("utf-8")
|
||||
else:
|
||||
txt = fnm.decode("utf-8")
|
||||
flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)]
|
||||
flds.table_chunks = []
|
||||
return flds
|
||||
|
||||
@ -149,7 +149,8 @@ class EsQueryer:
|
||||
atks = toDict(atks)
|
||||
btkss = [toDict(tks) for tks in btkss]
|
||||
tksim = [self.similarity(atks, btks) for btks in btkss]
|
||||
return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, tksim, sims[0]
|
||||
return np.array(sims[0]) * vtweight + \
|
||||
np.array(tksim) * tkweight, tksim, sims[0]
|
||||
|
||||
def similarity(self, qtwt, dtwt):
|
||||
if isinstance(dtwt, type("")):
|
||||
@ -159,11 +160,11 @@ class EsQueryer:
|
||||
s = 1e-9
|
||||
for k, v in qtwt.items():
|
||||
if k in dtwt:
|
||||
s += v# * dtwt[k]
|
||||
s += v # * dtwt[k]
|
||||
q = 1e-9
|
||||
for k, v in qtwt.items():
|
||||
q += v #* v
|
||||
q += v # * v
|
||||
#d = 1e-9
|
||||
#for k, v in dtwt.items():
|
||||
# for k, v in dtwt.items():
|
||||
# d += v * v
|
||||
return s / q #math.sqrt(q) / math.sqrt(d)
|
||||
return s / q # math.sqrt(q) / math.sqrt(d)
|
||||
|
||||
@ -80,14 +80,18 @@ class Dealer:
|
||||
if not req.get("sort"):
|
||||
s = s.sort(
|
||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
||||
{"create_timestamp_flt": {
|
||||
"order": "desc", "unmapped_type": "float"}}
|
||||
)
|
||||
else:
|
||||
s = s.sort(
|
||||
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
||||
{"page_num_int": {"order": "asc", "unmapped_type": "float",
|
||||
"mode": "avg", "numeric_type": "double"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float",
|
||||
"mode": "avg", "numeric_type": "double"}},
|
||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
||||
{"create_timestamp_flt": {
|
||||
"order": "desc", "unmapped_type": "float"}}
|
||||
)
|
||||
|
||||
if qst:
|
||||
@ -180,11 +184,13 @@ class Dealer:
|
||||
m = {n: d.get(n) for n in flds if d.get(n) is not None}
|
||||
for n, v in m.items():
|
||||
if isinstance(v, type([])):
|
||||
m[n] = "\t".join([str(vv) if not isinstance(vv, list) else "\t".join([str(vvv) for vvv in vv]) for vv in v])
|
||||
m[n] = "\t".join([str(vv) if not isinstance(
|
||||
vv, list) else "\t".join([str(vvv) for vvv in vv]) for vv in v])
|
||||
continue
|
||||
if not isinstance(v, type("")):
|
||||
m[n] = str(m[n])
|
||||
if n.find("tks")>0: m[n] = rmSpace(m[n])
|
||||
if n.find("tks") > 0:
|
||||
m[n] = rmSpace(m[n])
|
||||
|
||||
if m:
|
||||
res[d["id"]] = m
|
||||
@ -205,12 +211,16 @@ class Dealer:
|
||||
if pieces[i] == "```":
|
||||
st = i
|
||||
i += 1
|
||||
while i<len(pieces) and pieces[i] != "```":
|
||||
while i < len(pieces) and pieces[i] != "```":
|
||||
i += 1
|
||||
if i < len(pieces): i += 1
|
||||
pieces_.append("".join(pieces[st: i])+"\n")
|
||||
if i < len(pieces):
|
||||
i += 1
|
||||
pieces_.append("".join(pieces[st: i]) + "\n")
|
||||
else:
|
||||
pieces_.extend(re.split(r"([^\|][;。?!!\n]|[a-z][.?;!][ \n])", pieces[i]))
|
||||
pieces_.extend(
|
||||
re.split(
|
||||
r"([^\|][;。?!!\n]|[a-z][.?;!][ \n])",
|
||||
pieces[i]))
|
||||
i += 1
|
||||
pieces = pieces_
|
||||
else:
|
||||
@ -234,7 +244,8 @@ class Dealer:
|
||||
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
||||
len(ans_v[0]), len(chunk_v[0]))
|
||||
|
||||
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks]
|
||||
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
|
||||
for ck in chunks]
|
||||
cites = {}
|
||||
for i, a in enumerate(pieces_):
|
||||
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
||||
@ -258,9 +269,11 @@ class Dealer:
|
||||
continue
|
||||
if i not in cites:
|
||||
continue
|
||||
for c in cites[i]: assert int(c) < len(chunk_v)
|
||||
for c in cites[i]:
|
||||
if c in seted:continue
|
||||
assert int(c) < len(chunk_v)
|
||||
for c in cites[i]:
|
||||
if c in seted:
|
||||
continue
|
||||
res += f" ##{c}$$"
|
||||
seted.add(c)
|
||||
|
||||
@ -343,7 +356,11 @@ class Dealer:
|
||||
if dnm not in ranks["doc_aggs"]:
|
||||
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
||||
ranks["doc_aggs"][dnm]["count"] += 1
|
||||
ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
|
||||
ranks["doc_aggs"] = [{"doc_name": k,
|
||||
"doc_id": v["doc_id"],
|
||||
"count": v["count"]} for k,
|
||||
v in sorted(ranks["doc_aggs"].items(),
|
||||
key=lambda x:x[1]["count"] * -1)]
|
||||
|
||||
return ranks
|
||||
|
||||
@ -354,10 +371,17 @@ class Dealer:
|
||||
replaces = []
|
||||
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
||||
fld, v = r.group(1), r.group(3)
|
||||
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(fld, huqie.qieqie(huqie.qie(v)))
|
||||
replaces.append(("{}{}'{}'".format(r.group(1), r.group(2), r.group(3)), match))
|
||||
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
||||
fld, huqie.qieqie(huqie.qie(v)))
|
||||
replaces.append(
|
||||
("{}{}'{}'".format(
|
||||
r.group(1),
|
||||
r.group(2),
|
||||
r.group(3)),
|
||||
match))
|
||||
|
||||
for p, r in replaces: sql = sql.replace(p, r, 1)
|
||||
for p, r in replaces:
|
||||
sql = sql.replace(p, r, 1)
|
||||
chat_logger.info(f"To es: {sql}")
|
||||
|
||||
try:
|
||||
@ -366,4 +390,3 @@ class Dealer:
|
||||
except Exception as e:
|
||||
chat_logger.error(f"SQL failure: {sql} =>" + str(e))
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@ -150,8 +150,10 @@ class Dealer:
|
||||
return 6
|
||||
|
||||
def ner(t):
|
||||
if re.match(r"[0-9,.]{2,}$", t): return 2
|
||||
if re.match(r"[a-z]{1,2}$", t): return 0.01
|
||||
if re.match(r"[0-9,.]{2,}$", t):
|
||||
return 2
|
||||
if re.match(r"[a-z]{1,2}$", t):
|
||||
return 0.01
|
||||
if not self.ne or t not in self.ne:
|
||||
return 1
|
||||
m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
|
||||
|
||||
Reference in New Issue
Block a user