apply pep8 formalize (#155)

This commit is contained in:
KevinHuSh
2024-03-27 11:33:46 +08:00
committed by GitHub
parent a02e836790
commit fd7fcb5baf
55 changed files with 1568 additions and 753 deletions

View File

@ -372,7 +372,8 @@ class PptChunker(HuChunker):
tb = shape.table
rows = []
for i in range(1, len(tb.rows)):
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
rows.append("; ".join([tb.cell(
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
return "\n".join(rows)
if shape.has_text_frame:
@ -382,7 +383,8 @@ class PptChunker(HuChunker):
texts = []
for p in shape.shapes:
t = self.__extract(p)
if t: texts.append(t)
if t:
texts.append(t)
return "\n".join(texts)
def __call__(self, fnm):
@ -395,7 +397,8 @@ class PptChunker(HuChunker):
texts = []
for shape in slide.shapes:
txt = self.__extract(shape)
if txt: texts.append(txt)
if txt:
texts.append(txt)
txts.append("\n".join(texts))
import aspose.slides as slides
@ -404,9 +407,12 @@ class PptChunker(HuChunker):
with slides.Presentation(BytesIO(fnm)) as presentation:
for slide in presentation.slides:
buffered = BytesIO()
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
slide.get_thumbnail(
0.5, 0.5).save(
buffered, drawing.imaging.ImageFormat.jpeg)
imgs.append(buffered.getvalue())
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
assert len(imgs) == len(
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
flds = self.Fields()
flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))]
@ -445,7 +451,8 @@ class TextChunker(HuChunker):
if isinstance(fnm, str):
with open(fnm, "r") as f:
txt = f.read()
else: txt = fnm.decode("utf-8")
else:
txt = fnm.decode("utf-8")
flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)]
flds.table_chunks = []
return flds

View File

@ -149,7 +149,8 @@ class EsQueryer:
atks = toDict(atks)
btkss = [toDict(tks) for tks in btkss]
tksim = [self.similarity(atks, btks) for btks in btkss]
return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, tksim, sims[0]
return np.array(sims[0]) * vtweight + \
np.array(tksim) * tkweight, tksim, sims[0]
def similarity(self, qtwt, dtwt):
if isinstance(dtwt, type("")):
@ -159,11 +160,11 @@ class EsQueryer:
s = 1e-9
for k, v in qtwt.items():
if k in dtwt:
s += v# * dtwt[k]
s += v # * dtwt[k]
q = 1e-9
for k, v in qtwt.items():
q += v #* v
q += v # * v
#d = 1e-9
#for k, v in dtwt.items():
# for k, v in dtwt.items():
# d += v * v
return s / q #math.sqrt(q) / math.sqrt(d)
return s / q # math.sqrt(q) / math.sqrt(d)

View File

@ -80,14 +80,18 @@ class Dealer:
if not req.get("sort"):
s = s.sort(
{"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
{"create_timestamp_flt": {
"order": "desc", "unmapped_type": "float"}}
)
else:
s = s.sort(
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
{"page_num_int": {"order": "asc", "unmapped_type": "float",
"mode": "avg", "numeric_type": "double"}},
{"top_int": {"order": "asc", "unmapped_type": "float",
"mode": "avg", "numeric_type": "double"}},
{"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
{"create_timestamp_flt": {
"order": "desc", "unmapped_type": "float"}}
)
if qst:
@ -180,11 +184,13 @@ class Dealer:
m = {n: d.get(n) for n in flds if d.get(n) is not None}
for n, v in m.items():
if isinstance(v, type([])):
m[n] = "\t".join([str(vv) if not isinstance(vv, list) else "\t".join([str(vvv) for vvv in vv]) for vv in v])
m[n] = "\t".join([str(vv) if not isinstance(
vv, list) else "\t".join([str(vvv) for vvv in vv]) for vv in v])
continue
if not isinstance(v, type("")):
m[n] = str(m[n])
if n.find("tks")>0: m[n] = rmSpace(m[n])
if n.find("tks") > 0:
m[n] = rmSpace(m[n])
if m:
res[d["id"]] = m
@ -205,12 +211,16 @@ class Dealer:
if pieces[i] == "```":
st = i
i += 1
while i<len(pieces) and pieces[i] != "```":
while i < len(pieces) and pieces[i] != "```":
i += 1
if i < len(pieces): i += 1
pieces_.append("".join(pieces[st: i])+"\n")
if i < len(pieces):
i += 1
pieces_.append("".join(pieces[st: i]) + "\n")
else:
pieces_.extend(re.split(r"([^\|][;。?!\n]|[a-z][.?;!][ \n])", pieces[i]))
pieces_.extend(
re.split(
r"([^\|][;。?!\n]|[a-z][.?;!][ \n])",
pieces[i]))
i += 1
pieces = pieces_
else:
@ -234,7 +244,8 @@ class Dealer:
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
len(ans_v[0]), len(chunk_v[0]))
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks]
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
for ck in chunks]
cites = {}
for i, a in enumerate(pieces_):
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
@ -258,9 +269,11 @@ class Dealer:
continue
if i not in cites:
continue
for c in cites[i]: assert int(c) < len(chunk_v)
for c in cites[i]:
if c in seted:continue
assert int(c) < len(chunk_v)
for c in cites[i]:
if c in seted:
continue
res += f" ##{c}$$"
seted.add(c)
@ -343,7 +356,11 @@ class Dealer:
if dnm not in ranks["doc_aggs"]:
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
ranks["doc_aggs"][dnm]["count"] += 1
ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
ranks["doc_aggs"] = [{"doc_name": k,
"doc_id": v["doc_id"],
"count": v["count"]} for k,
v in sorted(ranks["doc_aggs"].items(),
key=lambda x:x[1]["count"] * -1)]
return ranks
@ -354,10 +371,17 @@ class Dealer:
replaces = []
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
fld, v = r.group(1), r.group(3)
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(fld, huqie.qieqie(huqie.qie(v)))
replaces.append(("{}{}'{}'".format(r.group(1), r.group(2), r.group(3)), match))
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
fld, huqie.qieqie(huqie.qie(v)))
replaces.append(
("{}{}'{}'".format(
r.group(1),
r.group(2),
r.group(3)),
match))
for p, r in replaces: sql = sql.replace(p, r, 1)
for p, r in replaces:
sql = sql.replace(p, r, 1)
chat_logger.info(f"To es: {sql}")
try:
@ -366,4 +390,3 @@ class Dealer:
except Exception as e:
chat_logger.error(f"SQL failure: {sql} =>" + str(e))
return {"error": str(e)}

View File

@ -150,8 +150,10 @@ class Dealer:
return 6
def ner(t):
if re.match(r"[0-9,.]{2,}$", t): return 2
if re.match(r"[a-z]{1,2}$", t): return 0.01
if re.match(r"[0-9,.]{2,}$", t):
return 2
if re.match(r"[a-z]{1,2}$", t):
return 0.01
if not self.ne or t not in self.ne:
return 1
m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,