apply pep8 formalize (#155)

2026-02-01 08:05:07 +08:00 · 2024-03-27 11:33:46 +08:00
parent a02e836790
commit fd7fcb5baf
55 changed files with 1568 additions and 753 deletions
--- a/rag/nlp/huchunk.py
+++ b/rag/nlp/huchunk.py
@ -372,7 +372,8 @@ class PptChunker(HuChunker):
            tb = shape.table
            rows = []
            for i in range(1, len(tb.rows)):
-                rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
+                rows.append("; ".join([tb.cell(
+                    0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
            return "\n".join(rows)

        if shape.has_text_frame:
@ -382,7 +383,8 @@ class PptChunker(HuChunker):
            texts = []
            for p in shape.shapes:
                t = self.__extract(p)
-                if t: texts.append(t)
+                if t:
+                    texts.append(t)
            return "\n".join(texts)

    def __call__(self, fnm):
@ -395,7 +397,8 @@ class PptChunker(HuChunker):
            texts = []
            for shape in slide.shapes:
                txt = self.__extract(shape)
-                if txt: texts.append(txt)
+                if txt:
+                    texts.append(txt)
            txts.append("\n".join(texts))

        import aspose.slides as slides
@ -404,9 +407,12 @@ class PptChunker(HuChunker):
        with slides.Presentation(BytesIO(fnm)) as presentation:
            for slide in presentation.slides:
                buffered = BytesIO()
-                slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
+                slide.get_thumbnail(
+                    0.5, 0.5).save(
+                    buffered, drawing.imaging.ImageFormat.jpeg)
                imgs.append(buffered.getvalue())
-        assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
+        assert len(imgs) == len(
+            txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))

        flds = self.Fields()
        flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))]
@ -445,7 +451,8 @@ class TextChunker(HuChunker):
        if isinstance(fnm, str):
            with open(fnm, "r") as f:
                txt = f.read()
-        else: txt = fnm.decode("utf-8")
+        else:
+            txt = fnm.decode("utf-8")
        flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)]
        flds.table_chunks = []
        return flds
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -149,7 +149,8 @@ class EsQueryer:
        atks = toDict(atks)
        btkss = [toDict(tks) for tks in btkss]
        tksim = [self.similarity(atks, btks) for btks in btkss]
-        return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, tksim, sims[0]
+        return np.array(sims[0]) * vtweight + \
+            np.array(tksim) * tkweight, tksim, sims[0]

    def similarity(self, qtwt, dtwt):
        if isinstance(dtwt, type("")):
@ -159,11 +160,11 @@ class EsQueryer:
        s = 1e-9
        for k, v in qtwt.items():
            if k in dtwt:
-                s += v# * dtwt[k]
+                s += v  # * dtwt[k]
        q = 1e-9
        for k, v in qtwt.items():
-            q += v #* v
+            q += v  # * v
        #d = 1e-9
-        #for k, v in dtwt.items():
+        # for k, v in dtwt.items():
        #    d += v * v
-        return s / q #math.sqrt(q) / math.sqrt(d)
+        return s / q  # math.sqrt(q) / math.sqrt(d)
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -80,14 +80,18 @@ class Dealer:
            if not req.get("sort"):
                s = s.sort(
                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
-                    {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
+                    {"create_timestamp_flt": {
+                        "order": "desc", "unmapped_type": "float"}}
                )
            else:
                s = s.sort(
-                    {"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
-                    {"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
+                    {"page_num_int": {"order": "asc", "unmapped_type": "float",
+                                      "mode": "avg", "numeric_type": "double"}},
+                    {"top_int": {"order": "asc", "unmapped_type": "float",
+                                 "mode": "avg", "numeric_type": "double"}},
                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
-                    {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
+                    {"create_timestamp_flt": {
+                        "order": "desc", "unmapped_type": "float"}}
                )

        if qst:
@ -180,11 +184,13 @@ class Dealer:
            m = {n: d.get(n) for n in flds if d.get(n) is not None}
            for n, v in m.items():
                if isinstance(v, type([])):
-                    m[n] = "\t".join([str(vv) if not isinstance(vv, list) else "\t".join([str(vvv) for vvv in vv]) for vv in v])
+                    m[n] = "\t".join([str(vv) if not isinstance(
+                        vv, list) else "\t".join([str(vvv) for vvv in vv]) for vv in v])
                    continue
                if not isinstance(v, type("")):
                    m[n] = str(m[n])
-                if n.find("tks")>0: m[n] = rmSpace(m[n])
+                if n.find("tks") > 0:
+                    m[n] = rmSpace(m[n])

            if m:
                res[d["id"]] = m
@ -205,12 +211,16 @@ class Dealer:
                if pieces[i] == "```":
                    st = i
                    i += 1
-                    while i<len(pieces) and pieces[i] != "```":
+                    while i < len(pieces) and pieces[i] != "```":
                        i += 1
-                    if i < len(pieces): i += 1
-                    pieces_.append("".join(pieces[st: i])+"\n")
+                    if i < len(pieces):
+                        i += 1
+                    pieces_.append("".join(pieces[st: i]) + "\n")
                else:
-                    pieces_.extend(re.split(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", pieces[i]))
+                    pieces_.extend(
+                        re.split(
+                            r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])",
+                            pieces[i]))
                    i += 1
            pieces = pieces_
        else:
@ -234,7 +244,8 @@ class Dealer:
        assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
            len(ans_v[0]), len(chunk_v[0]))

-        chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks]
+        chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
+                      for ck in chunks]
        cites = {}
        for i, a in enumerate(pieces_):
            sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
@ -258,9 +269,11 @@ class Dealer:
                continue
            if i not in cites:
                continue
-            for c in cites[i]: assert int(c) < len(chunk_v)
            for c in cites[i]:
-                if c in seted:continue
+                assert int(c) < len(chunk_v)
+            for c in cites[i]:
+                if c in seted:
+                    continue
                res += f" ##{c}$$"
                seted.add(c)

@ -343,7 +356,11 @@ class Dealer:
            if dnm not in ranks["doc_aggs"]:
                ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
            ranks["doc_aggs"][dnm]["count"] += 1
-        ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
+        ranks["doc_aggs"] = [{"doc_name": k,
+                              "doc_id": v["doc_id"],
+                              "count": v["count"]} for k,
+                             v in sorted(ranks["doc_aggs"].items(),
+                                         key=lambda x:x[1]["count"] * -1)]

        return ranks

@ -354,10 +371,17 @@ class Dealer:
        replaces = []
        for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
            fld, v = r.group(1), r.group(3)
-            match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(fld, huqie.qieqie(huqie.qie(v)))
-            replaces.append(("{}{}'{}'".format(r.group(1), r.group(2), r.group(3)), match))
+            match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
+                fld, huqie.qieqie(huqie.qie(v)))
+            replaces.append(
+                ("{}{}'{}'".format(
+                    r.group(1),
+                    r.group(2),
+                    r.group(3)),
+                    match))

-        for p, r in replaces: sql = sql.replace(p, r, 1)
+        for p, r in replaces:
+            sql = sql.replace(p, r, 1)
        chat_logger.info(f"To es: {sql}")

        try:
@ -366,4 +390,3 @@ class Dealer:
        except Exception as e:
            chat_logger.error(f"SQL failure: {sql} =>" + str(e))
            return {"error": str(e)}
-
--- a/rag/nlp/term_weight.py
+++ b/rag/nlp/term_weight.py
@ -150,8 +150,10 @@ class Dealer:
            return 6

        def ner(t):
-            if re.match(r"[0-9,.]{2,}$", t): return 2
-            if re.match(r"[a-z]{1,2}$", t): return 0.01
+            if re.match(r"[0-9,.]{2,}$", t):
+                return 2
+            if re.match(r"[a-z]{1,2}$", t):
+                return 0.01
            if not self.ne or t not in self.ne:
                return 1
            m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,