refine for English corpus (#135)

2026-02-02 08:35:08 +08:00 · 2024-03-20 16:56:16 +08:00
parent 78727c8809
commit 6999598101
12 changed files with 216 additions and 125 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -3,14 +3,9 @@ from collections import Counter

 from rag.utils import num_tokens_from_string
 from . import huqie
-from nltk import word_tokenize
 import re
 import copy

-from nltk.stem import PorterStemmer
-
-stemmer = PorterStemmer()
-

 BULLET_PATTERN = [[
    r"第[零一二三四五六七八九十百0-9]+(分?编|部分)",
@ -77,13 +72,8 @@ def is_english(texts):
 def tokenize(d, t, eng):
    d["content_with_weight"] = t
    t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
-    if eng:
-        t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
-        d["content_ltks"] = " ".join([stemmer.stem(w)
-                                     for w in word_tokenize(t)])
-    else:
-        d["content_ltks"] = huqie.qie(t)
-        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
+    d["content_ltks"] = huqie.qie(t)
+    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])


 def tokenize_table(tbls, doc, eng, batch_size=10):
@ -94,8 +84,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
            continue
        if isinstance(rows, str):
            d = copy.deepcopy(doc)
-            r = re.sub(r"<[^<>]{,12}>", "", rows)
-            tokenize(d, r, eng)
+            tokenize(d, rows, eng)
            d["content_with_weight"] = rows
            d["image"] = img
            add_positions(d, poss)
--- a/rag/nlp/huqie.py
+++ b/rag/nlp/huqie.py
@ -8,7 +8,8 @@ import re
 import string
 import sys
 from hanziconv import HanziConv
-
+from nltk import word_tokenize
+from nltk.stem import PorterStemmer, WordNetLemmatizer
 from api.utils.file_utils import get_project_base_directory


@ -45,6 +46,9 @@ class Huqie:
        self.trie_ = datrie.Trie(string.printable)
        self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")

+        self.stemmer = PorterStemmer()
+        self.lemmatizer = WordNetLemmatizer()
+
        self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》，。？、；‘’：“”【】~！￥%……（）——-]+|[a-z\.-]+|[0-9,\.-]+)"
        try:
            self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
@ -239,6 +243,10 @@ class Huqie:
    def qie(self, line):
        line = self._strQ2B(line).lower()
        line = self._tradi2simp(line)
+        zh_num = len([1 for c in line if is_chinese(c)])
+        if zh_num < len(line) * 0.2:
+            return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])
+
        arr = re.split(self.SPLIT_CHAR, line)
        res = []
        for L in arr:
@ -290,8 +298,12 @@ class Huqie:
        return self.merge_(res)

    def qieqie(self, tks):
+        tks = tks.split(" ")
+        zh_num = len([1 for c in tks if c and is_chinese(c[0])])
+        if zh_num < len(tks) * 0.2:return " ".join(tks)
+
        res = []
-        for tk in tks.split(" "):
+        for tk in tks:
            if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
                res.append(tk)
                continue
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -4,8 +4,8 @@ import json
 import re
 import logging
 import copy
-import math
-from elasticsearch_dsl import Q, Search
+from elasticsearch_dsl import Q
+
 from rag.nlp import huqie, term_weight, synonym


@ -33,12 +33,14 @@ class EsQueryer:

    @staticmethod
    def rmWWW(txt):
-        txt = re.sub(
-            r"是*(什么样的|哪家|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*",
-            "",
-            txt)
-        return re.sub(
-            r"(what|who|how|which|where|why|(is|are|were|was) there) (is|are|were|was|to)*", "", txt, re.IGNORECASE)
+        patts = [
+            (r"是*(什么样的|哪家|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*", ""),
+            (r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),
+            (r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down)", " ")
+        ]
+        for r, p in patts:
+            txt = re.sub(r, p, txt, flags=re.IGNORECASE)
+        return txt

    def question(self, txt, tbl="qa", min_match="60%"):
        txt = re.sub(
@ -50,7 +52,7 @@ class EsQueryer:
        txt = EsQueryer.rmWWW(txt)

        if not self.isChinese(txt):
-            tks = [t for t in txt.split(" ") if t.strip()]
+            tks = huqie.qie(txt).split(" ")
            q = tks
            for i in range(1, len(tks)):
                q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
@ -58,9 +60,9 @@ class EsQueryer:
                q.append(txt)
            return Q("bool",
                     must=Q("query_string", fields=self.flds,
-                            type="best_fields", query=" OR ".join(q),
+                            type="best_fields", query=" ".join(q),
                            boost=1, minimum_should_match=min_match)
-                     ), txt.split(" ")
+                     ), tks

        def needQieqie(tk):
            if len(tk) < 4:
@ -160,8 +162,8 @@ class EsQueryer:
                s += v# * dtwt[k]
        q = 1e-9
        for k, v in qtwt.items():
-            q += v * v
-        d = 1e-9
-        for k, v in dtwt.items():
-            d += v * v
-        return s / q#math.sqrt(q) / math.sqrt(d)
+            q += v #* v
+        #d = 1e-9
+        #for k, v in dtwt.items():
+        #    d += v * v
+        return s / q #math.sqrt(q) / math.sqrt(d)
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -196,7 +196,24 @@ class Dealer:
    def insert_citations(self, answer, chunks, chunk_v,
                         embd_mdl, tkweight=0.7, vtweight=0.3):
        assert len(chunks) == len(chunk_v)
-        pieces = re.split(r"([；。？!！\n]|[a-z][.?;!][ \n])", answer)
+        pieces = re.split(r"(```)", answer)
+        if len(pieces) >= 3:
+            i = 0
+            pieces_ = []
+            while i < len(pieces):
+                if pieces[i] == "```":
+                    st = i
+                    i += 1
+                    while i<len(pieces) and pieces[i] != "```":
+                        i += 1
+                    if i < len(pieces): i += 1
+                    pieces_.append("".join(pieces[st: i])+"\n")
+                else:
+                    pieces_.extend(re.split(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", pieces[i]))
+                    i += 1
+            pieces = pieces_
+        else:
+            pieces = re.split(r"([^\|][；。？!！\n]|[a-z][.?;!][ \n])", answer)
        for i in range(1, len(pieces)):
            if re.match(r"[a-z][.?;!][ \n]", pieces[i]):
                pieces[i - 1] += pieces[i][0]
@ -226,7 +243,7 @@ class Dealer:
                                                            chunks_tks,
                                                            tkweight, vtweight)
            mx = np.max(sim) * 0.99
-            if mx < 0.66:
+            if mx < 0.7:
                continue
            cites[idx[i]] = list(
                set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
@ -249,6 +266,7 @@ class Dealer:

    def rerank(self, sres, query, tkweight=0.3,
               vtweight=0.7, cfield="content_ltks"):
+        _, keywords = self.qryr.question(query)
        ins_embd = [
            Dealer.trans2floats(
                sres.field[i].get("q_%d_vec" % len(sres.query_vector), "\t".join(["0"] * len(sres.query_vector)))) for i in sres.ids]
@ -258,8 +276,7 @@ class Dealer:
                  for i in sres.ids]
        sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
                                                        ins_embd,
-                                                        huqie.qie(
-                                                            query).split(" "),
+                                                        keywords,
                                                        ins_tw, tkweight, vtweight)
        return sim, tksim, vtsim