Some document API refined. (#53)

Add naive chunking method to RAG
2026-01-31 15:45:08 +08:00 · 2024-02-02 19:21:37 +08:00
parent 7b71fb2db6
commit 51482f3e2a
13 changed files with 447 additions and 268 deletions
--- a/rag/app/init.py
+++ b/rag/app/init.py
@ -1,91 +0,0 @@
-import re
-
-from nltk import word_tokenize
-
-from rag.nlp import stemmer, huqie
-
-BULLET_PATTERN = [[
-        r"第[零一二三四五六七八九十百]+(编|部分)",
-        r"第[零一二三四五六七八九十百]+章",
-        r"第[零一二三四五六七八九十百]+节",
-        r"第[零一二三四五六七八九十百]+条",
-        r"[\(（][零一二三四五六七八九十百]+[\)）]",
-    ], [
-        r"[0-9]{,3}[\. 、]",
-        r"[0-9]{,2}\.[0-9]{,2}",
-        r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
-        r"[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}\.[0-9]{,2}",
-    ], [
-        r"第[零一二三四五六七八九十百]+章",
-        r"第[零一二三四五六七八九十百]+节",
-        r"[零一二三四五六七八九十百]+[ 、]",
-        r"[\(（][零一二三四五六七八九十百]+[\)）]",
-        r"[\(（][0-9]{,2}[\)）]",
-    ] ,[
-        r"PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)",
-        r"Chapter (I+V?|VI*|XI|IX|X)",
-        r"Section [0-9]+",
-        r"Article [0-9]+"
-    ]
-    ]
-
-
-def bullets_category(sections):
-    global BULLET_PATTERN
-    hits = [0] * len(BULLET_PATTERN)
-    for i, pro in enumerate(BULLET_PATTERN):
-        for sec in sections:
-            for p in pro:
-                if re.match(p, sec):
-                    hits[i] += 1
-                    break
-    maxium = 0
-    res = -1
-    for i,h in enumerate(hits):
-        if h <= maxium:continue
-        res = i
-        maxium = h
-    return res
-
-def is_english(texts):
-    eng = 0
-    for t in texts:
-        if re.match(r"[a-zA-Z]{2,}", t.strip()):
-            eng += 1
-    if eng / len(texts) > 0.8:
-        return True
-    return False
-
-def tokenize(d, t, eng):
-    d["content_with_weight"] = t
-    if eng:
-        t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
-        d["content_ltks"] = " ".join([stemmer.stem(w) for w in word_tokenize(t)])
-    else:
-        d["content_ltks"] = huqie.qie(t)
-        d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
-
-
-def remove_contents_table(sections, eng=False):
-    i = 0
-    while i < len(sections):
-        def get(i):
-            nonlocal sections
-            return (sections[i] if type(sections[i]) == type("") else sections[i][0]).strip()
-        if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", get(i).split("@@")[0], re.IGNORECASE)):
-            i += 1
-            continue
-        sections.pop(i)
-        if i >= len(sections): break
-        prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
-        while not prefix:
-            sections.pop(i)
-            if i >= len(sections): break
-            prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
-        sections.pop(i)
-        if i >= len(sections) or not prefix: break
-        for j in range(i, min(i+128, len(sections))):
-            if not re.match(prefix, get(j)):
-                continue
-            for _ in range(i, j):sections.pop(i)
-            break