Tagging (#4426)

### What problem does this PR solve? #4367 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 16:45:08 +08:00 · 2025-01-09 17:07:21 +08:00
parent f892d7d426
commit c5da3cdd97
30 changed files with 736 additions and 202 deletions
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -26,6 +26,7 @@ from docx import Document
 from PIL import Image
 from markdown import markdown

+
 class Excel(ExcelParser):
    def __call__(self, fnm, binary=None, callback=None):
        if not binary:
@ -58,11 +59,11 @@ class Excel(ExcelParser):
                if len(res) % 999 == 0:
                    callback(len(res) *
                             0.6 /
-                             total, ("Extract Q&A: {}".format(len(res)) +
+                             total, ("Extract pairs: {}".format(len(res)) +
                                     (f"{len(fails)} failure, line: %s..." %
                                      (",".join(fails[:3])) if fails else "")))

-        callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + (
+        callback(0.6, ("Extract pairs: {}. ".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        self.is_english = is_english(
            [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
@ -269,7 +270,7 @@ def beAdocPdf(d, q, a, eng, image, poss):
    return d


-def beAdocDocx(d, q, a, eng, image):
+def beAdocDocx(d, q, a, eng, image, row_num=-1):
    qprefix = "Question: " if eng else "问题："
    aprefix = "Answer: " if eng else "回答："
    d["content_with_weight"] = "\t".join(
@ -277,16 +278,20 @@ def beAdocDocx(d, q, a, eng, image):
    d["content_ltks"] = rag_tokenizer.tokenize(q)
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    d["image"] = image
+    if row_num >= 0:
+        d["top_int"] = [row_num]
    return d


-def beAdoc(d, q, a, eng):
+def beAdoc(d, q, a, eng, row_num=-1):
    qprefix = "Question: " if eng else "问题："
    aprefix = "Answer: " if eng else "回答："
    d["content_with_weight"] = "\t".join(
        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
    d["content_ltks"] = rag_tokenizer.tokenize(q)
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    if row_num >= 0:
+        d["top_int"] = [row_num]
    return d


@ -316,8 +321,8 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = Excel()
-        for q, a in excel_parser(filename, binary, callback):
-            res.append(beAdoc(deepcopy(doc), q, a, eng))
+        for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
+            res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
        return res

    elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
@ -344,7 +349,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                    fails.append(str(i+1))
            elif len(arr) == 2:
                if question and answer:
-                    res.append(beAdoc(deepcopy(doc), question, answer, eng))
+                    res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
                question, answer = arr
            i += 1
            if len(res) % 999 == 0:
@ -352,7 +357,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

        if question:
-            res.append(beAdoc(deepcopy(doc), question, answer, eng))
+            res.append(beAdoc(deepcopy(doc), question, answer, eng, len(lines)))

        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
@ -378,14 +383,14 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                    fails.append(str(i + 1))
            elif len(row) == 2:
                if question and answer:
-                    res.append(beAdoc(deepcopy(doc), question, answer, eng))
+                    res.append(beAdoc(deepcopy(doc), question, answer, eng, i))
                question, answer = row
            if len(res) % 999 == 0:
                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

        if question:
-            res.append(beAdoc(deepcopy(doc), question, answer, eng))
+            res.append(beAdoc(deepcopy(doc), question, answer, eng, len(reader)))

        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
@ -420,7 +425,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
                if last_answer.strip():
                    sum_question = '\n'.join(question_stack)
                    if sum_question:
-                        res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
+                        res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
                    last_answer = ''

                i = question_level
@ -432,7 +437,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
        if last_answer.strip():
            sum_question = '\n'.join(question_stack)
            if sum_question:
-                res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
+                res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng, index))
        return res

    elif re.search(r"\.docx$", filename, re.IGNORECASE):
@ -440,8 +445,8 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
        qai_list, tbls = docx_parser(filename, binary,
                                    from_page=0, to_page=10000, callback=callback)
        res = tokenize_table(tbls, doc, eng)
-        for q, a, image in qai_list:
-            res.append(beAdocDocx(deepcopy(doc), q, a, eng, image))
+        for i, (q, a, image) in enumerate(qai_list):
+            res.append(beAdocDocx(deepcopy(doc), q, a, eng, image, i))
        return res

    raise NotImplementedError(
--- a/rag/app/tag.py
+++ b/rag/app/tag.py
@ -0,0 +1,125 @@
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import re
+import csv
+from copy import deepcopy
+
+from deepdoc.parser.utils import get_text
+from rag.app.qa import Excel
+from rag.nlp import rag_tokenizer
+
+
+def beAdoc(d, q, a, eng, row_num=-1):
+    d["content_with_weight"] = q
+    d["content_ltks"] = rag_tokenizer.tokenize(q)
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    d["tag_kwd"] = [t.strip() for t in a.split(",") if t.strip()]
+    if row_num >= 0:
+        d["top_int"] = [row_num]
+    return d
+
+
+def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
+    """
+        Excel and csv(txt) format files are supported.
+        If the file is in excel format, there should be 2 column content and tags without header.
+        And content column is ahead of tags column.
+        And it's O.K if it has multiple sheets as long as the columns are rightly composed.
+
+        If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate content and tags.
+
+        All the deformed lines will be ignored.
+        Every pair will be treated as a chunk.
+    """
+    eng = lang.lower() == "english"
+    res = []
+    doc = {
+        "docnm_kwd": filename,
+        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
+    }
+    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        excel_parser = Excel()
+        for ii, (q, a) in enumerate(excel_parser(filename, binary, callback)):
+            res.append(beAdoc(deepcopy(doc), q, a, eng, ii))
+        return res
+
+    elif re.search(r"\.(txt)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        comma, tab = 0, 0
+        for line in lines:
+            if len(line.split(",")) == 2:
+                comma += 1
+            if len(line.split("\t")) == 2:
+                tab += 1
+        delimiter = "\t" if tab >= comma else ","
+
+        fails = []
+        content = ""
+        i = 0
+        while i < len(lines):
+            arr = lines[i].split(delimiter)
+            if len(arr) != 2:
+                content += "\n" + lines[i]
+            elif len(arr) == 2:
+                content += "\n" + arr[0]
+                res.append(beAdoc(deepcopy(doc), content, arr[1], eng, i))
+                content = ""
+            i += 1
+            if len(res) % 999 == 0:
+                callback(len(res) * 0.6 / len(lines), ("Extract TAG: {}".format(len(res)) + (
+                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        callback(0.6, ("Extract TAG: {}".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        return res
+
+    elif re.search(r"\.(csv)$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        txt = get_text(filename, binary)
+        lines = txt.split("\n")
+        delimiter = "\t" if any("\t" in line for line in lines) else ","
+
+        fails = []
+        content = ""
+        res = []
+        reader = csv.reader(lines, delimiter=delimiter)
+
+        for i, row in enumerate(reader):
+            if len(row) != 2:
+                content += "\n" + lines[i]
+            elif len(row) == 2:
+                content += "\n" + row[0]
+                res.append(beAdoc(deepcopy(doc), content, row[1], eng, i))
+                content = ""
+            if len(res) % 999 == 0:
+                callback(len(res) * 0.6 / len(lines), ("Extract Tags: {}".format(len(res)) + (
+                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+
+        callback(0.6, ("Extract TAG : {}".format(len(res)) + (
+            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
+        return res
+
+    raise NotImplementedError(
+        "Excel, csv(txt) format files are supported.")
+
+
+if __name__ == "__main__":
+    import sys
+
+    def dummy(prog=None, msg=""):
+        pass
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)