Add pdf support for QA parser (#1155)

### What problem does this PR solve? Support extracting questions and answers from PDF files ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 08:35:08 +08:00 · 2024-06-14 15:12:39 +08:00
parent 7dc39cbfa6
commit 90975460af
5 changed files with 194 additions and 10 deletions
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -13,13 +13,13 @@
 import re
 from copy import deepcopy
 from io import BytesIO
+from timeit import default_timer as timer
 from nltk import word_tokenize
 from openpyxl import load_workbook
-from rag.nlp import is_english, random_choices, find_codec
-from rag.nlp import rag_tokenizer
-from deepdoc.parser import ExcelParser
-
-
+from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet
+from rag.nlp import rag_tokenizer, tokenize_table
+from rag.settings import cron_logger
+from deepdoc.parser import PdfParser, ExcelParser
 class Excel(ExcelParser):
    def __call__(self, fnm, binary=None, callback=None):
        if not binary:
@ -62,12 +62,80 @@ class Excel(ExcelParser):
            [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
        return res

-
+class Pdf(PdfParser):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
+        start = timer()
+        callback(msg="OCR is running...")
+        self.__images__(
+            filename if not binary else binary,
+            zoomin,
+            from_page,
+            to_page,
+            callback
+        )
+        callback(msg="OCR finished")
+        cron_logger.info("OCR({}~{}): {}".format(from_page, to_page, timer() - start))
+        start = timer()
+        self._layouts_rec(zoomin, drop=False)
+        callback(0.63, "Layout analysis finished.")
+        self._table_transformer_job(zoomin)
+        callback(0.65, "Table analysis finished.")
+        self._text_merge()
+        callback(0.67, "Text merging finished")
+        tbls = self._extract_table_figure(True, zoomin, True, True)
+        #self._naive_vertical_merge()
+        # self._concat_downward()
+        #self._filter_forpages()
+        cron_logger.info("layouts: {}".format(timer() - start))
+        sections = [b["text"] for b in self.boxes]
+        bull_x0_list = []
+        q_bull, reg = qbullets_category(sections)
+        if q_bull == -1:
+            raise ValueError("Unable to recognize Q&A structure.")
+        qai_list = []
+        last_q, last_a, last_tag = '', '', ''
+        last_index = -1
+        last_box = {'text':''}
+        last_bull = None
+        for box in self.boxes:
+            section, line_tag = box['text'], self._line_tag(box, zoomin)
+            has_bull, index = has_qbullet(reg, box, last_box, last_index, last_bull, bull_x0_list)
+            last_box, last_index, last_bull = box, index, has_bull
+            if not has_bull:  # No question bullet
+                if not last_q:
+                    continue
+                else:
+                    last_a = f'{last_a}{section}'
+                    last_tag = f'{last_tag}{line_tag}'
+            else:
+                if last_q:
+                    qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
+                    last_q, last_a, last_tag = '', '', ''
+                last_q = has_bull.group()
+                _, end = has_bull.span()
+                last_a = section[end:]
+                last_tag = line_tag
+        if last_q:
+            qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
+        return qai_list, tbls
+    
 def rmPrefix(txt):
    return re.sub(
        r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:： ]+", "", txt.strip(), flags=re.IGNORECASE)


+def beAdocPdf(d, q, a, eng, image, poss):
+    qprefix = "Question: " if eng else "问题："
+    aprefix = "Answer: " if eng else "回答："
+    d["content_with_weight"] = "\t".join(
+        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
+    d["content_ltks"] = rag_tokenizer.tokenize(q)
+    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
+    d["image"] = image
+    add_positions(d, poss)
+    return d
+
 def beAdoc(d, q, a, eng):
    qprefix = "Question: " if eng else "问题："
    aprefix = "Answer: " if eng else "回答："
@ -145,6 +213,19 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

        return res
+    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
+        pdf_parser = Pdf()
+        count = 0
+        qai_list, tbls = pdf_parser(filename if not binary else binary,
+                                    from_page=0, to_page=10000, callback=callback)
+        
+        res = tokenize_table(tbls, doc, eng)
+
+        for q, a, image, poss in qai_list:
+            count += 1
+            res.append(beAdocPdf(deepcopy(doc), q, a, eng, image, poss))
+        return res
+

    raise NotImplementedError(
        "Excel and csv(txt) format files are supported.")
@ -153,6 +234,8 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
 if __name__ == "__main__":
    import sys

-    def dummy(a, b):
+    def dummy(prog=None, msg=""):
        pass
-    chunk(sys.argv[1], callback=dummy)
+    import json
+
+    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)