Refine resume parts and fix bugs in retrival using sql (#66)

2026-02-01 08:05:07 +08:00 · 2024-02-19 19:22:17 +08:00
parent 452020d33a
commit a8294f2168
29 changed files with 302 additions and 158 deletions
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -70,7 +70,17 @@ def beAdoc(d, q, a, eng):


 def chunk(filename, binary=None, callback=None, **kwargs):
+    """
+        Excel and csv(txt) format files are supported.
+        If the file is in excel format, there should be 2 column question and answer without header.
+        And question column is ahead of answer column.
+        And it's O.K if it has multiple sheets as long as the columns are rightly composed.

+        If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
+
+        All the deformed lines will be ignored.
+        Every pair of Q&A will be treated as a chunk.
+    """
    res = []
    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")