From 9a843667b3e4fb8ee4ebc9d5010b00b41f770370 Mon Sep 17 00:00:00 2001 From: KevinHuSh Date: Tue, 19 Mar 2024 15:31:47 +0800 Subject: [PATCH] fix github account login issue (#132) --- api/apps/user_app.py | 2 ++ deepdoc/parser/pdf_parser.py | 2 +- rag/app/manual.py | 5 +---- rag/app/qa.py | 12 +++++++++--- rag/nlp/__init__.py | 1 + rag/nlp/query.py | 2 +- 6 files changed, 15 insertions(+), 9 deletions(-) diff --git a/api/apps/user_app.py b/api/apps/user_app.py index 7ff2364f6..c3dd68ef2 100644 --- a/api/apps/user_app.py +++ b/api/apps/user_app.py @@ -106,7 +106,9 @@ def github_callback(): stat_logger.exception(e) return redirect("/?error=%s"%str(e)) user = users[0] + user.access_token = get_uuid() login_user(user) + user.save() return redirect("/?auth=%s" % user.get_id()) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 8aaa7dc51..2f6bec07b 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -639,7 +639,7 @@ class HuParser: mink = "" minv = 1000000000 for k, bxs in tbls.items(): - for b in bxs[:10]: + for b in bxs: if b.get("layout_type", "").find("caption") >= 0: continue y_dis = self._y_dis(c, b) diff --git a/rag/app/manual.py b/rag/app/manual.py index 018a3b725..b3dc688ec 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -62,9 +62,6 @@ class Pdf(PdfParser): for b in self.boxes: b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip()) - # merge chunks with the same bullets - self._merge_with_same_bullet() - # set pivot using the most frequent type of title, # then merge between 2 pivot bull = bullets_category([b["text"] for b in self.boxes]) @@ -79,7 +76,7 @@ class Pdf(PdfParser): sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)] for (img, rows), poss in tbls: - sections.append((rows[0], -1, [(p[0]+1, p[1], p[2], p[3], p[4]) for p in poss])) + sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss])) chunks = [] last_sid = -2 diff --git a/rag/app/qa.py b/rag/app/qa.py index e649b4a23..0a546921e 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -11,6 +11,7 @@ # limitations under the License. # import re +from copy import deepcopy from io import BytesIO from nltk import word_tokenize from openpyxl import load_workbook @@ -93,12 +94,17 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): All the deformed lines will be ignored. Every pair of Q&A will be treated as a chunk. """ + eng = lang.lower() == "english" res = [] + doc = { + "docnm_kwd": filename, + "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + } if re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") excel_parser = Excel() for q, a in excel_parser(filename, binary, callback): - res.append(beAdoc({}, q, a, excel_parser.is_english)) + res.append(beAdoc(deepcopy(doc), q, a, eng)) return res elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") @@ -113,14 +119,14 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): break txt += l lines = txt.split("\n") - eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]]) + #is_english([rmPrefix(l) for l in lines[:100]]) fails = [] for i, line in enumerate(lines): arr = [l for l in line.split("\t") if len(l) > 1] if len(arr) != 2: fails.append(str(i)) continue - res.append(beAdoc({}, arr[0], arr[1], eng)) + res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng)) if len(res) % 999 == 0: callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 95e8308ea..4afd37624 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -76,6 +76,7 @@ def is_english(texts): def tokenize(d, t, eng): d["content_with_weight"] = t + t = re.sub(r"]{0,12})?>", " ", t) if eng: t = re.sub(r"([a-z])-([a-z])", r"\1\2", t) d["content_ltks"] = " ".join([stemmer.stem(w) diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 3d944b3be..608f18f99 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -29,7 +29,7 @@ class EsQueryer: for t in arr: if not re.match(r"[a-zA-Z]+$", t): e += 1 - return e * 1. / len(arr) >= 0.8 + return e * 1. / len(arr) >= 0.7 @staticmethod def rmWWW(txt):