fix github account login issue (#132)

This commit is contained in:
KevinHuSh
2024-03-19 15:31:47 +08:00
committed by GitHub
parent 9da671b951
commit 9a843667b3
6 changed files with 15 additions and 9 deletions

View File

@ -106,7 +106,9 @@ def github_callback():
stat_logger.exception(e) stat_logger.exception(e)
return redirect("/?error=%s"%str(e)) return redirect("/?error=%s"%str(e))
user = users[0] user = users[0]
user.access_token = get_uuid()
login_user(user) login_user(user)
user.save()
return redirect("/?auth=%s" % user.get_id()) return redirect("/?auth=%s" % user.get_id())

View File

@ -639,7 +639,7 @@ class HuParser:
mink = "" mink = ""
minv = 1000000000 minv = 1000000000
for k, bxs in tbls.items(): for k, bxs in tbls.items():
for b in bxs[:10]: for b in bxs:
if b.get("layout_type", "").find("caption") >= 0: if b.get("layout_type", "").find("caption") >= 0:
continue continue
y_dis = self._y_dis(c, b) y_dis = self._y_dis(c, b)

View File

@ -62,9 +62,6 @@ class Pdf(PdfParser):
for b in self.boxes: for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip()) b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
# merge chunks with the same bullets
self._merge_with_same_bullet()
# set pivot using the most frequent type of title, # set pivot using the most frequent type of title,
# then merge between 2 pivot # then merge between 2 pivot
bull = bullets_category([b["text"] for b in self.boxes]) bull = bullets_category([b["text"] for b in self.boxes])
@ -79,7 +76,7 @@ class Pdf(PdfParser):
sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)] sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls: for (img, rows), poss in tbls:
sections.append((rows[0], -1, [(p[0]+1, p[1], p[2], p[3], p[4]) for p in poss])) sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))
chunks = [] chunks = []
last_sid = -2 last_sid = -2

View File

@ -11,6 +11,7 @@
# limitations under the License. # limitations under the License.
# #
import re import re
from copy import deepcopy
from io import BytesIO from io import BytesIO
from nltk import word_tokenize from nltk import word_tokenize
from openpyxl import load_workbook from openpyxl import load_workbook
@ -93,12 +94,17 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
All the deformed lines will be ignored. All the deformed lines will be ignored.
Every pair of Q&A will be treated as a chunk. Every pair of Q&A will be treated as a chunk.
""" """
eng = lang.lower() == "english"
res = [] res = []
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
excel_parser = Excel() excel_parser = Excel()
for q, a in excel_parser(filename, binary, callback): for q, a in excel_parser(filename, binary, callback):
res.append(beAdoc({}, q, a, excel_parser.is_english)) res.append(beAdoc(deepcopy(doc), q, a, eng))
return res return res
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
@ -113,14 +119,14 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
break break
txt += l txt += l
lines = txt.split("\n") lines = txt.split("\n")
eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]]) #is_english([rmPrefix(l) for l in lines[:100]])
fails = [] fails = []
for i, line in enumerate(lines): for i, line in enumerate(lines):
arr = [l for l in line.split("\t") if len(l) > 1] arr = [l for l in line.split("\t") if len(l) > 1]
if len(arr) != 2: if len(arr) != 2:
fails.append(str(i)) fails.append(str(i))
continue continue
res.append(beAdoc({}, arr[0], arr[1], eng)) res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
if len(res) % 999 == 0: if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

View File

@ -76,6 +76,7 @@ def is_english(texts):
def tokenize(d, t, eng): def tokenize(d, t, eng):
d["content_with_weight"] = t d["content_with_weight"] = t
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
if eng: if eng:
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t) t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
d["content_ltks"] = " ".join([stemmer.stem(w) d["content_ltks"] = " ".join([stemmer.stem(w)

View File

@ -29,7 +29,7 @@ class EsQueryer:
for t in arr: for t in arr:
if not re.match(r"[a-zA-Z]+$", t): if not re.match(r"[a-zA-Z]+$", t):
e += 1 e += 1
return e * 1. / len(arr) >= 0.8 return e * 1. / len(arr) >= 0.7
@staticmethod @staticmethod
def rmWWW(txt): def rmWWW(txt):