mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
fix github account login issue (#132)
This commit is contained in:
@ -106,7 +106,9 @@ def github_callback():
|
|||||||
stat_logger.exception(e)
|
stat_logger.exception(e)
|
||||||
return redirect("/?error=%s"%str(e))
|
return redirect("/?error=%s"%str(e))
|
||||||
user = users[0]
|
user = users[0]
|
||||||
|
user.access_token = get_uuid()
|
||||||
login_user(user)
|
login_user(user)
|
||||||
|
user.save()
|
||||||
return redirect("/?auth=%s" % user.get_id())
|
return redirect("/?auth=%s" % user.get_id())
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -639,7 +639,7 @@ class HuParser:
|
|||||||
mink = ""
|
mink = ""
|
||||||
minv = 1000000000
|
minv = 1000000000
|
||||||
for k, bxs in tbls.items():
|
for k, bxs in tbls.items():
|
||||||
for b in bxs[:10]:
|
for b in bxs:
|
||||||
if b.get("layout_type", "").find("caption") >= 0:
|
if b.get("layout_type", "").find("caption") >= 0:
|
||||||
continue
|
continue
|
||||||
y_dis = self._y_dis(c, b)
|
y_dis = self._y_dis(c, b)
|
||||||
|
|||||||
@ -62,9 +62,6 @@ class Pdf(PdfParser):
|
|||||||
for b in self.boxes:
|
for b in self.boxes:
|
||||||
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
||||||
|
|
||||||
# merge chunks with the same bullets
|
|
||||||
self._merge_with_same_bullet()
|
|
||||||
|
|
||||||
# set pivot using the most frequent type of title,
|
# set pivot using the most frequent type of title,
|
||||||
# then merge between 2 pivot
|
# then merge between 2 pivot
|
||||||
bull = bullets_category([b["text"] for b in self.boxes])
|
bull = bullets_category([b["text"] for b in self.boxes])
|
||||||
@ -79,7 +76,7 @@ class Pdf(PdfParser):
|
|||||||
|
|
||||||
sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
|
sections = [(b["text"], sec_ids[i], get_position(b)) for i, b in enumerate(self.boxes)]
|
||||||
for (img, rows), poss in tbls:
|
for (img, rows), poss in tbls:
|
||||||
sections.append((rows[0], -1, [(p[0]+1, p[1], p[2], p[3], p[4]) for p in poss]))
|
sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0]+1-from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
last_sid = -2
|
last_sid = -2
|
||||||
|
|||||||
@ -11,6 +11,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import re
|
import re
|
||||||
|
from copy import deepcopy
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from nltk import word_tokenize
|
from nltk import word_tokenize
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
@ -93,12 +94,17 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
All the deformed lines will be ignored.
|
All the deformed lines will be ignored.
|
||||||
Every pair of Q&A will be treated as a chunk.
|
Every pair of Q&A will be treated as a chunk.
|
||||||
"""
|
"""
|
||||||
|
eng = lang.lower() == "english"
|
||||||
res = []
|
res = []
|
||||||
|
doc = {
|
||||||
|
"docnm_kwd": filename,
|
||||||
|
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
|
}
|
||||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
excel_parser = Excel()
|
excel_parser = Excel()
|
||||||
for q, a in excel_parser(filename, binary, callback):
|
for q, a in excel_parser(filename, binary, callback):
|
||||||
res.append(beAdoc({}, q, a, excel_parser.is_english))
|
res.append(beAdoc(deepcopy(doc), q, a, eng))
|
||||||
return res
|
return res
|
||||||
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
@ -113,14 +119,14 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
break
|
break
|
||||||
txt += l
|
txt += l
|
||||||
lines = txt.split("\n")
|
lines = txt.split("\n")
|
||||||
eng = lang.lower() == "english"#is_english([rmPrefix(l) for l in lines[:100]])
|
#is_english([rmPrefix(l) for l in lines[:100]])
|
||||||
fails = []
|
fails = []
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
arr = [l for l in line.split("\t") if len(l) > 1]
|
arr = [l for l in line.split("\t") if len(l) > 1]
|
||||||
if len(arr) != 2:
|
if len(arr) != 2:
|
||||||
fails.append(str(i))
|
fails.append(str(i))
|
||||||
continue
|
continue
|
||||||
res.append(beAdoc({}, arr[0], arr[1], eng))
|
res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
|
||||||
if len(res) % 999 == 0:
|
if len(res) % 999 == 0:
|
||||||
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
||||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||||
|
|||||||
@ -76,6 +76,7 @@ def is_english(texts):
|
|||||||
|
|
||||||
def tokenize(d, t, eng):
|
def tokenize(d, t, eng):
|
||||||
d["content_with_weight"] = t
|
d["content_with_weight"] = t
|
||||||
|
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
||||||
if eng:
|
if eng:
|
||||||
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
t = re.sub(r"([a-z])-([a-z])", r"\1\2", t)
|
||||||
d["content_ltks"] = " ".join([stemmer.stem(w)
|
d["content_ltks"] = " ".join([stemmer.stem(w)
|
||||||
|
|||||||
@ -29,7 +29,7 @@ class EsQueryer:
|
|||||||
for t in arr:
|
for t in arr:
|
||||||
if not re.match(r"[a-zA-Z]+$", t):
|
if not re.match(r"[a-zA-Z]+$", t):
|
||||||
e += 1
|
e += 1
|
||||||
return e * 1. / len(arr) >= 0.8
|
return e * 1. / len(arr) >= 0.7
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def rmWWW(txt):
|
def rmWWW(txt):
|
||||||
|
|||||||
Reference in New Issue
Block a user