mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Integration with Infinity (#2894)
### What problem does this PR solve? Integration with Infinity - Replaced ELASTICSEARCH with dataStoreConn - Renamed deleteByQuery with delete - Renamed bulk to upsertBulk - getHighlight, getAggregation - Fix KGSearch.search - Moved Dealer.sql_retrieval to es_conn.py ### Type of change - [x] Refactoring
This commit is contained in:
@ -20,6 +20,7 @@ from rag.nlp import tokenize, is_english
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||
from PyPDF2 import PdfReader as pdf2_read
|
||||
import json
|
||||
|
||||
|
||||
class Ppt(PptParser):
|
||||
@ -107,9 +108,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
d["image"] = img
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||
d["page_num_list"] = json.dumps([pn + 1])
|
||||
d["top_list"] = json.dumps([0])
|
||||
d["position_list"] = json.dumps([(pn + 1, 0, img.size[0], 0, img.size[1])])
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
@ -123,10 +124,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pn += from_page
|
||||
if img:
|
||||
d["image"] = img
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [
|
||||
(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
|
||||
d["page_num_list"] = json.dumps([pn + 1])
|
||||
d["top_list"] = json.dumps([0])
|
||||
d["position_list"] = json.dumps([
|
||||
(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)])
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
||||
@ -74,7 +74,7 @@ class Excel(ExcelParser):
|
||||
def trans_datatime(s):
|
||||
try:
|
||||
return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@ -112,7 +112,7 @@ def column_data_type(arr):
|
||||
continue
|
||||
try:
|
||||
arr[i] = trans[ty](str(arr[i]))
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
arr[i] = None
|
||||
# if ty == "text":
|
||||
# if len(arr) > 128 and uni / len(arr) < 0.1:
|
||||
@ -182,7 +182,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
||||
"datetime": "_dt",
|
||||
"bool": "_kwd"}
|
||||
for df in dfs:
|
||||
for n in ["id", "_id", "index", "idx"]:
|
||||
for n in ["id", "index", "idx"]:
|
||||
if n in df.columns:
|
||||
del df[n]
|
||||
clmns = df.columns.values
|
||||
|
||||
Reference in New Issue
Block a user