Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-11-12 14:59:41 +08:00
committed by GitHub
parent 00b6000b76
commit f4c52371ab
42 changed files with 2647 additions and 1878 deletions

View File

@ -20,6 +20,7 @@ from rag.nlp import tokenize, is_english
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read
import json
class Ppt(PptParser):
@ -107,9 +108,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
d["page_num_list"] = json.dumps([pn + 1])
d["top_list"] = json.dumps([0])
d["position_list"] = json.dumps([(pn + 1, 0, img.size[0], 0, img.size[1])])
tokenize(d, txt, eng)
res.append(d)
return res
@ -123,10 +124,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pn += from_page
if img:
d["image"] = img
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [
(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
d["page_num_list"] = json.dumps([pn + 1])
d["top_list"] = json.dumps([0])
d["position_list"] = json.dumps([
(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)])
tokenize(d, txt, eng)
res.append(d)
return res

View File

@ -74,7 +74,7 @@ class Excel(ExcelParser):
def trans_datatime(s):
try:
return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
except Exception as e:
except Exception:
pass
@ -112,7 +112,7 @@ def column_data_type(arr):
continue
try:
arr[i] = trans[ty](str(arr[i]))
except Exception as e:
except Exception:
arr[i] = None
# if ty == "text":
# if len(arr) > 128 and uni / len(arr) < 0.1:
@ -182,7 +182,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
"datetime": "_dt",
"bool": "_kwd"}
for df in dfs:
for n in ["id", "_id", "index", "idx"]:
for n in ["id", "index", "idx"]:
if n in df.columns:
del df[n]
clmns = df.columns.values