Integration with Infinity (#2894)

### What problem does this PR solve? Integration with Infinity - Replaced ELASTICSEARCH with dataStoreConn - Renamed deleteByQuery with delete - Renamed bulk to upsertBulk - getHighlight, getAggregation - Fix KGSearch.search - Moved Dealer.sql_retrieval to es_conn.py ### Type of change - [x] Refactoring
2026-01-31 07:36:46 +08:00 · 2024-11-12 14:59:41 +08:00
parent 00b6000b76
commit f4c52371ab
42 changed files with 2647 additions and 1878 deletions
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@ -20,6 +20,7 @@ from rag.nlp import tokenize, is_english
 from rag.nlp import rag_tokenizer
 from deepdoc.parser import PdfParser, PptParser, PlainParser
 from PyPDF2 import PdfReader as pdf2_read
+import json


 class Ppt(PptParser):
@ -107,9 +108,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            d = copy.deepcopy(doc)
            pn += from_page
            d["image"] = img
-            d["page_num_int"] = [pn + 1]
-            d["top_int"] = [0]
-            d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
+            d["page_num_list"] = json.dumps([pn + 1])
+            d["top_list"] = json.dumps([0])
+            d["position_list"] = json.dumps([(pn + 1, 0, img.size[0], 0, img.size[1])])
            tokenize(d, txt, eng)
            res.append(d)
        return res
@ -123,10 +124,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            pn += from_page
            if img:
                d["image"] = img
-            d["page_num_int"] = [pn + 1]
-            d["top_int"] = [0]
-            d["position_int"] = [
-                (pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
+            d["page_num_list"] = json.dumps([pn + 1])
+            d["top_list"] = json.dumps([0])
+            d["position_list"] = json.dumps([
+                (pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)])
            tokenize(d, txt, eng)
            res.append(d)
        return res
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -74,7 +74,7 @@ class Excel(ExcelParser):
 def trans_datatime(s):
    try:
        return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
-    except Exception as e:
+    except Exception:
        pass


@ -112,7 +112,7 @@ def column_data_type(arr):
            continue
        try:
            arr[i] = trans[ty](str(arr[i]))
-        except Exception as e:
+        except Exception:
            arr[i] = None
    # if ty == "text":
    #    if len(arr) > 128 and uni / len(arr) < 0.1:
@ -182,7 +182,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
        "datetime": "_dt",
        "bool": "_kwd"}
    for df in dfs:
-        for n in ["id", "_id", "index", "idx"]:
+        for n in ["id", "index", "idx"]:
            if n in df.columns:
                del df[n]
        clmns = df.columns.values