Added doc for switching elasticsearch to infinity (#3370)

### What problem does this PR solve?

Added doc for switching elasticsearch to infinity

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Documentation Update
This commit is contained in:
Zhichang Yu
2024-11-14 00:08:55 +08:00
committed by GitHub
parent 83c6b1f308
commit 9d395ab74e
15 changed files with 157 additions and 62 deletions

View File

@ -4,7 +4,6 @@ import time
import os
from typing import List, Dict
import elasticsearch
import copy
from elasticsearch import Elasticsearch
from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
@ -17,14 +16,13 @@ import polars as pl
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, FusionExpr
from rag.nlp import is_english, rag_tokenizer
logger.info("Elasticsearch sdk version: "+str(elasticsearch.__version__))
@singleton
class ESConnection(DocStoreConnection):
def __init__(self):
self.info = {}
for _ in range(10):
logger.info(f"Use Elasticsearch {settings.ES['hosts']} as the doc engine.")
for _ in range(24):
try:
self.es = Elasticsearch(
settings.ES["hosts"].split(","),
@ -34,21 +32,27 @@ class ESConnection(DocStoreConnection):
)
if self.es:
self.info = self.es.info()
logger.info("Connect to es.")
break
except Exception:
logger.exception("Fail to connect to es")
time.sleep(1)
except Exception as e:
logger.warn(f"{str(e)}. Waiting Elasticsearch {settings.ES['hosts']} to be healthy.")
time.sleep(5)
if not self.es.ping():
raise Exception("Can't connect to ES cluster")
v = self.info.get("version", {"number": "5.6"})
msg = f"Elasticsearch {settings.ES['hosts']} didn't become healthy in 120s."
logger.error(msg)
raise Exception(msg)
v = self.info.get("version", {"number": "8.11.3"})
v = v["number"].split(".")[0]
if int(v) < 8:
raise Exception(f"ES version must be greater than or equal to 8, current version: {v}")
msg = f"Elasticsearch version must be greater than or equal to 8, current version: {v}"
logger.error(msg)
raise Exception(msg)
fp_mapping = os.path.join(get_project_base_directory(), "conf", "mapping.json")
if not os.path.exists(fp_mapping):
raise Exception(f"Mapping file not found at {fp_mapping}")
msg = f"Elasticsearch mapping file not found at {fp_mapping}"
logger.error(msg)
raise Exception(msg)
self.mapping = json.load(open(fp_mapping, "r"))
logger.info(f"Elasticsearch {settings.ES['hosts']} is healthy.")
"""
Database operations

View File

@ -1,13 +1,14 @@
import os
import re
import json
import time
from typing import List, Dict
import infinity
from infinity.common import ConflictType, InfinityException
from infinity.index import IndexInfo, IndexType
from infinity.connection_pool import ConnectionPool
from rag import settings
from api.utils.log_utils import logger
from rag import settings
from rag.utils import singleton
import polars as pl
from polars.series.series import Series
@ -54,8 +55,24 @@ class InfinityConnection(DocStoreConnection):
if ":" in infinity_uri:
host, port = infinity_uri.split(":")
infinity_uri = infinity.common.NetworkAddress(host, int(port))
self.connPool = ConnectionPool(infinity_uri)
logger.info(f"Connected to infinity {infinity_uri}.")
self.connPool = None
logger.info(f"Use Infinity {infinity_uri} as the doc engine.")
for _ in range(24):
try:
connPool = ConnectionPool(infinity_uri)
inf_conn = connPool.get_conn()
_ = inf_conn.show_current_node()
connPool.release_conn(inf_conn)
self.connPool = connPool
break
except Exception as e:
logger.warn(f"{str(e)}. Waiting Infinity {infinity_uri} to be healthy.")
time.sleep(5)
if self.connPool is None:
msg = f"Infinity {infinity_uri} didn't become healthy in 120s."
logger.error(msg)
raise Exception(msg)
logger.info(f"Infinity {infinity_uri} is healthy.")
"""
Database operations
@ -151,8 +168,8 @@ class InfinityConnection(DocStoreConnection):
_ = db_instance.get_table(table_name)
self.connPool.release_conn(inf_conn)
return True
except Exception:
logger.exception("INFINITY indexExist")
except Exception as e:
logger.warn(f"INFINITY indexExist {str(e)}")
return False
"""
@ -199,7 +216,7 @@ class InfinityConnection(DocStoreConnection):
)
if len(filter_cond) != 0:
filter_fulltext = f"({filter_cond}) AND {filter_fulltext}"
# doc_store_logger.info(f"filter_fulltext: {filter_fulltext}")
# logger.info(f"filter_fulltext: {filter_fulltext}")
minimum_should_match = "0%"
if "minimum_should_match" in matchExpr.extra_options:
minimum_should_match = (
@ -312,7 +329,7 @@ class InfinityConnection(DocStoreConnection):
for k, v in d.items():
if k.endswith("_kwd") and isinstance(v, list):
d[k] = " ".join(v)
ids = [f"{d['id']}" for d in documents]
ids = ["'{}'".format(d["id"]) for d in documents]
str_ids = ", ".join(ids)
str_filter = f"id IN ({str_ids})"
table_instance.delete(str_filter)
@ -321,7 +338,7 @@ class InfinityConnection(DocStoreConnection):
# logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
table_instance.insert(documents)
self.connPool.release_conn(inf_conn)
doc_store_logger.info(f"inserted into {table_name} {str_ids}.")
logger.info(f"inserted into {table_name} {str_ids}.")
return []
def update(