|
|
|
@ -30,6 +30,7 @@ from rag.settings import PAGERANK_FLD, TAG_FLD
|
|
|
|
from rag.utils import singleton
|
|
|
|
from rag.utils import singleton
|
|
|
|
import pandas as pd
|
|
|
|
import pandas as pd
|
|
|
|
from api.utils.file_utils import get_project_base_directory
|
|
|
|
from api.utils.file_utils import get_project_base_directory
|
|
|
|
|
|
|
|
from rag.nlp import is_english
|
|
|
|
|
|
|
|
|
|
|
|
from rag.utils.doc_store_conn import (
|
|
|
|
from rag.utils.doc_store_conn import (
|
|
|
|
DocStoreConnection,
|
|
|
|
DocStoreConnection,
|
|
|
|
@ -40,7 +41,8 @@ from rag.utils.doc_store_conn import (
|
|
|
|
OrderByExpr,
|
|
|
|
OrderByExpr,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger('ragflow.infinity_conn')
|
|
|
|
logger = logging.getLogger("ragflow.infinity_conn")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def field_keyword(field_name: str):
|
|
|
|
def field_keyword(field_name: str):
|
|
|
|
# The "docnm_kwd" field is always a string, not list.
|
|
|
|
# The "docnm_kwd" field is always a string, not list.
|
|
|
|
@ -48,6 +50,7 @@ def field_keyword(field_name: str):
|
|
|
|
return True
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def equivalent_condition_to_str(condition: dict, table_instance=None) -> str | None:
|
|
|
|
def equivalent_condition_to_str(condition: dict, table_instance=None) -> str | None:
|
|
|
|
assert "_id" not in condition
|
|
|
|
assert "_id" not in condition
|
|
|
|
clmns = {}
|
|
|
|
clmns = {}
|
|
|
|
@ -115,10 +118,10 @@ def concat_dataframes(df_list: list[pd.DataFrame], selectFields: list[str]) -> p
|
|
|
|
|
|
|
|
|
|
|
|
schema = []
|
|
|
|
schema = []
|
|
|
|
for field_name in selectFields:
|
|
|
|
for field_name in selectFields:
|
|
|
|
if field_name == 'score()': # Workaround: fix schema is changed to score()
|
|
|
|
if field_name == "score()": # Workaround: fix schema is changed to score()
|
|
|
|
schema.append('SCORE')
|
|
|
|
schema.append("SCORE")
|
|
|
|
elif field_name == 'similarity()': # Workaround: fix schema is changed to similarity()
|
|
|
|
elif field_name == "similarity()": # Workaround: fix schema is changed to similarity()
|
|
|
|
schema.append('SIMILARITY')
|
|
|
|
schema.append("SIMILARITY")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
schema.append(field_name)
|
|
|
|
schema.append(field_name)
|
|
|
|
return pd.DataFrame(columns=schema)
|
|
|
|
return pd.DataFrame(columns=schema)
|
|
|
|
@ -158,9 +161,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
|
|
|
|
|
|
|
|
def _migrate_db(self, inf_conn):
|
|
|
|
def _migrate_db(self, inf_conn):
|
|
|
|
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
|
|
|
|
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
|
|
|
|
fp_mapping = os.path.join(
|
|
|
|
fp_mapping = os.path.join(get_project_base_directory(), "conf", "infinity_mapping.json")
|
|
|
|
get_project_base_directory(), "conf", "infinity_mapping.json"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
if not os.path.exists(fp_mapping):
|
|
|
|
if not os.path.exists(fp_mapping):
|
|
|
|
raise Exception(f"Mapping file not found at {fp_mapping}")
|
|
|
|
raise Exception(f"Mapping file not found at {fp_mapping}")
|
|
|
|
schema = json.load(open(fp_mapping))
|
|
|
|
schema = json.load(open(fp_mapping))
|
|
|
|
@ -178,16 +179,12 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
res = inf_table.add_columns({field_name: field_info})
|
|
|
|
res = inf_table.add_columns({field_name: field_info})
|
|
|
|
assert res.error_code == infinity.ErrorCode.OK
|
|
|
|
assert res.error_code == infinity.ErrorCode.OK
|
|
|
|
logger.info(
|
|
|
|
logger.info(f"INFINITY added following column to table {table_name}: {field_name} {field_info}")
|
|
|
|
f"INFINITY added following column to table {table_name}: {field_name} {field_info}"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
if field_info["type"] != "varchar" or "analyzer" not in field_info:
|
|
|
|
if field_info["type"] != "varchar" or "analyzer" not in field_info:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
inf_table.create_index(
|
|
|
|
inf_table.create_index(
|
|
|
|
f"text_idx_{field_name}",
|
|
|
|
f"text_idx_{field_name}",
|
|
|
|
IndexInfo(
|
|
|
|
IndexInfo(field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}),
|
|
|
|
field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}
|
|
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
ConflictType.Ignore,
|
|
|
|
ConflictType.Ignore,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@ -221,9 +218,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
inf_conn = self.connPool.get_conn()
|
|
|
|
inf_conn = self.connPool.get_conn()
|
|
|
|
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
|
|
|
|
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
|
|
|
|
|
|
|
|
|
|
|
|
fp_mapping = os.path.join(
|
|
|
|
fp_mapping = os.path.join(get_project_base_directory(), "conf", "infinity_mapping.json")
|
|
|
|
get_project_base_directory(), "conf", "infinity_mapping.json"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
if not os.path.exists(fp_mapping):
|
|
|
|
if not os.path.exists(fp_mapping):
|
|
|
|
raise Exception(f"Mapping file not found at {fp_mapping}")
|
|
|
|
raise Exception(f"Mapping file not found at {fp_mapping}")
|
|
|
|
schema = json.load(open(fp_mapping))
|
|
|
|
schema = json.load(open(fp_mapping))
|
|
|
|
@ -253,15 +248,11 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
inf_table.create_index(
|
|
|
|
inf_table.create_index(
|
|
|
|
f"text_idx_{field_name}",
|
|
|
|
f"text_idx_{field_name}",
|
|
|
|
IndexInfo(
|
|
|
|
IndexInfo(field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}),
|
|
|
|
field_name, IndexType.FullText, {"ANALYZER": field_info["analyzer"]}
|
|
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
ConflictType.Ignore,
|
|
|
|
ConflictType.Ignore,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
self.connPool.release_conn(inf_conn)
|
|
|
|
self.connPool.release_conn(inf_conn)
|
|
|
|
logger.info(
|
|
|
|
logger.info(f"INFINITY created table {table_name}, vector size {vectorSize}")
|
|
|
|
f"INFINITY created table {table_name}, vector size {vectorSize}"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
|
|
|
def deleteIdx(self, indexName: str, knowledgebaseId: str):
|
|
|
|
table_name = f"{indexName}_{knowledgebaseId}"
|
|
|
|
table_name = f"{indexName}_{knowledgebaseId}"
|
|
|
|
@ -288,7 +279,8 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
def search(
|
|
|
|
def search(
|
|
|
|
self, selectFields: list[str],
|
|
|
|
self,
|
|
|
|
|
|
|
|
selectFields: list[str],
|
|
|
|
highlightFields: list[str],
|
|
|
|
highlightFields: list[str],
|
|
|
|
condition: dict,
|
|
|
|
condition: dict,
|
|
|
|
matchExprs: list[MatchExpr],
|
|
|
|
matchExprs: list[MatchExpr],
|
|
|
|
@ -298,10 +290,10 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
indexNames: str | list[str],
|
|
|
|
indexNames: str | list[str],
|
|
|
|
knowledgebaseIds: list[str],
|
|
|
|
knowledgebaseIds: list[str],
|
|
|
|
aggFields: list[str] = [],
|
|
|
|
aggFields: list[str] = [],
|
|
|
|
rank_feature: dict | None = None
|
|
|
|
rank_feature: dict | None = None,
|
|
|
|
) -> tuple[pd.DataFrame, int]:
|
|
|
|
) -> tuple[pd.DataFrame, int]:
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
TODO: Infinity doesn't provide highlight
|
|
|
|
BUG: Infinity returns empty for a highlight field if the query string doesn't use that field.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
if isinstance(indexNames, str):
|
|
|
|
if isinstance(indexNames, str):
|
|
|
|
indexNames = indexNames.split(",")
|
|
|
|
indexNames = indexNames.split(",")
|
|
|
|
@ -438,9 +430,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
matchExpr.extra_options.copy(),
|
|
|
|
matchExpr.extra_options.copy(),
|
|
|
|
)
|
|
|
|
)
|
|
|
|
elif isinstance(matchExpr, FusionExpr):
|
|
|
|
elif isinstance(matchExpr, FusionExpr):
|
|
|
|
builder = builder.fusion(
|
|
|
|
builder = builder.fusion(matchExpr.method, matchExpr.topn, matchExpr.fusion_params)
|
|
|
|
matchExpr.method, matchExpr.topn, matchExpr.fusion_params
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
if filter_cond and len(filter_cond) > 0:
|
|
|
|
if filter_cond and len(filter_cond) > 0:
|
|
|
|
builder.filter(filter_cond)
|
|
|
|
builder.filter(filter_cond)
|
|
|
|
@ -455,15 +445,13 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
self.connPool.release_conn(inf_conn)
|
|
|
|
self.connPool.release_conn(inf_conn)
|
|
|
|
res = concat_dataframes(df_list, output)
|
|
|
|
res = concat_dataframes(df_list, output)
|
|
|
|
if matchExprs:
|
|
|
|
if matchExprs:
|
|
|
|
res['Sum'] = res[score_column] + res[PAGERANK_FLD]
|
|
|
|
res["Sum"] = res[score_column] + res[PAGERANK_FLD]
|
|
|
|
res = res.sort_values(by='Sum', ascending=False).reset_index(drop=True).drop(columns=['Sum'])
|
|
|
|
res = res.sort_values(by="Sum", ascending=False).reset_index(drop=True).drop(columns=["Sum"])
|
|
|
|
res = res.head(limit)
|
|
|
|
res = res.head(limit)
|
|
|
|
logger.debug(f"INFINITY search final result: {str(res)}")
|
|
|
|
logger.debug(f"INFINITY search final result: {str(res)}")
|
|
|
|
return res, total_hits_count
|
|
|
|
return res, total_hits_count
|
|
|
|
|
|
|
|
|
|
|
|
def get(
|
|
|
|
def get(self, chunkId: str, indexName: str, knowledgebaseIds: list[str]) -> dict | None:
|
|
|
|
self, chunkId: str, indexName: str, knowledgebaseIds: list[str]
|
|
|
|
|
|
|
|
) -> dict | None:
|
|
|
|
|
|
|
|
inf_conn = self.connPool.get_conn()
|
|
|
|
inf_conn = self.connPool.get_conn()
|
|
|
|
db_instance = inf_conn.get_database(self.dbName)
|
|
|
|
db_instance = inf_conn.get_database(self.dbName)
|
|
|
|
df_list = list()
|
|
|
|
df_list = list()
|
|
|
|
@ -476,8 +464,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
table_instance = db_instance.get_table(table_name)
|
|
|
|
table_instance = db_instance.get_table(table_name)
|
|
|
|
except Exception:
|
|
|
|
except Exception:
|
|
|
|
logger.warning(
|
|
|
|
logger.warning(f"Table not found: {table_name}, this knowledge base isn't created in Infinity. Maybe it is created in other document engine.")
|
|
|
|
f"Table not found: {table_name}, this knowledge base isn't created in Infinity. Maybe it is created in other document engine.")
|
|
|
|
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
kb_res, _ = table_instance.output(["*"]).filter(f"id = '{chunkId}'").to_df()
|
|
|
|
kb_res, _ = table_instance.output(["*"]).filter(f"id = '{chunkId}'").to_df()
|
|
|
|
logger.debug(f"INFINITY get table: {str(table_list)}, result: {str(kb_res)}")
|
|
|
|
logger.debug(f"INFINITY get table: {str(table_list)}, result: {str(kb_res)}")
|
|
|
|
@ -487,9 +474,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
res_fields = self.getFields(res, res.columns.tolist())
|
|
|
|
res_fields = self.getFields(res, res.columns.tolist())
|
|
|
|
return res_fields.get(chunkId, None)
|
|
|
|
return res_fields.get(chunkId, None)
|
|
|
|
|
|
|
|
|
|
|
|
def insert(
|
|
|
|
def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str = None) -> list[str]:
|
|
|
|
self, documents: list[dict], indexName: str, knowledgebaseId: str = None
|
|
|
|
|
|
|
|
) -> list[str]:
|
|
|
|
|
|
|
|
inf_conn = self.connPool.get_conn()
|
|
|
|
inf_conn = self.connPool.get_conn()
|
|
|
|
db_instance = inf_conn.get_database(self.dbName)
|
|
|
|
db_instance = inf_conn.get_database(self.dbName)
|
|
|
|
table_name = f"{indexName}_{knowledgebaseId}"
|
|
|
|
table_name = f"{indexName}_{knowledgebaseId}"
|
|
|
|
@ -532,7 +517,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
d[k] = v
|
|
|
|
d[k] = v
|
|
|
|
elif re.search(r"_feas$", k):
|
|
|
|
elif re.search(r"_feas$", k):
|
|
|
|
d[k] = json.dumps(v)
|
|
|
|
d[k] = json.dumps(v)
|
|
|
|
elif k == 'kb_id':
|
|
|
|
elif k == "kb_id":
|
|
|
|
if isinstance(d[k], list):
|
|
|
|
if isinstance(d[k], list):
|
|
|
|
d[k] = d[k][0] # since d[k] is a list, but we need a str
|
|
|
|
d[k] = d[k][0] # since d[k] is a list, but we need a str
|
|
|
|
elif k == "position_int":
|
|
|
|
elif k == "position_int":
|
|
|
|
@ -561,9 +546,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
logger.debug(f"INFINITY inserted into {table_name} {str_ids}.")
|
|
|
|
logger.debug(f"INFINITY inserted into {table_name} {str_ids}.")
|
|
|
|
return []
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
def update(
|
|
|
|
def update(self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str) -> bool:
|
|
|
|
self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str
|
|
|
|
|
|
|
|
) -> bool:
|
|
|
|
|
|
|
|
# if 'position_int' in newValue:
|
|
|
|
# if 'position_int' in newValue:
|
|
|
|
# logger.info(f"update position_int: {newValue['position_int']}")
|
|
|
|
# logger.info(f"update position_int: {newValue['position_int']}")
|
|
|
|
inf_conn = self.connPool.get_conn()
|
|
|
|
inf_conn = self.connPool.get_conn()
|
|
|
|
@ -587,7 +570,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
newValue[k] = v
|
|
|
|
newValue[k] = v
|
|
|
|
elif re.search(r"_feas$", k):
|
|
|
|
elif re.search(r"_feas$", k):
|
|
|
|
newValue[k] = json.dumps(v)
|
|
|
|
newValue[k] = json.dumps(v)
|
|
|
|
elif k == 'kb_id':
|
|
|
|
elif k == "kb_id":
|
|
|
|
if isinstance(newValue[k], list):
|
|
|
|
if isinstance(newValue[k], list):
|
|
|
|
newValue[k] = newValue[k][0] # since d[k] is a list, but we need a str
|
|
|
|
newValue[k] = newValue[k][0] # since d[k] is a list, but we need a str
|
|
|
|
elif k == "position_int":
|
|
|
|
elif k == "position_int":
|
|
|
|
@ -615,7 +598,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
remove_opt = {} # "[k,new_value]": [id_to_update, ...]
|
|
|
|
remove_opt = {} # "[k,new_value]": [id_to_update, ...]
|
|
|
|
if removeValue:
|
|
|
|
if removeValue:
|
|
|
|
col_to_remove = list(removeValue.keys())
|
|
|
|
col_to_remove = list(removeValue.keys())
|
|
|
|
row_to_opt = table_instance.output(col_to_remove + ['id']).filter(filter).to_df()
|
|
|
|
row_to_opt = table_instance.output(col_to_remove + ["id"]).filter(filter).to_df()
|
|
|
|
logger.debug(f"INFINITY search table {str(table_name)}, filter {filter}, result: {str(row_to_opt[0])}")
|
|
|
|
logger.debug(f"INFINITY search table {str(table_name)}, filter {filter}, result: {str(row_to_opt[0])}")
|
|
|
|
row_to_opt = self.getFields(row_to_opt, col_to_remove)
|
|
|
|
row_to_opt = self.getFields(row_to_opt, col_to_remove)
|
|
|
|
for id, old_v in row_to_opt.items():
|
|
|
|
for id, old_v in row_to_opt.items():
|
|
|
|
@ -645,9 +628,7 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
table_instance = db_instance.get_table(table_name)
|
|
|
|
table_instance = db_instance.get_table(table_name)
|
|
|
|
except Exception:
|
|
|
|
except Exception:
|
|
|
|
logger.warning(
|
|
|
|
logger.warning(f"Skipped deleting from table {table_name} since the table doesn't exist.")
|
|
|
|
f"Skipped deleting from table {table_name} since the table doesn't exist."
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
return 0
|
|
|
|
return 0
|
|
|
|
filter = equivalent_condition_to_str(condition, table_instance)
|
|
|
|
filter = equivalent_condition_to_str(condition, table_instance)
|
|
|
|
logger.debug(f"INFINITY delete table {table_name}, filter {filter}.")
|
|
|
|
logger.debug(f"INFINITY delete table {table_name}, filter {filter}.")
|
|
|
|
@ -675,14 +656,14 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
if not fields:
|
|
|
|
if not fields:
|
|
|
|
return {}
|
|
|
|
return {}
|
|
|
|
fieldsAll = fields.copy()
|
|
|
|
fieldsAll = fields.copy()
|
|
|
|
fieldsAll.append('id')
|
|
|
|
fieldsAll.append("id")
|
|
|
|
column_map = {col.lower(): col for col in res.columns}
|
|
|
|
column_map = {col.lower(): col for col in res.columns}
|
|
|
|
matched_columns = {column_map[col.lower()]: col for col in set(fieldsAll) if col.lower() in column_map}
|
|
|
|
matched_columns = {column_map[col.lower()]: col for col in set(fieldsAll) if col.lower() in column_map}
|
|
|
|
none_columns = [col for col in set(fieldsAll) if col.lower() not in column_map]
|
|
|
|
none_columns = [col for col in set(fieldsAll) if col.lower() not in column_map]
|
|
|
|
|
|
|
|
|
|
|
|
res2 = res[matched_columns.keys()]
|
|
|
|
res2 = res[matched_columns.keys()]
|
|
|
|
res2 = res2.rename(columns=matched_columns)
|
|
|
|
res2 = res2.rename(columns=matched_columns)
|
|
|
|
res2.drop_duplicates(subset=['id'], inplace=True)
|
|
|
|
res2.drop_duplicates(subset=["id"], inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
for column in res2.columns:
|
|
|
|
for column in res2.columns:
|
|
|
|
k = column.lower()
|
|
|
|
k = column.lower()
|
|
|
|
@ -691,16 +672,18 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
elif re.search(r"_feas$", k):
|
|
|
|
elif re.search(r"_feas$", k):
|
|
|
|
res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {})
|
|
|
|
res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {})
|
|
|
|
elif k == "position_int":
|
|
|
|
elif k == "position_int":
|
|
|
|
|
|
|
|
|
|
|
|
def to_position_int(v):
|
|
|
|
def to_position_int(v):
|
|
|
|
if v:
|
|
|
|
if v:
|
|
|
|
arr = [int(hex_val, 16) for hex_val in v.split('_')]
|
|
|
|
arr = [int(hex_val, 16) for hex_val in v.split("_")]
|
|
|
|
v = [arr[i : i + 5] for i in range(0, len(arr), 5)]
|
|
|
|
v = [arr[i : i + 5] for i in range(0, len(arr), 5)]
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
v = []
|
|
|
|
v = []
|
|
|
|
return v
|
|
|
|
return v
|
|
|
|
|
|
|
|
|
|
|
|
res2[column] = res2[column].apply(to_position_int)
|
|
|
|
res2[column] = res2[column].apply(to_position_int)
|
|
|
|
elif k in ["page_num_int", "top_int"]:
|
|
|
|
elif k in ["page_num_int", "top_int"]:
|
|
|
|
res2[column] = res2[column].apply(lambda v:[int(hex_val, 16) for hex_val in v.split('_')] if v else [])
|
|
|
|
res2[column] = res2[column].apply(lambda v: [int(hex_val, 16) for hex_val in v.split("_")] if v else [])
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
for column in none_columns:
|
|
|
|
for column in none_columns:
|
|
|
|
@ -719,23 +702,35 @@ class InfinityConnection(DocStoreConnection):
|
|
|
|
for i in range(num_rows):
|
|
|
|
for i in range(num_rows):
|
|
|
|
id = column_id[i]
|
|
|
|
id = column_id[i]
|
|
|
|
txt = res[fieldnm][i]
|
|
|
|
txt = res[fieldnm][i]
|
|
|
|
|
|
|
|
if re.search(r"<em>[^<>]+</em>", txt, flags=re.IGNORECASE | re.MULTILINE):
|
|
|
|
|
|
|
|
ans[id] = txt
|
|
|
|
|
|
|
|
continue
|
|
|
|
txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE)
|
|
|
|
txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE)
|
|
|
|
txts = []
|
|
|
|
txts = []
|
|
|
|
for t in re.split(r"[.?!;\n]", txt):
|
|
|
|
for t in re.split(r"[.?!;\n]", txt):
|
|
|
|
|
|
|
|
if is_english([t]):
|
|
|
|
for w in keywords:
|
|
|
|
for w in keywords:
|
|
|
|
t = re.sub(
|
|
|
|
t = re.sub(
|
|
|
|
r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])"
|
|
|
|
r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])" % re.escape(w),
|
|
|
|
% re.escape(w),
|
|
|
|
|
|
|
|
r"\1<em>\2</em>\3",
|
|
|
|
r"\1<em>\2</em>\3",
|
|
|
|
t,
|
|
|
|
t,
|
|
|
|
flags=re.IGNORECASE | re.MULTILINE,
|
|
|
|
flags=re.IGNORECASE | re.MULTILINE,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
if not re.search(
|
|
|
|
else:
|
|
|
|
r"<em>[^<>]+</em>", t, flags=re.IGNORECASE | re.MULTILINE
|
|
|
|
for w in sorted(keywords, key=len, reverse=True):
|
|
|
|
):
|
|
|
|
t = re.sub(
|
|
|
|
|
|
|
|
re.escape(w),
|
|
|
|
|
|
|
|
f"<em>{w}</em>",
|
|
|
|
|
|
|
|
t,
|
|
|
|
|
|
|
|
flags=re.IGNORECASE | re.MULTILINE,
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
if not re.search(r"<em>[^<>]+</em>", t, flags=re.IGNORECASE | re.MULTILINE):
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
txts.append(t)
|
|
|
|
txts.append(t)
|
|
|
|
|
|
|
|
if txts:
|
|
|
|
ans[id] = "...".join(txts)
|
|
|
|
ans[id] = "...".join(txts)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
ans[id] = txt
|
|
|
|
return ans
|
|
|
|
return ans
|
|
|
|
|
|
|
|
|
|
|
|
def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str):
|
|
|
|
def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str):
|
|
|
|
|