mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Added infinity rank_feature support (#9044)
### What problem does this PR solve? Added infinity rank_feature support ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -14,7 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
|
|
||||||
from flask import request
|
from flask import request
|
||||||
from flask_login import login_required, current_user
|
from flask_login import login_required, current_user
|
||||||
@ -106,13 +105,6 @@ def update():
|
|||||||
return get_data_error_result(
|
return get_data_error_result(
|
||||||
message="Can't find this knowledgebase!")
|
message="Can't find this knowledgebase!")
|
||||||
|
|
||||||
if req.get("parser_id", "") == "tag" and os.environ.get('DOC_ENGINE', "elasticsearch") == "infinity":
|
|
||||||
return get_json_result(
|
|
||||||
data=False,
|
|
||||||
message='The chunking method Tag has not been supported by Infinity yet.',
|
|
||||||
code=settings.RetCode.OPERATING_ERROR
|
|
||||||
)
|
|
||||||
|
|
||||||
if req["name"].lower() != kb.name.lower() \
|
if req["name"].lower() != kb.name.lower() \
|
||||||
and len(
|
and len(
|
||||||
KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) >= 1:
|
KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) >= 1:
|
||||||
@ -124,9 +116,6 @@ def update():
|
|||||||
return get_data_error_result()
|
return get_data_error_result()
|
||||||
|
|
||||||
if kb.pagerank != req.get("pagerank", 0):
|
if kb.pagerank != req.get("pagerank", 0):
|
||||||
if os.environ.get("DOC_ENGINE", "elasticsearch") != "elasticsearch":
|
|
||||||
return get_data_error_result(message="'pagerank' can only be set when doc_engine is elasticsearch")
|
|
||||||
|
|
||||||
if req.get("pagerank", 0) > 0:
|
if req.get("pagerank", 0) > 0:
|
||||||
settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]},
|
settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]},
|
||||||
search.index_name(kb.tenant_id), kb.id)
|
search.index_name(kb.tenant_id), kb.id)
|
||||||
|
|||||||
@ -30,7 +30,7 @@
|
|||||||
"knowledge_graph_kwd": {"type": "varchar", "default": ""},
|
"knowledge_graph_kwd": {"type": "varchar", "default": ""},
|
||||||
"entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
"entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||||
"pagerank_fea": {"type": "integer", "default": 0},
|
"pagerank_fea": {"type": "integer", "default": 0},
|
||||||
"tag_feas": {"type": "varchar", "default": ""},
|
"tag_feas": {"type": "varchar", "default": "", "analyzer": "rankfeatures"},
|
||||||
|
|
||||||
"from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
"from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||||
"to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
"to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||||
|
|||||||
@ -77,7 +77,7 @@ services:
|
|||||||
container_name: ragflow-infinity
|
container_name: ragflow-infinity
|
||||||
profiles:
|
profiles:
|
||||||
- infinity
|
- infinity
|
||||||
image: infiniflow/infinity:v0.6.0-dev4
|
image: infiniflow/infinity:v0.6.0-dev5
|
||||||
volumes:
|
volumes:
|
||||||
- infinity_data:/var/infinity
|
- infinity_data:/var/infinity
|
||||||
- ./infinity_conf.toml:/infinity_conf.toml
|
- ./infinity_conf.toml:/infinity_conf.toml
|
||||||
|
|||||||
@ -17,7 +17,7 @@ log_file_max_size = "100MB"
|
|||||||
log_file_rotate_count = 10
|
log_file_rotate_count = 10
|
||||||
|
|
||||||
# trace/debug/info/warning/error/critical 6 log levels, default: info
|
# trace/debug/info/warning/error/critical 6 log levels, default: info
|
||||||
log_level = "info"
|
log_level = "trace"
|
||||||
|
|
||||||
[storage]
|
[storage]
|
||||||
persistence_dir = "/var/infinity/persistence"
|
persistence_dir = "/var/infinity/persistence"
|
||||||
@ -47,7 +47,7 @@ mem_index_capacity = 65536
|
|||||||
buffer_manager_size = "8GB"
|
buffer_manager_size = "8GB"
|
||||||
lru_num = 7
|
lru_num = 7
|
||||||
temp_dir = "/var/infinity/tmp"
|
temp_dir = "/var/infinity/tmp"
|
||||||
result_cache = "on"
|
result_cache = "off"
|
||||||
memindex_memory_quota = "1GB"
|
memindex_memory_quota = "1GB"
|
||||||
|
|
||||||
[wal]
|
[wal]
|
||||||
|
|||||||
@ -113,7 +113,7 @@ ragflow:
|
|||||||
infinity:
|
infinity:
|
||||||
image:
|
image:
|
||||||
repository: infiniflow/infinity
|
repository: infiniflow/infinity
|
||||||
tag: v0.6.0-dev4
|
tag: v0.6.0-dev5
|
||||||
storage:
|
storage:
|
||||||
className:
|
className:
|
||||||
capacity: 5Gi
|
capacity: 5Gi
|
||||||
|
|||||||
@ -274,4 +274,4 @@ class FulltextQueryer:
|
|||||||
keywords.append(f"{tk}^{w}")
|
keywords.append(f"{tk}^{w}")
|
||||||
|
|
||||||
return MatchTextExpr(self.query_fields, " ".join(keywords), 100,
|
return MatchTextExpr(self.query_fields, " ".join(keywords), 100,
|
||||||
{"minimum_should_match": min(3, len(keywords) / 10)})
|
{"minimum_should_match": min(3, len(keywords) // 10)})
|
||||||
|
|||||||
@ -26,7 +26,7 @@ from infinity.index import IndexInfo, IndexType
|
|||||||
from infinity.connection_pool import ConnectionPool
|
from infinity.connection_pool import ConnectionPool
|
||||||
from infinity.errors import ErrorCode
|
from infinity.errors import ErrorCode
|
||||||
from rag import settings
|
from rag import settings
|
||||||
from rag.settings import PAGERANK_FLD
|
from rag.settings import PAGERANK_FLD, TAG_FLD
|
||||||
from rag.utils import singleton
|
from rag.utils import singleton
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
@ -311,7 +311,7 @@ class InfinityConnection(DocStoreConnection):
|
|||||||
df_list = list()
|
df_list = list()
|
||||||
table_list = list()
|
table_list = list()
|
||||||
output = selectFields.copy()
|
output = selectFields.copy()
|
||||||
for essential_field in ["id"]:
|
for essential_field in ["id"] + aggFields:
|
||||||
if essential_field not in output:
|
if essential_field not in output:
|
||||||
output.append(essential_field)
|
output.append(essential_field)
|
||||||
score_func = ""
|
score_func = ""
|
||||||
@ -333,15 +333,29 @@ class InfinityConnection(DocStoreConnection):
|
|||||||
if PAGERANK_FLD not in output:
|
if PAGERANK_FLD not in output:
|
||||||
output.append(PAGERANK_FLD)
|
output.append(PAGERANK_FLD)
|
||||||
output = [f for f in output if f != "_score"]
|
output = [f for f in output if f != "_score"]
|
||||||
|
if limit <= 0:
|
||||||
|
# ElasticSearch default limit is 10000
|
||||||
|
limit = 10000
|
||||||
|
|
||||||
# Prepare expressions common to all tables
|
# Prepare expressions common to all tables
|
||||||
filter_cond = None
|
filter_cond = None
|
||||||
filter_fulltext = ""
|
filter_fulltext = ""
|
||||||
if condition:
|
if condition:
|
||||||
|
table_found = False
|
||||||
for indexName in indexNames:
|
for indexName in indexNames:
|
||||||
table_name = f"{indexName}_{knowledgebaseIds[0]}"
|
for kb_id in knowledgebaseIds:
|
||||||
|
table_name = f"{indexName}_{kb_id}"
|
||||||
|
try:
|
||||||
filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name))
|
filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name))
|
||||||
|
table_found = True
|
||||||
break
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if table_found:
|
||||||
|
break
|
||||||
|
if not table_found:
|
||||||
|
logger.error(f"No valid tables found for indexNames {indexNames} and knowledgebaseIds {knowledgebaseIds}")
|
||||||
|
return pd.DataFrame(), 0
|
||||||
|
|
||||||
for matchExpr in matchExprs:
|
for matchExpr in matchExprs:
|
||||||
if isinstance(matchExpr, MatchTextExpr):
|
if isinstance(matchExpr, MatchTextExpr):
|
||||||
@ -355,6 +369,18 @@ class InfinityConnection(DocStoreConnection):
|
|||||||
if isinstance(minimum_should_match, float):
|
if isinstance(minimum_should_match, float):
|
||||||
str_minimum_should_match = str(int(minimum_should_match * 100)) + "%"
|
str_minimum_should_match = str(int(minimum_should_match * 100)) + "%"
|
||||||
matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match
|
matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match
|
||||||
|
|
||||||
|
# Add rank_feature support
|
||||||
|
if rank_feature and "rank_features" not in matchExpr.extra_options:
|
||||||
|
# Convert rank_feature dict to Infinity's rank_features string format
|
||||||
|
# Format: "field^feature_name^weight,field^feature_name^weight"
|
||||||
|
rank_features_list = []
|
||||||
|
for feature_name, weight in rank_feature.items():
|
||||||
|
# Use TAG_FLD as the field containing rank features
|
||||||
|
rank_features_list.append(f"{TAG_FLD}^{feature_name}^{weight}")
|
||||||
|
if rank_features_list:
|
||||||
|
matchExpr.extra_options["rank_features"] = ",".join(rank_features_list)
|
||||||
|
|
||||||
for k, v in matchExpr.extra_options.items():
|
for k, v in matchExpr.extra_options.items():
|
||||||
if not isinstance(v, str):
|
if not isinstance(v, str):
|
||||||
matchExpr.extra_options[k] = str(v)
|
matchExpr.extra_options[k] = str(v)
|
||||||
@ -416,7 +442,7 @@ class InfinityConnection(DocStoreConnection):
|
|||||||
matchExpr.method, matchExpr.topn, matchExpr.fusion_params
|
matchExpr.method, matchExpr.topn, matchExpr.fusion_params
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if len(filter_cond) > 0:
|
if filter_cond and len(filter_cond) > 0:
|
||||||
builder.filter(filter_cond)
|
builder.filter(filter_cond)
|
||||||
if orderBy.fields:
|
if orderBy.fields:
|
||||||
builder.sort(order_by_expr_list)
|
builder.sort(order_by_expr_list)
|
||||||
@ -662,6 +688,8 @@ class InfinityConnection(DocStoreConnection):
|
|||||||
k = column.lower()
|
k = column.lower()
|
||||||
if field_keyword(k):
|
if field_keyword(k):
|
||||||
res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd])
|
res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd])
|
||||||
|
elif re.search(r"_feas$", k):
|
||||||
|
res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {})
|
||||||
elif k == "position_int":
|
elif k == "position_int":
|
||||||
def to_position_int(v):
|
def to_position_int(v):
|
||||||
if v:
|
if v:
|
||||||
@ -712,9 +740,46 @@ class InfinityConnection(DocStoreConnection):
|
|||||||
|
|
||||||
def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str):
|
def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str):
|
||||||
"""
|
"""
|
||||||
TODO: Infinity doesn't provide aggregation
|
Manual aggregation for tag fields since Infinity doesn't provide native aggregation
|
||||||
"""
|
"""
|
||||||
return list()
|
from collections import Counter
|
||||||
|
|
||||||
|
# Extract DataFrame from result
|
||||||
|
if isinstance(res, tuple):
|
||||||
|
df, _ = res
|
||||||
|
else:
|
||||||
|
df = res
|
||||||
|
|
||||||
|
if df.empty or fieldnm not in df.columns:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Aggregate tag counts
|
||||||
|
tag_counter = Counter()
|
||||||
|
|
||||||
|
for value in df[fieldnm]:
|
||||||
|
if pd.isna(value) or not value:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle different tag formats
|
||||||
|
if isinstance(value, str):
|
||||||
|
# Split by ### for tag_kwd field or comma for other formats
|
||||||
|
if fieldnm == "tag_kwd" and "###" in value:
|
||||||
|
tags = [tag.strip() for tag in value.split("###") if tag.strip()]
|
||||||
|
else:
|
||||||
|
# Try comma separation as fallback
|
||||||
|
tags = [tag.strip() for tag in value.split(",") if tag.strip()]
|
||||||
|
|
||||||
|
for tag in tags:
|
||||||
|
if tag: # Only count non-empty tags
|
||||||
|
tag_counter[tag] += 1
|
||||||
|
elif isinstance(value, list):
|
||||||
|
# Handle list format
|
||||||
|
for tag in value:
|
||||||
|
if tag and isinstance(tag, str):
|
||||||
|
tag_counter[tag.strip()] += 1
|
||||||
|
|
||||||
|
# Return as list of [tag, count] pairs, sorted by count descending
|
||||||
|
return [[tag, count] for tag, count in tag_counter.most_common()]
|
||||||
|
|
||||||
"""
|
"""
|
||||||
SQL
|
SQL
|
||||||
|
|||||||
@ -68,8 +68,8 @@ class TestChunksList:
|
|||||||
"params, expected_code, expected_page_size, expected_message",
|
"params, expected_code, expected_page_size, expected_message",
|
||||||
[
|
[
|
||||||
({"page_size": None}, 0, 5, ""),
|
({"page_size": None}, 0, 5, ""),
|
||||||
pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
|
pytest.param({"page_size": 0}, 0, 5, ""),
|
||||||
pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
|
pytest.param({"page_size": 0}, 100, 0, ""),
|
||||||
({"page_size": 1}, 0, 1, ""),
|
({"page_size": 1}, 0, 1, ""),
|
||||||
({"page_size": 6}, 0, 5, ""),
|
({"page_size": 6}, 0, 5, ""),
|
||||||
({"page_size": "1"}, 0, 1, ""),
|
({"page_size": "1"}, 0, 1, ""),
|
||||||
|
|||||||
@ -69,8 +69,7 @@ class TestChunksList:
|
|||||||
"params, expected_code, expected_page_size, expected_message",
|
"params, expected_code, expected_page_size, expected_message",
|
||||||
[
|
[
|
||||||
({"page_size": None}, 0, 5, ""),
|
({"page_size": None}, 0, 5, ""),
|
||||||
pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
|
pytest.param({"page_size": 0}, 0, 5, ""),
|
||||||
pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
|
|
||||||
({"page_size": 1}, 0, 1, ""),
|
({"page_size": 1}, 0, 1, ""),
|
||||||
({"page_size": 6}, 0, 5, ""),
|
({"page_size": 6}, 0, 5, ""),
|
||||||
({"page_size": "1"}, 0, 1, ""),
|
({"page_size": "1"}, 0, 1, ""),
|
||||||
|
|||||||
@ -50,8 +50,7 @@ class TestChunksList:
|
|||||||
"params, expected_page_size, expected_message",
|
"params, expected_page_size, expected_message",
|
||||||
[
|
[
|
||||||
({"page_size": None}, 5, ""),
|
({"page_size": None}, 5, ""),
|
||||||
pytest.param({"page_size": 0}, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
|
pytest.param({"page_size": 0}, 5, ""),
|
||||||
pytest.param({"page_size": 0}, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
|
|
||||||
({"page_size": 1}, 1, ""),
|
({"page_size": 1}, 1, ""),
|
||||||
({"page_size": 6}, 5, ""),
|
({"page_size": 6}, 5, ""),
|
||||||
({"page_size": "1"}, 1, ""),
|
({"page_size": "1"}, 1, ""),
|
||||||
|
|||||||
@ -68,8 +68,7 @@ class TestChunksList:
|
|||||||
"params, expected_code, expected_page_size, expected_message",
|
"params, expected_code, expected_page_size, expected_message",
|
||||||
[
|
[
|
||||||
({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""),
|
({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""),
|
||||||
pytest.param({"size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
|
pytest.param({"size": 0}, 0, 5, ""),
|
||||||
pytest.param({"size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
|
|
||||||
({"size": 1}, 0, 1, ""),
|
({"size": 1}, 0, 1, ""),
|
||||||
({"size": 6}, 0, 5, ""),
|
({"size": 6}, 0, 5, ""),
|
||||||
({"size": "1"}, 0, 1, ""),
|
({"size": "1"}, 0, 1, ""),
|
||||||
|
|||||||
Reference in New Issue
Block a user