Added infinity rank_feature support (#9044)

### What problem does this PR solve? Added infinity rank_feature support ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-04 09:35:06 +08:00 · 2025-07-29 09:14:23 +08:00
parent 28f7b33a74
commit 342a04ec8a
12 changed files with 85 additions and 34 deletions
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
@ -14,7 +14,6 @@
 #  limitations under the License.
 #
 import json
 import os
 from flask import request
 from flask_login import login_required, current_user
@ -106,13 +105,6 @@ def update():
            return get_data_error_result(
                message="Can't find this knowledgebase!")
        if req.get("parser_id", "") == "tag" and os.environ.get('DOC_ENGINE', "elasticsearch") == "infinity":
            return get_json_result(
                data=False,
                message='The chunking method Tag has not been supported by Infinity yet.',
                code=settings.RetCode.OPERATING_ERROR
            )
        if req["name"].lower() != kb.name.lower() \
                and len(
            KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) >= 1:
@ -124,9 +116,6 @@ def update():
            return get_data_error_result()
        if kb.pagerank != req.get("pagerank", 0):
            if os.environ.get("DOC_ENGINE", "elasticsearch") != "elasticsearch":
                return get_data_error_result(message="'pagerank' can only be set when doc_engine is elasticsearch")
            if req.get("pagerank", 0) > 0:
                settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]},
                                         search.index_name(kb.tenant_id), kb.id)
--- a/conf/infinity_mapping.json
+++ b/conf/infinity_mapping.json
@ -30,7 +30,7 @@
 	"knowledge_graph_kwd": {"type": "varchar", "default": ""},
 	"entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"pagerank_fea": {"type": "integer", "default":  0},
-	"tag_feas": {"type": "varchar", "default":  ""},
+	"tag_feas": {"type": "varchar", "default": "", "analyzer": "rankfeatures"},
 	"from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
--- a/docker/docker-compose-base.yml
+++ b/docker/docker-compose-base.yml
@ -77,7 +77,7 @@ services:
    container_name: ragflow-infinity
    profiles:
      - infinity
-    image: infiniflow/infinity:v0.6.0-dev4
+    image: infiniflow/infinity:v0.6.0-dev5
    volumes:
      - infinity_data:/var/infinity
      - ./infinity_conf.toml:/infinity_conf.toml
--- a/docker/infinity_conf.toml
+++ b/docker/infinity_conf.toml
@ -17,7 +17,7 @@ log_file_max_size        = "100MB"
 log_file_rotate_count    = 10
 # trace/debug/info/warning/error/critical 6 log levels, default: info
-log_level               = "info"
+log_level               = "trace"
 [storage]
 persistence_dir         = "/var/infinity/persistence"
@ -47,7 +47,7 @@ mem_index_capacity       = 65536
 buffer_manager_size      = "8GB"
 lru_num                  = 7
 temp_dir                 = "/var/infinity/tmp"
-result_cache             = "on"
+result_cache             = "off"
 memindex_memory_quota    = "1GB"
 [wal]
--- a/helm/values.yaml
+++ b/helm/values.yaml
@ -113,7 +113,7 @@ ragflow:
 infinity:
  image:
    repository: infiniflow/infinity
-    tag: v0.6.0-dev4
+    tag: v0.6.0-dev5
  storage:
    className:
    capacity: 5Gi
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -274,4 +274,4 @@ class FulltextQueryer:
                keywords.append(f"{tk}^{w}")
        return MatchTextExpr(self.query_fields, " ".join(keywords), 100,
-                             {"minimum_should_match": min(3, len(keywords) / 10)})
+                             {"minimum_should_match": min(3, len(keywords) // 10)})
--- a/rag/utils/infinity_conn.py
+++ b/rag/utils/infinity_conn.py
@ -26,7 +26,7 @@ from infinity.index import IndexInfo, IndexType
 from infinity.connection_pool import ConnectionPool
 from infinity.errors import ErrorCode
 from rag import settings
-from rag.settings import PAGERANK_FLD
+from rag.settings import PAGERANK_FLD, TAG_FLD
 from rag.utils import singleton
 import pandas as pd
 from api.utils.file_utils import get_project_base_directory
@ -311,7 +311,7 @@ class InfinityConnection(DocStoreConnection):
        df_list = list()
        table_list = list()
        output = selectFields.copy()
-        for essential_field in ["id"]:
+        for essential_field in ["id"] + aggFields:
            if essential_field not in output:
                output.append(essential_field)
        score_func = ""
@ -333,15 +333,29 @@ class InfinityConnection(DocStoreConnection):
            if PAGERANK_FLD not in output:
                output.append(PAGERANK_FLD)
        output = [f for f in output if f != "_score"]
        if limit <= 0:
            # ElasticSearch default limit is 10000
            limit = 10000
        # Prepare expressions common to all tables
        filter_cond = None
        filter_fulltext = ""
        if condition:
            table_found = False
            for indexName in indexNames:
-                table_name = f"{indexName}_{knowledgebaseIds[0]}"
+                for kb_id in knowledgebaseIds:
                    table_name = f"{indexName}_{kb_id}"
                    try:
                        filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name))
                        table_found = True
                        break
                    except Exception:
                        pass
                if table_found:
                    break
            if not table_found:
                logger.error(f"No valid tables found for indexNames {indexNames} and knowledgebaseIds {knowledgebaseIds}")
                return pd.DataFrame(), 0
        for matchExpr in matchExprs:
            if isinstance(matchExpr, MatchTextExpr):
@ -355,6 +369,18 @@ class InfinityConnection(DocStoreConnection):
                if isinstance(minimum_should_match, float):
                    str_minimum_should_match = str(int(minimum_should_match * 100)) + "%"
                    matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match
                # Add rank_feature support
                if rank_feature and "rank_features" not in matchExpr.extra_options:
                    # Convert rank_feature dict to Infinity's rank_features string format
                    # Format: "field^feature_name^weight,field^feature_name^weight"
                    rank_features_list = []
                    for feature_name, weight in rank_feature.items():
                        # Use TAG_FLD as the field containing rank features
                        rank_features_list.append(f"{TAG_FLD}^{feature_name}^{weight}")
                    if rank_features_list:
                        matchExpr.extra_options["rank_features"] = ",".join(rank_features_list)
                for k, v in matchExpr.extra_options.items():
                    if not isinstance(v, str):
                        matchExpr.extra_options[k] = str(v)
@ -416,7 +442,7 @@ class InfinityConnection(DocStoreConnection):
                                matchExpr.method, matchExpr.topn, matchExpr.fusion_params
                            )
                else:
-                    if len(filter_cond) > 0:
+                    if filter_cond and len(filter_cond) > 0:
                        builder.filter(filter_cond)
                if orderBy.fields:
                    builder.sort(order_by_expr_list)
@ -662,6 +688,8 @@ class InfinityConnection(DocStoreConnection):
            k = column.lower()
            if field_keyword(k):
                res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd])
            elif re.search(r"_feas$", k):
                res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {})
            elif k == "position_int":
                def to_position_int(v):
                    if v:
@ -712,9 +740,46 @@ class InfinityConnection(DocStoreConnection):
    def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str):
        """
-        TODO: Infinity doesn't provide aggregation
+        Manual aggregation for tag fields since Infinity doesn't provide native aggregation
        """
-        return list()
+        from collections import Counter
        # Extract DataFrame from result
        if isinstance(res, tuple):
            df, _ = res
        else:
            df = res
        if df.empty or fieldnm not in df.columns:
            return []
        # Aggregate tag counts
        tag_counter = Counter()
        for value in df[fieldnm]:
            if pd.isna(value) or not value:
                continue
            # Handle different tag formats
            if isinstance(value, str):
                # Split by ### for tag_kwd field or comma for other formats
                if fieldnm == "tag_kwd" and "###" in value:
                    tags = [tag.strip() for tag in value.split("###") if tag.strip()]
                else:
                    # Try comma separation as fallback
                    tags = [tag.strip() for tag in value.split(",") if tag.strip()]
                for tag in tags:
                    if tag:  # Only count non-empty tags
                        tag_counter[tag] += 1
            elif isinstance(value, list):
                # Handle list format
                for tag in value:
                    if tag and isinstance(tag, str):
                        tag_counter[tag.strip()] += 1
        # Return as list of [tag, count] pairs, sorted by count descending
        return [[tag, count] for tag, count in tag_counter.most_common()]
    """
    SQL
--- a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py
+++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py
@ -68,8 +68,8 @@ class TestChunksList:
        "params, expected_code, expected_page_size, expected_message",
        [
            ({"page_size": None}, 0, 5, ""),
-            pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
+            pytest.param({"page_size": 0}, 0, 5, ""),
-            pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
+            pytest.param({"page_size": 0}, 100, 0, ""),
            ({"page_size": 1}, 0, 1, ""),
            ({"page_size": 6}, 0, 5, ""),
            ({"page_size": "1"}, 0, 1, ""),
--- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py
+++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py
@ -69,8 +69,7 @@ class TestChunksList:
        "params, expected_code, expected_page_size, expected_message",
        [
            ({"page_size": None}, 0, 5, ""),
-            pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
+            pytest.param({"page_size": 0}, 0, 5, ""),
            pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
            ({"page_size": 1}, 0, 1, ""),
            ({"page_size": 6}, 0, 5, ""),
            ({"page_size": "1"}, 0, 1, ""),
--- a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_list_chunks.py
+++ b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_list_chunks.py
@ -50,8 +50,7 @@ class TestChunksList:
        "params, expected_page_size, expected_message",
        [
            ({"page_size": None}, 5, ""),
-            pytest.param({"page_size": 0}, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
+            pytest.param({"page_size": 0}, 5, ""),
            pytest.param({"page_size": 0}, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
            ({"page_size": 1}, 1, ""),
            ({"page_size": 6}, 5, ""),
            ({"page_size": "1"}, 1, ""),
--- a/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py
+++ b/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py
@ -68,8 +68,7 @@ class TestChunksList:
        "params, expected_code, expected_page_size, expected_message",
        [
            ({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""),
-            pytest.param({"size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")),
+            pytest.param({"size": 0}, 0, 5, ""),
            pytest.param({"size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")),
            ({"size": 1}, 0, 1, ""),
            ({"size": 6}, 0, 5, ""),
            ({"size": "1"}, 0, 1, ""),