diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index 43e6e4eac..7c535d0ae 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -14,7 +14,6 @@ # limitations under the License. # import json -import os from flask import request from flask_login import login_required, current_user @@ -106,13 +105,6 @@ def update(): return get_data_error_result( message="Can't find this knowledgebase!") - if req.get("parser_id", "") == "tag" and os.environ.get('DOC_ENGINE', "elasticsearch") == "infinity": - return get_json_result( - data=False, - message='The chunking method Tag has not been supported by Infinity yet.', - code=settings.RetCode.OPERATING_ERROR - ) - if req["name"].lower() != kb.name.lower() \ and len( KnowledgebaseService.query(name=req["name"], tenant_id=current_user.id, status=StatusEnum.VALID.value)) >= 1: @@ -124,9 +116,6 @@ def update(): return get_data_error_result() if kb.pagerank != req.get("pagerank", 0): - if os.environ.get("DOC_ENGINE", "elasticsearch") != "elasticsearch": - return get_data_error_result(message="'pagerank' can only be set when doc_engine is elasticsearch") - if req.get("pagerank", 0) > 0: settings.docStoreConn.update({"kb_id": kb.id}, {PAGERANK_FLD: req["pagerank"]}, search.index_name(kb.tenant_id), kb.id) diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json index f6772852c..3e39044a7 100644 --- a/conf/infinity_mapping.json +++ b/conf/infinity_mapping.json @@ -30,7 +30,7 @@ "knowledge_graph_kwd": {"type": "varchar", "default": ""}, "entities_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "pagerank_fea": {"type": "integer", "default": 0}, - "tag_feas": {"type": "varchar", "default": ""}, + "tag_feas": {"type": "varchar", "default": "", "analyzer": "rankfeatures"}, "from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, diff --git a/docker/docker-compose-base.yml b/docker/docker-compose-base.yml index 73c1d5415..7ded0b10c 100644 --- a/docker/docker-compose-base.yml +++ b/docker/docker-compose-base.yml @@ -77,7 +77,7 @@ services: container_name: ragflow-infinity profiles: - infinity - image: infiniflow/infinity:v0.6.0-dev4 + image: infiniflow/infinity:v0.6.0-dev5 volumes: - infinity_data:/var/infinity - ./infinity_conf.toml:/infinity_conf.toml diff --git a/docker/infinity_conf.toml b/docker/infinity_conf.toml index cc8a0dcde..c332cf6eb 100644 --- a/docker/infinity_conf.toml +++ b/docker/infinity_conf.toml @@ -17,7 +17,7 @@ log_file_max_size = "100MB" log_file_rotate_count = 10 # trace/debug/info/warning/error/critical 6 log levels, default: info -log_level = "info" +log_level = "trace" [storage] persistence_dir = "/var/infinity/persistence" @@ -47,7 +47,7 @@ mem_index_capacity = 65536 buffer_manager_size = "8GB" lru_num = 7 temp_dir = "/var/infinity/tmp" -result_cache = "on" +result_cache = "off" memindex_memory_quota = "1GB" [wal] diff --git a/helm/values.yaml b/helm/values.yaml index 2894a8559..ded5bf5c1 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -113,7 +113,7 @@ ragflow: infinity: image: repository: infiniflow/infinity - tag: v0.6.0-dev4 + tag: v0.6.0-dev5 storage: className: capacity: 5Gi diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 55b4e9d32..b708ff490 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -274,4 +274,4 @@ class FulltextQueryer: keywords.append(f"{tk}^{w}") return MatchTextExpr(self.query_fields, " ".join(keywords), 100, - {"minimum_should_match": min(3, len(keywords) / 10)}) + {"minimum_should_match": min(3, len(keywords) // 10)}) diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 6b8300860..a8229c948 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -111,7 +111,7 @@ class Dealer: q_vec = matchDense.embedding_data src.append(f"q_{len(q_vec)}_vec") - fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05, 0.95"}) + fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05,0.95"}) matchExprs = [matchText, matchDense, fusionExpr] res = self.dataStore.search(src, highlightFields, filters, matchExprs, orderBy, offset, limit, diff --git a/rag/utils/infinity_conn.py b/rag/utils/infinity_conn.py index f3f9be3a0..ad76e1ba6 100644 --- a/rag/utils/infinity_conn.py +++ b/rag/utils/infinity_conn.py @@ -26,7 +26,7 @@ from infinity.index import IndexInfo, IndexType from infinity.connection_pool import ConnectionPool from infinity.errors import ErrorCode from rag import settings -from rag.settings import PAGERANK_FLD +from rag.settings import PAGERANK_FLD, TAG_FLD from rag.utils import singleton import pandas as pd from api.utils.file_utils import get_project_base_directory @@ -311,7 +311,7 @@ class InfinityConnection(DocStoreConnection): df_list = list() table_list = list() output = selectFields.copy() - for essential_field in ["id"]: + for essential_field in ["id"] + aggFields: if essential_field not in output: output.append(essential_field) score_func = "" @@ -333,15 +333,29 @@ class InfinityConnection(DocStoreConnection): if PAGERANK_FLD not in output: output.append(PAGERANK_FLD) output = [f for f in output if f != "_score"] + if limit <= 0: + # ElasticSearch default limit is 10000 + limit = 10000 # Prepare expressions common to all tables filter_cond = None filter_fulltext = "" if condition: + table_found = False for indexName in indexNames: - table_name = f"{indexName}_{knowledgebaseIds[0]}" - filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name)) - break + for kb_id in knowledgebaseIds: + table_name = f"{indexName}_{kb_id}" + try: + filter_cond = equivalent_condition_to_str(condition, db_instance.get_table(table_name)) + table_found = True + break + except Exception: + pass + if table_found: + break + if not table_found: + logger.error(f"No valid tables found for indexNames {indexNames} and knowledgebaseIds {knowledgebaseIds}") + return pd.DataFrame(), 0 for matchExpr in matchExprs: if isinstance(matchExpr, MatchTextExpr): @@ -355,6 +369,18 @@ class InfinityConnection(DocStoreConnection): if isinstance(minimum_should_match, float): str_minimum_should_match = str(int(minimum_should_match * 100)) + "%" matchExpr.extra_options["minimum_should_match"] = str_minimum_should_match + + # Add rank_feature support + if rank_feature and "rank_features" not in matchExpr.extra_options: + # Convert rank_feature dict to Infinity's rank_features string format + # Format: "field^feature_name^weight,field^feature_name^weight" + rank_features_list = [] + for feature_name, weight in rank_feature.items(): + # Use TAG_FLD as the field containing rank features + rank_features_list.append(f"{TAG_FLD}^{feature_name}^{weight}") + if rank_features_list: + matchExpr.extra_options["rank_features"] = ",".join(rank_features_list) + for k, v in matchExpr.extra_options.items(): if not isinstance(v, str): matchExpr.extra_options[k] = str(v) @@ -416,7 +442,7 @@ class InfinityConnection(DocStoreConnection): matchExpr.method, matchExpr.topn, matchExpr.fusion_params ) else: - if len(filter_cond) > 0: + if filter_cond and len(filter_cond) > 0: builder.filter(filter_cond) if orderBy.fields: builder.sort(order_by_expr_list) @@ -662,6 +688,8 @@ class InfinityConnection(DocStoreConnection): k = column.lower() if field_keyword(k): res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd]) + elif re.search(r"_feas$", k): + res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {}) elif k == "position_int": def to_position_int(v): if v: @@ -712,9 +740,46 @@ class InfinityConnection(DocStoreConnection): def getAggregation(self, res: tuple[pd.DataFrame, int] | pd.DataFrame, fieldnm: str): """ - TODO: Infinity doesn't provide aggregation + Manual aggregation for tag fields since Infinity doesn't provide native aggregation """ - return list() + from collections import Counter + + # Extract DataFrame from result + if isinstance(res, tuple): + df, _ = res + else: + df = res + + if df.empty or fieldnm not in df.columns: + return [] + + # Aggregate tag counts + tag_counter = Counter() + + for value in df[fieldnm]: + if pd.isna(value) or not value: + continue + + # Handle different tag formats + if isinstance(value, str): + # Split by ### for tag_kwd field or comma for other formats + if fieldnm == "tag_kwd" and "###" in value: + tags = [tag.strip() for tag in value.split("###") if tag.strip()] + else: + # Try comma separation as fallback + tags = [tag.strip() for tag in value.split(",") if tag.strip()] + + for tag in tags: + if tag: # Only count non-empty tags + tag_counter[tag] += 1 + elif isinstance(value, list): + # Handle list format + for tag in value: + if tag and isinstance(tag, str): + tag_counter[tag.strip()] += 1 + + # Return as list of [tag, count] pairs, sorted by count descending + return [[tag, count] for tag, count in tag_counter.most_common()] """ SQL diff --git a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py index f866d3f09..5508ff306 100644 --- a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py +++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py @@ -68,8 +68,8 @@ class TestChunksList: "params, expected_code, expected_page_size, expected_message", [ ({"page_size": None}, 0, 5, ""), - pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), - pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), + pytest.param({"page_size": 0}, 0, 5, ""), + pytest.param({"page_size": 0}, 100, 0, ""), ({"page_size": 1}, 0, 1, ""), ({"page_size": 6}, 0, 5, ""), ({"page_size": "1"}, 0, 1, ""), diff --git a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py index c8134214c..f4059f3be 100644 --- a/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py +++ b/test/testcases/test_http_api/test_chunk_management_within_dataset/test_list_chunks.py @@ -69,8 +69,7 @@ class TestChunksList: "params, expected_code, expected_page_size, expected_message", [ ({"page_size": None}, 0, 5, ""), - pytest.param({"page_size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), - pytest.param({"page_size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), + pytest.param({"page_size": 0}, 0, 5, ""), ({"page_size": 1}, 0, 1, ""), ({"page_size": 6}, 0, 5, ""), ({"page_size": "1"}, 0, 1, ""), diff --git a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_list_chunks.py b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_list_chunks.py index 76f9da5e0..d7663c9a9 100644 --- a/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_list_chunks.py +++ b/test/testcases/test_sdk_api/test_chunk_management_within_dataset/test_list_chunks.py @@ -50,8 +50,7 @@ class TestChunksList: "params, expected_page_size, expected_message", [ ({"page_size": None}, 5, ""), - pytest.param({"page_size": 0}, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), - pytest.param({"page_size": 0}, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), + pytest.param({"page_size": 0}, 5, ""), ({"page_size": 1}, 1, ""), ({"page_size": 6}, 5, ""), ({"page_size": "1"}, 1, ""), diff --git a/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py b/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py index dd567e01d..3d82ba550 100644 --- a/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py +++ b/test/testcases/test_web_api/test_chunk_app/test_list_chunks.py @@ -68,8 +68,7 @@ class TestChunksList: "params, expected_code, expected_page_size, expected_message", [ ({"size": None}, 100, 0, """TypeError("int() argument must be a string, a bytes-like object or a real number, not 'NoneType'")"""), - pytest.param({"size": 0}, 0, 5, "", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="Infinity does not support page_size=0")), - pytest.param({"size": 0}, 100, 0, "3013", marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "opensearch", "elasticsearch"], reason="Infinity does not support page_size=0")), + pytest.param({"size": 0}, 0, 5, ""), ({"size": 1}, 0, 1, ""), ({"size": 6}, 0, 5, ""), ({"size": "1"}, 0, 1, ""),