Tagging (#4426)

### What problem does this PR solve? #4367 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-01 08:05:07 +08:00 · 2025-01-09 17:07:21 +08:00
parent f892d7d426
commit c5da3cdd97
30 changed files with 736 additions and 202 deletions
--- a/rag/utils/init.py
+++ b/rag/utils/init.py
@ -71,11 +71,13 @@ def findMaxTm(fnm):
        pass
    return m

+
 tiktoken_cache_dir = get_project_base_directory()
 os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
 # encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
 encoder = tiktoken.get_encoding("cl100k_base")

+
 def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    try:
--- a/rag/utils/doc_store_conn.py
+++ b/rag/utils/doc_store_conn.py
@ -176,7 +176,17 @@ class DocStoreConnection(ABC):

    @abstractmethod
    def search(
-        self, selectFields: list[str], highlight: list[str], condition: dict, matchExprs: list[MatchExpr], orderBy: OrderByExpr, offset: int, limit: int, indexNames: str|list[str], knowledgebaseIds: list[str]
+        self, selectFields: list[str],
+            highlightFields: list[str],
+            condition: dict,
+            matchExprs: list[MatchExpr],
+            orderBy: OrderByExpr,
+            offset: int,
+            limit: int,
+            indexNames: str|list[str],
+            knowledgebaseIds: list[str],
+            aggFields: list[str] = [],
+            rank_feature: dict | None = None
    ) -> list[dict] | pl.DataFrame:
        """
        Search with given conjunctive equivalent filtering condition and return all fields of matched documents
@ -191,7 +201,7 @@ class DocStoreConnection(ABC):
        raise NotImplementedError("Not implemented")

    @abstractmethod
-    def insert(self, rows: list[dict], indexName: str, knowledgebaseId: str) -> list[str]:
+    def insert(self, rows: list[dict], indexName: str, knowledgebaseId: str = None) -> list[str]:
        """
        Update or insert a bulk of rows
        """
--- a/rag/utils/es_conn.py
+++ b/rag/utils/es_conn.py
@ -9,6 +9,7 @@ from elasticsearch import Elasticsearch, NotFoundError
 from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
 from elastic_transport import ConnectionTimeout
 from rag import settings
+from rag.settings import TAG_FLD, PAGERANK_FLD
 from rag.utils import singleton
 from api.utils.file_utils import get_project_base_directory
 import polars as pl
@ -20,6 +21,7 @@ ATTEMPT_TIME = 2

 logger = logging.getLogger('ragflow.es_conn')

+
@singleton
 class ESConnection(DocStoreConnection):
    def __init__(self):
@ -111,9 +113,19 @@ class ESConnection(DocStoreConnection):
    CRUD operations
    """

-    def search(self, selectFields: list[str], highlightFields: list[str], condition: dict, matchExprs: list[MatchExpr],
-               orderBy: OrderByExpr, offset: int, limit: int, indexNames: str | list[str],
-               knowledgebaseIds: list[str]) -> list[dict] | pl.DataFrame:
+    def search(
+            self, selectFields: list[str],
+            highlightFields: list[str],
+            condition: dict,
+            matchExprs: list[MatchExpr],
+            orderBy: OrderByExpr,
+            offset: int,
+            limit: int,
+            indexNames: str | list[str],
+            knowledgebaseIds: list[str],
+            aggFields: list[str] = [],
+            rank_feature: dict | None = None
+    ) -> list[dict] | pl.DataFrame:
        """
        Refers to https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html
        """
@ -175,8 +187,13 @@ class ESConnection(DocStoreConnection):
                          similarity=similarity,
                          )

+        if bqry and rank_feature:
+            for fld, sc in rank_feature.items():
+                if fld != PAGERANK_FLD:
+                    fld = f"{TAG_FLD}.{fld}"
+                bqry.should.append(Q("rank_feature", field=fld, linear={}, boost=sc))
+
        if bqry:
-            bqry.should.append(Q("rank_feature", field="pagerank_fea", linear={}, boost=10))
            s = s.query(bqry)
        for field in highlightFields:
            s = s.highlight(field)
@ -187,7 +204,7 @@ class ESConnection(DocStoreConnection):
                order = "asc" if order == 0 else "desc"
                if field in ["page_num_int", "top_int"]:
                    order_info = {"order": order, "unmapped_type": "float",
-                                "mode": "avg", "numeric_type": "double"}
+                                  "mode": "avg", "numeric_type": "double"}
                elif field.endswith("_int") or field.endswith("_flt"):
                    order_info = {"order": order, "unmapped_type": "float"}
                else:
@ -195,8 +212,11 @@ class ESConnection(DocStoreConnection):
                orders.append({field: order_info})
            s = s.sort(*orders)

+        for fld in aggFields:
+            s.aggs.bucket(f'aggs_{fld}', 'terms', field=fld, size=1000000)
+
        if limit > 0:
-            s = s[offset:offset+limit]
+            s = s[offset:offset + limit]
        q = s.to_dict()
        logger.debug(f"ESConnection.search {str(indexNames)} query: " + json.dumps(q))

@ -240,7 +260,7 @@ class ESConnection(DocStoreConnection):
        logger.error("ESConnection.get timeout for 3 times!")
        raise Exception("ESConnection.get timeout.")

-    def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str) -> list[str]:
+    def insert(self, documents: list[dict], indexName: str, knowledgebaseId: str = None) -> list[str]:
        # Refers to https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
        operations = []
        for d in documents:
@ -292,44 +312,57 @@ class ESConnection(DocStoreConnection):
                    if str(e).find("Timeout") > 0:
                        continue
            return False
-        else:
-            # update unspecific maybe-multiple documents
-            bqry = Q("bool")
-            for k, v in condition.items():
-                if not isinstance(k, str) or not v:
-                    continue
-                if k == "exist":
-                    bqry.filter.append(Q("exists", field=v))
-                    continue
-                if isinstance(v, list):
-                    bqry.filter.append(Q("terms", **{k: v}))
-                elif isinstance(v, str) or isinstance(v, int):
-                    bqry.filter.append(Q("term", **{k: v}))
-                else:
-                    raise Exception(
-                        f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
-            scripts = []
-            for k, v in newValue.items():
-                if k == "remove":
-                    scripts.append(f"ctx._source.remove('{v}');")
-                    continue
-                if (not isinstance(k, str) or not v) and k != "available_int":
-                    continue
+
+        # update unspecific maybe-multiple documents
+        bqry = Q("bool")
+        for k, v in condition.items():
+            if not isinstance(k, str) or not v:
+                continue
+            if k == "exist":
+                bqry.filter.append(Q("exists", field=v))
+                continue
+            if isinstance(v, list):
+                bqry.filter.append(Q("terms", **{k: v}))
+            elif isinstance(v, str) or isinstance(v, int):
+                bqry.filter.append(Q("term", **{k: v}))
+            else:
+                raise Exception(
+                    f"Condition `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str or list.")
+        scripts = []
+        params = {}
+        for k, v in newValue.items():
+            if k == "remove":
                if isinstance(v, str):
-                    scripts.append(f"ctx._source.{k} = '{v}'")
-                elif isinstance(v, int):
-                    scripts.append(f"ctx._source.{k} = {v}")
-                else:
-                    raise Exception(
-                        f"newValue `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str.")
+                    scripts.append(f"ctx._source.remove('{v}');")
+                if isinstance(v, dict):
+                    for kk, vv in v.items():
+                        scripts.append(f"int i=ctx._source.{kk}.indexOf(params.p_{kk});ctx._source.{kk}.remove(i);")
+                        params[f"p_{kk}"] = vv
+                continue
+            if k == "add":
+                if isinstance(v, dict):
+                    for kk, vv in v.items():
+                        scripts.append(f"ctx._source.{kk}.add(params.pp_{kk});")
+                        params[f"pp_{kk}"] = vv.strip()
+                continue
+            if (not isinstance(k, str) or not v) and k != "available_int":
+                continue
+            if isinstance(v, str):
+                scripts.append(f"ctx._source.{k} = '{v}'")
+            elif isinstance(v, int):
+                scripts.append(f"ctx._source.{k} = {v}")
+            else:
+                raise Exception(
+                    f"newValue `{str(k)}={str(v)}` value type is {str(type(v))}, expected to be int, str.")
        ubq = UpdateByQuery(
            index=indexName).using(
            self.es).query(bqry)
-        ubq = ubq.script(source="; ".join(scripts))
+        ubq = ubq.script(source="".join(scripts), params=params)
        ubq = ubq.params(refresh=True)
        ubq = ubq.params(slices=5)
        ubq = ubq.params(conflicts="proceed")
-        for i in range(3):
+
+        for _ in range(ATTEMPT_TIME):
            try:
                _ = ubq.execute()
                return True
--- a/rag/utils/infinity_conn.py
+++ b/rag/utils/infinity_conn.py
@ -10,6 +10,7 @@ from infinity.index import IndexInfo, IndexType
 from infinity.connection_pool import ConnectionPool
 from infinity.errors import ErrorCode
 from rag import settings
+from rag.settings import PAGERANK_FLD
 from rag.utils import singleton
 import polars as pl
 from polars.series.series import Series
@ -231,8 +232,7 @@ class InfinityConnection(DocStoreConnection):
    """

    def search(
-            self,
-            selectFields: list[str],
+            self, selectFields: list[str],
            highlightFields: list[str],
            condition: dict,
            matchExprs: list[MatchExpr],
@ -241,7 +241,9 @@ class InfinityConnection(DocStoreConnection):
            limit: int,
            indexNames: str | list[str],
            knowledgebaseIds: list[str],
-    ) -> tuple[pl.DataFrame, int]:
+            aggFields: list[str] = [],
+            rank_feature: dict | None = None
+    ) -> list[dict] | pl.DataFrame:
        """
        TODO: Infinity doesn't provide highlight
        """
@ -256,7 +258,7 @@ class InfinityConnection(DocStoreConnection):
            if essential_field not in selectFields:
                selectFields.append(essential_field)
        if matchExprs:
-            for essential_field in ["score()", "pagerank_fea"]:
+            for essential_field in ["score()", PAGERANK_FLD]:
                selectFields.append(essential_field)

        # Prepare expressions common to all tables
@ -346,7 +348,7 @@ class InfinityConnection(DocStoreConnection):
        self.connPool.release_conn(inf_conn)
        res = concat_dataframes(df_list, selectFields)
        if matchExprs:
-            res = res.sort(pl.col("SCORE") + pl.col("pagerank_fea"), descending=True, maintain_order=True)
+            res = res.sort(pl.col("SCORE") + pl.col(PAGERANK_FLD), descending=True, maintain_order=True)
        res = res.limit(limit)
        logger.debug(f"INFINITY search final result: {str(res)}")
        return res, total_hits_count
@ -378,7 +380,7 @@ class InfinityConnection(DocStoreConnection):
        return res_fields.get(chunkId, None)

    def insert(
-            self, documents: list[dict], indexName: str, knowledgebaseId: str
+            self, documents: list[dict], indexName: str, knowledgebaseId: str = None
    ) -> list[str]:
        inf_conn = self.connPool.get_conn()
        db_instance = inf_conn.get_database(self.dbName)
@ -456,7 +458,7 @@ class InfinityConnection(DocStoreConnection):
            elif k in ["page_num_int", "top_int"]:
                assert isinstance(v, list)
                newValue[k] = "_".join(f"{num:08x}" for num in v)
-            elif k == "remove" and v in ["pagerank_fea"]:
+            elif k == "remove" and v in [PAGERANK_FLD]:
                del newValue[k]
                newValue[v] = 0
        logger.debug(f"INFINITY update table {table_name}, filter {filter}, newValue {newValue}.")