From 23bdf25a1fd4aaed05e385ca086e7b9a5d003ce3 Mon Sep 17 00:00:00 2001
From: Carve_ <75568342+Rynzie02@users.noreply.github.com>
Date: Sat, 31 Jan 2026 15:11:54 +0800
Subject: [PATCH] feature:Add OceanBase Storage Support for Table Parser
 (#12923)

### What problem does this PR solve?

close #12770

This PR adds OceanBase as a storage backend for the Table Parser. It
enables dynamic table schema storage via JSON and implements OceanBase
SQL execution for text-to-SQL retrieval.


### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

### Changes
- Table Parser stores row data into `chunk_data` when doc engine is
OceanBase. (table.py)
- OceanBase table schema adds `chunk_data` JSON column and migrates if
needed.
- Implemented OceanBase `sql()` to execute text-to-SQL results.
(ob_conn.py)
- Add `DOC_ENGINE_OCEANBASE` flag for engine detection (setting.py)

### Test
1. Set `DOC_ENGINE=oceanbase` (e.g. in `docker/.env`)
<img width="1290" height="783" alt="doc_engine_ob"
src="https://github.com/user-attachments/assets/7d1c609f-7bf2-4b2e-b4cc-4243e72ad4f1"
/>

2. Upload an Excel file to Knowledge Base.(for test, we use as below)
<img width="786" height="930" alt="excel"
src="https://github.com/user-attachments/assets/bedf82f2-cd00-426b-8f4d-6978a151231a"
/>

3. Choose **Table** as parsing method.
<img width="2550" height="1134" alt="parse_excel"
src="https://github.com/user-attachments/assets/aba11769-02be-4905-97e1-e24485e24cd0"
/>

4.Ask a natural language query in chat.
<img width="2550" height="1134" alt="query"
src="https://github.com/user-attachments/assets/26a910a6-e503-4ac7-b66a-f5754bbb0e91"
/>
---
 api/db/services/dialog_service.py | 40 ++++++++++++++++-
 common/settings.py                |  4 +-
 rag/app/table.py                  | 14 +++---
 rag/utils/ob_conn.py              | 72 ++++++++++++++++++++++++++++---
 4 files changed, 115 insertions(+), 15 deletions(-)
diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py
index 8a11397e4..3940a8a2f 100644
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@@ -586,7 +586,12 @@ async def use_sql(question, field_map, tenant_id, chat_mdl, quota=True, kb_ids=N
     logging.debug(f"use_sql: Question: {question}")
 
     # Determine which document engine we're using
-    doc_engine = "infinity" if settings.DOC_ENGINE_INFINITY else "es"
+    if settings.DOC_ENGINE_INFINITY:
+        doc_engine = "infinity"
+    elif settings.DOC_ENGINE_OCEANBASE:
+        doc_engine = "oceanbase"
+    else:
+        doc_engine = "es"
 
     # Construct the full table name
     # For Elasticsearch: ragflow_{tenant_id} (kb_id is in WHERE clause)
@@ -633,6 +638,37 @@ Write SQL using json_extract_string() with exact field names. Include doc_id, do
             "\n".join([f"  - {field}" for field in json_field_names]),
             question
         )
+    elif doc_engine == "oceanbase":
+        # Build OceanBase prompts with JSON extraction context
+        json_field_names = list(field_map.keys())
+        sys_prompt = """You are a Database Administrator. Write SQL for a table with JSON 'chunk_data' column.
+
+JSON Extraction: json_extract_string(chunk_data, '$.FieldName')
+Numeric Cast: CAST(json_extract_string(chunk_data, '$.FieldName') AS INTEGER/FLOAT)
+NULL Check: json_extract_isnull(chunk_data, '$.FieldName') == false
+
+RULES:
+1. Use EXACT field names (case-sensitive) from the list below
+2. For SELECT: include doc_id, docnm_kwd, and json_extract_string() for requested fields
+3. For COUNT: use COUNT(*) or COUNT(DISTINCT json_extract_string(...))
+4. Add AS alias for extracted field names
+5. DO NOT select 'content' field
+6. Only add NULL check (json_extract_isnull() == false) in WHERE clause when:
+   - Question asks to "show me" or "display" specific columns
+   - Question mentions "not null" or "excluding null"
+   - Add NULL check for count specific column
+   - DO NOT add NULL check for COUNT(*) queries (COUNT(*) counts all rows including nulls)
+7. Output ONLY the SQL, no explanations"""
+        user_prompt = """Table: {}
+Fields (EXACT case): {}
+{}
+Question: {}
+Write SQL using json_extract_string() with exact field names. Include doc_id, docnm_kwd for data queries. Only SQL.""".format(
+            table_name,
+            ", ".join(json_field_names),
+            "\n".join([f"  - {field}" for field in json_field_names]),
+            question
+        )
     else:
         # Build ES/OS prompts with direct field access
         sys_prompt = """You are a Database Administrator. Write SQL queries.
@@ -703,7 +739,7 @@ Write SQL using exact field names above. Include doc_id, docnm_kwd for data quer
     except Exception as e:
         logging.warning(f"use_sql: Initial SQL execution FAILED with error: {e}")
         # Build retry prompt with error information
-        if doc_engine == "infinity":
+        if doc_engine in ("infinity", "oceanbase"):
             # Build Infinity error retry prompt
             json_field_names = list(field_map.keys())
             user_prompt = """
diff --git a/common/settings.py b/common/settings.py
index 41ac2f5eb..221e4a909 100644
--- a/common/settings.py
+++ b/common/settings.py
@@ -79,6 +79,7 @@ FEISHU_OAUTH = None
 OAUTH_CONFIG = None
 DOC_ENGINE = os.getenv('DOC_ENGINE', 'elasticsearch')
 DOC_ENGINE_INFINITY = (DOC_ENGINE.lower() == "infinity")
+DOC_ENGINE_OCEANBASE = (DOC_ENGINE.lower() == "oceanbase")
 
 
 docStoreConn = None
@@ -241,9 +242,10 @@ def init_settings():
     FEISHU_OAUTH = get_base_config("oauth", {}).get("feishu")
     OAUTH_CONFIG = get_base_config("oauth", {})
 
-    global DOC_ENGINE, DOC_ENGINE_INFINITY, docStoreConn, ES, OB, OS, INFINITY
+    global DOC_ENGINE, DOC_ENGINE_INFINITY, DOC_ENGINE_OCEANBASE, docStoreConn, ES, OB, OS, INFINITY
     DOC_ENGINE = os.environ.get("DOC_ENGINE", "elasticsearch")
     DOC_ENGINE_INFINITY = (DOC_ENGINE.lower() == "infinity")
+    DOC_ENGINE_OCEANBASE = (DOC_ENGINE.lower() == "oceanbase")
     lower_case_doc_engine = DOC_ENGINE.lower()
     if lower_case_doc_engine == "elasticsearch":
         ES = get_base_config("es", {})
diff --git a/rag/app/table.py b/rag/app/table.py
index 1b49994e5..fad6b7038 100644
--- a/rag/app/table.py
+++ b/rag/app/table.py
@@ -457,10 +457,10 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
                 txts.extend([str(c) for c in cln if c])
         clmns_map = [(py_clmns[i].lower() + fields_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in
                      range(len(clmns))]
-        # For Infinity: Use original column names as keys since they're stored in chunk_data JSON
+        # For Infinity/OceanBase: Use original column names as keys since they're stored in chunk_data JSON
         # For ES/OS: Use full field names with type suffixes (e.g., url_kwd, body_tks)
-        if settings.DOC_ENGINE_INFINITY:
-            # For Infinity: key = original column name, value = display name
+        if settings.DOC_ENGINE_INFINITY or settings.DOC_ENGINE_OCEANBASE:
+            # For Infinity/OceanBase: key = original column name, value = display name
             field_map = {py_clmns[i].lower(): str(clmns[i]).replace("_", " ") for i in range(len(clmns))}
         else:
             # For ES/OS: key = typed field name, value = display name
@@ -480,9 +480,9 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
                     continue
                 if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]):
                     continue
-                # For Infinity: Store in chunk_data JSON column
+                # For Infinity/OceanBase: Store in chunk_data JSON column
                 # For Elasticsearch/OpenSearch: Store as individual fields with type suffixes
-                if settings.DOC_ENGINE_INFINITY:
+                if settings.DOC_ENGINE_INFINITY or settings.DOC_ENGINE_OCEANBASE:
                     data_json[str(clmns[j])] = row[clmns[j]]
                 else:
                     fld = clmns_map[j][0]
@@ -490,8 +490,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
                 row_fields.append((clmns[j], row[clmns[j]]))
             if not row_fields:
                 continue
-            # Add the data JSON field to the document (for Infinity only)
-            if settings.DOC_ENGINE_INFINITY:
+            # Add the data JSON field to the document (for Infinity/OceanBase)
+            if settings.DOC_ENGINE_INFINITY or settings.DOC_ENGINE_OCEANBASE:
                 d["chunk_data"] = data_json
             # Format as a structured text for better LLM comprehension
             # Format each field as "- Field Name: Value" on separate lines
diff --git a/rag/utils/ob_conn.py b/rag/utils/ob_conn.py
index b7c0ead58..45ccb256d 100644
--- a/rag/utils/ob_conn.py
+++ b/rag/utils/ob_conn.py
@@ -49,6 +49,7 @@ logger = logging.getLogger('ragflow.ob_conn')
 column_order_id = Column("_order_id", Integer, nullable=True, comment="chunk order id for maintaining sequence")
 column_group_id = Column("group_id", String(256), nullable=True, comment="group id for external retrieval")
 column_mom_id = Column("mom_id", String(256), nullable=True, comment="parent chunk id")
+column_chunk_data = Column("chunk_data", JSON, nullable=True, comment="table parser row data")
 
 column_definitions: list[Column] = [
     Column("id", String(256), primary_key=True, comment="chunk id"),
@@ -89,6 +90,7 @@ column_definitions: list[Column] = [
     Column("rank_flt", Double, nullable=True, comment="rank of this entity"),
     Column("removed_kwd", String(256), nullable=True, index=True, server_default="'N'",
            comment="whether it has been deleted"),
+    column_chunk_data,
     Column("metadata", JSON, nullable=True, comment="metadata for this chunk"),
     Column("extra", JSON, nullable=True, comment="extra information of non-general chunk"),
     column_order_id,
@@ -754,7 +756,7 @@ class OBConnection(DocStoreConnection):
     Table operations
     """
 
-    def create_idx(self, indexName: str, knowledgebaseId: str, vectorSize: int):
+    def create_idx(self, indexName: str, knowledgebaseId: str, vectorSize: int, parser_id: str = None):
         vector_field_name = f"q_{vectorSize}_vec"
         vector_index_name = f"{vector_field_name}_idx"
 
@@ -793,7 +795,7 @@ class OBConnection(DocStoreConnection):
             )
 
             # new columns migration
-            for column in [column_order_id, column_group_id, column_mom_id]:
+            for column in [column_chunk_data, column_order_id, column_group_id, column_mom_id]:
                 _try_with_lock(
                     lock_name=f"ob_add_{column.name}_{indexName}",
                     check_func=lambda: self._column_exist(indexName, column.name),
@@ -1826,6 +1828,66 @@ class OBConnection(DocStoreConnection):
     SQL
     """
 
-    def sql(sql: str, fetch_size: int, format: str):
-        # TODO: execute the sql generated by text-to-sql
-        return None
+    def sql(self, sql: str, fetch_size: int = 1024, format: str = "json"):
+        logger.debug("OBConnection.sql get sql: %s", sql)
+
+        def normalize_sql(sql_text: str) -> str:
+            cleaned = sql_text.strip().rstrip(";")
+            cleaned = re.sub(r"[`]+", "", cleaned)
+            cleaned = re.sub(
+                r"json_extract_string\s*\(\s*([^,]+?)\s*,\s*([^)]+?)\s*\)",
+                r"JSON_UNQUOTE(JSON_EXTRACT(\1, \2))",
+                cleaned,
+                flags=re.IGNORECASE,
+            )
+            cleaned = re.sub(
+                r"json_extract_isnull\s*\(\s*([^,]+?)\s*,\s*([^)]+?)\s*\)",
+                r"(JSON_EXTRACT(\1, \2) IS NULL)",
+                cleaned,
+                flags=re.IGNORECASE,
+            )
+            return cleaned
+
+        def coerce_value(value: Any) -> Any:
+            if isinstance(value, np.generic):
+                return value.item()
+            if isinstance(value, bytes):
+                return value.decode("utf-8", errors="ignore")
+            return value
+
+        sql_text = normalize_sql(sql)
+        if fetch_size and fetch_size > 0:
+            sql_lower = sql_text.lstrip().lower()
+            if re.match(r"^(select|with)\b", sql_lower) and not re.search(r"\blimit\b", sql_lower):
+                sql_text = f"{sql_text} LIMIT {int(fetch_size)}"
+
+        logger.debug("OBConnection.sql to ob: %s", sql_text)
+
+        try:
+            res = self.client.perform_raw_text_sql(sql_text)
+        except Exception:
+            logger.exception("OBConnection.sql got exception")
+            raise
+
+        if res is None:
+            return None
+
+        columns = list(res.keys()) if hasattr(res, "keys") else []
+        try:
+            rows = res.fetchmany(fetch_size) if fetch_size and fetch_size > 0 else res.fetchall()
+        except Exception:
+            rows = res.fetchall()
+
+        rows_list = [[coerce_value(v) for v in list(row)] for row in rows]
+        result = {
+            "columns": [{"name": col, "type": "text"} for col in columns],
+            "rows": rows_list,
+        }
+
+        if format == "markdown":
+            header = "|" + "|".join(columns) + "|" if columns else ""
+            separator = "|" + "|".join(["---" for _ in columns]) + "|" if columns else ""
+            body = "\n".join(["|" + "|".join([str(v) for v in row]) + "|" for row in rows_list])
+            result["markdown"] = "\n".join([line for line in [header, separator, body] if line])
+
+        return result