mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-01 08:05:07 +08:00
Add dataset with table parser type for Infinity and answer question in chat using SQL (#12541)
### What problem does this PR solve? 1) Create dataset using table parser for infinity 2) Answer questions in chat using SQL ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -228,15 +228,26 @@ class InfinityConnectionBase(DocStoreConnection):
|
||||
Table operations
|
||||
"""
|
||||
|
||||
def create_idx(self, index_name: str, dataset_id: str, vector_size: int):
|
||||
def create_idx(self, index_name: str, dataset_id: str, vector_size: int, parser_id: str = None):
|
||||
table_name = f"{index_name}_{dataset_id}"
|
||||
self.logger.debug(f"CREATE_IDX: Creating table {table_name}, parser_id: {parser_id}")
|
||||
|
||||
inf_conn = self.connPool.get_conn()
|
||||
inf_db = inf_conn.create_database(self.dbName, ConflictType.Ignore)
|
||||
|
||||
# Use configured schema
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", self.mapping_file_name)
|
||||
if not os.path.exists(fp_mapping):
|
||||
raise Exception(f"Mapping file not found at {fp_mapping}")
|
||||
schema = json.load(open(fp_mapping))
|
||||
|
||||
if parser_id is not None:
|
||||
from common.constants import ParserType
|
||||
if parser_id == ParserType.TABLE.value:
|
||||
# Table parser: add chunk_data JSON column to store table-specific fields
|
||||
schema["chunk_data"] = {"type": "json", "default": "{}"}
|
||||
self.logger.info("Added chunk_data column for TABLE parser")
|
||||
|
||||
vector_name = f"q_{vector_size}_vec"
|
||||
schema[vector_name] = {"type": f"vector,{vector_size},float"}
|
||||
inf_table = inf_db.create_table(
|
||||
@ -453,4 +464,198 @@ class InfinityConnectionBase(DocStoreConnection):
|
||||
"""
|
||||
|
||||
def sql(self, sql: str, fetch_size: int, format: str):
|
||||
raise NotImplementedError("Not implemented")
|
||||
"""
|
||||
Execute SQL query on Infinity database via psql command.
|
||||
Transform text-to-sql for Infinity's SQL syntax.
|
||||
"""
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
self.logger.debug(f"InfinityConnection.sql get sql: {sql}")
|
||||
|
||||
# Clean up SQL
|
||||
sql = re.sub(r"[ `]+", " ", sql)
|
||||
sql = sql.replace("%", "")
|
||||
|
||||
# Transform SELECT field aliases to actual stored field names
|
||||
# Build field mapping from infinity_mapping.json comment field
|
||||
field_mapping = {}
|
||||
# Also build reverse mapping for column names in result
|
||||
reverse_mapping = {}
|
||||
fp_mapping = os.path.join(get_project_base_directory(), "conf", self.mapping_file_name)
|
||||
if os.path.exists(fp_mapping):
|
||||
schema = json.load(open(fp_mapping))
|
||||
for field_name, field_info in schema.items():
|
||||
if "comment" in field_info:
|
||||
# Parse comma-separated aliases from comment
|
||||
# e.g., "docnm_kwd, title_tks, title_sm_tks"
|
||||
aliases = [a.strip() for a in field_info["comment"].split(",")]
|
||||
for alias in aliases:
|
||||
field_mapping[alias] = field_name
|
||||
reverse_mapping[field_name] = alias # Store first alias for reverse mapping
|
||||
|
||||
# Replace field names in SELECT clause
|
||||
select_match = re.search(r"(select\s+.*?)(from\s+)", sql, re.IGNORECASE)
|
||||
if select_match:
|
||||
select_clause = select_match.group(1)
|
||||
from_clause = select_match.group(2)
|
||||
|
||||
# Apply field transformations
|
||||
for alias, actual in field_mapping.items():
|
||||
select_clause = re.sub(
|
||||
rf'(^|[, ]){alias}([, ]|$)',
|
||||
rf'\1{actual}\2',
|
||||
select_clause
|
||||
)
|
||||
|
||||
sql = select_clause + from_clause + sql[select_match.end():]
|
||||
|
||||
# Also replace field names in WHERE, ORDER BY, GROUP BY, and HAVING clauses
|
||||
for alias, actual in field_mapping.items():
|
||||
# Transform in WHERE clause
|
||||
sql = re.sub(
|
||||
rf'(\bwhere\s+[^;]*?)(\b){re.escape(alias)}\b',
|
||||
rf'\1{actual}',
|
||||
sql,
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
# Transform in ORDER BY clause
|
||||
sql = re.sub(
|
||||
rf'(\border by\s+[^;]*?)(\b){re.escape(alias)}\b',
|
||||
rf'\1{actual}',
|
||||
sql,
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
# Transform in GROUP BY clause
|
||||
sql = re.sub(
|
||||
rf'(\bgroup by\s+[^;]*?)(\b){re.escape(alias)}\b',
|
||||
rf'\1{actual}',
|
||||
sql,
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
# Transform in HAVING clause
|
||||
sql = re.sub(
|
||||
rf'(\bhaving\s+[^;]*?)(\b){re.escape(alias)}\b',
|
||||
rf'\1{actual}',
|
||||
sql,
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
|
||||
self.logger.debug(f"InfinityConnection.sql to execute: {sql}")
|
||||
|
||||
# Get connection parameters from the Infinity connection pool wrapper
|
||||
# We need to use INFINITY_CONN singleton, not the raw ConnectionPool
|
||||
from common.doc_store.infinity_conn_pool import INFINITY_CONN
|
||||
conn_info = INFINITY_CONN.get_conn_uri()
|
||||
|
||||
# Parse host and port from conn_info
|
||||
if conn_info and "host=" in conn_info:
|
||||
host_match = re.search(r"host=(\S+)", conn_info)
|
||||
if host_match:
|
||||
host = host_match.group(1)
|
||||
else:
|
||||
host = "infinity"
|
||||
else:
|
||||
host = "infinity"
|
||||
|
||||
# Parse port from conn_info, default to 5432 if not found
|
||||
if conn_info and "port=" in conn_info:
|
||||
port_match = re.search(r"port=(\d+)", conn_info)
|
||||
if port_match:
|
||||
port = port_match.group(1)
|
||||
else:
|
||||
port = "5432"
|
||||
else:
|
||||
port = "5432"
|
||||
|
||||
# Use psql command to execute SQL
|
||||
# Use full path to psql to avoid PATH issues
|
||||
psql_path = "/usr/bin/psql"
|
||||
# Check if psql exists at expected location, otherwise try to find it
|
||||
import shutil
|
||||
psql_from_path = shutil.which("psql")
|
||||
if psql_from_path:
|
||||
psql_path = psql_from_path
|
||||
|
||||
# Execute SQL with psql to get both column names and data in one call
|
||||
psql_cmd = [
|
||||
psql_path,
|
||||
"-h", host,
|
||||
"-p", port,
|
||||
"-c", sql,
|
||||
]
|
||||
|
||||
self.logger.debug(f"Executing psql command: {' '.join(psql_cmd)}")
|
||||
|
||||
result = subprocess.run(
|
||||
psql_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10 # 10 second timeout
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
error_msg = result.stderr.strip()
|
||||
raise Exception(f"psql command failed: {error_msg}\nSQL: {sql}")
|
||||
|
||||
# Parse the output
|
||||
output = result.stdout.strip()
|
||||
if not output:
|
||||
# No results
|
||||
return {
|
||||
"columns": [],
|
||||
"rows": []
|
||||
} if format == "json" else []
|
||||
|
||||
# Parse psql table output which has format:
|
||||
# col1 | col2 | col3
|
||||
# -----+-----+-----
|
||||
# val1 | val2 | val3
|
||||
lines = output.split("\n")
|
||||
|
||||
# Extract column names from first line
|
||||
columns = []
|
||||
rows = []
|
||||
|
||||
if len(lines) >= 1:
|
||||
header_line = lines[0]
|
||||
for col_name in header_line.split("|"):
|
||||
col_name = col_name.strip()
|
||||
if col_name:
|
||||
columns.append({"name": col_name})
|
||||
|
||||
# Data starts after the separator line (line with dashes)
|
||||
data_start = 2 if len(lines) >= 2 and "-" in lines[1] else 1
|
||||
for i in range(data_start, len(lines)):
|
||||
line = lines[i].strip()
|
||||
# Skip empty lines and footer lines like "(1 row)"
|
||||
if not line or re.match(r"^\(\d+ row", line):
|
||||
continue
|
||||
# Split by | and strip each cell
|
||||
row = [cell.strip() for cell in line.split("|")]
|
||||
# Ensure row matches column count
|
||||
if len(row) == len(columns):
|
||||
rows.append(row)
|
||||
elif len(row) > len(columns):
|
||||
# Row has more cells than columns - truncate
|
||||
rows.append(row[:len(columns)])
|
||||
elif len(row) < len(columns):
|
||||
# Row has fewer cells - pad with empty strings
|
||||
rows.append(row + [""] * (len(columns) - len(row)))
|
||||
|
||||
if format == "json":
|
||||
result = {
|
||||
"columns": columns,
|
||||
"rows": rows[:fetch_size] if fetch_size > 0 else rows
|
||||
}
|
||||
else:
|
||||
result = rows[:fetch_size] if fetch_size > 0 else rows
|
||||
|
||||
return result
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
self.logger.exception(f"InfinityConnection.sql timeout. SQL:\n{sql}")
|
||||
raise Exception(f"SQL timeout\n\nSQL: {sql}")
|
||||
except Exception as e:
|
||||
self.logger.exception(f"InfinityConnection.sql got exception. SQL:\n{sql}")
|
||||
raise Exception(f"SQL error: {e}\n\nSQL: {sql}")
|
||||
|
||||
Reference in New Issue
Block a user