Add dataset with table parser type for Infinity and answer question in chat using SQL (#12541)

### What problem does this PR solve?

1) Create  dataset using table parser for infinity
2) Answer questions in chat using SQL

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
qinling0210
2026-01-19 19:35:14 +08:00
committed by GitHub
parent 05da2a5872
commit b40d639fdb
19 changed files with 1003 additions and 101 deletions

View File

@ -33,6 +33,7 @@ from deepdoc.parser.figure_parser import vision_figure_parser_figure_xlsx_wrappe
from deepdoc.parser.utils import get_text
from rag.nlp import rag_tokenizer, tokenize, tokenize_table
from deepdoc.parser import ExcelParser
from common import settings
class Excel(ExcelParser):
@ -431,7 +432,9 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
res = []
PY = Pinyin()
fieds_map = {"text": "_tks", "int": "_long", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"}
# Field type suffixes for database columns
# Maps data types to their database field suffixes
fields_map = {"text": "_tks", "int": "_long", "keyword": "_kwd", "float": "_flt", "datetime": "_dt", "bool": "_kwd"}
for df in dfs:
for n in ["id", "_id", "index", "idx"]:
if n in df.columns:
@ -452,13 +455,24 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
df[clmns[j]] = cln
if ty == "text":
txts.extend([str(c) for c in cln if c])
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in
clmns_map = [(py_clmns[i].lower() + fields_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in
range(len(clmns))]
# For Infinity: Use original column names as keys since they're stored in chunk_data JSON
# For ES/OS: Use full field names with type suffixes (e.g., url_kwd, body_tks)
if settings.DOC_ENGINE_INFINITY:
# For Infinity: key = original column name, value = display name
field_map = {py_clmns[i].lower(): str(clmns[i]).replace("_", " ") for i in range(len(clmns))}
else:
# For ES/OS: key = typed field name, value = display name
field_map = {k: v for k, v in clmns_map}
logging.debug(f"Field map: {field_map}")
KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": field_map})
eng = lang.lower() == "english" # is_english(txts)
for ii, row in df.iterrows():
d = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
row_txt = []
row_fields = []
data_json = {} # For Infinity: Store all columns in a JSON object
for j in range(len(clmns)):
if row[clmns[j]] is None:
continue
@ -466,17 +480,27 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
continue
if not isinstance(row[clmns[j]], pd.Series) and pd.isna(row[clmns[j]]):
continue
fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(row[clmns[j]])
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
if not row_txt:
# For Infinity: Store in chunk_data JSON column
# For Elasticsearch/OpenSearch: Store as individual fields with type suffixes
if settings.DOC_ENGINE_INFINITY:
data_json[str(clmns[j])] = row[clmns[j]]
else:
fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(row[clmns[j]])
row_fields.append((clmns[j], row[clmns[j]]))
if not row_fields:
continue
tokenize(d, "; ".join(row_txt), eng)
# Add the data JSON field to the document (for Infinity only)
if settings.DOC_ENGINE_INFINITY:
d["chunk_data"] = data_json
# Format as a structured text for better LLM comprehension
# Format each field as "- Field Name: Value" on separate lines
formatted_text = "\n".join([f"- {field}: {value}" for field, value in row_fields])
tokenize(d, formatted_text, eng)
res.append(d)
if tbls:
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
res.extend(tokenize_table(tbls, doc, is_english))
KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}})
callback(0.35, "")
return res

View File

@ -558,7 +558,8 @@ def build_TOC(task, docs, progress_callback):
def init_kb(row, vector_size: int):
idxnm = search.index_name(row["tenant_id"])
return settings.docStoreConn.create_idx(idxnm, row.get("kb_id", ""), vector_size)
parser_id = row.get("parser_id", None)
return settings.docStoreConn.create_idx(idxnm, row.get("kb_id", ""), vector_size, parser_id)
async def embedding(docs, mdl, parser_config=None, callback=None):
@ -739,7 +740,7 @@ async def run_dataflow(task: dict):
start_ts = timer()
set_progress(task_id, prog=0.82, msg="[DOC Engine]:\nStart to index...")
e = await insert_es(task_id, task["tenant_id"], task["kb_id"], chunks, partial(set_progress, task_id, 0, 100000000))
e = await insert_chunks(task_id, task["tenant_id"], task["kb_id"], chunks, partial(set_progress, task_id, 0, 100000000))
if not e:
PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id,
task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
@ -833,7 +834,17 @@ async def delete_image(kb_id, chunk_id):
raise
async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback):
async def insert_chunks(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback):
"""
Insert chunks into document store (Elasticsearch OR Infinity).
Args:
task_id: Task identifier
task_tenant_id: Tenant ID
task_dataset_id: Dataset/knowledge base ID
chunks: List of chunk dictionaries to insert
progress_callback: Callback function for progress updates
"""
mothers = []
mother_ids = set([])
for ck in chunks:
@ -858,7 +869,7 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
for b in range(0, len(mothers), settings.DOC_BULK_SIZE):
await asyncio.to_thread(settings.docStoreConn.insert, mothers[b:b + settings.DOC_BULK_SIZE],
search.index_name(task_tenant_id), task_dataset_id, )
search.index_name(task_tenant_id), task_dataset_id)
task_canceled = has_canceled(task_id)
if task_canceled:
progress_callback(-1, msg="Task has been canceled.")
@ -866,7 +877,7 @@ async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_c
for b in range(0, len(chunks), settings.DOC_BULK_SIZE):
doc_store_result = await asyncio.to_thread(settings.docStoreConn.insert, chunks[b:b + settings.DOC_BULK_SIZE],
search.index_name(task_tenant_id), task_dataset_id, )
search.index_name(task_tenant_id), task_dataset_id)
task_canceled = has_canceled(task_id)
if task_canceled:
progress_callback(-1, msg="Task has been canceled.")
@ -932,13 +943,6 @@ async def do_handle_task(task):
# prepare the progress callback function
progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
# FIXME: workaround, Infinity doesn't support table parsing method, this check is to notify user
lower_case_doc_engine = settings.DOC_ENGINE.lower()
if lower_case_doc_engine == 'infinity' and task['parser_id'].lower() == 'table':
error_message = "Table parsing method is not supported by Infinity, please use other parsing methods or use Elasticsearch as the document engine."
progress_callback(-1, msg=error_message)
raise Exception(error_message)
task_canceled = has_canceled(task_id)
if task_canceled:
progress_callback(-1, msg="Task has been canceled.")
@ -1092,14 +1096,14 @@ async def do_handle_task(task):
chunk_count = len(set([chunk["id"] for chunk in chunks]))
start_ts = timer()
async def _maybe_insert_es(_chunks):
async def _maybe_insert_chunks(_chunks):
if has_canceled(task_id):
return True
insert_result = await insert_es(task_id, task_tenant_id, task_dataset_id, _chunks, progress_callback)
insert_result = await insert_chunks(task_id, task_tenant_id, task_dataset_id, _chunks, progress_callback)
return bool(insert_result)
try:
if not await _maybe_insert_es(chunks):
if not await _maybe_insert_chunks(chunks):
return
logging.info(
@ -1115,7 +1119,7 @@ async def do_handle_task(task):
if toc_thread:
d = toc_thread.result()
if d:
if not await _maybe_insert_es([d]):
if not await _maybe_insert_chunks([d]):
return
DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, 0, 1, 0)

View File

@ -317,7 +317,18 @@ class InfinityConnection(InfinityConnectionBase):
break
if vector_size == 0:
raise ValueError("Cannot infer vector size from documents")
self.create_idx(index_name, knowledgebase_id, vector_size)
# Determine parser_id from document structure
# Table parser documents have 'chunk_data' field
parser_id = None
if "chunk_data" in documents[0] and isinstance(documents[0].get("chunk_data"), dict):
from common.constants import ParserType
parser_id = ParserType.TABLE.value
self.logger.debug("Detected TABLE parser from document structure")
# Fallback: Create table with base schema (shouldn't normally happen as init_kb() creates it)
self.logger.debug(f"Fallback: Creating table {table_name} with base schema, parser_id: {parser_id}")
self.create_idx(index_name, knowledgebase_id, vector_size, parser_id)
table_instance = db_instance.get_table(table_name)
# embedding fields can't have a default value....
@ -378,6 +389,12 @@ class InfinityConnection(InfinityConnectionBase):
d[k] = v
elif re.search(r"_feas$", k):
d[k] = json.dumps(v)
elif k == "chunk_data":
# Convert data dict to JSON string for storage
if isinstance(v, dict):
d[k] = json.dumps(v)
else:
d[k] = v
elif k == "kb_id":
if isinstance(d[k], list):
d[k] = d[k][0] # since d[k] is a list, but we need a str
@ -586,6 +603,9 @@ class InfinityConnection(InfinityConnectionBase):
res2[column] = res2[column].apply(lambda v: [kwd for kwd in v.split("###") if kwd])
elif re.search(r"_feas$", k):
res2[column] = res2[column].apply(lambda v: json.loads(v) if v else {})
elif k == "chunk_data":
# Parse JSON data back to dict for table parser fields
res2[column] = res2[column].apply(lambda v: json.loads(v) if v and isinstance(v, str) else v)
elif k == "position_int":
def to_position_int(v):
if v: