Fix: table tag on chunks. (#12126)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu
2025-12-24 09:32:19 +08:00
committed by GitHub
parent 17b8bb62b6
commit c33134ea2c
5 changed files with 26 additions and 17 deletions

View File

@ -56,7 +56,6 @@ class LLMParam(ComponentParamBase):
self.check_nonnegative_number(int(self.max_tokens), "[Agent] Max tokens")
self.check_decimal_float(float(self.top_p), "[Agent] Top P")
self.check_empty(self.llm_id, "[Agent] LLM")
self.check_empty(self.sys_prompt, "[Agent] System prompt")
self.check_empty(self.prompts, "[Agent] User prompt")
def gen_conf(self):

View File

@ -696,10 +696,12 @@ class DocumentService(CommonService):
for k,v in r.meta_fields.items():
if k not in meta:
meta[k] = {}
v = str(v)
if v not in meta[k]:
meta[k][v] = []
meta[k][v].append(doc_id)
if not isinstance(v, list):
v = [v]
for vv in v:
if vv not in meta[k]:
meta[k][vv] = []
meta[k][vv].append(doc_id)
return meta
@classmethod

View File

@ -44,21 +44,27 @@ def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
def filter_out(v2docs, operator, value):
ids = []
for input, docids in v2docs.items():
if operator in ["=", "", ">", "<", "", ""]:
try:
if isinstance(input, list):
input = input[0]
input = float(input)
value = float(value)
except Exception:
input = str(input)
value = str(value)
pass
if isinstance(input, str):
input = input.lower()
if isinstance(value, str):
value = value.lower()
for conds in [
(operator == "contains", str(value).lower() in str(input).lower()),
(operator == "not contains", str(value).lower() not in str(input).lower()),
(operator == "in", str(input).lower() in str(value).lower()),
(operator == "not in", str(input).lower() not in str(value).lower()),
(operator == "start with", str(input).lower().startswith(str(value).lower())),
(operator == "end with", str(input).lower().endswith(str(value).lower())),
(operator == "contains", input in value if not isinstance(input, list) else all([i in value for i in input])),
(operator == "not contains", input not in value if not isinstance(input, list) else all([i not in value for i in input])),
(operator == "in", input in value if not isinstance(input, list) else all([i in value for i in input])),
(operator == "not in", input not in value if not isinstance(input, list) else all([i not in value for i in input])),
(operator == "start with", str(input).lower().startswith(str(value).lower()) if not isinstance(input, list) else "".join([str(i).lower() for i in input]).startswith(str(value).lower())),
(operator == "end with", str(input).lower().endswith(str(value).lower()) if not isinstance(input, list) else "".join([str(i).lower() for i in input]).endswith(str(value).lower())),
(operator == "empty", not input),
(operator == "not empty", input),
(operator == "=", input == value),

View File

@ -348,7 +348,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d["doc_type_kwd"] = "table"
if img:
d["image"] = img
d["doc_type_kwd"] = "image"
if d["content_with_weight"].find("<tr>") < 0:
d["doc_type_kwd"] = "image"
if poss:
add_positions(d, poss)
res.append(d)
@ -361,7 +362,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d["doc_type_kwd"] = "table"
if img:
d["image"] = img
d["doc_type_kwd"] = "image"
if d["content_with_weight"].find("<tr>") < 0:
d["doc_type_kwd"] = "image"
add_positions(d, poss)
res.append(d)
return res

View File

@ -395,8 +395,8 @@ async def build_chunks(task, progress_callback):
await asyncio.gather(*tasks, return_exceptions=True)
raise
metadata = {}
for ck in cks:
metadata = update_metadata_to(metadata, ck["metadata_obj"])
for doc in docs:
metadata = update_metadata_to(metadata, doc["metadata_obj"])
del ck["metadata_obj"]
if metadata:
e, doc = DocumentService.get_by_id(task["doc_id"])