mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-26 08:56:47 +08:00
Feat: supports filter documents by empty metadata (#12180)
### What problem does this PR solve? Supports filter documents by empty metadata ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -234,6 +234,10 @@ async def list_docs():
|
||||
|
||||
req = await get_request_json()
|
||||
|
||||
return_empty_metadata = req.get("return_empty_metadata", False)
|
||||
if isinstance(return_empty_metadata, str):
|
||||
return_empty_metadata = return_empty_metadata.lower() == "true"
|
||||
|
||||
run_status = req.get("run_status", [])
|
||||
if run_status:
|
||||
invalid_status = {s for s in run_status if s not in VALID_TASK_STATUS}
|
||||
@ -248,11 +252,18 @@ async def list_docs():
|
||||
|
||||
suffix = req.get("suffix", [])
|
||||
metadata_condition = req.get("metadata_condition", {}) or {}
|
||||
if metadata_condition and not isinstance(metadata_condition, dict):
|
||||
return get_data_error_result(message="metadata_condition must be an object.")
|
||||
metadata = req.get("metadata", {}) or {}
|
||||
if metadata and not isinstance(metadata, dict):
|
||||
return get_data_error_result(message="metadata must be an object.")
|
||||
if isinstance(metadata, dict) and metadata.get("empty_metadata"):
|
||||
return_empty_metadata = True
|
||||
metadata = {k: v for k, v in metadata.items() if k != "empty_metadata"}
|
||||
if return_empty_metadata:
|
||||
metadata_condition = {}
|
||||
metadata = {}
|
||||
else:
|
||||
if metadata_condition and not isinstance(metadata_condition, dict):
|
||||
return get_data_error_result(message="metadata_condition must be an object.")
|
||||
if metadata and not isinstance(metadata, dict):
|
||||
return get_data_error_result(message="metadata must be an object.")
|
||||
|
||||
doc_ids_filter = None
|
||||
metas = None
|
||||
@ -295,7 +306,19 @@ async def list_docs():
|
||||
doc_ids_filter = list(doc_ids_filter)
|
||||
|
||||
try:
|
||||
docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_ids_filter)
|
||||
docs, tol = DocumentService.get_by_kb_id(
|
||||
kb_id,
|
||||
page_number,
|
||||
items_per_page,
|
||||
orderby,
|
||||
desc,
|
||||
keywords,
|
||||
run_status,
|
||||
types,
|
||||
suffix,
|
||||
doc_ids_filter,
|
||||
return_empty_metadata=return_empty_metadata,
|
||||
)
|
||||
|
||||
if create_time_from or create_time_to:
|
||||
filtered_docs = []
|
||||
|
||||
@ -125,26 +125,26 @@ class DocumentService(CommonService):
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_by_kb_id(cls, kb_id, page_number, items_per_page,
|
||||
orderby, desc, keywords, run_status, types, suffix, doc_ids=None):
|
||||
def get_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_ids=None, return_empty_metadata=False):
|
||||
fields = cls.get_cls_model_fields()
|
||||
if keywords:
|
||||
docs = cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])\
|
||||
.join(File2Document, on=(File2Document.document_id == cls.model.id))\
|
||||
.join(File, on=(File.id == File2Document.file_id))\
|
||||
.join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\
|
||||
.join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)\
|
||||
.where(
|
||||
(cls.model.kb_id == kb_id),
|
||||
(fn.LOWER(cls.model.name).contains(keywords.lower()))
|
||||
)
|
||||
docs = (
|
||||
cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])
|
||||
.join(File2Document, on=(File2Document.document_id == cls.model.id))
|
||||
.join(File, on=(File.id == File2Document.file_id))
|
||||
.join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)
|
||||
.join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)
|
||||
.where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.name).contains(keywords.lower())))
|
||||
)
|
||||
else:
|
||||
docs = cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])\
|
||||
.join(File2Document, on=(File2Document.document_id == cls.model.id))\
|
||||
.join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\
|
||||
.join(File, on=(File.id == File2Document.file_id))\
|
||||
.join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)\
|
||||
docs = (
|
||||
cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])
|
||||
.join(File2Document, on=(File2Document.document_id == cls.model.id))
|
||||
.join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)
|
||||
.join(File, on=(File.id == File2Document.file_id))
|
||||
.join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)
|
||||
.where(cls.model.kb_id == kb_id)
|
||||
)
|
||||
|
||||
if doc_ids:
|
||||
docs = docs.where(cls.model.id.in_(doc_ids))
|
||||
@ -154,6 +154,8 @@ class DocumentService(CommonService):
|
||||
docs = docs.where(cls.model.type.in_(types))
|
||||
if suffix:
|
||||
docs = docs.where(cls.model.suffix.in_(suffix))
|
||||
if return_empty_metadata:
|
||||
docs = docs.where(fn.COALESCE(fn.JSON_LENGTH(cls.model.meta_fields), 0) == 0)
|
||||
|
||||
count = docs.count()
|
||||
if desc:
|
||||
@ -161,7 +163,6 @@ class DocumentService(CommonService):
|
||||
else:
|
||||
docs = docs.order_by(cls.model.getter_by(orderby).asc())
|
||||
|
||||
|
||||
if page_number and items_per_page:
|
||||
docs = docs.paginate(page_number, items_per_page)
|
||||
|
||||
@ -217,18 +218,16 @@ class DocumentService(CommonService):
|
||||
suffix_counter = {}
|
||||
run_status_counter = {}
|
||||
metadata_counter = {}
|
||||
empty_metadata_count = 0
|
||||
|
||||
for row in rows:
|
||||
suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1
|
||||
run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1
|
||||
meta_fields = row.meta_fields or {}
|
||||
if isinstance(meta_fields, str):
|
||||
try:
|
||||
meta_fields = json.loads(meta_fields)
|
||||
except Exception:
|
||||
meta_fields = {}
|
||||
if not isinstance(meta_fields, dict):
|
||||
if not meta_fields:
|
||||
empty_metadata_count += 1
|
||||
continue
|
||||
has_valid_meta = False
|
||||
for key, value in meta_fields.items():
|
||||
values = value if isinstance(value, list) else [value]
|
||||
for vv in values:
|
||||
@ -240,7 +239,11 @@ class DocumentService(CommonService):
|
||||
if key not in metadata_counter:
|
||||
metadata_counter[key] = {}
|
||||
metadata_counter[key][sv] = metadata_counter[key].get(sv, 0) + 1
|
||||
has_valid_meta = True
|
||||
if not has_valid_meta:
|
||||
empty_metadata_count += 1
|
||||
|
||||
metadata_counter["empty_metadata"] = {"true": empty_metadata_count}
|
||||
return {
|
||||
"suffix": suffix_counter,
|
||||
"run_status": run_status_counter,
|
||||
|
||||
Reference in New Issue
Block a user