mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
### What problem does this PR solve? Feat: Support attribute filtering #8703 ### Type of change - [X] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: writinwaters <cai.keith@gmail.com>
This commit is contained in:
@ -470,6 +470,20 @@ def list_docs(dataset_id, tenant_id):
|
|||||||
required: false
|
required: false
|
||||||
default: 0
|
default: 0
|
||||||
description: Unix timestamp for filtering documents created before this time. 0 means no filter.
|
description: Unix timestamp for filtering documents created before this time. 0 means no filter.
|
||||||
|
- in: query
|
||||||
|
name: suffix
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
required: false
|
||||||
|
description: Filter by file suffix (e.g., ["pdf", "txt", "docx"]).
|
||||||
|
- in: query
|
||||||
|
name: run
|
||||||
|
type: array
|
||||||
|
items:
|
||||||
|
type: string
|
||||||
|
required: false
|
||||||
|
description: Filter by document run status. Supports both numeric ("0", "1", "2", "3", "4") and text formats ("UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL").
|
||||||
- in: header
|
- in: header
|
||||||
name: Authorization
|
name: Authorization
|
||||||
type: string
|
type: string
|
||||||
@ -512,63 +526,62 @@ def list_docs(dataset_id, tenant_id):
|
|||||||
description: Processing status.
|
description: Processing status.
|
||||||
"""
|
"""
|
||||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
|
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
|
||||||
id = request.args.get("id")
|
|
||||||
name = request.args.get("name")
|
|
||||||
|
|
||||||
if id and not DocumentService.query(id=id, kb_id=dataset_id):
|
q = request.args
|
||||||
return get_error_data_result(message=f"You don't own the document {id}.")
|
document_id = q.get("id")
|
||||||
|
name = q.get("name")
|
||||||
|
|
||||||
|
if document_id and not DocumentService.query(id=document_id, kb_id=dataset_id):
|
||||||
|
return get_error_data_result(message=f"You don't own the document {document_id}.")
|
||||||
if name and not DocumentService.query(name=name, kb_id=dataset_id):
|
if name and not DocumentService.query(name=name, kb_id=dataset_id):
|
||||||
return get_error_data_result(message=f"You don't own the document {name}.")
|
return get_error_data_result(message=f"You don't own the document {name}.")
|
||||||
|
|
||||||
page = int(request.args.get("page", 1))
|
page = int(q.get("page", 1))
|
||||||
keywords = request.args.get("keywords", "")
|
page_size = int(q.get("page_size", 30))
|
||||||
page_size = int(request.args.get("page_size", 30))
|
orderby = q.get("orderby", "create_time")
|
||||||
orderby = request.args.get("orderby", "create_time")
|
desc = str(q.get("desc", "true")).strip().lower() != "false"
|
||||||
if request.args.get("desc") == "False":
|
keywords = q.get("keywords", "")
|
||||||
desc = False
|
|
||||||
else:
|
|
||||||
desc = True
|
|
||||||
docs, tol = DocumentService.get_list(dataset_id, page, page_size, orderby, desc, keywords, id, name)
|
|
||||||
|
|
||||||
create_time_from = int(request.args.get("create_time_from", 0))
|
# filters - align with OpenAPI parameter names
|
||||||
create_time_to = int(request.args.get("create_time_to", 0))
|
suffix = q.getlist("suffix")
|
||||||
|
run_status = q.getlist("run")
|
||||||
|
create_time_from = int(q.get("create_time_from", 0))
|
||||||
|
create_time_to = int(q.get("create_time_to", 0))
|
||||||
|
|
||||||
|
# map run status (accept text or numeric) - align with API parameter
|
||||||
|
run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"}
|
||||||
|
run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status]
|
||||||
|
|
||||||
|
docs, total = DocumentService.get_list(
|
||||||
|
dataset_id, page, page_size, orderby, desc, keywords, document_id, name, suffix, run_status_converted
|
||||||
|
)
|
||||||
|
|
||||||
|
# time range filter (0 means no bound)
|
||||||
if create_time_from or create_time_to:
|
if create_time_from or create_time_to:
|
||||||
filtered_docs = []
|
docs = [
|
||||||
for doc in docs:
|
d for d in docs
|
||||||
doc_create_time = doc.get("create_time", 0)
|
if (create_time_from == 0 or d.get("create_time", 0) >= create_time_from)
|
||||||
if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to):
|
and (create_time_to == 0 or d.get("create_time", 0) <= create_time_to)
|
||||||
filtered_docs.append(doc)
|
]
|
||||||
docs = filtered_docs
|
|
||||||
|
|
||||||
# rename key's name
|
# rename keys + map run status back to text for output
|
||||||
renamed_doc_list = []
|
|
||||||
key_mapping = {
|
key_mapping = {
|
||||||
"chunk_num": "chunk_count",
|
"chunk_num": "chunk_count",
|
||||||
"kb_id": "dataset_id",
|
"kb_id": "dataset_id",
|
||||||
"token_num": "token_count",
|
"token_num": "token_count",
|
||||||
"parser_id": "chunk_method",
|
"parser_id": "chunk_method",
|
||||||
}
|
}
|
||||||
run_mapping = {
|
run_status_numeric_to_text = {"0": "UNSTART", "1": "RUNNING", "2": "CANCEL", "3": "DONE", "4": "FAIL"}
|
||||||
"0": "UNSTART",
|
|
||||||
"1": "RUNNING",
|
|
||||||
"2": "CANCEL",
|
|
||||||
"3": "DONE",
|
|
||||||
"4": "FAIL",
|
|
||||||
}
|
|
||||||
for doc in docs:
|
|
||||||
renamed_doc = {}
|
|
||||||
for key, value in doc.items():
|
|
||||||
if key == "run":
|
|
||||||
renamed_doc["run"] = run_mapping.get(str(value))
|
|
||||||
new_key = key_mapping.get(key, key)
|
|
||||||
renamed_doc[new_key] = value
|
|
||||||
if key == "run":
|
|
||||||
renamed_doc["run"] = run_mapping.get(value)
|
|
||||||
renamed_doc_list.append(renamed_doc)
|
|
||||||
return get_result(data={"total": tol, "docs": renamed_doc_list})
|
|
||||||
|
|
||||||
|
output_docs = []
|
||||||
|
for d in docs:
|
||||||
|
renamed_doc = {key_mapping.get(k, k): v for k, v in d.items()}
|
||||||
|
if "run" in d:
|
||||||
|
renamed_doc["run"] = run_status_numeric_to_text.get(str(d["run"]), d["run"])
|
||||||
|
output_docs.append(renamed_doc)
|
||||||
|
|
||||||
|
return get_result(data={"total": total, "docs": output_docs})
|
||||||
|
|
||||||
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
|
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
|
||||||
@token_required
|
@token_required
|
||||||
|
|||||||
@ -79,7 +79,7 @@ class DocumentService(CommonService):
|
|||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_list(cls, kb_id, page_number, items_per_page,
|
def get_list(cls, kb_id, page_number, items_per_page,
|
||||||
orderby, desc, keywords, id, name):
|
orderby, desc, keywords, id, name, suffix=None, run = None):
|
||||||
fields = cls.get_cls_model_fields()
|
fields = cls.get_cls_model_fields()
|
||||||
docs = cls.model.select(*[*fields, UserCanvas.title]).join(File2Document, on = (File2Document.document_id == cls.model.id))\
|
docs = cls.model.select(*[*fields, UserCanvas.title]).join(File2Document, on = (File2Document.document_id == cls.model.id))\
|
||||||
.join(File, on = (File.id == File2Document.file_id))\
|
.join(File, on = (File.id == File2Document.file_id))\
|
||||||
@ -96,6 +96,10 @@ class DocumentService(CommonService):
|
|||||||
docs = docs.where(
|
docs = docs.where(
|
||||||
fn.LOWER(cls.model.name).contains(keywords.lower())
|
fn.LOWER(cls.model.name).contains(keywords.lower())
|
||||||
)
|
)
|
||||||
|
if suffix:
|
||||||
|
docs = docs.where(cls.model.suffix.in_(suffix))
|
||||||
|
if run:
|
||||||
|
docs = docs.where(cls.model.run.in_(run))
|
||||||
if desc:
|
if desc:
|
||||||
docs = docs.order_by(cls.model.getter_by(orderby).desc())
|
docs = docs.order_by(cls.model.getter_by(orderby).desc())
|
||||||
else:
|
else:
|
||||||
|
|||||||
17
docs/guides/agent/agent_component_reference/chunker_token.md
Normal file
17
docs/guides/agent/agent_component_reference/chunker_token.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
---
|
||||||
|
sidebar_position: 32
|
||||||
|
slug: /chunker_token_component
|
||||||
|
---
|
||||||
|
|
||||||
|
# Parser component
|
||||||
|
|
||||||
|
A component that sets the parsing rules for your dataset.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
A **Parser** component defines how various file types should be parsed, including parsing methods for PDFs , fields to parse for Emails, and OCR methods for images.
|
||||||
|
|
||||||
|
|
||||||
|
## Scenario
|
||||||
|
|
||||||
|
A **Parser** component is auto-populated on the ingestion pipeline canvas and required in all ingestion pipeline workflows.
|
||||||
@ -1198,23 +1198,24 @@ Failure:
|
|||||||
|
|
||||||
### List documents
|
### List documents
|
||||||
|
|
||||||
**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}`
|
**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}`
|
||||||
|
|
||||||
Lists documents in a specified dataset.
|
Lists documents in a specified dataset.
|
||||||
|
|
||||||
#### Request
|
#### Request
|
||||||
|
|
||||||
- Method: GET
|
- Method: GET
|
||||||
- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}`
|
- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}`
|
||||||
- Headers:
|
- Headers:
|
||||||
- `'content-Type: application/json'`
|
- `'content-Type: application/json'`
|
||||||
- `'Authorization: Bearer <YOUR_API_KEY>'`
|
- `'Authorization: Bearer <YOUR_API_KEY>'`
|
||||||
|
|
||||||
##### Request example
|
##### Request examples
|
||||||
|
|
||||||
|
**A basic request with pagination:**
|
||||||
```bash
|
```bash
|
||||||
curl --request GET \
|
curl --request GET \
|
||||||
--url http://{address}/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp} \
|
--url http://{address}/api/v1/datasets/{dataset_id}/documents?page=1&page_size=10 \
|
||||||
--header 'Authorization: Bearer <YOUR_API_KEY>'
|
--header 'Authorization: Bearer <YOUR_API_KEY>'
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -1236,10 +1237,34 @@ curl --request GET \
|
|||||||
Indicates whether the retrieved documents should be sorted in descending order. Defaults to `true`.
|
Indicates whether the retrieved documents should be sorted in descending order. Defaults to `true`.
|
||||||
- `id`: (*Filter parameter*), `string`
|
- `id`: (*Filter parameter*), `string`
|
||||||
The ID of the document to retrieve.
|
The ID of the document to retrieve.
|
||||||
- `create_time_from`: (*Filter parameter*), `integer`
|
- `create_time_from`: (*Filter parameter*), `integer`
|
||||||
Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to `0`.
|
Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to `0`.
|
||||||
- `create_time_to`: (*Filter parameter*), `integer`
|
- `create_time_to`: (*Filter parameter*), `integer`
|
||||||
Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to `0`.
|
Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to `0`.
|
||||||
|
- `suffix`: (*Filter parameter*), `array[string]`
|
||||||
|
Filter by file suffix. Supports multiple values, e.g., `pdf`, `txt`, and `docx`. Defaults to all suffixes.
|
||||||
|
- `run`: (*Filter parameter*), `array[string]`
|
||||||
|
Filter by document processing status. Supports numeric, text, and mixed formats:
|
||||||
|
- Numeric format: `["0", "1", "2", "3", "4"]`
|
||||||
|
- Text format: `[UNSTART, RUNNING, CANCEL, DONE, FAIL]`
|
||||||
|
- Mixed format: `[UNSTART, 1, DONE]` (mixing numeric and text formats)
|
||||||
|
- Status mapping:
|
||||||
|
- `0` / `UNSTART`: Document not yet processed
|
||||||
|
- `1` / `RUNNING`: Document is currently being processed
|
||||||
|
- `2` / `CANCEL`: Document processing was cancelled
|
||||||
|
- `3` / `DONE`: Document processing completed successfully
|
||||||
|
- `4` / `FAIL`: Document processing failed
|
||||||
|
Defaults to all statuses.
|
||||||
|
|
||||||
|
##### Usage examples
|
||||||
|
|
||||||
|
**A request with multiple filtering parameters**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl --request GET \
|
||||||
|
--url 'http://{address}/api/v1/datasets/{dataset_id}/documents?suffix=pdf&run=DONE&page=1&page_size=10' \
|
||||||
|
--header 'Authorization: Bearer <YOUR_API_KEY>'
|
||||||
|
```
|
||||||
|
|
||||||
#### Response
|
#### Response
|
||||||
|
|
||||||
@ -1270,7 +1295,7 @@ Success:
|
|||||||
"process_duration": 0.0,
|
"process_duration": 0.0,
|
||||||
"progress": 0.0,
|
"progress": 0.0,
|
||||||
"progress_msg": "",
|
"progress_msg": "",
|
||||||
"run": "0",
|
"run": "UNSTART",
|
||||||
"size": 7,
|
"size": 7,
|
||||||
"source_type": "local",
|
"source_type": "local",
|
||||||
"status": "1",
|
"status": "1",
|
||||||
|
|||||||
Reference in New Issue
Block a user