diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 0c6405c14..ec9387e5f 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -470,6 +470,20 @@ def list_docs(dataset_id, tenant_id): required: false default: 0 description: Unix timestamp for filtering documents created before this time. 0 means no filter. + - in: query + name: suffix + type: array + items: + type: string + required: false + description: Filter by file suffix (e.g., ["pdf", "txt", "docx"]). + - in: query + name: run + type: array + items: + type: string + required: false + description: Filter by document run status. Supports both numeric ("0", "1", "2", "3", "4") and text formats ("UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"). - in: header name: Authorization type: string @@ -512,63 +526,62 @@ def list_docs(dataset_id, tenant_id): description: Processing status. """ if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id): - return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") - id = request.args.get("id") - name = request.args.get("name") + return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ") - if id and not DocumentService.query(id=id, kb_id=dataset_id): - return get_error_data_result(message=f"You don't own the document {id}.") + q = request.args + document_id = q.get("id") + name = q.get("name") + + if document_id and not DocumentService.query(id=document_id, kb_id=dataset_id): + return get_error_data_result(message=f"You don't own the document {document_id}.") if name and not DocumentService.query(name=name, kb_id=dataset_id): return get_error_data_result(message=f"You don't own the document {name}.") - page = int(request.args.get("page", 1)) - keywords = request.args.get("keywords", "") - page_size = int(request.args.get("page_size", 30)) - orderby = request.args.get("orderby", "create_time") - if request.args.get("desc") == "False": - desc = False - else: - desc = True - docs, tol = DocumentService.get_list(dataset_id, page, page_size, orderby, desc, keywords, id, name) + page = int(q.get("page", 1)) + page_size = int(q.get("page_size", 30)) + orderby = q.get("orderby", "create_time") + desc = str(q.get("desc", "true")).strip().lower() != "false" + keywords = q.get("keywords", "") - create_time_from = int(request.args.get("create_time_from", 0)) - create_time_to = int(request.args.get("create_time_to", 0)) + # filters - align with OpenAPI parameter names + suffix = q.getlist("suffix") + run_status = q.getlist("run") + create_time_from = int(q.get("create_time_from", 0)) + create_time_to = int(q.get("create_time_to", 0)) + # map run status (accept text or numeric) - align with API parameter + run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"} + run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status] + + docs, total = DocumentService.get_list( + dataset_id, page, page_size, orderby, desc, keywords, document_id, name, suffix, run_status_converted + ) + + # time range filter (0 means no bound) if create_time_from or create_time_to: - filtered_docs = [] - for doc in docs: - doc_create_time = doc.get("create_time", 0) - if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to): - filtered_docs.append(doc) - docs = filtered_docs + docs = [ + d for d in docs + if (create_time_from == 0 or d.get("create_time", 0) >= create_time_from) + and (create_time_to == 0 or d.get("create_time", 0) <= create_time_to) + ] - # rename key's name - renamed_doc_list = [] + # rename keys + map run status back to text for output key_mapping = { "chunk_num": "chunk_count", - "kb_id": "dataset_id", + "kb_id": "dataset_id", "token_num": "token_count", "parser_id": "chunk_method", } - run_mapping = { - "0": "UNSTART", - "1": "RUNNING", - "2": "CANCEL", - "3": "DONE", - "4": "FAIL", - } - for doc in docs: - renamed_doc = {} - for key, value in doc.items(): - if key == "run": - renamed_doc["run"] = run_mapping.get(str(value)) - new_key = key_mapping.get(key, key) - renamed_doc[new_key] = value - if key == "run": - renamed_doc["run"] = run_mapping.get(value) - renamed_doc_list.append(renamed_doc) - return get_result(data={"total": tol, "docs": renamed_doc_list}) + run_status_numeric_to_text = {"0": "UNSTART", "1": "RUNNING", "2": "CANCEL", "3": "DONE", "4": "FAIL"} + output_docs = [] + for d in docs: + renamed_doc = {key_mapping.get(k, k): v for k, v in d.items()} + if "run" in d: + renamed_doc["run"] = run_status_numeric_to_text.get(str(d["run"]), d["run"]) + output_docs.append(renamed_doc) + + return get_result(data={"total": total, "docs": output_docs}) @manager.route("/datasets//documents", methods=["DELETE"]) # noqa: F821 @token_required diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index af2b08f24..33676a6b3 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -79,7 +79,7 @@ class DocumentService(CommonService): @classmethod @DB.connection_context() def get_list(cls, kb_id, page_number, items_per_page, - orderby, desc, keywords, id, name): + orderby, desc, keywords, id, name, suffix=None, run = None): fields = cls.get_cls_model_fields() docs = cls.model.select(*[*fields, UserCanvas.title]).join(File2Document, on = (File2Document.document_id == cls.model.id))\ .join(File, on = (File.id == File2Document.file_id))\ @@ -96,6 +96,10 @@ class DocumentService(CommonService): docs = docs.where( fn.LOWER(cls.model.name).contains(keywords.lower()) ) + if suffix: + docs = docs.where(cls.model.suffix.in_(suffix)) + if run: + docs = docs.where(cls.model.run.in_(run)) if desc: docs = docs.order_by(cls.model.getter_by(orderby).desc()) else: diff --git a/docs/guides/agent/agent_component_reference/chunker_token.md b/docs/guides/agent/agent_component_reference/chunker_token.md new file mode 100644 index 000000000..8d29d4fa6 --- /dev/null +++ b/docs/guides/agent/agent_component_reference/chunker_token.md @@ -0,0 +1,17 @@ +--- +sidebar_position: 32 +slug: /chunker_token_component +--- + +# Parser component + +A component that sets the parsing rules for your dataset. + +--- + +A **Parser** component defines how various file types should be parsed, including parsing methods for PDFs , fields to parse for Emails, and OCR methods for images. + + +## Scenario + +A **Parser** component is auto-populated on the ingestion pipeline canvas and required in all ingestion pipeline workflows. \ No newline at end of file diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 286aaa5a0..51e2cc06a 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -1198,23 +1198,24 @@ Failure: ### List documents -**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}` +**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}` Lists documents in a specified dataset. #### Request - Method: GET -- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}` +- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}` - Headers: - `'content-Type: application/json'` - `'Authorization: Bearer '` -##### Request example +##### Request examples +**A basic request with pagination:** ```bash curl --request GET \ - --url http://{address}/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp} \ + --url http://{address}/api/v1/datasets/{dataset_id}/documents?page=1&page_size=10 \ --header 'Authorization: Bearer ' ``` @@ -1236,10 +1237,34 @@ curl --request GET \ Indicates whether the retrieved documents should be sorted in descending order. Defaults to `true`. - `id`: (*Filter parameter*), `string` The ID of the document to retrieve. -- `create_time_from`: (*Filter parameter*), `integer` +- `create_time_from`: (*Filter parameter*), `integer` Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to `0`. -- `create_time_to`: (*Filter parameter*), `integer` +- `create_time_to`: (*Filter parameter*), `integer` Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to `0`. +- `suffix`: (*Filter parameter*), `array[string]` + Filter by file suffix. Supports multiple values, e.g., `pdf`, `txt`, and `docx`. Defaults to all suffixes. +- `run`: (*Filter parameter*), `array[string]` + Filter by document processing status. Supports numeric, text, and mixed formats: + - Numeric format: `["0", "1", "2", "3", "4"]` + - Text format: `[UNSTART, RUNNING, CANCEL, DONE, FAIL]` + - Mixed format: `[UNSTART, 1, DONE]` (mixing numeric and text formats) + - Status mapping: + - `0` / `UNSTART`: Document not yet processed + - `1` / `RUNNING`: Document is currently being processed + - `2` / `CANCEL`: Document processing was cancelled + - `3` / `DONE`: Document processing completed successfully + - `4` / `FAIL`: Document processing failed + Defaults to all statuses. + +##### Usage examples + +**A request with multiple filtering parameters** + +```bash +curl --request GET \ + --url 'http://{address}/api/v1/datasets/{dataset_id}/documents?suffix=pdf&run=DONE&page=1&page_size=10' \ + --header 'Authorization: Bearer ' +``` #### Response @@ -1270,7 +1295,7 @@ Success: "process_duration": 0.0, "progress": 0.0, "progress_msg": "", - "run": "0", + "run": "UNSTART", "size": 7, "source_type": "local", "status": "1",