Compare commits

...

4 Commits

Author SHA1 Message Date
70a0f081f6 Minor tweaks (#11249)
### What problem does this PR solve?

Fix some IDE warnings

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2025-11-13 16:11:07 +08:00
93422fa8cc Fix: Law parser (#11246)
### What problem does this PR solve?

Fix: Law parser
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-13 15:19:02 +08:00
bfc84ba95b Test: handle duplicate names by appending "(1)" (#11244)
### What problem does this PR solve?

- Updated tests to reflect new behavior of handling duplicate dataset
names
- Instead of returning an error, the system now appends "(1)" to
duplicate names
- This problem was introduced by PR #10960

### Type of change

- [x] Testcase update
2025-11-13 15:18:32 +08:00
871055b0fc Feat:support API for generating knowledge graph and raptor (#11229)
### What problem does this PR solve?
issue:
[#11195](https://github.com/infiniflow/ragflow/issues/11195)
change:
support API for generating knowledge graph and raptor

### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- [x] Documentation Update
2025-11-13 15:17:52 +08:00
13 changed files with 423 additions and 38 deletions

View File

@ -96,12 +96,12 @@ login_manager.init_app(app)
commands.register_commands(app)
def search_pages_path(pages_dir):
def search_pages_path(page_path):
app_path_list = [
path for path in pages_dir.glob("*_app.py") if not path.name.startswith(".")
path for path in page_path.glob("*_app.py") if not path.name.startswith(".")
]
api_path_list = [
path for path in pages_dir.glob("*sdk/*.py") if not path.name.startswith(".")
path for path in page_path.glob("*sdk/*.py") if not path.name.startswith(".")
]
app_path_list.extend(api_path_list)
return app_path_list
@ -138,7 +138,7 @@ pages_dir = [
]
client_urls_prefix = [
register_page(path) for dir in pages_dir for path in search_pages_path(dir)
register_page(path) for directory in pages_dir for path in search_pages_path(directory)
]
@ -177,5 +177,7 @@ def load_user(web_request):
@app.teardown_request
def _db_close(exc):
def _db_close(exception):
if exception:
logging.exception(f"Request failed: {exception}")
close_connection()

View File

@ -426,7 +426,6 @@ def test_db_connect():
try:
import trino
import os
from trino.auth import BasicAuthentication
except Exception as e:
return server_error_response(f"Missing dependency 'trino'. Please install: pip install trino, detail: {e}")
@ -438,7 +437,7 @@ def test_db_connect():
auth = None
if http_scheme == "https" and req.get("password"):
auth = BasicAuthentication(req.get("username") or "ragflow", req["password"])
auth = trino.BasicAuthentication(req.get("username") or "ragflow", req["password"])
conn = trino.dbapi.connect(
host=req["host"],
@ -471,8 +470,8 @@ def test_db_connect():
@login_required
def getlistversion(canvas_id):
try:
list =sorted([c.to_dict() for c in UserCanvasVersionService.list_by_canvas_id(canvas_id)], key=lambda x: x["update_time"]*-1)
return get_json_result(data=list)
versions =sorted([c.to_dict() for c in UserCanvasVersionService.list_by_canvas_id(canvas_id)], key=lambda x: x["update_time"]*-1)
return get_json_result(data=versions)
except Exception as e:
return get_data_error_result(message=f"Error getting history files: {e}")

View File

@ -55,7 +55,6 @@ def set_connector():
"timeout_secs": int(req.get("timeout_secs", 60 * 29)),
"status": TaskStatus.SCHEDULE,
}
conn["status"] = TaskStatus.SCHEDULE
ConnectorService.save(**conn)
time.sleep(1)

View File

@ -85,7 +85,6 @@ def get():
if not e:
return get_data_error_result(message="Conversation not found!")
tenants = UserTenantService.query(user_id=current_user.id)
avatar = None
for tenant in tenants:
dialog = DialogService.query(tenant_id=tenant.tenant_id, id=conv.dialog_id)
if dialog and len(dialog) > 0:

View File

@ -154,15 +154,15 @@ def get_kb_names(kb_ids):
@login_required
def list_dialogs():
try:
diags = DialogService.query(
conversations = DialogService.query(
tenant_id=current_user.id,
status=StatusEnum.VALID.value,
reverse=True,
order_by=DialogService.model.create_time)
diags = [d.to_dict() for d in diags]
for d in diags:
d["kb_ids"], d["kb_names"] = get_kb_names(d["kb_ids"])
return get_json_result(data=diags)
conversations = [d.to_dict() for d in conversations]
for conversation in conversations:
conversation["kb_ids"], conversation["kb_names"] = get_kb_names(conversation["kb_ids"])
return get_json_result(data=conversations)
except Exception as e:
return server_error_response(e)

View File

@ -308,7 +308,7 @@ def get_filter():
@manager.route("/infos", methods=["POST"]) # noqa: F821
@login_required
def docinfos():
def doc_infos():
req = request.json
doc_ids = req["doc_ids"]
for doc_id in doc_ids:
@ -544,6 +544,7 @@ def change_parser():
return get_data_error_result(message="Tenant not found!")
if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
return None
try:
if "pipeline_id" in req and req["pipeline_id"] != "":

View File

@ -246,8 +246,8 @@ def rm():
try:
if file.location:
settings.STORAGE_IMPL.rm(file.parent_id, file.location)
except Exception:
logging.exception(f"Fail to remove object: {file.parent_id}/{file.location}")
except Exception as e:
logging.exception(f"Fail to remove object: {file.parent_id}/{file.location}, error: {e}")
informs = File2DocumentService.get_by_file_id(file.id)
for inform in informs:

View File

@ -731,6 +731,8 @@ def delete_kb_task():
def cancel_task(task_id):
REDIS_CONN.set(f"{task_id}-cancel", "x")
kb_task_id_field: str = ""
kb_task_finish_at: str = ""
match pipeline_task_type:
case PipelineTaskType.GRAPH_RAG:
kb_task_id_field = "graphrag_task_id"

View File

@ -21,10 +21,11 @@ import json
from flask import request
from peewee import OperationalError
from api.db.db_models import File
from api.db.services.document_service import DocumentService
from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.task_service import GRAPH_RAPTOR_FAKE_DOC_ID, TaskService
from api.db.services.user_service import TenantService
from common.constants import RetCode, FileSource, StatusEnum
from api.utils.api_utils import (
@ -118,7 +119,6 @@ def create(tenant_id):
req, err = validate_and_parse_json_request(request, CreateDatasetReq)
if err is not None:
return get_error_argument_result(err)
req = KnowledgebaseService.create_with_name(
name = req.pop("name", None),
tenant_id = tenant_id,
@ -144,7 +144,6 @@ def create(tenant_id):
ok, k = KnowledgebaseService.get_by_id(req["id"])
if not ok:
return get_error_data_result(message="Dataset created failed")
response_data = remap_dictionary_keys(k.to_dict())
return get_result(data=response_data)
except Exception as e:
@ -532,3 +531,157 @@ def delete_knowledge_graph(tenant_id, dataset_id):
search.index_name(kb.tenant_id), dataset_id)
return get_result(data=True)
@manager.route("/datasets/<dataset_id>/run_graphrag", methods=["POST"]) # noqa: F821
@token_required
def run_graphrag(tenant_id,dataset_id):
if not dataset_id:
return get_error_data_result(message='Lack of "Dataset ID"')
if not KnowledgebaseService.accessible(dataset_id, tenant_id):
return get_result(
data=False,
message='No authorization.',
code=RetCode.AUTHENTICATION_ERROR
)
ok, kb = KnowledgebaseService.get_by_id(dataset_id)
if not ok:
return get_error_data_result(message="Invalid Dataset ID")
task_id = kb.graphrag_task_id
if task_id:
ok, task = TaskService.get_by_id(task_id)
if not ok:
logging.warning(f"A valid GraphRAG task id is expected for Dataset {dataset_id}")
if task and task.progress not in [-1, 1]:
return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A Graph Task is already running.")
documents, _ = DocumentService.get_by_kb_id(
kb_id=dataset_id,
page_number=0,
items_per_page=0,
orderby="create_time",
desc=False,
keywords="",
run_status=[],
types=[],
suffix=[],
)
if not documents:
return get_error_data_result(message=f"No documents in Dataset {dataset_id}")
sample_document = documents[0]
document_ids = [document["id"] for document in documents]
task_id = queue_raptor_o_graphrag_tasks(sample_doc_id=sample_document, ty="graphrag", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids))
if not KnowledgebaseService.update_by_id(kb.id, {"graphrag_task_id": task_id}):
logging.warning(f"Cannot save graphrag_task_id for Dataset {dataset_id}")
return get_result(data={"graphrag_task_id": task_id})
@manager.route("/datasets/<dataset_id>/trace_graphrag", methods=["GET"]) # noqa: F821
@token_required
def trace_graphrag(tenant_id,dataset_id):
if not dataset_id:
return get_error_data_result(message='Lack of "Dataset ID"')
if not KnowledgebaseService.accessible(dataset_id, tenant_id):
return get_result(
data=False,
message='No authorization.',
code=RetCode.AUTHENTICATION_ERROR
)
ok, kb = KnowledgebaseService.get_by_id(dataset_id)
if not ok:
return get_error_data_result(message="Invalid Dataset ID")
task_id = kb.graphrag_task_id
if not task_id:
return get_result(data={})
ok, task = TaskService.get_by_id(task_id)
if not ok:
return get_result(data={})
return get_result(data=task.to_dict())
@manager.route("/datasets/<dataset_id>/run_raptor", methods=["POST"]) # noqa: F821
@token_required
def run_raptor(tenant_id,dataset_id):
if not dataset_id:
return get_error_data_result(message='Lack of "Dataset ID"')
if not KnowledgebaseService.accessible(dataset_id, tenant_id):
return get_result(
data=False,
message='No authorization.',
code=RetCode.AUTHENTICATION_ERROR
)
ok, kb = KnowledgebaseService.get_by_id(dataset_id)
if not ok:
return get_error_data_result(message="Invalid Dataset ID")
task_id = kb.raptor_task_id
if task_id:
ok, task = TaskService.get_by_id(task_id)
if not ok:
logging.warning(f"A valid RAPTOR task id is expected for Dataset {dataset_id}")
if task and task.progress not in [-1, 1]:
return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A RAPTOR Task is already running.")
documents, _ = DocumentService.get_by_kb_id(
kb_id=dataset_id,
page_number=0,
items_per_page=0,
orderby="create_time",
desc=False,
keywords="",
run_status=[],
types=[],
suffix=[],
)
if not documents:
return get_error_data_result(message=f"No documents in Dataset {dataset_id}")
sample_document = documents[0]
document_ids = [document["id"] for document in documents]
task_id = queue_raptor_o_graphrag_tasks(sample_doc_id=sample_document, ty="raptor", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids))
if not KnowledgebaseService.update_by_id(kb.id, {"raptor_task_id": task_id}):
logging.warning(f"Cannot save raptor_task_id for Dataset {dataset_id}")
return get_result(data={"raptor_task_id": task_id})
@manager.route("/datasets/<dataset_id>/trace_raptor", methods=["GET"]) # noqa: F821
@token_required
def trace_raptor(tenant_id,dataset_id):
if not dataset_id:
return get_error_data_result(message='Lack of "Dataset ID"')
if not KnowledgebaseService.accessible(dataset_id, tenant_id):
return get_result(
data=False,
message='No authorization.',
code=RetCode.AUTHENTICATION_ERROR
)
ok, kb = KnowledgebaseService.get_by_id(dataset_id)
if not ok:
return get_error_data_result(message="Invalid Dataset ID")
task_id = kb.raptor_task_id
if not task_id:
return get_result(data={})
ok, task = TaskService.get_by_id(task_id)
if not ok:
return get_error_data_result(message="RAPTOR Task Not Found or Error Occurred")
return get_result(data=task.to_dict())

View File

@ -974,6 +974,237 @@ Failure:
---
### Construct knowledge graph
**POST** `/api/v1/datasets/{dataset_id}/run_graphrag`
Constructs a knowledge graph from a specified dataset.
#### Request
- Method: POST
- URL: `/api/v1/datasets/{dataset_id}/run_graphrag`
- Headers:
- `'Authorization: Bearer <YOUR_API_KEY>'`
##### Request example
```bash
curl --request POST \
--url http://{address}/api/v1/datasets/{dataset_id}/run_graphrag \
--header 'Authorization: Bearer <YOUR_API_KEY>'
```
##### Request parameters
- `dataset_id`: (*Path parameter*)
The ID of the target dataset.
#### Response
Success:
```json
{
"code":0,
"data":{
"graphrag_task_id":"e498de54bfbb11f0ba028f704583b57b"
}
}
```
Failure:
```json
{
"code": 102,
"message": "Invalid Dataset ID"
}
```
---
### Get knowledge graph construction status
**GET** `/api/v1/datasets/{dataset_id}/trace_graphrag`
Retrieves the knowledge graph construction status for a specified dataset.
#### Request
- Method: GET
- URL: `/api/v1/datasets/{dataset_id}/trace_graphrag`
- Headers:
- `'Authorization: Bearer <YOUR_API_KEY>'`
##### Request example
```bash
curl --request GET \
--url http://{address}/api/v1/datasets/{dataset_id}/trace_graphrag \
--header 'Authorization: Bearer <YOUR_API_KEY>'
```
##### Request parameters
- `dataset_id`: (*Path parameter*)
The ID of the target dataset.
#### Response
Success:
```json
{
"code":0,
"data":{
"begin_at":"Wed, 12 Nov 2025 19:36:56 GMT",
"chunk_ids":"",
"create_date":"Wed, 12 Nov 2025 19:36:56 GMT",
"create_time":1762947416350,
"digest":"39e43572e3dcd84f",
"doc_id":"44661c10bde211f0bc93c164a47ffc40",
"from_page":100000000,
"id":"e498de54bfbb11f0ba028f704583b57b",
"priority":0,
"process_duration":2.45419,
"progress":1.0,
"progress_msg":"19:36:56 created task graphrag\n19:36:57 Task has been received.\n19:36:58 [GraphRAG] doc:083661febe2411f0bc79456921e5745f has no available chunks, skip generation.\n19:36:58 [GraphRAG] build_subgraph doc:44661c10bde211f0bc93c164a47ffc40 start (chunks=1, timeout=10000000000s)\n19:36:58 Graph already contains 44661c10bde211f0bc93c164a47ffc40\n19:36:58 [GraphRAG] build_subgraph doc:44661c10bde211f0bc93c164a47ffc40 empty\n19:36:58 [GraphRAG] kb:33137ed0bde211f0bc93c164a47ffc40 no subgraphs generated successfully, end.\n19:36:58 Knowledge Graph done (0.72s)","retry_count":1,
"task_type":"graphrag",
"to_page":100000000,
"update_date":"Wed, 12 Nov 2025 19:36:58 GMT",
"update_time":1762947418454
}
}
```
Failure:
```json
{
"code": 102,
"message": "Invalid Dataset ID"
}
```
---
### Construct RAPTOR
**POST** `/api/v1/datasets/{dataset_id}/run_raptor`
Construct a RAPTOR from a specified dataset.
#### Request
- Method: POST
- URL: `/api/v1/datasets/{dataset_id}/run_raptor`
- Headers:
- `'Authorization: Bearer <YOUR_API_KEY>'`
##### Request example
```bash
curl --request POST \
--url http://{address}/api/v1/datasets/{dataset_id}/run_raptor \
--header 'Authorization: Bearer <YOUR_API_KEY>'
```
##### Request parameters
- `dataset_id`: (*Path parameter*)
The ID of the target dataset.
#### Response
Success:
```json
{
"code":0,
"data":{
"raptor_task_id":"50d3c31cbfbd11f0ba028f704583b57b"
}
}
```
Failure:
```json
{
"code": 102,
"message": "Invalid Dataset ID"
}
```
---
### Get RAPTOR construction status
**GET** `/api/v1/datasets/{dataset_id}/trace_raptor`
Retrieves the RAPTOR construction status for a specified dataset.
#### Request
- Method: GET
- URL: `/api/v1/datasets/{dataset_id}/trace_raptor`
- Headers:
- `'Authorization: Bearer <YOUR_API_KEY>'`
##### Request example
```bash
curl --request GET \
--url http://{address}/api/v1/datasets/{dataset_id}/trace_raptor \
--header 'Authorization: Bearer <YOUR_API_KEY>'
```
##### Request parameters
- `dataset_id`: (*Path parameter*)
The ID of the target dataset.
#### Response
Success:
```json
{
"code":0,
"data":{
"begin_at":"Wed, 12 Nov 2025 19:47:07 GMT",
"chunk_ids":"",
"create_date":"Wed, 12 Nov 2025 19:47:07 GMT",
"create_time":1762948027427,
"digest":"8b279a6248cb8fc6",
"doc_id":"44661c10bde211f0bc93c164a47ffc40",
"from_page":100000000,
"id":"50d3c31cbfbd11f0ba028f704583b57b",
"priority":0,
"process_duration":0.948244,
"progress":1.0,
"progress_msg":"19:47:07 created task raptor\n19:47:07 Task has been received.\n19:47:07 Processing...\n19:47:07 Processing...\n19:47:07 Indexing done (0.01s).\n19:47:07 Task done (0.29s)",
"retry_count":1,
"task_type":"raptor",
"to_page":100000000,
"update_date":"Wed, 12 Nov 2025 19:47:07 GMT",
"update_time":1762948027948
}
}
```
Failure:
```json
{
"code": 102,
"message": "Invalid Dataset ID"
}
```
---
## FILE MANAGEMENT WITHIN DATASET
---

View File

@ -482,7 +482,7 @@ def tree_merge(bull, sections, depth):
root = Node(level=0, depth=target_level, texts=[])
root.build_tree(lines)
return [("\n").join(element) for element in root.get_tree() if element]
return [element for element in root.get_tree() if element]
def hierarchical_merge(bull, sections, depth):

View File

@ -16,14 +16,15 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
import pytest
from common import create_dataset
from configs import DATASET_NAME_LIMIT, INVALID_API_TOKEN
from configs import DATASET_NAME_LIMIT, DEFAULT_PARSER_CONFIG, INVALID_API_TOKEN
from hypothesis import example, given, settings
from libs.auth import RAGFlowHttpApiAuth
from utils import encode_avatar
from utils.file_utils import create_image_file
from utils.hypothesis_utils import valid_names
from configs import DEFAULT_PARSER_CONFIG
from common import create_dataset
@pytest.mark.usefixtures("clear_datasets")
class TestAuthorization:
@ -125,8 +126,8 @@ class TestDatasetCreate:
assert res["code"] == 0, res
res = create_dataset(HttpApiAuth, payload)
assert res["code"] == 103, res
assert res["message"] == f"Dataset name '{name}' already exists", res
assert res["code"] == 0, res
assert res["data"]["name"] == name + "(1)", res
@pytest.mark.p3
def test_name_case_insensitive(self, HttpApiAuth):
@ -137,8 +138,8 @@ class TestDatasetCreate:
payload = {"name": name.lower()}
res = create_dataset(HttpApiAuth, payload)
assert res["code"] == 103, res
assert res["message"] == f"Dataset name '{name.lower()}' already exists", res
assert res["code"] == 0, res
assert res["data"]["name"] == name.lower() + "(1)", res
@pytest.mark.p2
def test_avatar(self, HttpApiAuth, tmp_path):

View File

@ -17,13 +17,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from operator import attrgetter
import pytest
from configs import DATASET_NAME_LIMIT, HOST_ADDRESS, INVALID_API_TOKEN
from configs import DATASET_NAME_LIMIT, DEFAULT_PARSER_CONFIG, HOST_ADDRESS, INVALID_API_TOKEN
from hypothesis import example, given, settings
from ragflow_sdk import DataSet, RAGFlow
from utils import encode_avatar
from utils.file_utils import create_image_file
from utils.hypothesis_utils import valid_names
from configs import DEFAULT_PARSER_CONFIG
@pytest.mark.usefixtures("clear_datasets")
class TestAuthorization:
@ -95,9 +95,8 @@ class TestDatasetCreate:
payload = {"name": name}
client.create_dataset(**payload)
with pytest.raises(Exception) as excinfo:
client.create_dataset(**payload)
assert str(excinfo.value) == f"Dataset name '{name}' already exists", str(excinfo.value)
dataset = client.create_dataset(**payload)
assert dataset.name == name + "(1)", str(dataset)
@pytest.mark.p3
def test_name_case_insensitive(self, client):
@ -106,9 +105,8 @@ class TestDatasetCreate:
client.create_dataset(**payload)
payload = {"name": name.lower()}
with pytest.raises(Exception) as excinfo:
client.create_dataset(**payload)
assert str(excinfo.value) == f"Dataset name '{name.lower()}' already exists", str(excinfo.value)
dataset = client.create_dataset(**payload)
assert dataset.name == name.lower() + "(1)", str(dataset)
@pytest.mark.p2
def test_avatar(self, client, tmp_path):