TEST: Added test cases for Upload Documents HTTP API (#5991)

### What problem does this PR solve?

cover upload docments endpoints

### Type of change

- [x] add test cases
This commit is contained in:
liu an
2025-03-12 19:38:52 +08:00
committed by GitHub
parent 7cd37c37cd
commit bd5eb47441
12 changed files with 782 additions and 123 deletions

View File

@ -0,0 +1,101 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
from pathlib import Path
import requests
from requests_toolbelt import MultipartEncoder
HEADERS = {"Content-Type": "application/json"}
HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9380")
DATASETS_API_URL = "/api/v1/datasets"
FILE_API_URL = "/api/v1/datasets/{dataset_id}/documents"
INVALID_API_TOKEN = "invalid_key_123"
DATASET_NAME_LIMIT = 128
DOCUMENT_NAME_LIMIT = 128
# DATASET MANAGEMENT
def create_dataset(auth, payload):
res = requests.post(
url=f"{HOST_ADDRESS}{DATASETS_API_URL}",
headers=HEADERS,
auth=auth,
json=payload,
)
return res.json()
def list_dataset(auth, params=None):
res = requests.get(
url=f"{HOST_ADDRESS}{DATASETS_API_URL}",
headers=HEADERS,
auth=auth,
params=params,
)
return res.json()
def update_dataset(auth, dataset_id, payload):
res = requests.put(
url=f"{HOST_ADDRESS}{DATASETS_API_URL}/{dataset_id}",
headers=HEADERS,
auth=auth,
json=payload,
)
return res.json()
def delete_dataset(auth, payload=None):
res = requests.delete(
url=f"{HOST_ADDRESS}{DATASETS_API_URL}",
headers=HEADERS,
auth=auth,
json=payload,
)
return res.json()
def create_datasets(auth, num):
ids = []
for i in range(num):
res = create_dataset(auth, {"name": f"dataset_{i}"})
ids.append(res["data"]["id"])
return ids
# FILE MANAGEMENT WITHIN DATASET
def upload_documnets(auth, dataset_id, files_path=None):
url = f"{HOST_ADDRESS}{FILE_API_URL}".format(dataset_id=dataset_id)
if files_path is None:
files_path = []
fields = []
for i, fp in enumerate(files_path):
p = Path(fp)
fields.append(("file", (p.name, p.open("rb"))))
m = MultipartEncoder(fields=fields)
res = requests.post(
url=url,
headers={"Content-Type": m.content_type},
auth=auth,
data=m,
)
return res.json()

View File

@ -0,0 +1,73 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import pytest
from common import delete_dataset
from libs.utils.file_utils import (
create_docx_file,
create_eml_file,
create_excel_file,
create_html_file,
create_image_file,
create_json_file,
create_md_file,
create_pdf_file,
create_ppt_file,
create_txt_file,
)
@pytest.fixture(scope="function", autouse=True)
def clear_datasets(get_http_api_auth):
yield
delete_dataset(get_http_api_auth)
@pytest.fixture
def generate_test_files(tmp_path):
files = {}
files["docx"] = tmp_path / "ragflow_test.docx"
create_docx_file(files["docx"])
files["excel"] = tmp_path / "ragflow_test.xlsx"
create_excel_file(files["excel"])
files["ppt"] = tmp_path / "ragflow_test.pptx"
create_ppt_file(files["ppt"])
files["image"] = tmp_path / "ragflow_test.png"
create_image_file(files["image"])
files["pdf"] = tmp_path / "ragflow_test.pdf"
create_pdf_file(files["pdf"])
files["txt"] = tmp_path / "ragflow_test.txt"
create_txt_file(files["txt"])
files["md"] = tmp_path / "ragflow_test.md"
create_md_file(files["md"])
files["json"] = tmp_path / "ragflow_test.json"
create_json_file(files["json"])
files["eml"] = tmp_path / "ragflow_test.eml"
create_eml_file(files["eml"])
files["html"] = tmp_path / "ragflow_test.html"
create_html_file(files["html"])
return files

View File

@ -1,57 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import requests
HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9380")
API_URL = f"{HOST_ADDRESS}/api/v1/datasets"
HEADERS = {"Content-Type": "application/json"}
INVALID_API_TOKEN = "invalid_key_123"
DATASET_NAME_LIMIT = 128
def create_dataset(auth, payload):
res = requests.post(url=API_URL, headers=HEADERS, auth=auth, json=payload)
return res.json()
def list_dataset(auth, params=None):
res = requests.get(url=API_URL, headers=HEADERS, auth=auth, params=params)
return res.json()
def update_dataset(auth, dataset_id, payload):
res = requests.put(
url=f"{API_URL}/{dataset_id}", headers=HEADERS, auth=auth, json=payload
)
return res.json()
def delete_dataset(auth, payload=None):
res = requests.delete(url=API_URL, headers=HEADERS, auth=auth, json=payload)
return res.json()
def create_datasets(auth, num):
ids = []
for i in range(num):
res = create_dataset(auth, {"name": f"dataset_{i}"})
ids.append(res["data"]["id"])
return ids

View File

@ -1,24 +0,0 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import pytest
from common import delete_dataset
@pytest.fixture(scope="function", autouse=True)
def clear_datasets(get_http_api_auth):
yield
delete_dataset(get_http_api_auth)

View File

@ -13,12 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import base64
from pathlib import Path
import pytest
from common import DATASET_NAME_LIMIT, INVALID_API_TOKEN, create_dataset
from libs.auth import RAGFlowHttpApiAuth
from libs.utils import encode_avatar
from libs.utils.file_utils import create_image_file
class TestAuthorization:
@ -75,18 +75,11 @@ class TestDatasetCreation:
class TestAdvancedConfigurations:
def test_avatar(self, get_http_api_auth, request):
def encode_avatar(image_path):
with Path.open(image_path, "rb") as file:
binary_data = file.read()
base64_encoded = base64.b64encode(binary_data).decode("utf-8")
return base64_encoded
def test_avatar(self, get_http_api_auth, tmp_path):
fn = create_image_file(tmp_path / "ragflow_test.png")
payload = {
"name": "avatar_test",
"avatar": encode_avatar(
Path(request.config.rootdir) / "test/data/logo.svg"
),
"avatar": encode_avatar(fn),
}
res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0

View File

@ -13,9 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import base64
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import pytest
from common import (
@ -26,6 +24,8 @@ from common import (
update_dataset,
)
from libs.auth import RAGFlowHttpApiAuth
from libs.utils import encode_avatar
from libs.utils.file_utils import create_image_file
# TODO: Missing scenario for updating embedding_model with chunk_count != 0
@ -171,19 +171,10 @@ class TestDatasetUpdate:
else:
assert res["message"] == expected_message
def test_avatar(self, get_http_api_auth, request):
def encode_avatar(image_path):
with Path.open(image_path, "rb") as file:
binary_data = file.read()
base64_encoded = base64.b64encode(binary_data).decode("utf-8")
return base64_encoded
def test_avatar(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
payload = {
"avatar": encode_avatar(
Path(request.config.rootdir) / "test/data/logo.svg"
),
}
fn = create_image_file(tmp_path / "ragflow_test.png")
payload = {"avatar": encode_avatar(fn)}
res = update_dataset(get_http_api_auth, ids[0], payload)
assert res["code"] == 0

View File

@ -0,0 +1,230 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import string
from concurrent.futures import ThreadPoolExecutor
import pytest
import requests
from common import (
DOCUMENT_NAME_LIMIT,
FILE_API_URL,
HOST_ADDRESS,
INVALID_API_TOKEN,
create_datasets,
list_dataset,
upload_documnets,
)
from libs.auth import RAGFlowHttpApiAuth
from libs.utils.file_utils import create_txt_file
from requests_toolbelt import MultipartEncoder
class TestAuthorization:
@pytest.mark.parametrize(
"auth, expected_code, expected_message",
[
(None, 0, "`Authorization` can't be empty"),
(
RAGFlowHttpApiAuth(INVALID_API_TOKEN),
109,
"Authentication error: API key is invalid!",
),
],
)
def test_invalid_auth(
self, get_http_api_auth, auth, expected_code, expected_message
):
ids = create_datasets(get_http_api_auth, 1)
res = upload_documnets(auth, ids[0])
assert res["code"] == expected_code
assert res["message"] == expected_message
class TestUploadDocuments:
def test_valid_single_upload(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
fp = create_txt_file(tmp_path / "ragflow_test.txt")
res = upload_documnets(get_http_api_auth, ids[0], [fp])
assert res["code"] == 0
assert res["data"][0]["dataset_id"] == ids[0]
assert res["data"][0]["name"] == fp.name
@pytest.mark.parametrize(
"file_type",
[
"docx",
"excel",
"ppt",
"image",
"pdf",
"txt",
"md",
"json",
"eml",
"html",
],
)
def test_file_type_validation(
self, get_http_api_auth, generate_test_files, file_type
):
ids = create_datasets(get_http_api_auth, 1)
fp = generate_test_files[file_type]
res = upload_documnets(get_http_api_auth, ids[0], [fp])
assert res["code"] == 0
assert res["data"][0]["dataset_id"] == ids[0]
assert res["data"][0]["name"] == fp.name
@pytest.mark.parametrize(
"file_type",
["exe", "unknown"],
)
def test_unsupported_file_type(self, get_http_api_auth, tmp_path, file_type):
ids = create_datasets(get_http_api_auth, 1)
fp = tmp_path / f"ragflow_test.{file_type}"
fp.touch()
res = upload_documnets(get_http_api_auth, ids[0], [fp])
assert res["code"] == 500
assert (
res["message"]
== f"ragflow_test.{file_type}: This type of file has not been supported yet!"
)
def test_missing_file(self, get_http_api_auth):
ids = create_datasets(get_http_api_auth, 1)
res = upload_documnets(get_http_api_auth, ids[0])
assert res["code"] == 101
assert res["message"] == "No file part!"
def test_empty_file(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
fp = tmp_path / "empty.txt"
fp.touch()
res = upload_documnets(get_http_api_auth, ids[0], [fp])
assert res["code"] == 0
assert res["data"][0]["size"] == 0
def test_filename_empty(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
fp = create_txt_file(tmp_path / "ragflow_test.txt")
url = f"{HOST_ADDRESS}{FILE_API_URL}".format(dataset_id=ids[0])
fields = (("file", ("", fp.open("rb"))),)
m = MultipartEncoder(fields=fields)
res = requests.post(
url=url,
headers={"Content-Type": m.content_type},
auth=get_http_api_auth,
data=m,
)
assert res.json()["code"] == 101
assert res.json()["message"] == "No file selected!"
def test_filename_exceeds_max_length(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
# filename_length = 129
fp = create_txt_file(tmp_path / f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt")
res = upload_documnets(get_http_api_auth, ids[0], [fp])
assert res["code"] == 500
assert (
res["message"]
== f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt: Exceed the maximum length of file name!"
)
def test_invalid_dataset_id(self, get_http_api_auth, tmp_path):
fp = create_txt_file(tmp_path / "ragflow_test.txt")
res = upload_documnets(get_http_api_auth, "invalid_dataset_id", [fp])
assert res["code"] == 100
assert (
res["message"]
== """LookupError("Can\'t find the dataset with ID invalid_dataset_id!")"""
)
def test_duplicate_files(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
fp = create_txt_file(tmp_path / "ragflow_test.txt")
res = upload_documnets(get_http_api_auth, ids[0], [fp, fp])
assert res["code"] == 0
assert len(res["data"]) == 2
for i in range(len(res["data"])):
assert res["data"][i]["dataset_id"] == ids[0]
expected_name = fp.name
if i != 0:
expected_name = f"{fp.stem}({i}){fp.suffix}"
assert res["data"][i]["name"] == expected_name
def test_same_file_repeat(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
fp = create_txt_file(tmp_path / "ragflow_test.txt")
for i in range(10):
res = upload_documnets(get_http_api_auth, ids[0], [fp])
assert res["code"] == 0
assert len(res["data"]) == 1
assert res["data"][0]["dataset_id"] == ids[0]
expected_name = fp.name
if i != 0:
expected_name = f"{fp.stem}({i}){fp.suffix}"
assert res["data"][0]["name"] == expected_name
def test_filename_special_characters(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
illegal_chars = '<>:"/\\|?*'
translation_table = str.maketrans({char: "_" for char in illegal_chars})
safe_filename = string.punctuation.translate(translation_table)
fp = tmp_path / f"{safe_filename}.txt"
fp.write_text("Sample text content")
res = upload_documnets(get_http_api_auth, ids[0], [fp])
assert res["code"] == 0
assert len(res["data"]) == 1
assert res["data"][0]["dataset_id"] == ids[0]
assert res["data"][0]["name"] == fp.name
def test_multiple_files(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
expected_document_count = 20
fps = []
for i in range(expected_document_count):
fp = create_txt_file(tmp_path / f"ragflow_test_{i}.txt")
fps.append(fp)
res = upload_documnets(get_http_api_auth, ids[0], fps)
assert res["code"] == 0
res = list_dataset(get_http_api_auth, {"id": ids[0]})
assert res["data"][0]["document_count"] == expected_document_count
@pytest.mark.xfail
def test_concurrent_upload(self, get_http_api_auth, tmp_path):
ids = create_datasets(get_http_api_auth, 1)
expected_document_count = 20
fps = []
for i in range(expected_document_count):
fp = create_txt_file(tmp_path / f"ragflow_test_{i}.txt")
fps.append(fp)
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
executor.submit(
upload_documnets, get_http_api_auth, ids[0], fps[i : i + 1]
)
for i in range(expected_document_count)
]
responses = [f.result() for f in futures]
assert all(r["code"] == 0 for r in responses)
res = list_dataset(get_http_api_auth, {"id": ids[0]})
assert res["data"][0]["document_count"] == expected_document_count