Refactor Document API (#2833)

### What problem does this PR solve?

Refactor Document API

### Type of change


- [x] Refactoring

Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
This commit is contained in:
liuhua
2024-10-14 20:03:33 +08:00
committed by GitHub
parent df223eddf3
commit 6329427ad5
11 changed files with 393 additions and 418 deletions

View File

@ -18,8 +18,8 @@ class Base(object):
pr[name] = value
return pr
def post(self, path, json, stream=False):
res = self.rag.post(path, json, stream=stream)
def post(self, path, json=None, stream=False, files=None):
res = self.rag.post(path, json, stream=stream,files=files)
return res
def get(self, path, params):

View File

@ -1,5 +1,7 @@
from typing import Optional, List
from transformers.models.bloom.modeling_bloom import bloom_gelu_back
from .document import Document
from .base import Base
@ -39,39 +41,27 @@ class DataSet(Base):
if res.get("code") != 0:
raise Exception(res["message"])
def upload_documents(self,document_list: List[dict]):
url = f"/dataset/{self.id}/document"
files = [("file",(ele["name"],ele["blob"])) for ele in document_list]
res = self.post(path=url,json=None,files=files)
res = res.json()
if res.get("code") != 0:
raise Exception(res.get("message"))
def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
"""
List the documents in the dataset, optionally filtering by keywords, with pagination support.
Args:
keywords (Optional[str]): A string of keywords to filter the documents. Defaults to None.
offset (int): The starting point for pagination. Defaults to 0.
limit (int): The maximum number of documents to return. Defaults to -1 (no limit).
Returns:
List[Document]: A list of Document objects.
"""
# Construct the request payload for listing documents
payload = {
"knowledgebase_id": self.id,
"keywords": keywords,
"offset": offset,
"limit": limit
}
# Send the request to the server to list documents
res = self.get(f'/doc/dataset/{self.id}/documents', payload)
res_json = res.json()
# Handle response and error checking
if res_json.get("retmsg") != "success":
raise Exception(res_json.get("retmsg"))
# Parse the document data from the response
def list_documents(self, id: str = None, keywords: str = None, offset: int =1, limit: int = 1024, orderby: str = "create_time", desc: bool = True):
res = self.get(f"/dataset/{self.id}/info",params={"id": id,"keywords": keywords,"offset": offset,"limit": limit,"orderby": orderby,"desc": desc})
res = res.json()
documents = []
for doc_data in res_json["data"].get("docs", []):
doc = Document(self.rag, doc_data)
documents.append(doc)
if res.get("code") == 0:
for document in res["data"].get("docs"):
documents.append(Document(self.rag,document))
return documents
raise Exception(res["message"])
def delete_documents(self,ids: List[str] = None):
res = self.rm(f"/dataset/{self.id}/document",{"ids":ids})
res = res.json()
if res.get("code") != 0:
raise Exception(res["message"])
return documents

View File

@ -29,18 +29,14 @@ class Document(Base):
res_dict.pop(k)
super().__init__(rag, res_dict)
def save(self) -> bool:
def update(self,update_message:dict) -> bool:
"""
Save the document details to the server.
"""
res = self.post('/doc/save',
{"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "knowledgebase_id": self.knowledgebase_id,
"parser_method": self.parser_method, "parser_config": self.parser_config.to_json(),
})
res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message)
res = res.json()
if res.get("retmsg") == "success":
return True
raise Exception(res["retmsg"])
if res.get("code") != 0:
raise Exception(res["message"])
def delete(self) -> bool:
"""
@ -60,8 +56,7 @@ class Document(Base):
:return: The downloaded document content in bytes.
"""
# Construct the URL for the API request using the document ID and knowledge base ID
res = self.get(f"/doc/{self.id}",
{"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}")
# Check the response status code to ensure the request was successful
if res.status_code == 200:

View File

@ -32,12 +32,12 @@ class RAGFlow:
self.api_url = f"{base_url}/api/{version}"
self.authorization_header = {"Authorization": "{} {}".format("Bearer", self.user_key)}
def post(self, path, json, stream=False):
res = requests.post(url=self.api_url + path, json=json, headers=self.authorization_header, stream=stream)
def post(self, path, json=None, stream=False, files=None):
res = requests.post(url=self.api_url + path, json=json, headers=self.authorization_header, stream=stream,files=files)
return res
def get(self, path, params=None):
res = requests.get(url=self.api_url + path, params=params, headers=self.authorization_header)
def get(self, path, params=None, json=None):
res = requests.get(url=self.api_url + path, params=params, headers=self.authorization_header,json=json)
return res
def delete(self, path, json):
@ -151,31 +151,7 @@ class RAGFlow:
return result_list
raise Exception(res["message"])
def create_document(self, ds: DataSet, name: str, blob: bytes) -> bool:
url = f"/doc/dataset/{ds.id}/documents/upload"
files = {
'file': (name, blob)
}
headers = {
'Authorization': f"Bearer {ds.rag.user_key}"
}
response = requests.post(self.api_url + url, files=files,
headers=headers)
if response.status_code == 200 and response.json().get('retmsg') == 'success':
return True
else:
raise Exception(f"Upload failed: {response.json().get('retmsg')}")
return False
def get_document(self, id: str = None, name: str = None) -> Document:
res = self.get("/doc/infos", {"id": id, "name": name})
res = res.json()
if res.get("retmsg") == "success":
return Document(self, res['data'])
raise Exception(res["retmsg"])
def async_parse_documents(self, doc_ids):
"""

View File

@ -21,22 +21,16 @@ class TestDocument(TestSdk):
# Step 2: Create a new document
# The blob is the actual file content or a placeholder in this case
name = "TestDocument.txt"
blob = b"Sample document content for ingestion test."
res = rag.create_document(ds, name=name, blob=blob)
blob_2 = b"test_2."
list_1 = []
list_1.append({"name":"Test_1.txt",
"blob":blob})
list_1.append({"name":"Test_2.txt",
"blob":blob_2})
res = ds.upload_documents(list_1)
# Ensure document ingestion was successful
assert res is True, f"Failed to create document, error: {res}"
def test_get_detail_document_with_success(self):
"""
Test getting a document's detail with success
"""
rag = RAGFlow(API_KEY, HOST_ADDRESS)
doc = rag.get_document(name="TestDocument.txt")
assert isinstance(doc, Document), f"Failed to get dataset, error: {doc}."
assert doc.name == "TestDocument.txt", "Name does not match"
assert res is None, f"Failed to create document, error: {res}"
def test_update_document_with_success(self):
"""
@ -44,12 +38,13 @@ class TestDocument(TestSdk):
Update name or parser_method are supported
"""
rag = RAGFlow(API_KEY, HOST_ADDRESS)
doc = rag.get_document(name="TestDocument.txt")
ds = rag.list_datasets(name="God")
ds = ds[0]
doc = ds.list_documents()
doc = doc[0]
if isinstance(doc, Document):
doc.parser_method = "manual"
doc.name = "manual.txt"
res = doc.save()
assert res is True, f"Failed to update document, error: {res}"
res = doc.update({"parser_method":"manual","name":"manual.txt"})
assert res is None, f"Failed to update document, error: {res}"
else:
assert False, f"Failed to get document, error: {doc}"
@ -61,8 +56,10 @@ class TestDocument(TestSdk):
rag = RAGFlow(API_KEY, HOST_ADDRESS)
# Retrieve a document
doc = rag.get_document(name="manual.txt")
ds = rag.list_datasets(name="God")
ds = ds[0]
doc = ds.list_documents(name="manual.txt")
doc = doc[0]
# Check if the retrieved document is of type Document
if isinstance(doc, Document):
# Download the document content and save it to a file
@ -81,7 +78,7 @@ class TestDocument(TestSdk):
# If the document retrieval fails, assert failure
assert False, f"Failed to get document, error: {doc}"
def test_list_all_documents_in_dataset_with_success(self):
def test_list_documents_in_dataset_with_success(self):
"""
Test list all documents into a dataset with success.
"""
@ -101,12 +98,10 @@ class TestDocument(TestSdk):
blob1 = b"Sample document content for ingestion test111."
name2 = "Test Document222.txt"
blob2 = b"Sample document content for ingestion test222."
rag.create_document(ds, name=name1, blob=blob1)
rag.create_document(ds, name=name2, blob=blob2)
list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]
ds.upload_documents(list_1)
for d in ds.list_docs(keywords="test", offset=0, limit=12):
assert isinstance(d, Document)
print(d)
assert isinstance(d, Document), "Failed to upload documents"
def test_delete_documents_in_dataset_with_success(self):
"""