mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-05 18:15:06 +08:00
Compare commits
5 Commits
74ec734d69
...
aaae938f54
| Author | SHA1 | Date | |
|---|---|---|---|
| aaae938f54 | |||
| 9e73f799b2 | |||
| 21a62130c8 | |||
| 68e47c81d4 | |||
| f11d8af936 |
@ -188,6 +188,9 @@ def detail():
|
|||||||
return get_data_error_result(
|
return get_data_error_result(
|
||||||
message="Can't find this knowledgebase!")
|
message="Can't find this knowledgebase!")
|
||||||
kb["size"] = DocumentService.get_total_size_by_kb_id(kb_id=kb["id"],keywords="", run_status=[], types=[])
|
kb["size"] = DocumentService.get_total_size_by_kb_id(kb_id=kb["id"],keywords="", run_status=[], types=[])
|
||||||
|
for key in ["graphrag_task_finish_at", "raptor_task_finish_at", "mindmap_task_finish_at"]:
|
||||||
|
if finish_at := kb.get(key):
|
||||||
|
kb[key] = finish_at.strftime("%Y-%m-%d %H:%M:%S")
|
||||||
return get_json_result(data=kb)
|
return get_json_result(data=kb)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
|||||||
@ -803,6 +803,12 @@
|
|||||||
"tags": "TEXT EMBEDDING",
|
"tags": "TEXT EMBEDDING",
|
||||||
"max_tokens": 512,
|
"max_tokens": 512,
|
||||||
"model_type": "embedding"
|
"model_type": "embedding"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"llm_name": "glm-asr",
|
||||||
|
"tags": "SPEECH2TEXT",
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"model_type": "speech2text"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -5140,4 +5146,4 @@
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@ -37,9 +37,12 @@ OPENSEARCH_PASSWORD=infini_rag_flow_OS_01
|
|||||||
|
|
||||||
# The port used to expose the Kibana service to the host machine,
|
# The port used to expose the Kibana service to the host machine,
|
||||||
# allowing EXTERNAL access to the service running inside the Docker container.
|
# allowing EXTERNAL access to the service running inside the Docker container.
|
||||||
|
# To enable kibana, you need to:
|
||||||
|
# 1. Ensure that COMPOSE_PROFILES includes kibana, for example: COMPOSE_PROFILES=${DOC_ENGINE},kibana
|
||||||
|
# 2. Comment out or delete the following configurations of the es service in docker-compose-base.yml: xpack.security.enabled、xpack.security.http.ssl.enabled、xpack.security.transport.ssl.enabled (for details: https://www.elastic.co/docs/deploy-manage/security/self-auto-setup#stack-existing-settings-detected)
|
||||||
|
# 3. Adjust the es.hosts in conf/service_config.yaml or docker/service_conf.yaml.template to 'https://localhost:1200'
|
||||||
|
# 4. After the startup is successful, in the es container, execute the command to generate the kibana token: `bin/elasticsearch-create-enrollment-token -s kibana`, then you can use kibana normally
|
||||||
KIBANA_PORT=6601
|
KIBANA_PORT=6601
|
||||||
KIBANA_USER=rag_flow
|
|
||||||
KIBANA_PASSWORD=infini_rag_flow
|
|
||||||
|
|
||||||
# The maximum amount of the memory, in bytes, that a specific Docker container can use while running.
|
# The maximum amount of the memory, in bytes, that a specific Docker container can use while running.
|
||||||
# Update it according to the available memory in the host machine.
|
# Update it according to the available memory in the host machine.
|
||||||
|
|||||||
@ -207,6 +207,30 @@ services:
|
|||||||
start_period: 10s
|
start_period: 10s
|
||||||
|
|
||||||
|
|
||||||
|
kibana:
|
||||||
|
container_name: ragflow-kibana
|
||||||
|
profiles:
|
||||||
|
- kibana
|
||||||
|
image: kibana:${STACK_VERSION}
|
||||||
|
ports:
|
||||||
|
- ${KIBANA_PORT-5601}:5601
|
||||||
|
env_file: .env
|
||||||
|
environment:
|
||||||
|
- TZ=${TIMEZONE}
|
||||||
|
volumes:
|
||||||
|
- kibana_data:/usr/share/kibana/data
|
||||||
|
depends_on:
|
||||||
|
es01:
|
||||||
|
condition: service_started
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:5601/api/status"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 120
|
||||||
|
networks:
|
||||||
|
- ragflow
|
||||||
|
restart: on-failure
|
||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
esdata01:
|
esdata01:
|
||||||
@ -221,6 +245,8 @@ volumes:
|
|||||||
driver: local
|
driver: local
|
||||||
redis_data:
|
redis_data:
|
||||||
driver: local
|
driver: local
|
||||||
|
kibana_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
ragflow:
|
ragflow:
|
||||||
|
|||||||
@ -698,6 +698,58 @@ print("Async bulk parsing initiated.")
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
### Parse documents (with document status)
|
||||||
|
|
||||||
|
```python
|
||||||
|
DataSet.parse_documents(document_ids: list[str]) -> list[tuple[str, str, int, int]]
|
||||||
|
```
|
||||||
|
|
||||||
|
Parses documents **synchronously** in the current dataset.
|
||||||
|
This method wraps `async_parse_documents()` and automatically waits for all parsing tasks to complete.
|
||||||
|
It returns detailed parsing results, including the status and statistics for each document.
|
||||||
|
If interrupted by the user (e.g. `Ctrl+C`), all pending parsing jobs will be cancelled gracefully.
|
||||||
|
|
||||||
|
#### Parameters
|
||||||
|
|
||||||
|
##### document_ids: `list[str]`, *Required*
|
||||||
|
|
||||||
|
The IDs of the documents to parse.
|
||||||
|
|
||||||
|
#### Returns
|
||||||
|
|
||||||
|
A list of tuples with detailed parsing results:
|
||||||
|
```python
|
||||||
|
[
|
||||||
|
(document_id: str, status: str, chunk_count: int, token_count: int),
|
||||||
|
...
|
||||||
|
]
|
||||||
|
```
|
||||||
|
- **status** — Final parsing state (`success`, `failed`, `cancelled`, etc.)
|
||||||
|
- **chunk_count** — Number of content chunks created for the document.
|
||||||
|
- **token_count** — Total number of tokens processed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
rag_object = RAGFlow(api_key="<YOUR_API_KEY>", base_url="http://<YOUR_BASE_URL>:9380")
|
||||||
|
dataset = rag_object.create_dataset(name="dataset_name")
|
||||||
|
documents = dataset.list_documents(keywords="test")
|
||||||
|
ids = [doc.id for doc in documents]
|
||||||
|
|
||||||
|
try:
|
||||||
|
finished = dataset.parse_documents(ids)
|
||||||
|
for doc_id, status, chunk_count, token_count in finished:
|
||||||
|
print(f"Document {doc_id} parsing finished with status: {status}, chunks: {chunk_count}, tokens: {token_count}")
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\nParsing interrupted by user. All pending tasks have been cancelled.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Parsing failed: {e}")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
### Stop parsing documents
|
### Stop parsing documents
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|||||||
@ -234,8 +234,8 @@ class DeepInfraSeq2txt(Base):
|
|||||||
|
|
||||||
self.client = OpenAI(api_key=key, base_url=base_url)
|
self.client = OpenAI(api_key=key, base_url=base_url)
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
|
|
||||||
class CometAPISeq2txt(Base):
|
class CometAPISeq2txt(Base):
|
||||||
_FACTORY_NAME = "CometAPI"
|
_FACTORY_NAME = "CometAPI"
|
||||||
|
|
||||||
@ -244,7 +244,8 @@ class CometAPISeq2txt(Base):
|
|||||||
base_url = "https://api.cometapi.com/v1"
|
base_url = "https://api.cometapi.com/v1"
|
||||||
self.client = OpenAI(api_key=key, base_url=base_url)
|
self.client = OpenAI(api_key=key, base_url=base_url)
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
|
|
||||||
class DeerAPISeq2txt(Base):
|
class DeerAPISeq2txt(Base):
|
||||||
_FACTORY_NAME = "DeerAPI"
|
_FACTORY_NAME = "DeerAPI"
|
||||||
|
|
||||||
@ -253,3 +254,44 @@ class DeerAPISeq2txt(Base):
|
|||||||
base_url = "https://api.deerapi.com/v1"
|
base_url = "https://api.deerapi.com/v1"
|
||||||
self.client = OpenAI(api_key=key, base_url=base_url)
|
self.client = OpenAI(api_key=key, base_url=base_url)
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
|
|
||||||
|
class ZhipuSeq2txt(Base):
|
||||||
|
_FACTORY_NAME = "ZHIPU-AI"
|
||||||
|
|
||||||
|
def __init__(self, key, model_name="glm-asr", base_url="https://open.bigmodel.cn/api/paas/v4", **kwargs):
|
||||||
|
if not base_url:
|
||||||
|
base_url = "https://open.bigmodel.cn/api/paas/v4"
|
||||||
|
self.base_url = base_url
|
||||||
|
self.api_key = key
|
||||||
|
self.model_name = model_name
|
||||||
|
self.gen_conf = kwargs.get("gen_conf", {})
|
||||||
|
self.stream = kwargs.get("stream", False)
|
||||||
|
|
||||||
|
def transcription(self, audio_path):
|
||||||
|
payload = {
|
||||||
|
"model": self.model_name,
|
||||||
|
"temperature": str(self.gen_conf.get("temperature", 0.75)) or "0.75",
|
||||||
|
"stream": self.stream,
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {"Authorization": f"Bearer {self.api_key}"}
|
||||||
|
with open(audio_path, "rb") as audio_file:
|
||||||
|
files = {"file": audio_file}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
url=f"{self.base_url}/audio/transcriptions",
|
||||||
|
data=payload,
|
||||||
|
files=files,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
body = response.json()
|
||||||
|
if response.status_code == 200:
|
||||||
|
full_content = body["text"]
|
||||||
|
return full_content, num_tokens_from_string(full_content)
|
||||||
|
else:
|
||||||
|
error = body["error"]
|
||||||
|
return f"**ERROR**: code: {error['code']}, message: {error['message']}", 0
|
||||||
|
except Exception as e:
|
||||||
|
return "**ERROR**: " + str(e), 0
|
||||||
|
|||||||
@ -124,7 +124,7 @@ def kb_prompt(kbinfos, max_tokens, hash_id=False):
|
|||||||
|
|
||||||
knowledges = []
|
knowledges = []
|
||||||
for i, ck in enumerate(kbinfos["chunks"][:chunks_num]):
|
for i, ck in enumerate(kbinfos["chunks"][:chunks_num]):
|
||||||
cnt = "\nID: {}".format(i if not hash_id else hash_str2int(get_value(ck, "id", "chunk_id"), 100))
|
cnt = "\nID: {}".format(i if not hash_id else hash_str2int(get_value(ck, "id", "chunk_id"), 500))
|
||||||
cnt += draw_node("Title", get_value(ck, "docnm_kwd", "document_name"))
|
cnt += draw_node("Title", get_value(ck, "docnm_kwd", "document_name"))
|
||||||
cnt += draw_node("URL", ck['url']) if "url" in ck else ""
|
cnt += draw_node("URL", ck['url']) if "url" in ck else ""
|
||||||
for k, v in docs.get(get_value(ck, "doc_id", "document_id"), {}).items():
|
for k, v in docs.get(get_value(ck, "doc_id", "document_id"), {}).items():
|
||||||
|
|||||||
@ -100,12 +100,51 @@ class DataSet(Base):
|
|||||||
res = res.json()
|
res = res.json()
|
||||||
if res.get("code") != 0:
|
if res.get("code") != 0:
|
||||||
raise Exception(res["message"])
|
raise Exception(res["message"])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_documents_status(self, document_ids):
|
||||||
|
import time
|
||||||
|
terminal_states = {"DONE", "FAIL", "CANCEL"}
|
||||||
|
interval_sec = 1
|
||||||
|
pending = set(document_ids)
|
||||||
|
finished = []
|
||||||
|
while pending:
|
||||||
|
for doc_id in list(pending):
|
||||||
|
def fetch_doc(doc_id: str) -> Document | None:
|
||||||
|
try:
|
||||||
|
docs = self.list_documents(id=doc_id)
|
||||||
|
return docs[0] if docs else None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
doc = fetch_doc(doc_id)
|
||||||
|
if doc is None:
|
||||||
|
continue
|
||||||
|
if isinstance(doc.run, str) and doc.run.upper() in terminal_states:
|
||||||
|
finished.append((doc_id, doc.run, doc.chunk_count, doc.token_count))
|
||||||
|
pending.discard(doc_id)
|
||||||
|
elif float(doc.progress or 0.0) >= 1.0:
|
||||||
|
finished.append((doc_id, "DONE", doc.chunk_count, doc.token_count))
|
||||||
|
pending.discard(doc_id)
|
||||||
|
if pending:
|
||||||
|
time.sleep(interval_sec)
|
||||||
|
return finished
|
||||||
|
|
||||||
def async_parse_documents(self, document_ids):
|
def async_parse_documents(self, document_ids):
|
||||||
res = self.post(f"/datasets/{self.id}/chunks", {"document_ids": document_ids})
|
res = self.post(f"/datasets/{self.id}/chunks", {"document_ids": document_ids})
|
||||||
res = res.json()
|
res = res.json()
|
||||||
if res.get("code") != 0:
|
if res.get("code") != 0:
|
||||||
raise Exception(res.get("message"))
|
raise Exception(res.get("message"))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_documents(self, document_ids):
|
||||||
|
try:
|
||||||
|
self.async_parse_documents(document_ids)
|
||||||
|
self._get_documents_status(document_ids)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
self.async_cancel_parse_documents(document_ids)
|
||||||
|
|
||||||
|
return self._get_documents_status(document_ids)
|
||||||
|
|
||||||
|
|
||||||
def async_cancel_parse_documents(self, document_ids):
|
def async_cancel_parse_documents(self, document_ids):
|
||||||
res = self.rm(f"/datasets/{self.id}/chunks", {"document_ids": document_ids})
|
res = self.rm(f"/datasets/{self.id}/chunks", {"document_ids": document_ids})
|
||||||
|
|||||||
Reference in New Issue
Block a user