Compare commits
117 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6c32f80bc9 | |||
| 7e74546b73 | |||
| 25781113f9 | |||
| 16fa7db737 | |||
| a12fcf9156 | |||
| c27c02ea67 | |||
| 71068895ae | |||
| 93b35f4e58 | |||
| 9a01d1b876 | |||
| a7bd427116 | |||
| 2b36283712 | |||
| 6683179d6a | |||
| 673a28e492 | |||
| 2bfacd0469 | |||
| b3c923da6b | |||
| a1586e0af9 | |||
| f6a599461f | |||
| 081f922ee6 | |||
| 9f0f5b45cc | |||
| a2a6a35e94 | |||
| 9e5d501e83 | |||
| 4ca176bd41 | |||
| c3bc72dfd9 | |||
| 2dd705fe68 | |||
| d1614107e2 | |||
| 05fa3aeb08 | |||
| e73ce39b66 | |||
| d54d1375a5 | |||
| c6c9dbde64 | |||
| 95f809187e | |||
| d6772f5dd7 | |||
| 63ca15c595 | |||
| 7b144cc086 | |||
| 1c4e92ed35 | |||
| 10e83f26dc | |||
| 6ff63ee2ba | |||
| 12b4c5668c | |||
| baad35df30 | |||
| 5effbfac80 | |||
| 4d47b2b459 | |||
| d8c080ee52 | |||
| 626ace8639 | |||
| 1e923f1c90 | |||
| 234afb25d8 | |||
| aa1c915d6e | |||
| 77b1520b66 | |||
| 6b06ccead4 | |||
| 282f0857a3 | |||
| d7744f5870 | |||
| 9b21b66f23 | |||
| aa03dfa453 | |||
| 69b7c61498 | |||
| 8769619bb1 | |||
| ffe5737f7d | |||
| 04a9e95161 | |||
| 91b4a18c47 | |||
| 33eaf6fa2e | |||
| d65ba3e4d7 | |||
| bef1bbdf3e | |||
| 6b36f31f92 | |||
| 648a2baaa9 | |||
| 9392b8bc8f | |||
| 4153a36683 | |||
| bca63ad571 | |||
| 793e29f23a | |||
| 99be226c7c | |||
| 7ddb2f19be | |||
| c28f7b5d38 | |||
| 48607c3cfb | |||
| d15ba37313 | |||
| a553dc8dbd | |||
| eb27a4309e | |||
| 48e1534bf4 | |||
| e9d19c4684 | |||
| 8d6d7f6887 | |||
| a6e4b74d94 | |||
| a5aed2412f | |||
| 2810c60757 | |||
| 62afcf5ac8 | |||
| a74c755d83 | |||
| 7013d7f620 | |||
| de839fc3f0 | |||
| c6b6c748ae | |||
| ca5acc151a | |||
| 385dbe5ab5 | |||
| 3050a8cb07 | |||
| 9c77d367d0 | |||
| 5f03a4de11 | |||
| 290e5d958d | |||
| 9703633a57 | |||
| 7d3b68bb1e | |||
| c89f3c3cdb | |||
| 5d7f573379 | |||
| cab274f560 | |||
| 7059ec2298 | |||
| 674b3aeafd | |||
| 4c1476032d | |||
| 2af74cc494 | |||
| 38f0cc016f | |||
| 6874c6f3a7 | |||
| 8acc01a227 | |||
| 8c07992b6c | |||
| aee8b48d2f | |||
| daf215d266 | |||
| cdcc779705 | |||
| d589b0f568 | |||
| 9d60a84958 | |||
| aadb9cbec8 | |||
| 038822f3bd | |||
| ae501c58fa | |||
| 944776f207 | |||
| f1c98aad6b | |||
| ab06f502d7 | |||
| 6329339a32 | |||
| 84b39c60f6 | |||
| eb62c669ae | |||
| f69ff39fa0 |
3
.gitignore
vendored
@ -27,3 +27,6 @@ Cargo.lock
|
|||||||
|
|
||||||
# Exclude the log folder
|
# Exclude the log folder
|
||||||
docker/ragflow-logs/
|
docker/ragflow-logs/
|
||||||
|
/flask_session
|
||||||
|
/logs
|
||||||
|
rag/res/deepdoc
|
||||||
@ -1,10 +1,10 @@
|
|||||||
FROM swr.cn-north-4.myhuaweicloud.com/infiniflow/ragflow-base:v1.0
|
FROM infiniflow/ragflow-base:v2.0
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
WORKDIR /ragflow
|
WORKDIR /ragflow
|
||||||
|
|
||||||
ADD ./web ./web
|
ADD ./web ./web
|
||||||
RUN cd ./web && npm i && npm run build
|
RUN cd ./web && npm i --force && npm run build
|
||||||
|
|
||||||
ADD ./api ./api
|
ADD ./api ./api
|
||||||
ADD ./conf ./conf
|
ADD ./conf ./conf
|
||||||
@ -15,6 +15,7 @@ ENV PYTHONPATH=/ragflow/
|
|||||||
ENV HF_ENDPOINT=https://hf-mirror.com
|
ENV HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
|
||||||
ADD docker/entrypoint.sh ./entrypoint.sh
|
ADD docker/entrypoint.sh ./entrypoint.sh
|
||||||
|
ADD docker/.env ./
|
||||||
RUN chmod +x ./entrypoint.sh
|
RUN chmod +x ./entrypoint.sh
|
||||||
|
|
||||||
ENTRYPOINT ["./entrypoint.sh"]
|
ENTRYPOINT ["./entrypoint.sh"]
|
||||||
@ -1,4 +1,4 @@
|
|||||||
FROM swr.cn-north-4.myhuaweicloud.com/infiniflow/ragflow-base:v1.0
|
FROM FROM infiniflow/ragflow-base:v2.0
|
||||||
USER root
|
USER root
|
||||||
|
|
||||||
WORKDIR /ragflow
|
WORKDIR /ragflow
|
||||||
@ -9,7 +9,7 @@ RUN /root/miniconda3/envs/py11/bin/pip install onnxruntime-gpu --extra-index-url
|
|||||||
|
|
||||||
|
|
||||||
ADD ./web ./web
|
ADD ./web ./web
|
||||||
RUN cd ./web && npm i && npm run build
|
RUN cd ./web && npm i --force && npm run build
|
||||||
|
|
||||||
ADD ./api ./api
|
ADD ./api ./api
|
||||||
ADD ./conf ./conf
|
ADD ./conf ./conf
|
||||||
|
|||||||
@ -34,7 +34,7 @@ ADD ./requirements.txt ./requirements.txt
|
|||||||
RUN apt install openmpi-bin openmpi-common libopenmpi-dev
|
RUN apt install openmpi-bin openmpi-common libopenmpi-dev
|
||||||
ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH
|
ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH
|
||||||
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
||||||
RUN cd ./web && npm i && npm run build
|
RUN cd ./web && npm i --force && npm run build
|
||||||
RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt
|
RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
|
|||||||
@ -35,7 +35,7 @@ RUN dnf install -y openmpi openmpi-devel python3-openmpi
|
|||||||
ENV C_INCLUDE_PATH /usr/include/openmpi-x86_64:$C_INCLUDE_PATH
|
ENV C_INCLUDE_PATH /usr/include/openmpi-x86_64:$C_INCLUDE_PATH
|
||||||
ENV LD_LIBRARY_PATH /usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
|
ENV LD_LIBRARY_PATH /usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
|
||||||
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
||||||
RUN cd ./web && npm i && npm run build
|
RUN cd ./web && npm i --force && npm run build
|
||||||
RUN conda run -n py11 pip install $(grep -ivE "mpi4py" ./requirements.txt) # without mpi4py==3.1.5
|
RUN conda run -n py11 pip install $(grep -ivE "mpi4py" ./requirements.txt) # without mpi4py==3.1.5
|
||||||
RUN conda run -n py11 pip install redis
|
RUN conda run -n py11 pip install redis
|
||||||
|
|
||||||
|
|||||||
122
README.md
@ -15,18 +15,31 @@
|
|||||||
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
||||||
</a>
|
</a>
|
||||||
<a href="https://demo.ragflow.io" target="_blank">
|
<a href="https://demo.ragflow.io" target="_blank">
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
<img alt="Static Badge" src="https://img.shields.io/badge/Online-Demo-4e6b99"></a>
|
||||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.3.2-brightgreen"
|
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.6.0-brightgreen"
|
||||||
alt="docker pull infiniflow/ragflow:v0.3.2"></a>
|
alt="docker pull infiniflow/ragflow:v0.6.0"></a>
|
||||||
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
||||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=7d09f1" alt="license">
|
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=1570EF" alt="license">
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
## 💡 What is RAGFlow?
|
## 💡 What is RAGFlow?
|
||||||
|
|
||||||
[RAGFlow](https://demo.ragflow.io) is an open-source RAG (Retrieval-Augmented Generation) engine based on deep document understanding. It offers a streamlined RAG workflow for businesses of any scale, combining LLM (Large Language Models) to provide truthful question-answering capabilities, backed by well-founded citations from various complex formatted data.
|
[RAGFlow](https://ragflow.io/) is an open-source RAG (Retrieval-Augmented Generation) engine based on deep document understanding. It offers a streamlined RAG workflow for businesses of any scale, combining LLM (Large Language Models) to provide truthful question-answering capabilities, backed by well-founded citations from various complex formatted data.
|
||||||
|
|
||||||
|
## 📌 Latest Updates
|
||||||
|
|
||||||
|
- 2024-05-21 Supports streaming output and text chunk retrieval API.
|
||||||
|
- 2024-05-15 Integrates OpenAI GPT-4o.
|
||||||
|
- 2024-05-08 Integrates LLM DeepSeek-V2.
|
||||||
|
- 2024-04-26 Adds file management.
|
||||||
|
- 2024-04-19 Supports conversation API ([detail](./docs/conversation_api.md)).
|
||||||
|
- 2024-04-16 Integrates an embedding model 'bce-embedding-base_v1' from [BCEmbedding](https://github.com/netease-youdao/BCEmbedding), and [FastEmbed](https://github.com/qdrant/fastembed), which is designed specifically for light and speedy embedding.
|
||||||
|
- 2024-04-11 Supports [Xinference](./docs/xinference.md) for local LLM deployment.
|
||||||
|
- 2024-04-10 Adds a new layout recognition model for analyzing legal documents.
|
||||||
|
- 2024-04-08 Supports [Ollama](./docs/ollama.md) for local LLM deployment.
|
||||||
|
- 2024-04-07 Supports Chinese UI.
|
||||||
|
|
||||||
## 🌟 Key Features
|
## 🌟 Key Features
|
||||||
|
|
||||||
@ -56,16 +69,6 @@
|
|||||||
- Multiple recall paired with fused re-ranking.
|
- Multiple recall paired with fused re-ranking.
|
||||||
- Intuitive APIs for seamless integration with business.
|
- Intuitive APIs for seamless integration with business.
|
||||||
|
|
||||||
## 📌 Latest Features
|
|
||||||
|
|
||||||
- 2024-04-19 Support conversation API ([detail](./docs/conversation_api.md)).
|
|
||||||
- 2024-04-16 Add an embedding model 'bce-embedding-base_v1' from [BCEmbedding](https://github.com/netease-youdao/BCEmbedding).
|
|
||||||
- 2024-04-16 Add [FastEmbed](https://github.com/qdrant/fastembed), which is designed specifically for light and speedy embedding.
|
|
||||||
- 2024-04-11 Support [Xinference](./docs/xinference.md) for local LLM deployment.
|
|
||||||
- 2024-04-10 Add a new layout recognization model for analyzing Laws documentation.
|
|
||||||
- 2024-04-08 Support [Ollama](./docs/ollama.md) for local LLM deployment.
|
|
||||||
- 2024-04-07 Support Chinese UI.
|
|
||||||
|
|
||||||
## 🔎 System Architecture
|
## 🔎 System Architecture
|
||||||
|
|
||||||
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||||
@ -113,11 +116,14 @@
|
|||||||
|
|
||||||
3. Build the pre-built Docker images and start up the server:
|
3. Build the pre-built Docker images and start up the server:
|
||||||
|
|
||||||
|
> Running the following commands automatically downloads the *dev* version RAGFlow Docker image. To download and run a specified Docker version, update `RAGFLOW_VERSION` in **docker/.env** to the intended version, for example `RAGFLOW_VERSION=v0.6.0`, before running the following commands.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ cd ragflow/docker
|
$ cd ragflow/docker
|
||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
> The core image is about 9 GB in size and may take a while to load.
|
> The core image is about 9 GB in size and may take a while to load.
|
||||||
|
|
||||||
@ -179,14 +185,98 @@ To build the Docker images from source:
|
|||||||
```bash
|
```bash
|
||||||
$ git clone https://github.com/infiniflow/ragflow.git
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
$ cd ragflow/
|
$ cd ragflow/
|
||||||
$ docker build -t infiniflow/ragflow:v0.3.2 .
|
$ docker build -t infiniflow/ragflow:dev .
|
||||||
$ cd ragflow/docker
|
$ cd ragflow/docker
|
||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 🛠️ Launch Service from Source
|
||||||
|
|
||||||
|
To launch the service from source, please follow these steps:
|
||||||
|
|
||||||
|
1. Clone the repository
|
||||||
|
```bash
|
||||||
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
|
$ cd ragflow/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Create a virtual environment (ensure Anaconda or Miniconda is installed)
|
||||||
|
```bash
|
||||||
|
$ conda create -n ragflow python=3.11.0
|
||||||
|
$ conda activate ragflow
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
If CUDA version is greater than 12.0, execute the following additional commands:
|
||||||
|
```bash
|
||||||
|
$ pip uninstall -y onnxruntime-gpu
|
||||||
|
$ pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Copy the entry script and configure environment variables
|
||||||
|
```bash
|
||||||
|
$ cp docker/entrypoint.sh .
|
||||||
|
$ vi entrypoint.sh
|
||||||
|
```
|
||||||
|
Use the following commands to obtain the Python path and the ragflow project path:
|
||||||
|
```bash
|
||||||
|
$ which python
|
||||||
|
$ pwd
|
||||||
|
```
|
||||||
|
|
||||||
|
Set the output of `which python` as the value for `PY` and the output of `pwd` as the value for `PYTHONPATH`.
|
||||||
|
|
||||||
|
If `LD_LIBRARY_PATH` is already configured, it can be commented out.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Adjust configurations according to your actual situation; the two export commands are newly added.
|
||||||
|
PY=${PY}
|
||||||
|
export PYTHONPATH=${PYTHONPATH}
|
||||||
|
# Optional: Add Hugging Face mirror
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Start the base services
|
||||||
|
```bash
|
||||||
|
$ cd docker
|
||||||
|
$ docker compose -f docker-compose-base.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Check the configuration files
|
||||||
|
Ensure that the settings in **docker/.env** match those in **conf/service_conf.yaml**. The IP addresses and ports for related services in **service_conf.yaml** should be changed to the local machine IP and ports exposed by the container.
|
||||||
|
|
||||||
|
6. Launch the service
|
||||||
|
```bash
|
||||||
|
$ chmod +x ./entrypoint.sh
|
||||||
|
$ bash ./entrypoint.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
7. Start the WebUI service
|
||||||
|
```bash
|
||||||
|
$ cd web
|
||||||
|
$ npm install --registry=https://registry.npmmirror.com --force
|
||||||
|
$ vim .umirc.ts
|
||||||
|
# Modify proxy.target to 127.0.0.1:9380
|
||||||
|
$ npm run dev
|
||||||
|
```
|
||||||
|
|
||||||
|
8. Deploy the WebUI service
|
||||||
|
```bash
|
||||||
|
$ cd web
|
||||||
|
$ npm install --registry=https://registry.npmmirror.com --force
|
||||||
|
$ umi build
|
||||||
|
$ mkdir -p /ragflow/web
|
||||||
|
$ cp -r dist /ragflow/web
|
||||||
|
$ apt install nginx -y
|
||||||
|
$ cp ../docker/nginx/proxy.conf /etc/nginx
|
||||||
|
$ cp ../docker/nginx/nginx.conf /etc/nginx
|
||||||
|
$ cp ../docker/nginx/ragflow.conf /etc/nginx/conf.d
|
||||||
|
$ systemctl start nginx
|
||||||
|
```
|
||||||
|
|
||||||
## 📚 Documentation
|
## 📚 Documentation
|
||||||
|
|
||||||
|
- [Quickstart](./docs/quickstart.md)
|
||||||
- [FAQ](./docs/faq.md)
|
- [FAQ](./docs/faq.md)
|
||||||
|
|
||||||
## 📜 Roadmap
|
## 📜 Roadmap
|
||||||
|
|||||||
102
README_ja.md
@ -15,18 +15,33 @@
|
|||||||
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
||||||
</a>
|
</a>
|
||||||
<a href="https://demo.ragflow.io" target="_blank">
|
<a href="https://demo.ragflow.io" target="_blank">
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
<img alt="Static Badge" src="https://img.shields.io/badge/Online-Demo-4e6b99"></a>
|
||||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.3.2-brightgreen"
|
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.6.0-brightgreen"
|
||||||
alt="docker pull infiniflow/ragflow:v0.3.2"></a>
|
alt="docker pull infiniflow/ragflow:v0.6.0"></a>
|
||||||
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
||||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=7d09f1" alt="license">
|
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=1570EF" alt="license">
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
## 💡 RAGFlow とは?
|
## 💡 RAGFlow とは?
|
||||||
|
|
||||||
[RAGFlow](https://demo.ragflow.io) は、深い文書理解に基づいたオープンソースの RAG (Retrieval-Augmented Generation) エンジンである。LLM(大規模言語モデル)を組み合わせることで、様々な複雑なフォーマットのデータから根拠のある引用に裏打ちされた、信頼できる質問応答機能を実現し、あらゆる規模のビジネスに適した RAG ワークフローを提供します。
|
[RAGFlow](https://ragflow.io/) は、深い文書理解に基づいたオープンソースの RAG (Retrieval-Augmented Generation) エンジンである。LLM(大規模言語モデル)を組み合わせることで、様々な複雑なフォーマットのデータから根拠のある引用に裏打ちされた、信頼できる質問応答機能を実現し、あらゆる規模のビジネスに適した RAG ワークフローを提供します。
|
||||||
|
|
||||||
|
## 📌 最新情報
|
||||||
|
|
||||||
|
- 2024-05-21 ストリーミング出力とテキストチャンク取得APIをサポート。
|
||||||
|
- 2024-05-15 OpenAI GPT-4oを統合しました。
|
||||||
|
- 2024-05-08 LLM DeepSeek-V2を統合しました。
|
||||||
|
- 2024-04-26 「ファイル管理」機能を追加しました。
|
||||||
|
- 2024-04-19 会話 API をサポートします ([詳細](./docs/conversation_api.md))。
|
||||||
|
- 2024-04-16 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) から埋め込みモデル「bce-embedding-base_v1」を追加します。
|
||||||
|
- 2024-04-16 [FastEmbed](https://github.com/qdrant/fastembed) は、軽量かつ高速な埋め込み用に設計されています。
|
||||||
|
- 2024-04-11 ローカル LLM デプロイメント用に [Xinference](./docs/xinference.md) をサポートします。
|
||||||
|
- 2024-04-10 メソッド「Laws」に新しいレイアウト認識モデルを追加します。
|
||||||
|
- 2024-04-08 [Ollama](./docs/ollama.md) を使用した大規模モデルのローカライズされたデプロイメントをサポートします。
|
||||||
|
- 2024-04-07 中国語インターフェースをサポートします。
|
||||||
|
|
||||||
|
|
||||||
## 🌟 主な特徴
|
## 🌟 主な特徴
|
||||||
|
|
||||||
@ -56,16 +71,6 @@
|
|||||||
- 複数の想起と融合された再ランク付け。
|
- 複数の想起と融合された再ランク付け。
|
||||||
- 直感的な API によってビジネスとの統合がシームレスに。
|
- 直感的な API によってビジネスとの統合がシームレスに。
|
||||||
|
|
||||||
## 📌 最新の機能
|
|
||||||
|
|
||||||
- 2024-04-19 会話 API をサポートします ([詳細](./docs/conversation_api.md))。
|
|
||||||
- 2024-04-16 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) から埋め込みモデル「bce-embedding-base_v1」を追加します。
|
|
||||||
- 2024-04-16 [FastEmbed](https://github.com/qdrant/fastembed) は、軽量かつ高速な埋め込み用に設計されています。
|
|
||||||
- 2024-04-11 ローカル LLM デプロイメント用に [Xinference](./docs/xinference.md) をサポートします。
|
|
||||||
- 2024-04-10 メソッド「Laws」に新しいレイアウト認識モデルを追加します。
|
|
||||||
- 2024-04-08 [Ollama](./docs/ollama.md) を使用した大規模モデルのローカライズされたデプロイメントをサポートします。
|
|
||||||
- 2024-04-07 中国語インターフェースをサポートします。
|
|
||||||
|
|
||||||
## 🔎 システム構成
|
## 🔎 システム構成
|
||||||
|
|
||||||
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||||
@ -119,7 +124,9 @@
|
|||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
> コアイメージのサイズは約 15 GB で、ロードに時間がかかる場合があります。
|
> 上記のコマンドを実行すると、RAGFlowの開発版dockerイメージが自動的にダウンロードされます。 特定のバージョンのDockerイメージをダウンロードして実行したい場合は、docker/.envファイルのRAGFLOW_VERSION変数を見つけて、対応するバージョンに変更してください。 例えば、RAGFLOW_VERSION=v0.6.0として、上記のコマンドを実行してください。
|
||||||
|
|
||||||
|
> コアイメージのサイズは約 9 GB で、ロードに時間がかかる場合があります。
|
||||||
|
|
||||||
4. サーバーを立ち上げた後、サーバーの状態を確認する:
|
4. サーバーを立ち上げた後、サーバーの状態を確認する:
|
||||||
|
|
||||||
@ -179,14 +186,75 @@
|
|||||||
```bash
|
```bash
|
||||||
$ git clone https://github.com/infiniflow/ragflow.git
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
$ cd ragflow/
|
$ cd ragflow/
|
||||||
$ docker build -t infiniflow/ragflow:v0.3.2 .
|
$ docker build -t infiniflow/ragflow:v0.6.0 .
|
||||||
$ cd ragflow/docker
|
$ cd ragflow/docker
|
||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 🛠️ ソースコードからサービスを起動する方法
|
||||||
|
|
||||||
|
ソースコードからサービスを起動する場合は、以下の手順に従ってください:
|
||||||
|
|
||||||
|
1. リポジトリをクローンします
|
||||||
|
```bash
|
||||||
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
|
$ cd ragflow/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 仮想環境を作成します(AnacondaまたはMinicondaがインストールされていることを確認してください)
|
||||||
|
```bash
|
||||||
|
$ conda create -n ragflow python=3.11.0
|
||||||
|
$ conda activate ragflow
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
CUDAのバージョンが12.0以上の場合、以下の追加コマンドを実行してください:
|
||||||
|
```bash
|
||||||
|
$ pip uninstall -y onnxruntime-gpu
|
||||||
|
$ pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
||||||
|
```
|
||||||
|
|
||||||
|
3. エントリースクリプトをコピーし、環境変数を設定します
|
||||||
|
```bash
|
||||||
|
$ cp docker/entrypoint.sh .
|
||||||
|
$ vi entrypoint.sh
|
||||||
|
```
|
||||||
|
以下のコマンドでPythonのパスとragflowプロジェクトのパスを取得します:
|
||||||
|
```bash
|
||||||
|
$ which python
|
||||||
|
$ pwd
|
||||||
|
```
|
||||||
|
|
||||||
|
`which python`の出力を`PY`の値として、`pwd`の出力を`PYTHONPATH`の値として設定します。
|
||||||
|
|
||||||
|
`LD_LIBRARY_PATH`が既に設定されている場合は、コメントアウトできます。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 実際の状況に応じて設定を調整してください。以下の二つのexportは新たに追加された設定です
|
||||||
|
PY=${PY}
|
||||||
|
export PYTHONPATH=${PYTHONPATH}
|
||||||
|
# オプション:Hugging Faceミラーを追加
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
```
|
||||||
|
|
||||||
|
4. 基本サービスを起動します
|
||||||
|
```bash
|
||||||
|
$ cd docker
|
||||||
|
$ docker compose -f docker-compose-base.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
5. 設定ファイルを確認します
|
||||||
|
**docker/.env**内の設定が**conf/service_conf.yaml**内の設定と一致していることを確認してください。**service_conf.yaml**内の関連サービスのIPアドレスとポートは、ローカルマシンのIPアドレスとコンテナが公開するポートに変更する必要があります。
|
||||||
|
|
||||||
|
6. サービスを起動します
|
||||||
|
```bash
|
||||||
|
$ chmod +x ./entrypoint.sh
|
||||||
|
$ bash ./entrypoint.sh
|
||||||
|
```
|
||||||
|
|
||||||
## 📚 ドキュメンテーション
|
## 📚 ドキュメンテーション
|
||||||
|
|
||||||
|
- [Quickstart](./docs/quickstart.md)
|
||||||
- [FAQ](./docs/faq.md)
|
- [FAQ](./docs/faq.md)
|
||||||
|
|
||||||
## 📜 ロードマップ
|
## 📜 ロードマップ
|
||||||
|
|||||||
121
README_zh.md
@ -15,18 +15,31 @@
|
|||||||
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
||||||
</a>
|
</a>
|
||||||
<a href="https://demo.ragflow.io" target="_blank">
|
<a href="https://demo.ragflow.io" target="_blank">
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
<img alt="Static Badge" src="https://img.shields.io/badge/Online-Demo-4e6b99"></a>
|
||||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.3.2-brightgreen"
|
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.6.0-brightgreen"
|
||||||
alt="docker pull infiniflow/ragflow:v0.3.2"></a>
|
alt="docker pull infiniflow/ragflow:v0.6.0"></a>
|
||||||
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
||||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=7d09f1" alt="license">
|
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=1570EF" alt="license">
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
## 💡 RAGFlow 是什么?
|
## 💡 RAGFlow 是什么?
|
||||||
|
|
||||||
[RAGFlow](https://demo.ragflow.io) 是一款基于深度文档理解构建的开源 RAG(Retrieval-Augmented Generation)引擎。RAGFlow 可以为各种规模的企业及个人提供一套精简的 RAG 工作流程,结合大语言模型(LLM)针对用户各类不同的复杂格式数据提供可靠的问答以及有理有据的引用。
|
[RAGFlow](https://ragflow.io/) 是一款基于深度文档理解构建的开源 RAG(Retrieval-Augmented Generation)引擎。RAGFlow 可以为各种规模的企业及个人提供一套精简的 RAG 工作流程,结合大语言模型(LLM)针对用户各类不同的复杂格式数据提供可靠的问答以及有理有据的引用。
|
||||||
|
|
||||||
|
## 📌 近期更新
|
||||||
|
|
||||||
|
- 2024-05-21 支持流式结果输出和文本块获取API。
|
||||||
|
- 2024-05-15 集成大模型 OpenAI GPT-4o。
|
||||||
|
- 2024-05-08 集成大模型 DeepSeek。
|
||||||
|
- 2024-04-26 增添了'文件管理'功能。
|
||||||
|
- 2024-04-19 支持对话 API ([更多](./docs/conversation_api.md))。
|
||||||
|
- 2024-04-16 集成嵌入模型 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) 和 专为轻型和高速嵌入而设计的 [FastEmbed](https://github.com/qdrant/fastembed)。
|
||||||
|
- 2024-04-11 支持用 [Xinference](./docs/xinference.md) 本地化部署大模型。
|
||||||
|
- 2024-04-10 为‘Laws’版面分析增加了底层模型。
|
||||||
|
- 2024-04-08 支持用 [Ollama](./docs/ollama.md) 本地化部署大模型。
|
||||||
|
- 2024-04-07 支持中文界面。
|
||||||
|
|
||||||
## 🌟 主要功能
|
## 🌟 主要功能
|
||||||
|
|
||||||
@ -56,16 +69,6 @@
|
|||||||
- 基于多路召回、融合重排序。
|
- 基于多路召回、融合重排序。
|
||||||
- 提供易用的 API,可以轻松集成到各类企业系统。
|
- 提供易用的 API,可以轻松集成到各类企业系统。
|
||||||
|
|
||||||
## 📌 新增功能
|
|
||||||
|
|
||||||
- 2024-04-19 支持对话 API ([更多](./docs/conversation_api.md)).
|
|
||||||
- 2024-04-16 添加嵌入模型 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) 。
|
|
||||||
- 2024-04-16 添加 [FastEmbed](https://github.com/qdrant/fastembed) 专为轻型和高速嵌入而设计。
|
|
||||||
- 2024-04-11 支持用 [Xinference](./docs/xinference.md) 本地化部署大模型。
|
|
||||||
- 2024-04-10 为‘Laws’版面分析增加了底层模型。
|
|
||||||
- 2024-04-08 支持用 [Ollama](./docs/ollama.md) 本地化部署大模型。
|
|
||||||
- 2024-04-07 支持中文界面。
|
|
||||||
|
|
||||||
## 🔎 系统架构
|
## 🔎 系统架构
|
||||||
|
|
||||||
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||||
@ -119,7 +122,9 @@
|
|||||||
$ docker compose -f docker-compose-CN.yml up -d
|
$ docker compose -f docker-compose-CN.yml up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
> 核心镜像文件大约 15 GB,可能需要一定时间拉取。请耐心等待。
|
> 请注意,运行上述命令会自动下载 RAGFlow 的开发版本 docker 镜像。如果你想下载并运行特定版本的 docker 镜像,请在 docker/.env 文件中找到 RAGFLOW_VERSION 变量,将其改为对应版本。例如 RAGFLOW_VERSION=v0.6.0,然后运行上述命令。
|
||||||
|
|
||||||
|
> 核心镜像文件大约 9 GB,可能需要一定时间拉取。请耐心等待。
|
||||||
|
|
||||||
4. 服务器启动成功后再次确认服务器状态:
|
4. 服务器启动成功后再次确认服务器状态:
|
||||||
|
|
||||||
@ -179,14 +184,96 @@
|
|||||||
```bash
|
```bash
|
||||||
$ git clone https://github.com/infiniflow/ragflow.git
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
$ cd ragflow/
|
$ cd ragflow/
|
||||||
$ docker build -t infiniflow/ragflow:v0.3.2 .
|
$ docker build -t infiniflow/ragflow:v0.6.0 .
|
||||||
$ cd ragflow/docker
|
$ cd ragflow/docker
|
||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 🛠️ 源码启动服务
|
||||||
|
|
||||||
|
如需从源码启动服务,请参考以下步骤:
|
||||||
|
|
||||||
|
1. 克隆仓库
|
||||||
|
```bash
|
||||||
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
|
$ cd ragflow/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 创建虚拟环境(确保已安装 Anaconda 或 Miniconda)
|
||||||
|
```bash
|
||||||
|
$ conda create -n ragflow python=3.11.0
|
||||||
|
$ conda activate ragflow
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
如果cuda > 12.0,需额外执行以下命令:
|
||||||
|
```bash
|
||||||
|
$ pip uninstall -y onnxruntime-gpu
|
||||||
|
$ pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
||||||
|
```
|
||||||
|
|
||||||
|
3. 拷贝入口脚本并配置环境变量
|
||||||
|
```bash
|
||||||
|
$ cp docker/entrypoint.sh .
|
||||||
|
$ vi entrypoint.sh
|
||||||
|
```
|
||||||
|
使用以下命令获取python路径及ragflow项目路径:
|
||||||
|
```bash
|
||||||
|
$ which python
|
||||||
|
$ pwd
|
||||||
|
```
|
||||||
|
|
||||||
|
将上述`which python`的输出作为`PY`的值,将`pwd`的输出作为`PYTHONPATH`的值。
|
||||||
|
|
||||||
|
`LD_LIBRARY_PATH`如果环境已经配置好,可以注释掉。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 此处配置需要按照实际情况调整,两个export为新增配置
|
||||||
|
PY=${PY}
|
||||||
|
export PYTHONPATH=${PYTHONPATH}
|
||||||
|
# 可选:添加Hugging Face镜像
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
```
|
||||||
|
|
||||||
|
4. 启动基础服务
|
||||||
|
```bash
|
||||||
|
$ cd docker
|
||||||
|
$ docker compose -f docker-compose-base.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
5. 检查配置文件
|
||||||
|
确保**docker/.env**中的配置与**conf/service_conf.yaml**中配置一致, **service_conf.yaml**中相关服务的IP地址与端口应该改成本机IP地址及容器映射出来的端口。
|
||||||
|
|
||||||
|
6. 启动服务
|
||||||
|
```bash
|
||||||
|
$ chmod +x ./entrypoint.sh
|
||||||
|
$ bash ./entrypoint.sh
|
||||||
|
```
|
||||||
|
7. 启动WebUI服务
|
||||||
|
```bash
|
||||||
|
$ cd web
|
||||||
|
$ npm install --registry=https://registry.npmmirror.com --force
|
||||||
|
$ vim .umirc.ts
|
||||||
|
# 修改proxy.target为127.0.0.1:9380
|
||||||
|
$ npm run dev
|
||||||
|
```
|
||||||
|
|
||||||
|
8. 部署WebUI服务
|
||||||
|
```bash
|
||||||
|
$ cd web
|
||||||
|
$ npm install --registry=https://registry.npmmirror.com --force
|
||||||
|
$ umi build
|
||||||
|
$ mkdir -p /ragflow/web
|
||||||
|
$ cp -r dist /ragflow/web
|
||||||
|
$ apt install nginx -y
|
||||||
|
$ cp ../docker/nginx/proxy.conf /etc/nginx
|
||||||
|
$ cp ../docker/nginx/nginx.conf /etc/nginx
|
||||||
|
$ cp ../docker/nginx/ragflow.conf /etc/nginx/conf.d
|
||||||
|
$ systemctl start nginx
|
||||||
|
```
|
||||||
## 📚 技术文档
|
## 📚 技术文档
|
||||||
|
|
||||||
|
- [Quickstart](./docs/quickstart.md)
|
||||||
- [FAQ](./docs/faq.md)
|
- [FAQ](./docs/faq.md)
|
||||||
|
|
||||||
## 📜 路线图
|
## 📜 路线图
|
||||||
|
|||||||
@ -13,19 +13,23 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from flask import request
|
from flask import request, Response
|
||||||
from flask_login import login_required, current_user
|
from flask_login import login_required, current_user
|
||||||
|
|
||||||
from api.db import FileType, ParserType
|
from api.db import FileType, ParserType
|
||||||
from api.db.db_models import APIToken, API4Conversation
|
from api.db.db_models import APIToken, API4Conversation, Task
|
||||||
from api.db.services import duplicate_name
|
from api.db.services import duplicate_name
|
||||||
from api.db.services.api_service import APITokenService, API4ConversationService
|
from api.db.services.api_service import APITokenService, API4ConversationService
|
||||||
from api.db.services.dialog_service import DialogService, chat
|
from api.db.services.dialog_service import DialogService, chat
|
||||||
from api.db.services.document_service import DocumentService
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
|
from api.db.services.task_service import queue_tasks, TaskService
|
||||||
from api.db.services.user_service import UserTenantService
|
from api.db.services.user_service import UserTenantService
|
||||||
from api.settings import RetCode
|
from api.settings import RetCode
|
||||||
from api.utils import get_uuid, current_timestamp, datetime_format
|
from api.utils import get_uuid, current_timestamp, datetime_format
|
||||||
@ -33,8 +37,11 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge
|
|||||||
from itsdangerous import URLSafeTimedSerializer
|
from itsdangerous import URLSafeTimedSerializer
|
||||||
|
|
||||||
from api.utils.file_utils import filename_type, thumbnail
|
from api.utils.file_utils import filename_type, thumbnail
|
||||||
from rag.utils import MINIO
|
from rag.utils.minio_conn import MINIO
|
||||||
|
|
||||||
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
from rag.nlp import search
|
||||||
|
from elasticsearch_dsl import Q
|
||||||
|
|
||||||
def generate_confirmation_token(tenent_id):
|
def generate_confirmation_token(tenent_id):
|
||||||
serializer = URLSafeTimedSerializer(tenent_id)
|
serializer = URLSafeTimedSerializer(tenent_id)
|
||||||
@ -164,6 +171,7 @@ def completion():
|
|||||||
e, conv = API4ConversationService.get_by_id(req["conversation_id"])
|
e, conv = API4ConversationService.get_by_id(req["conversation_id"])
|
||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(retmsg="Conversation not found!")
|
return get_data_error_result(retmsg="Conversation not found!")
|
||||||
|
if "quote" not in req: req["quote"] = False
|
||||||
|
|
||||||
msg = []
|
msg = []
|
||||||
for m in req["messages"]:
|
for m in req["messages"]:
|
||||||
@ -180,13 +188,48 @@ def completion():
|
|||||||
return get_data_error_result(retmsg="Dialog not found!")
|
return get_data_error_result(retmsg="Dialog not found!")
|
||||||
del req["conversation_id"]
|
del req["conversation_id"]
|
||||||
del req["messages"]
|
del req["messages"]
|
||||||
ans = chat(dia, msg, **req)
|
|
||||||
if not conv.reference:
|
if not conv.reference:
|
||||||
conv.reference = []
|
conv.reference = []
|
||||||
conv.reference.append(ans["reference"])
|
conv.message.append({"role": "assistant", "content": ""})
|
||||||
conv.message.append({"role": "assistant", "content": ans["answer"]})
|
conv.reference.append({"chunks": [], "doc_aggs": []})
|
||||||
API4ConversationService.append_message(conv.id, conv.to_dict())
|
|
||||||
return get_json_result(data=ans)
|
def fillin_conv(ans):
|
||||||
|
nonlocal conv
|
||||||
|
if not conv.reference:
|
||||||
|
conv.reference.append(ans["reference"])
|
||||||
|
else: conv.reference[-1] = ans["reference"]
|
||||||
|
conv.message[-1] = {"role": "assistant", "content": ans["answer"]}
|
||||||
|
|
||||||
|
def stream():
|
||||||
|
nonlocal dia, msg, req, conv
|
||||||
|
try:
|
||||||
|
for ans in chat(dia, msg, True, **req):
|
||||||
|
fillin_conv(ans)
|
||||||
|
yield "data:"+json.dumps({"retcode": 0, "retmsg": "", "data": ans}, ensure_ascii=False) + "\n\n"
|
||||||
|
API4ConversationService.append_message(conv.id, conv.to_dict())
|
||||||
|
except Exception as e:
|
||||||
|
yield "data:" + json.dumps({"retcode": 500, "retmsg": str(e),
|
||||||
|
"data": {"answer": "**ERROR**: "+str(e), "reference": []}},
|
||||||
|
ensure_ascii=False) + "\n\n"
|
||||||
|
yield "data:"+json.dumps({"retcode": 0, "retmsg": "", "data": True}, ensure_ascii=False) + "\n\n"
|
||||||
|
|
||||||
|
if req.get("stream", True):
|
||||||
|
resp = Response(stream(), mimetype="text/event-stream")
|
||||||
|
resp.headers.add_header("Cache-control", "no-cache")
|
||||||
|
resp.headers.add_header("Connection", "keep-alive")
|
||||||
|
resp.headers.add_header("X-Accel-Buffering", "no")
|
||||||
|
resp.headers.add_header("Content-Type", "text/event-stream; charset=utf-8")
|
||||||
|
return resp
|
||||||
|
else:
|
||||||
|
answer = None
|
||||||
|
for ans in chat(dia, msg, **req):
|
||||||
|
answer = ans
|
||||||
|
fillin_conv(ans)
|
||||||
|
API4ConversationService.append_message(conv.id, conv.to_dict())
|
||||||
|
break
|
||||||
|
return get_json_result(data=answer)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
|
||||||
@ -233,6 +276,13 @@ def upload():
|
|||||||
if file.filename == '':
|
if file.filename == '':
|
||||||
return get_json_result(
|
return get_json_result(
|
||||||
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
|
||||||
|
root_folder = FileService.get_root_folder(tenant_id)
|
||||||
|
pf_id = root_folder["id"]
|
||||||
|
FileService.init_knowledgebase_docs(pf_id, tenant_id)
|
||||||
|
kb_root_folder = FileService.get_kb_folder(tenant_id)
|
||||||
|
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if DocumentService.get_doc_count(kb.tenant_id) >= int(os.environ.get('MAX_FILE_NUM_PER_USER', 8192)):
|
if DocumentService.get_doc_count(kb.tenant_id) >= int(os.environ.get('MAX_FILE_NUM_PER_USER', 8192)):
|
||||||
return get_data_error_result(
|
return get_data_error_result(
|
||||||
@ -264,11 +314,82 @@ def upload():
|
|||||||
"size": len(blob),
|
"size": len(blob),
|
||||||
"thumbnail": thumbnail(filename, blob)
|
"thumbnail": thumbnail(filename, blob)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
form_data=request.form
|
||||||
|
if "parser_id" in form_data.keys():
|
||||||
|
if request.form.get("parser_id").strip() in list(vars(ParserType).values())[1:-3]:
|
||||||
|
doc["parser_id"] = request.form.get("parser_id").strip()
|
||||||
if doc["type"] == FileType.VISUAL:
|
if doc["type"] == FileType.VISUAL:
|
||||||
doc["parser_id"] = ParserType.PICTURE.value
|
doc["parser_id"] = ParserType.PICTURE.value
|
||||||
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
||||||
doc["parser_id"] = ParserType.PRESENTATION.value
|
doc["parser_id"] = ParserType.PRESENTATION.value
|
||||||
doc = DocumentService.insert(doc)
|
|
||||||
return get_json_result(data=doc.to_json())
|
doc_result = DocumentService.insert(doc)
|
||||||
|
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
|
||||||
|
if "run" in form_data.keys():
|
||||||
|
if request.form.get("run").strip() == "1":
|
||||||
|
try:
|
||||||
|
info = {"run": 1, "progress": 0}
|
||||||
|
info["progress_msg"] = ""
|
||||||
|
info["chunk_num"] = 0
|
||||||
|
info["token_num"] = 0
|
||||||
|
DocumentService.update_by_id(doc["id"], info)
|
||||||
|
# if str(req["run"]) == TaskStatus.CANCEL.value:
|
||||||
|
tenant_id = DocumentService.get_tenant_id(doc["id"])
|
||||||
|
if not tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
|
||||||
|
#e, doc = DocumentService.get_by_id(doc["id"])
|
||||||
|
TaskService.filter_delete([Task.doc_id == doc["id"]])
|
||||||
|
e, doc = DocumentService.get_by_id(doc["id"])
|
||||||
|
doc = doc.to_dict()
|
||||||
|
doc["tenant_id"] = tenant_id
|
||||||
|
bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
|
||||||
|
queue_tasks(doc, bucket, name)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
return get_json_result(data=doc_result.to_json())
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/list_chunks', methods=['POST'])
|
||||||
|
# @login_required
|
||||||
|
def list_chunks():
|
||||||
|
token = request.headers.get('Authorization').split()[1]
|
||||||
|
objs = APIToken.query(token=token)
|
||||||
|
if not objs:
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg='Token is not valid!"', retcode=RetCode.AUTHENTICATION_ERROR)
|
||||||
|
|
||||||
|
form_data = request.form
|
||||||
|
|
||||||
|
try:
|
||||||
|
if "doc_name" in form_data.keys():
|
||||||
|
tenant_id = DocumentService.get_tenant_id_by_name(form_data['doc_name'])
|
||||||
|
q = Q("match", docnm_kwd=form_data['doc_name'])
|
||||||
|
|
||||||
|
elif "doc_id" in form_data.keys():
|
||||||
|
tenant_id = DocumentService.get_tenant_id(form_data['doc_id'])
|
||||||
|
q = Q("match", doc_id=form_data['doc_id'])
|
||||||
|
else:
|
||||||
|
return get_json_result(
|
||||||
|
data=False,retmsg="Can't find doc_name or doc_id"
|
||||||
|
)
|
||||||
|
|
||||||
|
res_es_search = ELASTICSEARCH.search(q,idxnm=search.index_name(tenant_id),timeout="600s")
|
||||||
|
|
||||||
|
res = [{} for _ in range(len(res_es_search['hits']['hits']))]
|
||||||
|
|
||||||
|
for index , chunk in enumerate(res_es_search['hits']['hits']):
|
||||||
|
res[index]['doc_name'] = chunk['_source']['docnm_kwd']
|
||||||
|
res[index]['content'] = chunk['_source']['content_with_weight']
|
||||||
|
if 'img_id' in chunk['_source'].keys():
|
||||||
|
res[index]['img_id'] = chunk['_source']['img_id']
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
return get_json_result(data=res)
|
||||||
|
|||||||
@ -20,8 +20,9 @@ from flask_login import login_required, current_user
|
|||||||
from elasticsearch_dsl import Q
|
from elasticsearch_dsl import Q
|
||||||
|
|
||||||
from rag.app.qa import rmPrefix, beAdoc
|
from rag.app.qa import rmPrefix, beAdoc
|
||||||
from rag.nlp import search, huqie
|
from rag.nlp import search, rag_tokenizer
|
||||||
from rag.utils import ELASTICSEARCH, rmSpace
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
from rag.utils import rmSpace
|
||||||
from api.db import LLMType, ParserType
|
from api.db import LLMType, ParserType
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from api.db.services.llm_service import TenantLLMService
|
from api.db.services.llm_service import TenantLLMService
|
||||||
@ -37,7 +38,7 @@ import re
|
|||||||
@manager.route('/list', methods=['POST'])
|
@manager.route('/list', methods=['POST'])
|
||||||
@login_required
|
@login_required
|
||||||
@validate_request("doc_id")
|
@validate_request("doc_id")
|
||||||
def list():
|
def list_chunk():
|
||||||
req = request.json
|
req = request.json
|
||||||
doc_id = req["doc_id"]
|
doc_id = req["doc_id"]
|
||||||
page = int(req.get("page", 1))
|
page = int(req.get("page", 1))
|
||||||
@ -124,10 +125,10 @@ def set():
|
|||||||
d = {
|
d = {
|
||||||
"id": req["chunk_id"],
|
"id": req["chunk_id"],
|
||||||
"content_with_weight": req["content_with_weight"]}
|
"content_with_weight": req["content_with_weight"]}
|
||||||
d["content_ltks"] = huqie.qie(req["content_with_weight"])
|
d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
d["important_kwd"] = req["important_kwd"]
|
d["important_kwd"] = req["important_kwd"]
|
||||||
d["important_tks"] = huqie.qie(" ".join(req["important_kwd"]))
|
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
|
||||||
if "available_int" in req:
|
if "available_int" in req:
|
||||||
d["available_int"] = req["available_int"]
|
d["available_int"] = req["available_int"]
|
||||||
|
|
||||||
@ -151,7 +152,7 @@ def set():
|
|||||||
retmsg="Q&A must be separated by TAB/ENTER key.")
|
retmsg="Q&A must be separated by TAB/ENTER key.")
|
||||||
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
||||||
d = beAdoc(d, arr[0], arr[1], not any(
|
d = beAdoc(d, arr[0], arr[1], not any(
|
||||||
[huqie.is_chinese(t) for t in q + a]))
|
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
||||||
|
|
||||||
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
||||||
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
||||||
@ -201,11 +202,11 @@ def create():
|
|||||||
md5 = hashlib.md5()
|
md5 = hashlib.md5()
|
||||||
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
||||||
chunck_id = md5.hexdigest()
|
chunck_id = md5.hexdigest()
|
||||||
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
|
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
|
||||||
"content_with_weight": req["content_with_weight"]}
|
"content_with_weight": req["content_with_weight"]}
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
d["important_kwd"] = req.get("important_kwd", [])
|
d["important_kwd"] = req.get("important_kwd", [])
|
||||||
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
|
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
|
||||||
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
||||||
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
||||||
|
|
||||||
|
|||||||
@ -13,12 +13,13 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
from flask import request
|
from flask import request, Response, jsonify
|
||||||
from flask_login import login_required
|
from flask_login import login_required
|
||||||
from api.db.services.dialog_service import DialogService, ConversationService, chat
|
from api.db.services.dialog_service import DialogService, ConversationService, chat
|
||||||
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||||
from api.utils import get_uuid
|
from api.utils import get_uuid
|
||||||
from api.utils.api_utils import get_json_result
|
from api.utils.api_utils import get_json_result
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
@manager.route('/set', methods=['POST'])
|
@manager.route('/set', methods=['POST'])
|
||||||
@ -103,9 +104,12 @@ def list_convsersation():
|
|||||||
|
|
||||||
@manager.route('/completion', methods=['POST'])
|
@manager.route('/completion', methods=['POST'])
|
||||||
@login_required
|
@login_required
|
||||||
@validate_request("conversation_id", "messages")
|
#@validate_request("conversation_id", "messages")
|
||||||
def completion():
|
def completion():
|
||||||
req = request.json
|
req = request.json
|
||||||
|
#req = {"conversation_id": "9aaaca4c11d311efa461fa163e197198", "messages": [
|
||||||
|
# {"role": "user", "content": "上海有吗?"}
|
||||||
|
#]}
|
||||||
msg = []
|
msg = []
|
||||||
for m in req["messages"]:
|
for m in req["messages"]:
|
||||||
if m["role"] == "system":
|
if m["role"] == "system":
|
||||||
@ -123,13 +127,48 @@ def completion():
|
|||||||
return get_data_error_result(retmsg="Dialog not found!")
|
return get_data_error_result(retmsg="Dialog not found!")
|
||||||
del req["conversation_id"]
|
del req["conversation_id"]
|
||||||
del req["messages"]
|
del req["messages"]
|
||||||
ans = chat(dia, msg, **req)
|
|
||||||
if not conv.reference:
|
if not conv.reference:
|
||||||
conv.reference = []
|
conv.reference = []
|
||||||
conv.reference.append(ans["reference"])
|
conv.message.append({"role": "assistant", "content": ""})
|
||||||
conv.message.append({"role": "assistant", "content": ans["answer"]})
|
conv.reference.append({"chunks": [], "doc_aggs": []})
|
||||||
ConversationService.update_by_id(conv.id, conv.to_dict())
|
|
||||||
return get_json_result(data=ans)
|
def fillin_conv(ans):
|
||||||
|
nonlocal conv
|
||||||
|
if not conv.reference:
|
||||||
|
conv.reference.append(ans["reference"])
|
||||||
|
else: conv.reference[-1] = ans["reference"]
|
||||||
|
conv.message[-1] = {"role": "assistant", "content": ans["answer"]}
|
||||||
|
|
||||||
|
def stream():
|
||||||
|
nonlocal dia, msg, req, conv
|
||||||
|
try:
|
||||||
|
for ans in chat(dia, msg, True, **req):
|
||||||
|
fillin_conv(ans)
|
||||||
|
yield "data:"+json.dumps({"retcode": 0, "retmsg": "", "data": ans}, ensure_ascii=False) + "\n\n"
|
||||||
|
ConversationService.update_by_id(conv.id, conv.to_dict())
|
||||||
|
except Exception as e:
|
||||||
|
yield "data:" + json.dumps({"retcode": 500, "retmsg": str(e),
|
||||||
|
"data": {"answer": "**ERROR**: "+str(e), "reference": []}},
|
||||||
|
ensure_ascii=False) + "\n\n"
|
||||||
|
yield "data:"+json.dumps({"retcode": 0, "retmsg": "", "data": True}, ensure_ascii=False) + "\n\n"
|
||||||
|
|
||||||
|
if req.get("stream", True):
|
||||||
|
resp = Response(stream(), mimetype="text/event-stream")
|
||||||
|
resp.headers.add_header("Cache-control", "no-cache")
|
||||||
|
resp.headers.add_header("Connection", "keep-alive")
|
||||||
|
resp.headers.add_header("X-Accel-Buffering", "no")
|
||||||
|
resp.headers.add_header("Content-Type", "text/event-stream; charset=utf-8")
|
||||||
|
return resp
|
||||||
|
|
||||||
|
else:
|
||||||
|
answer = None
|
||||||
|
for ans in chat(dia, msg, **req):
|
||||||
|
answer = ans
|
||||||
|
fillin_conv(ans)
|
||||||
|
ConversationService.update_by_id(conv.id, conv.to_dict())
|
||||||
|
break
|
||||||
|
return get_json_result(data=answer)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|||||||
@ -35,13 +35,7 @@ def set_dialog():
|
|||||||
top_n = req.get("top_n", 6)
|
top_n = req.get("top_n", 6)
|
||||||
similarity_threshold = req.get("similarity_threshold", 0.1)
|
similarity_threshold = req.get("similarity_threshold", 0.1)
|
||||||
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
|
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
|
||||||
llm_setting = req.get("llm_setting", {
|
llm_setting = req.get("llm_setting", {})
|
||||||
"temperature": 0.1,
|
|
||||||
"top_p": 0.3,
|
|
||||||
"frequency_penalty": 0.7,
|
|
||||||
"presence_penalty": 0.4,
|
|
||||||
"max_tokens": 215
|
|
||||||
})
|
|
||||||
default_prompt = {
|
default_prompt = {
|
||||||
"system": """你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。
|
"system": """你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。
|
||||||
以下是知识库:
|
以下是知识库:
|
||||||
@ -142,7 +136,7 @@ def get_kb_names(kb_ids):
|
|||||||
|
|
||||||
@manager.route('/list', methods=['GET'])
|
@manager.route('/list', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
def list():
|
def list_dialogs():
|
||||||
try:
|
try:
|
||||||
diags = DialogService.query(
|
diags = DialogService.query(
|
||||||
tenant_id=current_user.id,
|
tenant_id=current_user.id,
|
||||||
|
|||||||
@ -14,7 +14,6 @@
|
|||||||
# limitations under the License
|
# limitations under the License
|
||||||
#
|
#
|
||||||
|
|
||||||
import base64
|
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
@ -23,13 +22,18 @@ import flask
|
|||||||
from elasticsearch_dsl import Q
|
from elasticsearch_dsl import Q
|
||||||
from flask import request
|
from flask import request
|
||||||
from flask_login import login_required, current_user
|
from flask_login import login_required, current_user
|
||||||
|
|
||||||
|
from api.db.db_models import Task, File
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
|
from api.db.services.task_service import TaskService, queue_tasks
|
||||||
from rag.nlp import search
|
from rag.nlp import search
|
||||||
from rag.utils import ELASTICSEARCH
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
from api.db.services import duplicate_name
|
from api.db.services import duplicate_name
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||||
from api.utils import get_uuid
|
from api.utils import get_uuid
|
||||||
from api.db import FileType, TaskStatus, ParserType
|
from api.db import FileType, TaskStatus, ParserType, FileSource
|
||||||
from api.db.services.document_service import DocumentService
|
from api.db.services.document_service import DocumentService
|
||||||
from api.settings import RetCode
|
from api.settings import RetCode
|
||||||
from api.utils.api_utils import get_json_result
|
from api.utils.api_utils import get_json_result
|
||||||
@ -48,55 +52,68 @@ def upload():
|
|||||||
if 'file' not in request.files:
|
if 'file' not in request.files:
|
||||||
return get_json_result(
|
return get_json_result(
|
||||||
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
file = request.files['file']
|
|
||||||
if file.filename == '':
|
file_objs = request.files.getlist('file')
|
||||||
|
for file_obj in file_objs:
|
||||||
|
if file_obj.filename == '':
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
|
||||||
|
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||||
|
if not e:
|
||||||
|
raise LookupError("Can't find this knowledgebase!")
|
||||||
|
|
||||||
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
|
pf_id = root_folder["id"]
|
||||||
|
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
||||||
|
kb_root_folder = FileService.get_kb_folder(current_user.id)
|
||||||
|
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
|
||||||
|
|
||||||
|
err = []
|
||||||
|
for file in file_objs:
|
||||||
|
try:
|
||||||
|
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
||||||
|
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
|
||||||
|
raise RuntimeError("Exceed the maximum file number of a free user!")
|
||||||
|
|
||||||
|
filename = duplicate_name(
|
||||||
|
DocumentService.query,
|
||||||
|
name=file.filename,
|
||||||
|
kb_id=kb.id)
|
||||||
|
filetype = filename_type(filename)
|
||||||
|
if filetype == FileType.OTHER.value:
|
||||||
|
raise RuntimeError("This type of file has not been supported yet!")
|
||||||
|
|
||||||
|
location = filename
|
||||||
|
while MINIO.obj_exist(kb_id, location):
|
||||||
|
location += "_"
|
||||||
|
blob = file.read()
|
||||||
|
MINIO.put(kb_id, location, blob)
|
||||||
|
doc = {
|
||||||
|
"id": get_uuid(),
|
||||||
|
"kb_id": kb.id,
|
||||||
|
"parser_id": kb.parser_id,
|
||||||
|
"parser_config": kb.parser_config,
|
||||||
|
"created_by": current_user.id,
|
||||||
|
"type": filetype,
|
||||||
|
"name": filename,
|
||||||
|
"location": location,
|
||||||
|
"size": len(blob),
|
||||||
|
"thumbnail": thumbnail(filename, blob)
|
||||||
|
}
|
||||||
|
if doc["type"] == FileType.VISUAL:
|
||||||
|
doc["parser_id"] = ParserType.PICTURE.value
|
||||||
|
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
||||||
|
doc["parser_id"] = ParserType.PRESENTATION.value
|
||||||
|
DocumentService.insert(doc)
|
||||||
|
|
||||||
|
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
|
||||||
|
except Exception as e:
|
||||||
|
err.append(file.filename + ": " + str(e))
|
||||||
|
if err:
|
||||||
return get_json_result(
|
return get_json_result(
|
||||||
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
||||||
|
return get_json_result(data=True)
|
||||||
try:
|
|
||||||
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
|
||||||
if not e:
|
|
||||||
return get_data_error_result(
|
|
||||||
retmsg="Can't find this knowledgebase!")
|
|
||||||
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
|
||||||
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
|
|
||||||
return get_data_error_result(
|
|
||||||
retmsg="Exceed the maximum file number of a free user!")
|
|
||||||
|
|
||||||
filename = duplicate_name(
|
|
||||||
DocumentService.query,
|
|
||||||
name=file.filename,
|
|
||||||
kb_id=kb.id)
|
|
||||||
filetype = filename_type(filename)
|
|
||||||
if not filetype:
|
|
||||||
return get_data_error_result(
|
|
||||||
retmsg="This type of file has not been supported yet!")
|
|
||||||
|
|
||||||
location = filename
|
|
||||||
while MINIO.obj_exist(kb_id, location):
|
|
||||||
location += "_"
|
|
||||||
blob = request.files['file'].read()
|
|
||||||
MINIO.put(kb_id, location, blob)
|
|
||||||
doc = {
|
|
||||||
"id": get_uuid(),
|
|
||||||
"kb_id": kb.id,
|
|
||||||
"parser_id": kb.parser_id,
|
|
||||||
"parser_config": kb.parser_config,
|
|
||||||
"created_by": current_user.id,
|
|
||||||
"type": filetype,
|
|
||||||
"name": filename,
|
|
||||||
"location": location,
|
|
||||||
"size": len(blob),
|
|
||||||
"thumbnail": thumbnail(filename, blob)
|
|
||||||
}
|
|
||||||
if doc["type"] == FileType.VISUAL:
|
|
||||||
doc["parser_id"] = ParserType.PICTURE.value
|
|
||||||
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
|
||||||
doc["parser_id"] = ParserType.PRESENTATION.value
|
|
||||||
doc = DocumentService.insert(doc)
|
|
||||||
return get_json_result(data=doc.to_json())
|
|
||||||
except Exception as e:
|
|
||||||
return server_error_response(e)
|
|
||||||
|
|
||||||
|
|
||||||
@manager.route('/create', methods=['POST'])
|
@manager.route('/create', methods=['POST'])
|
||||||
@ -137,7 +154,7 @@ def create():
|
|||||||
|
|
||||||
@manager.route('/list', methods=['GET'])
|
@manager.route('/list', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
def list():
|
def list_docs():
|
||||||
kb_id = request.args.get("kb_id")
|
kb_id = request.args.get("kb_id")
|
||||||
if not kb_id:
|
if not kb_id:
|
||||||
return get_json_result(
|
return get_json_result(
|
||||||
@ -218,26 +235,39 @@ def change_status():
|
|||||||
@validate_request("doc_id")
|
@validate_request("doc_id")
|
||||||
def rm():
|
def rm():
|
||||||
req = request.json
|
req = request.json
|
||||||
try:
|
doc_ids = req["doc_id"]
|
||||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
if isinstance(doc_ids, str): doc_ids = [doc_ids]
|
||||||
if not e:
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
return get_data_error_result(retmsg="Document not found!")
|
pf_id = root_folder["id"]
|
||||||
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
||||||
if not tenant_id:
|
errors = ""
|
||||||
return get_data_error_result(retmsg="Tenant not found!")
|
for doc_id in doc_ids:
|
||||||
ELASTICSEARCH.deleteByQuery(
|
try:
|
||||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||||
|
if not tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
|
||||||
DocumentService.increment_chunk_num(
|
b, n = File2DocumentService.get_minio_address(doc_id=doc_id)
|
||||||
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
|
||||||
if not DocumentService.delete(doc):
|
|
||||||
return get_data_error_result(
|
|
||||||
retmsg="Database error (Document removal)!")
|
|
||||||
|
|
||||||
MINIO.rm(doc.kb_id, doc.location)
|
if not DocumentService.remove_document(doc, tenant_id):
|
||||||
return get_json_result(data=True)
|
return get_data_error_result(
|
||||||
except Exception as e:
|
retmsg="Database error (Document removal)!")
|
||||||
return server_error_response(e)
|
|
||||||
|
f2d = File2DocumentService.get_by_document_id(doc_id)
|
||||||
|
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
|
||||||
|
File2DocumentService.delete_by_document_id(doc_id)
|
||||||
|
|
||||||
|
MINIO.rm(b, n)
|
||||||
|
except Exception as e:
|
||||||
|
errors += str(e)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
return get_json_result(data=False, retmsg=errors, retcode=RetCode.SERVER_ERROR)
|
||||||
|
|
||||||
|
return get_json_result(data=True)
|
||||||
|
|
||||||
|
|
||||||
@manager.route('/run', methods=['POST'])
|
@manager.route('/run', methods=['POST'])
|
||||||
@ -259,6 +289,14 @@ def run():
|
|||||||
return get_data_error_result(retmsg="Tenant not found!")
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
ELASTICSEARCH.deleteByQuery(
|
ELASTICSEARCH.deleteByQuery(
|
||||||
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
||||||
|
|
||||||
|
if str(req["run"]) == TaskStatus.RUNNING.value:
|
||||||
|
TaskService.filter_delete([Task.doc_id == id])
|
||||||
|
e, doc = DocumentService.get_by_id(id)
|
||||||
|
doc = doc.to_dict()
|
||||||
|
doc["tenant_id"] = tenant_id
|
||||||
|
bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
|
||||||
|
queue_tasks(doc, bucket, name)
|
||||||
|
|
||||||
return get_json_result(data=True)
|
return get_json_result(data=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -280,15 +318,21 @@ def rename():
|
|||||||
data=False,
|
data=False,
|
||||||
retmsg="The extension of file can't be changed",
|
retmsg="The extension of file can't be changed",
|
||||||
retcode=RetCode.ARGUMENT_ERROR)
|
retcode=RetCode.ARGUMENT_ERROR)
|
||||||
if DocumentService.query(name=req["name"], kb_id=doc.kb_id):
|
for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
|
||||||
return get_data_error_result(
|
if d.name == req["name"]:
|
||||||
retmsg="Duplicated document name in the same knowledgebase.")
|
return get_data_error_result(
|
||||||
|
retmsg="Duplicated document name in the same knowledgebase.")
|
||||||
|
|
||||||
if not DocumentService.update_by_id(
|
if not DocumentService.update_by_id(
|
||||||
req["doc_id"], {"name": req["name"]}):
|
req["doc_id"], {"name": req["name"]}):
|
||||||
return get_data_error_result(
|
return get_data_error_result(
|
||||||
retmsg="Database error (Document rename)!")
|
retmsg="Database error (Document rename)!")
|
||||||
|
|
||||||
|
informs = File2DocumentService.get_by_document_id(req["doc_id"])
|
||||||
|
if informs:
|
||||||
|
e, file = FileService.get_by_id(informs[0].file_id)
|
||||||
|
FileService.update_by_id(file.id, {"name": req["name"]})
|
||||||
|
|
||||||
return get_json_result(data=True)
|
return get_json_result(data=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
@ -302,7 +346,9 @@ def get(doc_id):
|
|||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(retmsg="Document not found!")
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
|
||||||
response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
|
b,n = File2DocumentService.get_minio_address(doc_id=doc_id)
|
||||||
|
response = flask.make_response(MINIO.get(b, n))
|
||||||
|
|
||||||
ext = re.search(r"\.([^.]+)$", doc.name)
|
ext = re.search(r"\.([^.]+)$", doc.name)
|
||||||
if ext:
|
if ext:
|
||||||
if doc.type == FileType.VISUAL.value:
|
if doc.type == FileType.VISUAL.value:
|
||||||
@ -338,7 +384,8 @@ def change_parser():
|
|||||||
return get_data_error_result(retmsg="Not supported yet!")
|
return get_data_error_result(retmsg="Not supported yet!")
|
||||||
|
|
||||||
e = DocumentService.update_by_id(doc.id,
|
e = DocumentService.update_by_id(doc.id,
|
||||||
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
|
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
|
||||||
|
"run": TaskStatus.UNSTART.value})
|
||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(retmsg="Document not found!")
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
if "parser_config" in req:
|
if "parser_config" in req:
|
||||||
|
|||||||
129
api/apps/file2document_app.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License
|
||||||
|
#
|
||||||
|
from elasticsearch_dsl import Q
|
||||||
|
|
||||||
|
from api.db.db_models import File2Document
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
|
|
||||||
|
from flask import request
|
||||||
|
from flask_login import login_required, current_user
|
||||||
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
|
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||||
|
from api.utils import get_uuid
|
||||||
|
from api.db import FileType
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.settings import RetCode
|
||||||
|
from api.utils.api_utils import get_json_result
|
||||||
|
from rag.nlp import search
|
||||||
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/convert', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("file_ids", "kb_ids")
|
||||||
|
def convert():
|
||||||
|
req = request.json
|
||||||
|
kb_ids = req["kb_ids"]
|
||||||
|
file_ids = req["file_ids"]
|
||||||
|
file2documents = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for file_id in file_ids:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
file_ids_list = [file_id]
|
||||||
|
if file.type == FileType.FOLDER.value:
|
||||||
|
file_ids_list = FileService.get_all_innermost_file_ids(file_id, [])
|
||||||
|
for id in file_ids_list:
|
||||||
|
informs = File2DocumentService.get_by_file_id(id)
|
||||||
|
# delete
|
||||||
|
for inform in informs:
|
||||||
|
doc_id = inform.document_id
|
||||||
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||||
|
if not tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
if not DocumentService.remove_document(doc, tenant_id):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (Document removal)!")
|
||||||
|
File2DocumentService.delete_by_file_id(id)
|
||||||
|
|
||||||
|
# insert
|
||||||
|
for kb_id in kb_ids:
|
||||||
|
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Can't find this knowledgebase!")
|
||||||
|
e, file = FileService.get_by_id(id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Can't find this file!")
|
||||||
|
|
||||||
|
doc = DocumentService.insert({
|
||||||
|
"id": get_uuid(),
|
||||||
|
"kb_id": kb.id,
|
||||||
|
"parser_id": kb.parser_id,
|
||||||
|
"parser_config": kb.parser_config,
|
||||||
|
"created_by": current_user.id,
|
||||||
|
"type": file.type,
|
||||||
|
"name": file.name,
|
||||||
|
"location": file.location,
|
||||||
|
"size": file.size
|
||||||
|
})
|
||||||
|
file2document = File2DocumentService.insert({
|
||||||
|
"id": get_uuid(),
|
||||||
|
"file_id": id,
|
||||||
|
"document_id": doc.id,
|
||||||
|
})
|
||||||
|
file2documents.append(file2document.to_json())
|
||||||
|
return get_json_result(data=file2documents)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/rm', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("file_ids")
|
||||||
|
def rm():
|
||||||
|
req = request.json
|
||||||
|
file_ids = req["file_ids"]
|
||||||
|
if not file_ids:
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg='Lack of "Files ID"', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
try:
|
||||||
|
for file_id in file_ids:
|
||||||
|
informs = File2DocumentService.get_by_file_id(file_id)
|
||||||
|
if not informs:
|
||||||
|
return get_data_error_result(retmsg="Inform not found!")
|
||||||
|
for inform in informs:
|
||||||
|
if not inform:
|
||||||
|
return get_data_error_result(retmsg="Inform not found!")
|
||||||
|
File2DocumentService.delete_by_file_id(file_id)
|
||||||
|
doc_id = inform.document_id
|
||||||
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||||
|
if not tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
if not DocumentService.remove_document(doc, tenant_id):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (Document removal)!")
|
||||||
|
return get_json_result(data=True)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
347
api/apps/file_app.py
Normal file
@ -0,0 +1,347 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License
|
||||||
|
#
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import re
|
||||||
|
|
||||||
|
import flask
|
||||||
|
from elasticsearch_dsl import Q
|
||||||
|
from flask import request
|
||||||
|
from flask_login import login_required, current_user
|
||||||
|
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||||
|
from api.utils import get_uuid
|
||||||
|
from api.db import FileType, FileSource
|
||||||
|
from api.db.services import duplicate_name
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
|
from api.settings import RetCode
|
||||||
|
from api.utils.api_utils import get_json_result
|
||||||
|
from api.utils.file_utils import filename_type
|
||||||
|
from rag.nlp import search
|
||||||
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/upload', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
# @validate_request("parent_id")
|
||||||
|
def upload():
|
||||||
|
pf_id = request.form.get("parent_id")
|
||||||
|
|
||||||
|
if not pf_id:
|
||||||
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
|
pf_id = root_folder["id"]
|
||||||
|
|
||||||
|
if 'file' not in request.files:
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
file_objs = request.files.getlist('file')
|
||||||
|
|
||||||
|
for file_obj in file_objs:
|
||||||
|
if file_obj.filename == '':
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
file_res = []
|
||||||
|
try:
|
||||||
|
for file_obj in file_objs:
|
||||||
|
e, file = FileService.get_by_id(pf_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Can't find this folder!")
|
||||||
|
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
||||||
|
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(current_user.id) >= MAX_FILE_NUM_PER_USER:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Exceed the maximum file number of a free user!")
|
||||||
|
|
||||||
|
# split file name path
|
||||||
|
if not file_obj.filename:
|
||||||
|
e, file = FileService.get_by_id(pf_id)
|
||||||
|
file_obj_names = [file.name, file_obj.filename]
|
||||||
|
else:
|
||||||
|
full_path = '/' + file_obj.filename
|
||||||
|
file_obj_names = full_path.split('/')
|
||||||
|
file_len = len(file_obj_names)
|
||||||
|
|
||||||
|
# get folder
|
||||||
|
file_id_list = FileService.get_id_list_by_id(pf_id, file_obj_names, 1, [pf_id])
|
||||||
|
len_id_list = len(file_id_list)
|
||||||
|
|
||||||
|
# create folder
|
||||||
|
if file_len != len_id_list:
|
||||||
|
e, file = FileService.get_by_id(file_id_list[len_id_list - 1])
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 1], file_obj_names,
|
||||||
|
len_id_list)
|
||||||
|
else:
|
||||||
|
e, file = FileService.get_by_id(file_id_list[len_id_list - 2])
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 2], file_obj_names,
|
||||||
|
len_id_list)
|
||||||
|
|
||||||
|
# file type
|
||||||
|
filetype = filename_type(file_obj_names[file_len - 1])
|
||||||
|
location = file_obj_names[file_len - 1]
|
||||||
|
while MINIO.obj_exist(last_folder.id, location):
|
||||||
|
location += "_"
|
||||||
|
blob = file_obj.read()
|
||||||
|
filename = duplicate_name(
|
||||||
|
FileService.query,
|
||||||
|
name=file_obj_names[file_len - 1],
|
||||||
|
parent_id=last_folder.id)
|
||||||
|
file = {
|
||||||
|
"id": get_uuid(),
|
||||||
|
"parent_id": last_folder.id,
|
||||||
|
"tenant_id": current_user.id,
|
||||||
|
"created_by": current_user.id,
|
||||||
|
"type": filetype,
|
||||||
|
"name": filename,
|
||||||
|
"location": location,
|
||||||
|
"size": len(blob),
|
||||||
|
}
|
||||||
|
file = FileService.insert(file)
|
||||||
|
MINIO.put(last_folder.id, location, blob)
|
||||||
|
file_res.append(file.to_json())
|
||||||
|
return get_json_result(data=file_res)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/create', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("name")
|
||||||
|
def create():
|
||||||
|
req = request.json
|
||||||
|
pf_id = request.json.get("parent_id")
|
||||||
|
input_file_type = request.json.get("type")
|
||||||
|
if not pf_id:
|
||||||
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
|
pf_id = root_folder["id"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not FileService.is_parent_folder_exist(pf_id):
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg="Parent Folder Doesn't Exist!", retcode=RetCode.OPERATING_ERROR)
|
||||||
|
if FileService.query(name=req["name"], parent_id=pf_id):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Duplicated folder name in the same folder.")
|
||||||
|
|
||||||
|
if input_file_type == FileType.FOLDER.value:
|
||||||
|
file_type = FileType.FOLDER.value
|
||||||
|
else:
|
||||||
|
file_type = FileType.VIRTUAL.value
|
||||||
|
|
||||||
|
file = FileService.insert({
|
||||||
|
"id": get_uuid(),
|
||||||
|
"parent_id": pf_id,
|
||||||
|
"tenant_id": current_user.id,
|
||||||
|
"created_by": current_user.id,
|
||||||
|
"name": req["name"],
|
||||||
|
"location": "",
|
||||||
|
"size": 0,
|
||||||
|
"type": file_type
|
||||||
|
})
|
||||||
|
|
||||||
|
return get_json_result(data=file.to_json())
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/list', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def list_files():
|
||||||
|
pf_id = request.args.get("parent_id")
|
||||||
|
|
||||||
|
keywords = request.args.get("keywords", "")
|
||||||
|
|
||||||
|
page_number = int(request.args.get("page", 1))
|
||||||
|
items_per_page = int(request.args.get("page_size", 15))
|
||||||
|
orderby = request.args.get("orderby", "create_time")
|
||||||
|
desc = request.args.get("desc", True)
|
||||||
|
if not pf_id:
|
||||||
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
|
pf_id = root_folder["id"]
|
||||||
|
FileService.init_knowledgebase_docs(pf_id, current_user.id)
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(pf_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
|
||||||
|
files, total = FileService.get_by_pf_id(
|
||||||
|
current_user.id, pf_id, page_number, items_per_page, orderby, desc, keywords)
|
||||||
|
|
||||||
|
parent_folder = FileService.get_parent_folder(pf_id)
|
||||||
|
if not FileService.get_parent_folder(pf_id):
|
||||||
|
return get_json_result(retmsg="File not found!")
|
||||||
|
|
||||||
|
return get_json_result(data={"total": total, "files": files, "parent_folder": parent_folder.to_json()})
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/root_folder', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def get_root_folder():
|
||||||
|
try:
|
||||||
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
|
return get_json_result(data={"root_folder": root_folder})
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/parent_folder', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def get_parent_folder():
|
||||||
|
file_id = request.args.get("file_id")
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
|
||||||
|
parent_folder = FileService.get_parent_folder(file_id)
|
||||||
|
return get_json_result(data={"parent_folder": parent_folder.to_json()})
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/all_parent_folder', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def get_all_parent_folders():
|
||||||
|
file_id = request.args.get("file_id")
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
|
||||||
|
parent_folders = FileService.get_all_parent_folders(file_id)
|
||||||
|
parent_folders_res = []
|
||||||
|
for parent_folder in parent_folders:
|
||||||
|
parent_folders_res.append(parent_folder.to_json())
|
||||||
|
return get_json_result(data={"parent_folders": parent_folders_res})
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/rm', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("file_ids")
|
||||||
|
def rm():
|
||||||
|
req = request.json
|
||||||
|
file_ids = req["file_ids"]
|
||||||
|
try:
|
||||||
|
for file_id in file_ids:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="File or Folder not found!")
|
||||||
|
if not file.tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
if file.source_type == FileSource.KNOWLEDGEBASE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if file.type == FileType.FOLDER.value:
|
||||||
|
file_id_list = FileService.get_all_innermost_file_ids(file_id, [])
|
||||||
|
for inner_file_id in file_id_list:
|
||||||
|
e, file = FileService.get_by_id(inner_file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="File not found!")
|
||||||
|
MINIO.rm(file.parent_id, file.location)
|
||||||
|
FileService.delete_folder_by_pf_id(current_user.id, file_id)
|
||||||
|
else:
|
||||||
|
if not FileService.delete(file):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (File removal)!")
|
||||||
|
|
||||||
|
# delete file2document
|
||||||
|
informs = File2DocumentService.get_by_file_id(file_id)
|
||||||
|
for inform in informs:
|
||||||
|
doc_id = inform.document_id
|
||||||
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||||
|
if not tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
if not DocumentService.remove_document(doc, tenant_id):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (Document removal)!")
|
||||||
|
File2DocumentService.delete_by_file_id(file_id)
|
||||||
|
|
||||||
|
return get_json_result(data=True)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/rename', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("file_id", "name")
|
||||||
|
def rename():
|
||||||
|
req = request.json
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(req["file_id"])
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="File not found!")
|
||||||
|
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
|
||||||
|
file.name.lower()).suffix:
|
||||||
|
return get_json_result(
|
||||||
|
data=False,
|
||||||
|
retmsg="The extension of file can't be changed",
|
||||||
|
retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
for file in FileService.query(name=req["name"], pf_id=file.parent_id):
|
||||||
|
if file.name == req["name"]:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Duplicated file name in the same folder.")
|
||||||
|
|
||||||
|
if not FileService.update_by_id(
|
||||||
|
req["file_id"], {"name": req["name"]}):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (File rename)!")
|
||||||
|
|
||||||
|
informs = File2DocumentService.get_by_file_id(req["file_id"])
|
||||||
|
if informs:
|
||||||
|
if not DocumentService.update_by_id(
|
||||||
|
informs[0].document_id, {"name": req["name"]}):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (Document rename)!")
|
||||||
|
|
||||||
|
return get_json_result(data=True)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/get/<file_id>', methods=['GET'])
|
||||||
|
# @login_required
|
||||||
|
def get(file_id):
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
|
||||||
|
response = flask.make_response(MINIO.get(file.parent_id, file.location))
|
||||||
|
ext = re.search(r"\.([^.]+)$", file.name)
|
||||||
|
if ext:
|
||||||
|
if file.type == FileType.VISUAL.value:
|
||||||
|
response.headers.set('Content-Type', 'image/%s' % ext.group(1))
|
||||||
|
else:
|
||||||
|
response.headers.set(
|
||||||
|
'Content-Type',
|
||||||
|
'application/%s' %
|
||||||
|
ext.group(1))
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
@ -19,16 +19,18 @@ from flask_login import login_required, current_user
|
|||||||
|
|
||||||
from api.db.services import duplicate_name
|
from api.db.services import duplicate_name
|
||||||
from api.db.services.document_service import DocumentService
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
from api.db.services.user_service import TenantService, UserTenantService
|
from api.db.services.user_service import TenantService, UserTenantService
|
||||||
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||||
from api.utils import get_uuid, get_format_time
|
from api.utils import get_uuid, get_format_time
|
||||||
from api.db import StatusEnum, UserTenantRole
|
from api.db import StatusEnum, UserTenantRole, FileSource
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from api.db.db_models import Knowledgebase
|
from api.db.db_models import Knowledgebase, File
|
||||||
from api.settings import stat_logger, RetCode
|
from api.settings import stat_logger, RetCode
|
||||||
from api.utils.api_utils import get_json_result
|
from api.utils.api_utils import get_json_result
|
||||||
from rag.nlp import search
|
from rag.nlp import search
|
||||||
from rag.utils import ELASTICSEARCH
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
|
||||||
|
|
||||||
@manager.route('/create', methods=['post'])
|
@manager.route('/create', methods=['post'])
|
||||||
@ -109,9 +111,9 @@ def detail():
|
|||||||
|
|
||||||
@manager.route('/list', methods=['GET'])
|
@manager.route('/list', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
def list():
|
def list_kbs():
|
||||||
page_number = request.args.get("page", 1)
|
page_number = request.args.get("page", 1)
|
||||||
items_per_page = request.args.get("page_size", 15)
|
items_per_page = request.args.get("page_size", 150)
|
||||||
orderby = request.args.get("orderby", "create_time")
|
orderby = request.args.get("orderby", "create_time")
|
||||||
desc = request.args.get("desc", True)
|
desc = request.args.get("desc", True)
|
||||||
try:
|
try:
|
||||||
@ -136,17 +138,14 @@ def rm():
|
|||||||
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR)
|
data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.', retcode=RetCode.OPERATING_ERROR)
|
||||||
|
|
||||||
for doc in DocumentService.query(kb_id=req["kb_id"]):
|
for doc in DocumentService.query(kb_id=req["kb_id"]):
|
||||||
ELASTICSEARCH.deleteByQuery(
|
if not DocumentService.remove_document(doc, kbs[0].tenant_id):
|
||||||
Q("match", doc_id=doc.id), idxnm=search.index_name(kbs[0].tenant_id))
|
|
||||||
|
|
||||||
DocumentService.increment_chunk_num(
|
|
||||||
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
|
||||||
if not DocumentService.delete(doc):
|
|
||||||
return get_data_error_result(
|
return get_data_error_result(
|
||||||
retmsg="Database error (Document removal)!")
|
retmsg="Database error (Document removal)!")
|
||||||
|
f2d = File2DocumentService.get_by_document_id(doc.id)
|
||||||
|
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
|
||||||
|
File2DocumentService.delete_by_document_id(doc.id)
|
||||||
|
|
||||||
if not KnowledgebaseService.update_by_id(
|
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
|
||||||
req["kb_id"], {"status": StatusEnum.INVALID.value}):
|
|
||||||
return get_data_error_result(
|
return get_data_error_result(
|
||||||
retmsg="Database error (Knowledgebase removal)!")
|
retmsg="Database error (Knowledgebase removal)!")
|
||||||
return get_json_result(data=True)
|
return get_json_result(data=True)
|
||||||
|
|||||||
@ -142,6 +142,16 @@ def add_llm():
|
|||||||
return get_json_result(data=True)
|
return get_json_result(data=True)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/delete_llm', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("llm_factory", "llm_name")
|
||||||
|
def delete_llm():
|
||||||
|
req = request.json
|
||||||
|
TenantLLMService.filter_delete(
|
||||||
|
[TenantLLM.tenant_id == current_user.id, TenantLLM.llm_factory == req["llm_factory"], TenantLLM.llm_name == req["llm_name"]])
|
||||||
|
return get_json_result(data=True)
|
||||||
|
|
||||||
|
|
||||||
@manager.route('/my_llms', methods=['GET'])
|
@manager.route('/my_llms', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
def my_llms():
|
def my_llms():
|
||||||
@ -165,7 +175,7 @@ def my_llms():
|
|||||||
|
|
||||||
@manager.route('/list', methods=['GET'])
|
@manager.route('/list', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
def list():
|
def list_app():
|
||||||
model_type = request.args.get("model_type")
|
model_type = request.args.get("model_type")
|
||||||
try:
|
try:
|
||||||
objs = TenantLLMService.query(tenant_id=current_user.id)
|
objs = TenantLLMService.query(tenant_id=current_user.id)
|
||||||
@ -184,7 +194,7 @@ def list():
|
|||||||
|
|
||||||
res = {}
|
res = {}
|
||||||
for m in llms:
|
for m in llms:
|
||||||
if model_type and m["model_type"] != model_type:
|
if model_type and m["model_type"].find(model_type)<0:
|
||||||
continue
|
continue
|
||||||
if m["fid"] not in res:
|
if m["fid"] not in res:
|
||||||
res[m["fid"]] = []
|
res[m["fid"]] = []
|
||||||
|
|||||||
67
api/apps/system_app.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License
|
||||||
|
#
|
||||||
|
from flask_login import login_required
|
||||||
|
|
||||||
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
|
from api.utils.api_utils import get_json_result
|
||||||
|
from api.versions import get_rag_version
|
||||||
|
from rag.settings import SVR_QUEUE_NAME
|
||||||
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
|
from rag.utils.redis_conn import REDIS_CONN
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/version', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def version():
|
||||||
|
return get_json_result(data=get_rag_version())
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/status', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def status():
|
||||||
|
res = {}
|
||||||
|
st = timer()
|
||||||
|
try:
|
||||||
|
res["es"] = ELASTICSEARCH.health()
|
||||||
|
res["es"]["elapsed"] = "{:.1f}".format((timer() - st)*1000.)
|
||||||
|
except Exception as e:
|
||||||
|
res["es"] = {"status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
|
||||||
|
|
||||||
|
st = timer()
|
||||||
|
try:
|
||||||
|
MINIO.health()
|
||||||
|
res["minio"] = {"status": "green", "elapsed": "{:.1f}".format((timer() - st)*1000.)}
|
||||||
|
except Exception as e:
|
||||||
|
res["minio"] = {"status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
|
||||||
|
|
||||||
|
st = timer()
|
||||||
|
try:
|
||||||
|
KnowledgebaseService.get_by_id("x")
|
||||||
|
res["mysql"] = {"status": "green", "elapsed": "{:.1f}".format((timer() - st)*1000.)}
|
||||||
|
except Exception as e:
|
||||||
|
res["mysql"] = {"status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
|
||||||
|
|
||||||
|
st = timer()
|
||||||
|
try:
|
||||||
|
qinfo = REDIS_CONN.health(SVR_QUEUE_NAME)
|
||||||
|
res["redis"] = {"status": "green", "elapsed": "{:.1f}".format((timer() - st)*1000.), "pending": qinfo["pending"]}
|
||||||
|
except Exception as e:
|
||||||
|
res["redis"] = {"status": "red", "elapsed": "{:.1f}".format((timer() - st)*1000.), "error": str(e)}
|
||||||
|
|
||||||
|
return get_json_result(data=res)
|
||||||
@ -24,10 +24,11 @@ from api.db.db_models import TenantLLM
|
|||||||
from api.db.services.llm_service import TenantLLMService, LLMService
|
from api.db.services.llm_service import TenantLLMService, LLMService
|
||||||
from api.utils.api_utils import server_error_response, validate_request
|
from api.utils.api_utils import server_error_response, validate_request
|
||||||
from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format
|
from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format
|
||||||
from api.db import UserTenantRole, LLMType
|
from api.db import UserTenantRole, LLMType, FileType
|
||||||
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
|
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
|
||||||
LLM_FACTORY, LLM_BASE_URL
|
LLM_FACTORY, LLM_BASE_URL
|
||||||
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
from api.settings import stat_logger
|
from api.settings import stat_logger
|
||||||
from api.utils.api_utils import get_json_result, cors_reponse
|
from api.utils.api_utils import get_json_result, cors_reponse
|
||||||
|
|
||||||
@ -121,6 +122,79 @@ def github_callback():
|
|||||||
return redirect("/?auth=%s" % user.get_id())
|
return redirect("/?auth=%s" % user.get_id())
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/feishu_callback', methods=['GET'])
|
||||||
|
def feishu_callback():
|
||||||
|
import requests
|
||||||
|
app_access_token_res = requests.post(FEISHU_OAUTH.get("app_access_token_url"), data=json.dumps({
|
||||||
|
"app_id": FEISHU_OAUTH.get("app_id"),
|
||||||
|
"app_secret": FEISHU_OAUTH.get("app_secret")
|
||||||
|
}), headers={"Content-Type": "application/json; charset=utf-8"})
|
||||||
|
app_access_token_res = app_access_token_res.json()
|
||||||
|
if app_access_token_res['code'] != 0:
|
||||||
|
return redirect("/?error=%s" % app_access_token_res)
|
||||||
|
|
||||||
|
res = requests.post(FEISHU_OAUTH.get("user_access_token_url"), data=json.dumps({
|
||||||
|
"grant_type": FEISHU_OAUTH.get("grant_type"),
|
||||||
|
"code": request.args.get('code')
|
||||||
|
}), headers={"Content-Type": "application/json; charset=utf-8",
|
||||||
|
'Authorization': f"Bearer {app_access_token_res['app_access_token']}"})
|
||||||
|
res = res.json()
|
||||||
|
if res['code'] != 0:
|
||||||
|
return redirect("/?error=%s" % res["message"])
|
||||||
|
|
||||||
|
if "contact:user.email:readonly" not in res["data"]["scope"].split(" "):
|
||||||
|
return redirect("/?error=contact:user.email:readonly not in scope")
|
||||||
|
session["access_token"] = res["data"]["access_token"]
|
||||||
|
session["access_token_from"] = "feishu"
|
||||||
|
userinfo = user_info_from_feishu(session["access_token"])
|
||||||
|
users = UserService.query(email=userinfo["email"])
|
||||||
|
user_id = get_uuid()
|
||||||
|
if not users:
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
avatar = download_img(userinfo["avatar_url"])
|
||||||
|
except Exception as e:
|
||||||
|
stat_logger.exception(e)
|
||||||
|
avatar = ""
|
||||||
|
users = user_register(user_id, {
|
||||||
|
"access_token": session["access_token"],
|
||||||
|
"email": userinfo["email"],
|
||||||
|
"avatar": avatar,
|
||||||
|
"nickname": userinfo["en_name"],
|
||||||
|
"login_channel": "feishu",
|
||||||
|
"last_login_time": get_format_time(),
|
||||||
|
"is_superuser": False,
|
||||||
|
})
|
||||||
|
if not users:
|
||||||
|
raise Exception('Register user failure.')
|
||||||
|
if len(users) > 1:
|
||||||
|
raise Exception('Same E-mail exist!')
|
||||||
|
user = users[0]
|
||||||
|
login_user(user)
|
||||||
|
return redirect("/?auth=%s" % user.get_id())
|
||||||
|
except Exception as e:
|
||||||
|
rollback_user_registration(user_id)
|
||||||
|
stat_logger.exception(e)
|
||||||
|
return redirect("/?error=%s" % str(e))
|
||||||
|
user = users[0]
|
||||||
|
user.access_token = get_uuid()
|
||||||
|
login_user(user)
|
||||||
|
user.save()
|
||||||
|
return redirect("/?auth=%s" % user.get_id())
|
||||||
|
|
||||||
|
|
||||||
|
def user_info_from_feishu(access_token):
|
||||||
|
import requests
|
||||||
|
headers = {"Content-Type": "application/json; charset=utf-8",
|
||||||
|
'Authorization': f"Bearer {access_token}"}
|
||||||
|
res = requests.get(
|
||||||
|
f"https://open.feishu.cn/open-apis/authen/v1/user_info",
|
||||||
|
headers=headers)
|
||||||
|
user_info = res.json()["data"]
|
||||||
|
user_info["email"] = None if user_info.get("email") == "" else user_info["email"]
|
||||||
|
return user_info
|
||||||
|
|
||||||
|
|
||||||
def user_info_from_github(access_token):
|
def user_info_from_github(access_token):
|
||||||
import requests
|
import requests
|
||||||
headers = {"Accept": "application/json",
|
headers = {"Accept": "application/json",
|
||||||
@ -199,7 +273,7 @@ def rollback_user_registration(user_id):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
TenantLLM.delete().where(TenantLLM.tenant_id == user_id).excute()
|
TenantLLM.delete().where(TenantLLM.tenant_id == user_id).execute()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -221,6 +295,17 @@ def user_register(user_id, user):
|
|||||||
"invited_by": user_id,
|
"invited_by": user_id,
|
||||||
"role": UserTenantRole.OWNER
|
"role": UserTenantRole.OWNER
|
||||||
}
|
}
|
||||||
|
file_id = get_uuid()
|
||||||
|
file = {
|
||||||
|
"id": file_id,
|
||||||
|
"parent_id": file_id,
|
||||||
|
"tenant_id": user_id,
|
||||||
|
"created_by": user_id,
|
||||||
|
"name": "/",
|
||||||
|
"type": FileType.FOLDER.value,
|
||||||
|
"size": 0,
|
||||||
|
"location": "",
|
||||||
|
}
|
||||||
tenant_llm = []
|
tenant_llm = []
|
||||||
for llm in LLMService.query(fid=LLM_FACTORY):
|
for llm in LLMService.query(fid=LLM_FACTORY):
|
||||||
tenant_llm.append({"tenant_id": user_id,
|
tenant_llm.append({"tenant_id": user_id,
|
||||||
@ -236,6 +321,7 @@ def user_register(user_id, user):
|
|||||||
TenantService.insert(**tenant)
|
TenantService.insert(**tenant)
|
||||||
UserTenantService.insert(**usr_tenant)
|
UserTenantService.insert(**usr_tenant)
|
||||||
TenantLLMService.insert_many(tenant_llm)
|
TenantLLMService.insert_many(tenant_llm)
|
||||||
|
FileService.insert(file)
|
||||||
return UserService.query(email=user["email"])
|
return UserService.query(email=user["email"])
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -45,6 +45,8 @@ class FileType(StrEnum):
|
|||||||
VISUAL = 'visual'
|
VISUAL = 'visual'
|
||||||
AURAL = 'aural'
|
AURAL = 'aural'
|
||||||
VIRTUAL = 'virtual'
|
VIRTUAL = 'virtual'
|
||||||
|
FOLDER = 'folder'
|
||||||
|
OTHER = "other"
|
||||||
|
|
||||||
|
|
||||||
class LLMType(StrEnum):
|
class LLMType(StrEnum):
|
||||||
@ -62,6 +64,7 @@ class ChatStyle(StrEnum):
|
|||||||
|
|
||||||
|
|
||||||
class TaskStatus(StrEnum):
|
class TaskStatus(StrEnum):
|
||||||
|
UNSTART = "0"
|
||||||
RUNNING = "1"
|
RUNNING = "1"
|
||||||
CANCEL = "2"
|
CANCEL = "2"
|
||||||
DONE = "3"
|
DONE = "3"
|
||||||
@ -80,3 +83,11 @@ class ParserType(StrEnum):
|
|||||||
NAIVE = "naive"
|
NAIVE = "naive"
|
||||||
PICTURE = "picture"
|
PICTURE = "picture"
|
||||||
ONE = "one"
|
ONE = "one"
|
||||||
|
|
||||||
|
|
||||||
|
class FileSource(StrEnum):
|
||||||
|
LOCAL = ""
|
||||||
|
KNOWLEDGEBASE = "knowledgebase"
|
||||||
|
S3 = "s3"
|
||||||
|
|
||||||
|
KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
|
||||||
@ -21,14 +21,13 @@ import operator
|
|||||||
from functools import wraps
|
from functools import wraps
|
||||||
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
|
from itsdangerous.url_safe import URLSafeTimedSerializer as Serializer
|
||||||
from flask_login import UserMixin
|
from flask_login import UserMixin
|
||||||
|
from playhouse.migrate import MySQLMigrator, migrate
|
||||||
from peewee import (
|
from peewee import (
|
||||||
BigAutoField, BigIntegerField, BooleanField, CharField,
|
BigIntegerField, BooleanField, CharField,
|
||||||
CompositeKey, Insert, IntegerField, TextField, FloatField, DateTimeField,
|
CompositeKey, IntegerField, TextField, FloatField, DateTimeField,
|
||||||
Field, Model, Metadata
|
Field, Model, Metadata
|
||||||
)
|
)
|
||||||
from playhouse.pool import PooledMySQLDatabase
|
from playhouse.pool import PooledMySQLDatabase
|
||||||
|
|
||||||
from api.db import SerializedType, ParserType
|
from api.db import SerializedType, ParserType
|
||||||
from api.settings import DATABASE, stat_logger, SECRET_KEY
|
from api.settings import DATABASE, stat_logger, SECRET_KEY
|
||||||
from api.utils.log_utils import getLogger
|
from api.utils.log_utils import getLogger
|
||||||
@ -344,7 +343,7 @@ class DataBaseModel(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def init_database_tables():
|
def init_database_tables(alter_fields=[]):
|
||||||
members = inspect.getmembers(sys.modules[__name__], inspect.isclass)
|
members = inspect.getmembers(sys.modules[__name__], inspect.isclass)
|
||||||
table_objs = []
|
table_objs = []
|
||||||
create_failed_list = []
|
create_failed_list = []
|
||||||
@ -361,6 +360,7 @@ def init_database_tables():
|
|||||||
if create_failed_list:
|
if create_failed_list:
|
||||||
LOGGER.info(f"create tables failed: {create_failed_list}")
|
LOGGER.info(f"create tables failed: {create_failed_list}")
|
||||||
raise Exception(f"create tables failed: {create_failed_list}")
|
raise Exception(f"create tables failed: {create_failed_list}")
|
||||||
|
migrate_db()
|
||||||
|
|
||||||
|
|
||||||
def fill_db_model_object(model_object, human_model_dict):
|
def fill_db_model_object(model_object, human_model_dict):
|
||||||
@ -386,7 +386,7 @@ class User(DataBaseModel, UserMixin):
|
|||||||
max_length=32,
|
max_length=32,
|
||||||
null=True,
|
null=True,
|
||||||
help_text="English|Chinese",
|
help_text="English|Chinese",
|
||||||
default="English")
|
default="Chinese" if "zh_CN" in os.getenv("LANG", "") else "English")
|
||||||
color_schema = CharField(
|
color_schema = CharField(
|
||||||
max_length=32,
|
max_length=32,
|
||||||
null=True,
|
null=True,
|
||||||
@ -578,7 +578,7 @@ class Knowledgebase(DataBaseModel):
|
|||||||
language = CharField(
|
language = CharField(
|
||||||
max_length=32,
|
max_length=32,
|
||||||
null=True,
|
null=True,
|
||||||
default="English",
|
default="Chinese" if "zh_CN" in os.getenv("LANG", "") else "English",
|
||||||
help_text="English|Chinese")
|
help_text="English|Chinese")
|
||||||
description = TextField(null=True, help_text="KB description")
|
description = TextField(null=True, help_text="KB description")
|
||||||
embd_id = CharField(
|
embd_id = CharField(
|
||||||
@ -669,6 +669,66 @@ class Document(DataBaseModel):
|
|||||||
db_table = "document"
|
db_table = "document"
|
||||||
|
|
||||||
|
|
||||||
|
class File(DataBaseModel):
|
||||||
|
id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
primary_key=True,
|
||||||
|
)
|
||||||
|
parent_id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=False,
|
||||||
|
help_text="parent folder id",
|
||||||
|
index=True)
|
||||||
|
tenant_id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=False,
|
||||||
|
help_text="tenant id",
|
||||||
|
index=True)
|
||||||
|
created_by = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=False,
|
||||||
|
help_text="who created it")
|
||||||
|
name = CharField(
|
||||||
|
max_length=255,
|
||||||
|
null=False,
|
||||||
|
help_text="file name or folder name",
|
||||||
|
index=True)
|
||||||
|
location = CharField(
|
||||||
|
max_length=255,
|
||||||
|
null=True,
|
||||||
|
help_text="where dose it store")
|
||||||
|
size = IntegerField(default=0)
|
||||||
|
type = CharField(max_length=32, null=False, help_text="file extension")
|
||||||
|
source_type = CharField(
|
||||||
|
max_length=128,
|
||||||
|
null=False,
|
||||||
|
default="",
|
||||||
|
help_text="where dose this document come from")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
db_table = "file"
|
||||||
|
|
||||||
|
|
||||||
|
class File2Document(DataBaseModel):
|
||||||
|
id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
primary_key=True,
|
||||||
|
)
|
||||||
|
file_id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=True,
|
||||||
|
help_text="file id",
|
||||||
|
index=True)
|
||||||
|
document_id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=True,
|
||||||
|
help_text="document id",
|
||||||
|
index=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
db_table = "file2document"
|
||||||
|
|
||||||
|
|
||||||
class Task(DataBaseModel):
|
class Task(DataBaseModel):
|
||||||
id = CharField(max_length=32, primary_key=True)
|
id = CharField(max_length=32, primary_key=True)
|
||||||
doc_id = CharField(max_length=32, null=False, index=True)
|
doc_id = CharField(max_length=32, null=False, index=True)
|
||||||
@ -695,11 +755,11 @@ class Dialog(DataBaseModel):
|
|||||||
language = CharField(
|
language = CharField(
|
||||||
max_length=32,
|
max_length=32,
|
||||||
null=True,
|
null=True,
|
||||||
default="Chinese",
|
default="Chinese" if "zh_CN" in os.getenv("LANG", "") else "English",
|
||||||
help_text="English|Chinese")
|
help_text="English|Chinese")
|
||||||
llm_id = CharField(max_length=128, null=False, help_text="default llm ID")
|
llm_id = CharField(max_length=128, null=False, help_text="default llm ID")
|
||||||
llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,
|
llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,
|
||||||
"presence_penalty": 0.4, "max_tokens": 215})
|
"presence_penalty": 0.4, "max_tokens": 512})
|
||||||
prompt_type = CharField(
|
prompt_type = CharField(
|
||||||
max_length=16,
|
max_length=16,
|
||||||
null=False,
|
null=False,
|
||||||
@ -762,3 +822,14 @@ class API4Conversation(DataBaseModel):
|
|||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
db_table = "api_4_conversation"
|
db_table = "api_4_conversation"
|
||||||
|
|
||||||
|
|
||||||
|
def migrate_db():
|
||||||
|
try:
|
||||||
|
with DB.transaction():
|
||||||
|
migrator = MySQLMigrator(DB)
|
||||||
|
migrate(
|
||||||
|
migrator.add_column('file', 'source_type', CharField(max_length=128, null=False, default="", help_text="where dose this document come from"))
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|||||||
@ -16,10 +16,13 @@
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
from api.db import LLMType, UserTenantRole
|
from api.db import LLMType, UserTenantRole
|
||||||
from api.db.db_models import init_database_tables as init_web_db, LLMFactories, LLM, TenantLLM
|
from api.db.db_models import init_database_tables as init_web_db, LLMFactories, LLM, TenantLLM
|
||||||
from api.db.services import UserService
|
from api.db.services import UserService
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle
|
from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle
|
||||||
from api.db.services.user_service import TenantService, UserTenantService
|
from api.db.services.user_service import TenantService, UserTenantService
|
||||||
from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL
|
from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL
|
||||||
@ -123,7 +126,12 @@ factory_infos = [{
|
|||||||
"name": "Youdao",
|
"name": "Youdao",
|
||||||
"logo": "",
|
"logo": "",
|
||||||
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
||||||
"status": "1",
|
"status": "1",
|
||||||
|
},{
|
||||||
|
"name": "DeepSeek",
|
||||||
|
"logo": "",
|
||||||
|
"tags": "LLM",
|
||||||
|
"status": "1",
|
||||||
},
|
},
|
||||||
# {
|
# {
|
||||||
# "name": "文心一言",
|
# "name": "文心一言",
|
||||||
@ -138,6 +146,12 @@ def init_llm_factory():
|
|||||||
llm_infos = [
|
llm_infos = [
|
||||||
# ---------------------- OpenAI ------------------------
|
# ---------------------- OpenAI ------------------------
|
||||||
{
|
{
|
||||||
|
"fid": factory_infos[0]["name"],
|
||||||
|
"llm_name": "gpt-4o",
|
||||||
|
"tags": "LLM,CHAT,128K",
|
||||||
|
"max_tokens": 128000,
|
||||||
|
"model_type": LLMType.CHAT.value + "," + LLMType.IMAGE2TEXT.value
|
||||||
|
}, {
|
||||||
"fid": factory_infos[0]["name"],
|
"fid": factory_infos[0]["name"],
|
||||||
"llm_name": "gpt-3.5-turbo",
|
"llm_name": "gpt-3.5-turbo",
|
||||||
"tags": "LLM,CHAT,4K",
|
"tags": "LLM,CHAT,4K",
|
||||||
@ -155,6 +169,18 @@ def init_llm_factory():
|
|||||||
"tags": "TEXT EMBEDDING,8K",
|
"tags": "TEXT EMBEDDING,8K",
|
||||||
"max_tokens": 8191,
|
"max_tokens": 8191,
|
||||||
"model_type": LLMType.EMBEDDING.value
|
"model_type": LLMType.EMBEDDING.value
|
||||||
|
}, {
|
||||||
|
"fid": factory_infos[0]["name"],
|
||||||
|
"llm_name": "text-embedding-3-small",
|
||||||
|
"tags": "TEXT EMBEDDING,8K",
|
||||||
|
"max_tokens": 8191,
|
||||||
|
"model_type": LLMType.EMBEDDING.value
|
||||||
|
}, {
|
||||||
|
"fid": factory_infos[0]["name"],
|
||||||
|
"llm_name": "text-embedding-3-large",
|
||||||
|
"tags": "TEXT EMBEDDING,8K",
|
||||||
|
"max_tokens": 8191,
|
||||||
|
"model_type": LLMType.EMBEDDING.value
|
||||||
}, {
|
}, {
|
||||||
"fid": factory_infos[0]["name"],
|
"fid": factory_infos[0]["name"],
|
||||||
"llm_name": "whisper-1",
|
"llm_name": "whisper-1",
|
||||||
@ -331,6 +357,21 @@ def init_llm_factory():
|
|||||||
"max_tokens": 512,
|
"max_tokens": 512,
|
||||||
"model_type": LLMType.EMBEDDING.value
|
"model_type": LLMType.EMBEDDING.value
|
||||||
},
|
},
|
||||||
|
# ------------------------ DeepSeek -----------------------
|
||||||
|
{
|
||||||
|
"fid": factory_infos[8]["name"],
|
||||||
|
"llm_name": "deepseek-chat",
|
||||||
|
"tags": "LLM,CHAT,",
|
||||||
|
"max_tokens": 32768,
|
||||||
|
"model_type": LLMType.CHAT.value
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fid": factory_infos[8]["name"],
|
||||||
|
"llm_name": "deepseek-coder",
|
||||||
|
"tags": "LLM,CHAT,",
|
||||||
|
"max_tokens": 16385,
|
||||||
|
"model_type": LLMType.CHAT.value
|
||||||
|
},
|
||||||
]
|
]
|
||||||
for info in factory_infos:
|
for info in factory_infos:
|
||||||
try:
|
try:
|
||||||
@ -350,6 +391,25 @@ def init_llm_factory():
|
|||||||
LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
|
LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
|
||||||
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
|
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
|
||||||
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
|
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
|
||||||
|
## insert openai two embedding models to the current openai user.
|
||||||
|
print("Start to insert 2 OpenAI embedding models...")
|
||||||
|
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
|
||||||
|
for tid in tenant_ids:
|
||||||
|
for row in TenantLLMService.query(llm_factory="OpenAI", tenant_id=tid):
|
||||||
|
row = row.to_dict()
|
||||||
|
row["model_type"] = LLMType.EMBEDDING.value
|
||||||
|
row["llm_name"] = "text-embedding-3-small"
|
||||||
|
row["used_tokens"] = 0
|
||||||
|
try:
|
||||||
|
TenantLLMService.save(**row)
|
||||||
|
row = deepcopy(row)
|
||||||
|
row["llm_name"] = "text-embedding-3-large"
|
||||||
|
TenantLLMService.save(**row)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
for kb_id in KnowledgebaseService.get_all_ids():
|
||||||
|
KnowledgebaseService.update_by_id(kb_id, {"doc_num": DocumentService.get_kb_doc_count(kb_id)})
|
||||||
"""
|
"""
|
||||||
drop table llm;
|
drop table llm;
|
||||||
drop table llm_factories;
|
drop table llm_factories;
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import re
|
import re
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
from api.db import LLMType
|
from api.db import LLMType
|
||||||
from api.db.db_models import Dialog, Conversation
|
from api.db.db_models import Dialog, Conversation
|
||||||
@ -71,7 +72,7 @@ def message_fit_in(msg, max_length=4000):
|
|||||||
return max_length, msg
|
return max_length, msg
|
||||||
|
|
||||||
|
|
||||||
def chat(dialog, messages, **kwargs):
|
def chat(dialog, messages, stream=True, **kwargs):
|
||||||
assert messages[-1]["role"] == "user", "The last content of this conversation is not from user."
|
assert messages[-1]["role"] == "user", "The last content of this conversation is not from user."
|
||||||
llm = LLMService.query(llm_name=dialog.llm_id)
|
llm = LLMService.query(llm_name=dialog.llm_id)
|
||||||
if not llm:
|
if not llm:
|
||||||
@ -82,7 +83,9 @@ def chat(dialog, messages, **kwargs):
|
|||||||
else: max_tokens = llm[0].max_tokens
|
else: max_tokens = llm[0].max_tokens
|
||||||
kbs = KnowledgebaseService.get_by_ids(dialog.kb_ids)
|
kbs = KnowledgebaseService.get_by_ids(dialog.kb_ids)
|
||||||
embd_nms = list(set([kb.embd_id for kb in kbs]))
|
embd_nms = list(set([kb.embd_id for kb in kbs]))
|
||||||
assert len(embd_nms) == 1, "Knowledge bases use different embedding models."
|
if len(embd_nms) != 1:
|
||||||
|
yield {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []}
|
||||||
|
return {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []}
|
||||||
|
|
||||||
questions = [m["content"] for m in messages if m["role"] == "user"]
|
questions = [m["content"] for m in messages if m["role"] == "user"]
|
||||||
embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0])
|
embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0])
|
||||||
@ -94,7 +97,9 @@ def chat(dialog, messages, **kwargs):
|
|||||||
if field_map:
|
if field_map:
|
||||||
chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
|
chat_logger.info("Use SQL to retrieval:{}".format(questions[-1]))
|
||||||
ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl, prompt_config.get("quote", True))
|
ans = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl, prompt_config.get("quote", True))
|
||||||
if ans: return ans
|
if ans:
|
||||||
|
yield ans
|
||||||
|
return
|
||||||
|
|
||||||
for p in prompt_config["parameters"]:
|
for p in prompt_config["parameters"]:
|
||||||
if p["key"] == "knowledge":
|
if p["key"] == "knowledge":
|
||||||
@ -112,14 +117,16 @@ def chat(dialog, messages, **kwargs):
|
|||||||
else:
|
else:
|
||||||
kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
|
||||||
dialog.similarity_threshold,
|
dialog.similarity_threshold,
|
||||||
dialog.vector_similarity_weight, top=1024, aggs=False)
|
dialog.vector_similarity_weight,
|
||||||
|
doc_ids=kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None,
|
||||||
|
top=1024, aggs=False)
|
||||||
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
|
||||||
chat_logger.info(
|
chat_logger.info(
|
||||||
"{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
|
"{}->{}".format(" ".join(questions), "\n->".join(knowledges)))
|
||||||
|
|
||||||
if not knowledges and prompt_config.get("empty_response"):
|
if not knowledges and prompt_config.get("empty_response"):
|
||||||
return {
|
yield {"answer": prompt_config["empty_response"], "reference": kbinfos}
|
||||||
"answer": prompt_config["empty_response"], "reference": kbinfos}
|
return {"answer": prompt_config["empty_response"], "reference": kbinfos}
|
||||||
|
|
||||||
kwargs["knowledge"] = "\n".join(knowledges)
|
kwargs["knowledge"] = "\n".join(knowledges)
|
||||||
gen_conf = dialog.llm_setting
|
gen_conf = dialog.llm_setting
|
||||||
@ -130,33 +137,45 @@ def chat(dialog, messages, **kwargs):
|
|||||||
gen_conf["max_tokens"] = min(
|
gen_conf["max_tokens"] = min(
|
||||||
gen_conf["max_tokens"],
|
gen_conf["max_tokens"],
|
||||||
max_tokens - used_token_count)
|
max_tokens - used_token_count)
|
||||||
answer = chat_mdl.chat(
|
|
||||||
prompt_config["system"].format(
|
|
||||||
**kwargs), msg, gen_conf)
|
|
||||||
chat_logger.info("User: {}|Assistant: {}".format(
|
|
||||||
msg[-1]["content"], answer))
|
|
||||||
|
|
||||||
if knowledges and prompt_config.get("quote", True):
|
def decorate_answer(answer):
|
||||||
answer, idx = retrievaler.insert_citations(answer,
|
nonlocal prompt_config, knowledges, kwargs, kbinfos
|
||||||
[ck["content_ltks"]
|
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
||||||
for ck in kbinfos["chunks"]],
|
answer, idx = retrievaler.insert_citations(answer,
|
||||||
[ck["vector"]
|
[ck["content_ltks"]
|
||||||
for ck in kbinfos["chunks"]],
|
for ck in kbinfos["chunks"]],
|
||||||
embd_mdl,
|
[ck["vector"]
|
||||||
tkweight=1 - dialog.vector_similarity_weight,
|
for ck in kbinfos["chunks"]],
|
||||||
vtweight=dialog.vector_similarity_weight)
|
embd_mdl,
|
||||||
idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
|
tkweight=1 - dialog.vector_similarity_weight,
|
||||||
recall_docs = [
|
vtweight=dialog.vector_similarity_weight)
|
||||||
d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
|
idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
|
||||||
if not recall_docs: recall_docs = kbinfos["doc_aggs"]
|
recall_docs = [
|
||||||
kbinfos["doc_aggs"] = recall_docs
|
d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
|
||||||
|
if not recall_docs: recall_docs = kbinfos["doc_aggs"]
|
||||||
|
kbinfos["doc_aggs"] = recall_docs
|
||||||
|
|
||||||
for c in kbinfos["chunks"]:
|
refs = deepcopy(kbinfos)
|
||||||
if c.get("vector"):
|
for c in refs["chunks"]:
|
||||||
del c["vector"]
|
if c.get("vector"):
|
||||||
if answer.lower().find("invalid key") >= 0 or answer.lower().find("invalid api")>=0:
|
del c["vector"]
|
||||||
answer += " Please set LLM API-Key in 'User Setting -> Model Providers -> API-Key'"
|
if answer.lower().find("invalid key") >= 0 or answer.lower().find("invalid api")>=0:
|
||||||
return {"answer": answer, "reference": kbinfos}
|
answer += " Please set LLM API-Key in 'User Setting -> Model Providers -> API-Key'"
|
||||||
|
return {"answer": answer, "reference": refs}
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
answer = ""
|
||||||
|
for ans in chat_mdl.chat_streamly(prompt_config["system"].format(**kwargs), msg, gen_conf):
|
||||||
|
answer = ans
|
||||||
|
yield {"answer": answer, "reference": {}}
|
||||||
|
yield decorate_answer(answer)
|
||||||
|
else:
|
||||||
|
answer = chat_mdl.chat(
|
||||||
|
prompt_config["system"].format(
|
||||||
|
**kwargs), msg, gen_conf)
|
||||||
|
chat_logger.info("User: {}|Assistant: {}".format(
|
||||||
|
msg[-1]["content"], answer))
|
||||||
|
yield decorate_answer(answer)
|
||||||
|
|
||||||
|
|
||||||
def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
def use_sql(question, field_map, tenant_id, chat_mdl, quota=True):
|
||||||
|
|||||||
@ -13,10 +13,19 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
from peewee import Expression
|
import random
|
||||||
|
from datetime import datetime
|
||||||
|
from elasticsearch_dsl import Q
|
||||||
|
from peewee import fn
|
||||||
|
|
||||||
|
from api.settings import stat_logger
|
||||||
|
from api.utils import current_timestamp, get_format_time
|
||||||
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
|
from rag.nlp import search
|
||||||
|
|
||||||
from api.db import FileType, TaskStatus
|
from api.db import FileType, TaskStatus
|
||||||
from api.db.db_models import DB, Knowledgebase, Tenant
|
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
||||||
from api.db.db_models import Document
|
from api.db.db_models import Document
|
||||||
from api.db.services.common_service import CommonService
|
from api.db.services.common_service import CommonService
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
@ -32,8 +41,9 @@ class DocumentService(CommonService):
|
|||||||
orderby, desc, keywords):
|
orderby, desc, keywords):
|
||||||
if keywords:
|
if keywords:
|
||||||
docs = cls.model.select().where(
|
docs = cls.model.select().where(
|
||||||
cls.model.kb_id == kb_id,
|
(cls.model.kb_id == kb_id),
|
||||||
cls.model.name.like(f"%%{keywords}%%"))
|
(fn.LOWER(cls.model.name).contains(keywords.lower()))
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
docs = cls.model.select().where(cls.model.kb_id == kb_id)
|
docs = cls.model.select().where(cls.model.kb_id == kb_id)
|
||||||
count = docs.count()
|
count = docs.count()
|
||||||
@ -62,16 +72,15 @@ class DocumentService(CommonService):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def delete(cls, doc):
|
def remove_document(cls, doc, tenant_id):
|
||||||
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
|
ELASTICSEARCH.deleteByQuery(
|
||||||
if not KnowledgebaseService.update_by_id(
|
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||||
kb.id, {"doc_num": kb.doc_num - 1}):
|
cls.clear_chunk_num(doc.id)
|
||||||
raise RuntimeError("Database error (Knowledgebase)!")
|
|
||||||
return cls.delete_by_id(doc.id)
|
return cls.delete_by_id(doc.id)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_newly_uploaded(cls, tm, mod=0, comm=1, items_per_page=64):
|
def get_newly_uploaded(cls):
|
||||||
fields = [
|
fields = [
|
||||||
cls.model.id,
|
cls.model.id,
|
||||||
cls.model.kb_id,
|
cls.model.kb_id,
|
||||||
@ -93,11 +102,9 @@ class DocumentService(CommonService):
|
|||||||
cls.model.status == StatusEnum.VALID.value,
|
cls.model.status == StatusEnum.VALID.value,
|
||||||
~(cls.model.type == FileType.VIRTUAL.value),
|
~(cls.model.type == FileType.VIRTUAL.value),
|
||||||
cls.model.progress == 0,
|
cls.model.progress == 0,
|
||||||
cls.model.update_time >= tm,
|
cls.model.update_time >= current_timestamp() - 1000 * 600,
|
||||||
cls.model.run == TaskStatus.RUNNING.value,
|
cls.model.run == TaskStatus.RUNNING.value)\
|
||||||
(Expression(cls.model.create_time, "%%", comm) == mod))\
|
.order_by(cls.model.update_time.asc())
|
||||||
.order_by(cls.model.update_time.asc())\
|
|
||||||
.paginate(1, items_per_page)
|
|
||||||
return list(docs.dicts())
|
return list(docs.dicts())
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -130,6 +137,22 @@ class DocumentService(CommonService):
|
|||||||
Knowledgebase.id == kb_id).execute()
|
Knowledgebase.id == kb_id).execute()
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def clear_chunk_num(cls, doc_id):
|
||||||
|
doc = cls.model.get_by_id(doc_id)
|
||||||
|
assert doc, "Can't fine document in database."
|
||||||
|
|
||||||
|
num = Knowledgebase.update(
|
||||||
|
token_num=Knowledgebase.token_num -
|
||||||
|
doc.token_num,
|
||||||
|
chunk_num=Knowledgebase.chunk_num -
|
||||||
|
doc.chunk_num,
|
||||||
|
doc_num=Knowledgebase.doc_num-1
|
||||||
|
).where(
|
||||||
|
Knowledgebase.id == doc.kb_id).execute()
|
||||||
|
return num
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_tenant_id(cls, doc_id):
|
def get_tenant_id(cls, doc_id):
|
||||||
@ -143,6 +166,19 @@ class DocumentService(CommonService):
|
|||||||
return
|
return
|
||||||
return docs[0]["tenant_id"]
|
return docs[0]["tenant_id"]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_tenant_id_by_name(cls, name):
|
||||||
|
docs = cls.model.select(
|
||||||
|
Knowledgebase.tenant_id).join(
|
||||||
|
Knowledgebase, on=(
|
||||||
|
Knowledgebase.id == cls.model.kb_id)).where(
|
||||||
|
cls.model.name == name, Knowledgebase.status == StatusEnum.VALID.value)
|
||||||
|
docs = docs.dicts()
|
||||||
|
if not docs:
|
||||||
|
return
|
||||||
|
return docs[0]["tenant_id"]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_thumbnails(cls, docids):
|
def get_thumbnails(cls, docids):
|
||||||
@ -177,3 +213,61 @@ class DocumentService(CommonService):
|
|||||||
on=(Knowledgebase.id == cls.model.kb_id)).where(
|
on=(Knowledgebase.id == cls.model.kb_id)).where(
|
||||||
Knowledgebase.tenant_id == tenant_id)
|
Knowledgebase.tenant_id == tenant_id)
|
||||||
return len(docs)
|
return len(docs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def begin2parse(cls, docid):
|
||||||
|
cls.update_by_id(
|
||||||
|
docid, {"progress": random.random() * 1 / 100.,
|
||||||
|
"progress_msg": "Task dispatched...",
|
||||||
|
"process_begin_at": get_format_time()
|
||||||
|
})
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def update_progress(cls):
|
||||||
|
docs = cls.get_unfinished_docs()
|
||||||
|
for d in docs:
|
||||||
|
try:
|
||||||
|
tsks = Task.query(doc_id=d["id"], order_by=Task.create_time)
|
||||||
|
if not tsks:
|
||||||
|
continue
|
||||||
|
msg = []
|
||||||
|
prg = 0
|
||||||
|
finished = True
|
||||||
|
bad = 0
|
||||||
|
status = TaskStatus.RUNNING.value
|
||||||
|
for t in tsks:
|
||||||
|
if 0 <= t.progress < 1:
|
||||||
|
finished = False
|
||||||
|
prg += t.progress if t.progress >= 0 else 0
|
||||||
|
msg.append(t.progress_msg)
|
||||||
|
if t.progress == -1:
|
||||||
|
bad += 1
|
||||||
|
prg /= len(tsks)
|
||||||
|
if finished and bad:
|
||||||
|
prg = -1
|
||||||
|
status = TaskStatus.FAIL.value
|
||||||
|
elif finished:
|
||||||
|
status = TaskStatus.DONE.value
|
||||||
|
|
||||||
|
msg = "\n".join(msg)
|
||||||
|
info = {
|
||||||
|
"process_duation": datetime.timestamp(
|
||||||
|
datetime.now()) -
|
||||||
|
d["process_begin_at"].timestamp(),
|
||||||
|
"run": status}
|
||||||
|
if prg != 0:
|
||||||
|
info["progress"] = prg
|
||||||
|
if msg:
|
||||||
|
info["progress_msg"] = msg
|
||||||
|
cls.update_by_id(d["id"], info)
|
||||||
|
except Exception as e:
|
||||||
|
stat_logger.error("fetch task exception:" + str(e))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_kb_doc_count(cls, kb_id):
|
||||||
|
return len(cls.model.select(cls.model.id).where(
|
||||||
|
cls.model.kb_id == kb_id).dicts())
|
||||||
|
|
||||||
|
|||||||
85
api/db/services/file2document_service.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from api.db import FileSource
|
||||||
|
from api.db.db_models import DB
|
||||||
|
from api.db.db_models import File, File2Document
|
||||||
|
from api.db.services.common_service import CommonService
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.utils import current_timestamp, datetime_format, get_uuid
|
||||||
|
|
||||||
|
|
||||||
|
class File2DocumentService(CommonService):
|
||||||
|
model = File2Document
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_by_file_id(cls, file_id):
|
||||||
|
objs = cls.model.select().where(cls.model.file_id == file_id)
|
||||||
|
return objs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_by_document_id(cls, document_id):
|
||||||
|
objs = cls.model.select().where(cls.model.document_id == document_id)
|
||||||
|
return objs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def insert(cls, obj):
|
||||||
|
if not cls.save(**obj):
|
||||||
|
raise RuntimeError("Database error (File)!")
|
||||||
|
e, obj = cls.get_by_id(obj["id"])
|
||||||
|
if not e:
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
return obj
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete_by_file_id(cls, file_id):
|
||||||
|
return cls.model.delete().where(cls.model.file_id == file_id).execute()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete_by_document_id(cls, doc_id):
|
||||||
|
return cls.model.delete().where(cls.model.document_id == doc_id).execute()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def update_by_file_id(cls, file_id, obj):
|
||||||
|
obj["update_time"] = current_timestamp()
|
||||||
|
obj["update_date"] = datetime_format(datetime.now())
|
||||||
|
num = cls.model.update(obj).where(cls.model.id == file_id).execute()
|
||||||
|
e, obj = cls.get_by_id(cls.model.id)
|
||||||
|
return obj
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_minio_address(cls, doc_id=None, file_id=None):
|
||||||
|
if doc_id:
|
||||||
|
f2d = cls.get_by_document_id(doc_id)
|
||||||
|
else:
|
||||||
|
f2d = cls.get_by_file_id(file_id)
|
||||||
|
if f2d:
|
||||||
|
file = File.get_by_id(f2d[0].file_id)
|
||||||
|
if file.source_type == FileSource.LOCAL:
|
||||||
|
return file.parent_id, file.location
|
||||||
|
doc_id = f2d[0].document_id
|
||||||
|
|
||||||
|
assert doc_id, "please specify doc_id"
|
||||||
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
return doc.kb_id, doc.location
|
||||||
307
api/db/services/file_service.py
Normal file
@ -0,0 +1,307 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
from flask_login import current_user
|
||||||
|
from peewee import fn
|
||||||
|
|
||||||
|
from api.db import FileType, KNOWLEDGEBASE_FOLDER_NAME, FileSource
|
||||||
|
from api.db.db_models import DB, File2Document, Knowledgebase
|
||||||
|
from api.db.db_models import File, Document
|
||||||
|
from api.db.services.common_service import CommonService
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from api.utils import get_uuid
|
||||||
|
|
||||||
|
|
||||||
|
class FileService(CommonService):
|
||||||
|
model = File
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_by_pf_id(cls, tenant_id, pf_id, page_number, items_per_page,
|
||||||
|
orderby, desc, keywords):
|
||||||
|
if keywords:
|
||||||
|
files = cls.model.select().where(
|
||||||
|
(cls.model.tenant_id == tenant_id),
|
||||||
|
(cls.model.parent_id == pf_id),
|
||||||
|
(fn.LOWER(cls.model.name).contains(keywords.lower())),
|
||||||
|
~(cls.model.id == pf_id)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
files = cls.model.select().where((cls.model.tenant_id == tenant_id),
|
||||||
|
(cls.model.parent_id == pf_id),
|
||||||
|
~(cls.model.id == pf_id)
|
||||||
|
)
|
||||||
|
count = files.count()
|
||||||
|
if desc:
|
||||||
|
files = files.order_by(cls.model.getter_by(orderby).desc())
|
||||||
|
else:
|
||||||
|
files = files.order_by(cls.model.getter_by(orderby).asc())
|
||||||
|
|
||||||
|
files = files.paginate(page_number, items_per_page)
|
||||||
|
|
||||||
|
res_files = list(files.dicts())
|
||||||
|
for file in res_files:
|
||||||
|
if file["type"] == FileType.FOLDER.value:
|
||||||
|
file["size"] = cls.get_folder_size(file["id"])
|
||||||
|
file['kbs_info'] = []
|
||||||
|
continue
|
||||||
|
kbs_info = cls.get_kb_id_by_file_id(file['id'])
|
||||||
|
file['kbs_info'] = kbs_info
|
||||||
|
|
||||||
|
return res_files, count
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_kb_id_by_file_id(cls, file_id):
|
||||||
|
kbs = (cls.model.select(*[Knowledgebase.id, Knowledgebase.name])
|
||||||
|
.join(File2Document, on=(File2Document.file_id == file_id))
|
||||||
|
.join(Document, on=(File2Document.document_id == Document.id))
|
||||||
|
.join(Knowledgebase, on=(Knowledgebase.id == Document.kb_id))
|
||||||
|
.where(cls.model.id == file_id))
|
||||||
|
if not kbs: return []
|
||||||
|
kbs_info_list = []
|
||||||
|
for kb in list(kbs.dicts()):
|
||||||
|
kbs_info_list.append({"kb_id": kb['id'], "kb_name": kb['name']})
|
||||||
|
return kbs_info_list
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_by_pf_id_name(cls, id, name):
|
||||||
|
file = cls.model.select().where((cls.model.parent_id == id) & (cls.model.name == name))
|
||||||
|
if file.count():
|
||||||
|
e, file = cls.get_by_id(file[0].id)
|
||||||
|
if not e:
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
return file
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_id_list_by_id(cls, id, name, count, res):
|
||||||
|
if count < len(name):
|
||||||
|
file = cls.get_by_pf_id_name(id, name[count])
|
||||||
|
if file:
|
||||||
|
res.append(file.id)
|
||||||
|
return cls.get_id_list_by_id(file.id, name, count + 1, res)
|
||||||
|
else:
|
||||||
|
return res
|
||||||
|
else:
|
||||||
|
return res
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_all_innermost_file_ids(cls, folder_id, result_ids):
|
||||||
|
subfolders = cls.model.select().where(cls.model.parent_id == folder_id)
|
||||||
|
if subfolders.exists():
|
||||||
|
for subfolder in subfolders:
|
||||||
|
cls.get_all_innermost_file_ids(subfolder.id, result_ids)
|
||||||
|
else:
|
||||||
|
result_ids.append(folder_id)
|
||||||
|
return result_ids
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def create_folder(cls, file, parent_id, name, count):
|
||||||
|
if count > len(name) - 2:
|
||||||
|
return file
|
||||||
|
else:
|
||||||
|
file = cls.insert({
|
||||||
|
"id": get_uuid(),
|
||||||
|
"parent_id": parent_id,
|
||||||
|
"tenant_id": current_user.id,
|
||||||
|
"created_by": current_user.id,
|
||||||
|
"name": name[count],
|
||||||
|
"location": "",
|
||||||
|
"size": 0,
|
||||||
|
"type": FileType.FOLDER.value
|
||||||
|
})
|
||||||
|
return cls.create_folder(file, file.id, name, count + 1)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def is_parent_folder_exist(cls, parent_id):
|
||||||
|
parent_files = cls.model.select().where(cls.model.id == parent_id)
|
||||||
|
if parent_files.count():
|
||||||
|
return True
|
||||||
|
cls.delete_folder_by_pf_id(parent_id)
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_root_folder(cls, tenant_id):
|
||||||
|
for file in cls.model.select().where((cls.model.tenant_id == tenant_id),
|
||||||
|
(cls.model.parent_id == cls.model.id)
|
||||||
|
):
|
||||||
|
return file.to_dict()
|
||||||
|
|
||||||
|
file_id = get_uuid()
|
||||||
|
file = {
|
||||||
|
"id": file_id,
|
||||||
|
"parent_id": file_id,
|
||||||
|
"tenant_id": tenant_id,
|
||||||
|
"created_by": tenant_id,
|
||||||
|
"name": "/",
|
||||||
|
"type": FileType.FOLDER.value,
|
||||||
|
"size": 0,
|
||||||
|
"location": "",
|
||||||
|
}
|
||||||
|
cls.save(**file)
|
||||||
|
return file
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_kb_folder(cls, tenant_id):
|
||||||
|
for root in cls.model.select().where(cls.model.tenant_id == tenant_id and
|
||||||
|
cls.model.parent_id == cls.model.id):
|
||||||
|
for folder in cls.model.select().where(cls.model.tenant_id == tenant_id and
|
||||||
|
cls.model.parent_id == root.id and
|
||||||
|
cls.model.name == KNOWLEDGEBASE_FOLDER_NAME
|
||||||
|
):
|
||||||
|
return folder.to_dict()
|
||||||
|
assert False, "Can't find the KB folder. Database init error."
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def new_a_file_from_kb(cls, tenant_id, name, parent_id, ty=FileType.FOLDER.value, size=0, location=""):
|
||||||
|
for file in cls.query(tenant_id=tenant_id, parent_id=parent_id, name=name):
|
||||||
|
return file.to_dict()
|
||||||
|
file = {
|
||||||
|
"id": get_uuid(),
|
||||||
|
"parent_id": parent_id,
|
||||||
|
"tenant_id": tenant_id,
|
||||||
|
"created_by": tenant_id,
|
||||||
|
"name": name,
|
||||||
|
"type": ty,
|
||||||
|
"size": size,
|
||||||
|
"location": location,
|
||||||
|
"source_type": FileSource.KNOWLEDGEBASE
|
||||||
|
}
|
||||||
|
cls.save(**file)
|
||||||
|
return file
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def init_knowledgebase_docs(cls, root_id, tenant_id):
|
||||||
|
for _ in cls.model.select().where((cls.model.name == KNOWLEDGEBASE_FOLDER_NAME)\
|
||||||
|
& (cls.model.parent_id == root_id)):
|
||||||
|
return
|
||||||
|
folder = cls.new_a_file_from_kb(tenant_id, KNOWLEDGEBASE_FOLDER_NAME, root_id)
|
||||||
|
|
||||||
|
for kb in Knowledgebase.select(*[Knowledgebase.id, Knowledgebase.name]).where(Knowledgebase.tenant_id==tenant_id):
|
||||||
|
kb_folder = cls.new_a_file_from_kb(tenant_id, kb.name, folder["id"])
|
||||||
|
for doc in DocumentService.query(kb_id=kb.id):
|
||||||
|
FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], tenant_id)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_parent_folder(cls, file_id):
|
||||||
|
file = cls.model.select().where(cls.model.id == file_id)
|
||||||
|
if file.count():
|
||||||
|
e, file = cls.get_by_id(file[0].parent_id)
|
||||||
|
if not e:
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Database error (File doesn't exist)!")
|
||||||
|
return file
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_all_parent_folders(cls, start_id):
|
||||||
|
parent_folders = []
|
||||||
|
current_id = start_id
|
||||||
|
while current_id:
|
||||||
|
e, file = cls.get_by_id(current_id)
|
||||||
|
if file.parent_id != file.id and e:
|
||||||
|
parent_folders.append(file)
|
||||||
|
current_id = file.parent_id
|
||||||
|
else:
|
||||||
|
parent_folders.append(file)
|
||||||
|
break
|
||||||
|
return parent_folders
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def insert(cls, file):
|
||||||
|
if not cls.save(**file):
|
||||||
|
raise RuntimeError("Database error (File)!")
|
||||||
|
e, file = cls.get_by_id(file["id"])
|
||||||
|
if not e:
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
return file
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete(cls, file):
|
||||||
|
return cls.delete_by_id(file.id)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete_by_pf_id(cls, folder_id):
|
||||||
|
return cls.model.delete().where(cls.model.parent_id == folder_id).execute()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete_folder_by_pf_id(cls, user_id, folder_id):
|
||||||
|
try:
|
||||||
|
files = cls.model.select().where((cls.model.tenant_id == user_id)
|
||||||
|
& (cls.model.parent_id == folder_id))
|
||||||
|
for file in files:
|
||||||
|
cls.delete_folder_by_pf_id(user_id, file.id)
|
||||||
|
return cls.model.delete().where((cls.model.tenant_id == user_id)
|
||||||
|
& (cls.model.id == folder_id)).execute(),
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_file_count(cls, tenant_id):
|
||||||
|
files = cls.model.select(cls.model.id).where(cls.model.tenant_id == tenant_id)
|
||||||
|
return len(files)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_folder_size(cls, folder_id):
|
||||||
|
size = 0
|
||||||
|
|
||||||
|
def dfs(parent_id):
|
||||||
|
nonlocal size
|
||||||
|
for f in cls.model.select(*[cls.model.id, cls.model.size, cls.model.type]).where(
|
||||||
|
cls.model.parent_id == parent_id, cls.model.id != parent_id):
|
||||||
|
size += f.size
|
||||||
|
if f.type == FileType.FOLDER.value:
|
||||||
|
dfs(f.id)
|
||||||
|
|
||||||
|
dfs(folder_id)
|
||||||
|
return size
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def add_file_from_kb(cls, doc, kb_folder_id, tenant_id):
|
||||||
|
for _ in File2DocumentService.get_by_document_id(doc["id"]): return
|
||||||
|
file = {
|
||||||
|
"id": get_uuid(),
|
||||||
|
"parent_id": kb_folder_id,
|
||||||
|
"tenant_id": tenant_id,
|
||||||
|
"created_by": tenant_id,
|
||||||
|
"name": doc["name"],
|
||||||
|
"type": doc["type"],
|
||||||
|
"size": doc["size"],
|
||||||
|
"location": doc["location"],
|
||||||
|
"source_type": FileSource.KNOWLEDGEBASE
|
||||||
|
}
|
||||||
|
cls.save(**file)
|
||||||
|
File2DocumentService.save(**{"id": get_uuid(), "file_id": file["id"], "document_id": doc["id"]})
|
||||||
@ -1,67 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
|
|
||||||
from api.db import TenantPermission
|
|
||||||
from api.db.db_models import DB, Tenant
|
|
||||||
from api.db.db_models import Knowledgebase
|
|
||||||
from api.db.services.common_service import CommonService
|
|
||||||
from api.db import StatusEnum
|
|
||||||
|
|
||||||
|
|
||||||
class KnowledgebaseService(CommonService):
|
|
||||||
model = Knowledgebase
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
@DB.connection_context()
|
|
||||||
def get_by_tenant_ids(cls, joined_tenant_ids, user_id,
|
|
||||||
page_number, items_per_page, orderby, desc):
|
|
||||||
kbs = cls.model.select().where(
|
|
||||||
((cls.model.tenant_id.in_(joined_tenant_ids) & (cls.model.permission ==
|
|
||||||
TenantPermission.TEAM.value)) | (cls.model.tenant_id == user_id))
|
|
||||||
& (cls.model.status == StatusEnum.VALID.value)
|
|
||||||
)
|
|
||||||
if desc:
|
|
||||||
kbs = kbs.order_by(cls.model.getter_by(orderby).desc())
|
|
||||||
else:
|
|
||||||
kbs = kbs.order_by(cls.model.getter_by(orderby).asc())
|
|
||||||
|
|
||||||
kbs = kbs.paginate(page_number, items_per_page)
|
|
||||||
|
|
||||||
return list(kbs.dicts())
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
@DB.connection_context()
|
|
||||||
def get_detail(cls, kb_id):
|
|
||||||
fields = [
|
|
||||||
cls.model.id,
|
|
||||||
Tenant.embd_id,
|
|
||||||
cls.model.avatar,
|
|
||||||
cls.model.name,
|
|
||||||
cls.model.description,
|
|
||||||
cls.model.permission,
|
|
||||||
cls.model.doc_num,
|
|
||||||
cls.model.token_num,
|
|
||||||
cls.model.chunk_num,
|
|
||||||
cls.model.parser_id]
|
|
||||||
kbs = cls.model.select(*fields).join(Tenant, on=((Tenant.id == cls.model.tenant_id)&(Tenant.status== StatusEnum.VALID.value))).where(
|
|
||||||
(cls.model.id == kb_id),
|
|
||||||
(cls.model.status == StatusEnum.VALID.value)
|
|
||||||
)
|
|
||||||
if not kbs:
|
|
||||||
return
|
|
||||||
d = kbs[0].to_dict()
|
|
||||||
d["embd_id"] = kbs[0].tenant.embd_id
|
|
||||||
return d
|
|
||||||
@ -112,3 +112,8 @@ class KnowledgebaseService(CommonService):
|
|||||||
if kb:
|
if kb:
|
||||||
return True, kb[0]
|
return True, kb[0]
|
||||||
return False, None
|
return False, None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_all_ids(cls):
|
||||||
|
return [m["id"] for m in cls.model.select(cls.model.id).dicts()]
|
||||||
|
|||||||
@ -81,7 +81,7 @@ class TenantLLMService(CommonService):
|
|||||||
if not model_config:
|
if not model_config:
|
||||||
if llm_type == LLMType.EMBEDDING.value:
|
if llm_type == LLMType.EMBEDDING.value:
|
||||||
llm = LLMService.query(llm_name=llm_name)
|
llm = LLMService.query(llm_name=llm_name)
|
||||||
if llm and llm[0].fid in ["Youdao", "FastEmbed"]:
|
if llm and llm[0].fid in ["Youdao", "FastEmbed", "DeepSeek"]:
|
||||||
model_config = {"llm_factory": llm[0].fid, "api_key":"", "llm_name": llm_name, "api_base": ""}
|
model_config = {"llm_factory": llm[0].fid, "api_key":"", "llm_name": llm_name, "api_base": ""}
|
||||||
if not model_config:
|
if not model_config:
|
||||||
if llm_name == "flag-embedding":
|
if llm_name == "flag-embedding":
|
||||||
@ -128,11 +128,23 @@ class TenantLLMService(CommonService):
|
|||||||
else:
|
else:
|
||||||
assert False, "LLM type error"
|
assert False, "LLM type error"
|
||||||
|
|
||||||
num = cls.model.update(used_tokens=cls.model.used_tokens + used_tokens)\
|
num = 0
|
||||||
.where(cls.model.tenant_id == tenant_id, cls.model.llm_name == mdlnm)\
|
for u in cls.query(tenant_id = tenant_id, llm_name=mdlnm):
|
||||||
.execute()
|
num += cls.model.update(used_tokens = u.used_tokens + used_tokens)\
|
||||||
|
.where(cls.model.tenant_id == tenant_id, cls.model.llm_name == mdlnm)\
|
||||||
|
.execute()
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_openai_models(cls):
|
||||||
|
objs = cls.model.select().where(
|
||||||
|
(cls.model.llm_factory == "OpenAI"),
|
||||||
|
~(cls.model.llm_name == "text-embedding-3-small"),
|
||||||
|
~(cls.model.llm_name == "text-embedding-3-large")
|
||||||
|
).dicts()
|
||||||
|
return list(objs)
|
||||||
|
|
||||||
|
|
||||||
class LLMBundle(object):
|
class LLMBundle(object):
|
||||||
def __init__(self, tenant_id, llm_type, llm_name=None, lang="Chinese"):
|
def __init__(self, tenant_id, llm_type, llm_name=None, lang="Chinese"):
|
||||||
@ -170,8 +182,18 @@ class LLMBundle(object):
|
|||||||
|
|
||||||
def chat(self, system, history, gen_conf):
|
def chat(self, system, history, gen_conf):
|
||||||
txt, used_tokens = self.mdl.chat(system, history, gen_conf)
|
txt, used_tokens = self.mdl.chat(system, history, gen_conf)
|
||||||
if TenantLLMService.increase_usage(
|
if not TenantLLMService.increase_usage(
|
||||||
self.tenant_id, self.llm_type, used_tokens, self.llm_name):
|
self.tenant_id, self.llm_type, used_tokens, self.llm_name):
|
||||||
database_logger.error(
|
database_logger.error(
|
||||||
"Can't update token usage for {}/CHAT".format(self.tenant_id))
|
"Can't update token usage for {}/CHAT".format(self.tenant_id))
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
def chat_streamly(self, system, history, gen_conf):
|
||||||
|
for txt in self.mdl.chat_streamly(system, history, gen_conf):
|
||||||
|
if isinstance(txt, int):
|
||||||
|
if not TenantLLMService.increase_usage(
|
||||||
|
self.tenant_id, self.llm_type, txt, self.llm_name):
|
||||||
|
database_logger.error(
|
||||||
|
"Can't update token usage for {}/CHAT".format(self.tenant_id))
|
||||||
|
return
|
||||||
|
yield txt
|
||||||
|
|||||||
@ -15,13 +15,19 @@
|
|||||||
#
|
#
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from peewee import Expression
|
from api.db.db_utils import bulk_insert_into_db
|
||||||
from api.db.db_models import DB
|
from deepdoc.parser import PdfParser
|
||||||
|
from peewee import JOIN
|
||||||
|
from api.db.db_models import DB, File2Document, File
|
||||||
from api.db import StatusEnum, FileType, TaskStatus
|
from api.db import StatusEnum, FileType, TaskStatus
|
||||||
from api.db.db_models import Task, Document, Knowledgebase, Tenant
|
from api.db.db_models import Task, Document, Knowledgebase, Tenant
|
||||||
from api.db.services.common_service import CommonService
|
from api.db.services.common_service import CommonService
|
||||||
from api.db.services.document_service import DocumentService
|
from api.db.services.document_service import DocumentService
|
||||||
from api.utils import current_timestamp
|
from api.utils import current_timestamp, get_uuid
|
||||||
|
from deepdoc.parser.excel_parser import RAGFlowExcelParser
|
||||||
|
from rag.settings import SVR_QUEUE_NAME
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
|
from rag.utils.redis_conn import REDIS_CONN
|
||||||
|
|
||||||
|
|
||||||
class TaskService(CommonService):
|
class TaskService(CommonService):
|
||||||
@ -29,7 +35,7 @@ class TaskService(CommonService):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=1, takeit=True):
|
def get_tasks(cls, task_id):
|
||||||
fields = [
|
fields = [
|
||||||
cls.model.id,
|
cls.model.id,
|
||||||
cls.model.doc_id,
|
cls.model.doc_id,
|
||||||
@ -48,47 +54,38 @@ class TaskService(CommonService):
|
|||||||
Tenant.img2txt_id,
|
Tenant.img2txt_id,
|
||||||
Tenant.asr_id,
|
Tenant.asr_id,
|
||||||
cls.model.update_time]
|
cls.model.update_time]
|
||||||
with DB.lock("get_task", -1):
|
docs = cls.model.select(*fields) \
|
||||||
docs = cls.model.select(*fields) \
|
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
||||||
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
|
||||||
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
|
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \
|
||||||
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
|
.where(cls.model.id == task_id)
|
||||||
.where(
|
docs = list(docs.dicts())
|
||||||
Document.status == StatusEnum.VALID.value,
|
if not docs: return []
|
||||||
Document.run == TaskStatus.RUNNING.value,
|
|
||||||
~(Document.type == FileType.VIRTUAL.value),
|
|
||||||
cls.model.progress == 0,
|
|
||||||
#cls.model.update_time >= tm,
|
|
||||||
#(Expression(cls.model.create_time, "%%", comm) == mod)
|
|
||||||
)\
|
|
||||||
.order_by(cls.model.update_time.asc())\
|
|
||||||
.paginate(0, items_per_page)
|
|
||||||
docs = list(docs.dicts())
|
|
||||||
if not docs: return []
|
|
||||||
if not takeit: return docs
|
|
||||||
|
|
||||||
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", progress=random.random()/10.).where(
|
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.",
|
||||||
cls.model.id == docs[0]["id"]).execute()
|
progress=random.random() / 10.).where(
|
||||||
return docs
|
cls.model.id == docs[0]["id"]).execute()
|
||||||
|
return docs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_ongoing_doc_name(cls):
|
def get_ongoing_doc_name(cls):
|
||||||
with DB.lock("get_task", -1):
|
with DB.lock("get_task", -1):
|
||||||
docs = cls.model.select(*[Document.kb_id, Document.location]) \
|
docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
|
||||||
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
||||||
|
.join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
|
||||||
|
.join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \
|
||||||
.where(
|
.where(
|
||||||
Document.status == StatusEnum.VALID.value,
|
Document.status == StatusEnum.VALID.value,
|
||||||
Document.run == TaskStatus.RUNNING.value,
|
Document.run == TaskStatus.RUNNING.value,
|
||||||
~(Document.type == FileType.VIRTUAL.value),
|
~(Document.type == FileType.VIRTUAL.value),
|
||||||
cls.model.progress >= 0,
|
|
||||||
cls.model.progress < 1,
|
cls.model.progress < 1,
|
||||||
cls.model.create_time >= current_timestamp() - 180000
|
cls.model.create_time >= current_timestamp() - 1000 * 600
|
||||||
)
|
)
|
||||||
docs = list(docs.dicts())
|
docs = list(docs.dicts())
|
||||||
if not docs: return []
|
if not docs: return []
|
||||||
|
|
||||||
return list(set([(d["kb_id"], d["location"]) for d in docs]))
|
return list(set([(d["parent_id"] if d["parent_id"] else d["kb_id"], d["location"]) for d in docs]))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
@ -99,7 +96,7 @@ class TaskService(CommonService):
|
|||||||
return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
|
return doc.run == TaskStatus.CANCEL.value or doc.progress < 0
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
return True
|
return False
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
@ -111,3 +108,55 @@ class TaskService(CommonService):
|
|||||||
if "progress" in info:
|
if "progress" in info:
|
||||||
cls.model.update(progress=info["progress"]).where(
|
cls.model.update(progress=info["progress"]).where(
|
||||||
cls.model.id == id).execute()
|
cls.model.id == id).execute()
|
||||||
|
|
||||||
|
|
||||||
|
def queue_tasks(doc, bucket, name):
|
||||||
|
def new_task():
|
||||||
|
nonlocal doc
|
||||||
|
return {
|
||||||
|
"id": get_uuid(),
|
||||||
|
"doc_id": doc["id"]
|
||||||
|
}
|
||||||
|
tsks = []
|
||||||
|
|
||||||
|
if doc["type"] == FileType.PDF.value:
|
||||||
|
file_bin = MINIO.get(bucket, name)
|
||||||
|
do_layout = doc["parser_config"].get("layout_recognize", True)
|
||||||
|
pages = PdfParser.total_page_number(doc["name"], file_bin)
|
||||||
|
page_size = doc["parser_config"].get("task_page_size", 12)
|
||||||
|
if doc["parser_id"] == "paper":
|
||||||
|
page_size = doc["parser_config"].get("task_page_size", 22)
|
||||||
|
if doc["parser_id"] == "one":
|
||||||
|
page_size = 1000000000
|
||||||
|
if not do_layout:
|
||||||
|
page_size = 1000000000
|
||||||
|
page_ranges = doc["parser_config"].get("pages")
|
||||||
|
if not page_ranges:
|
||||||
|
page_ranges = [(1, 100000)]
|
||||||
|
for s, e in page_ranges:
|
||||||
|
s -= 1
|
||||||
|
s = max(0, s)
|
||||||
|
e = min(e - 1, pages)
|
||||||
|
for p in range(s, e, page_size):
|
||||||
|
task = new_task()
|
||||||
|
task["from_page"] = p
|
||||||
|
task["to_page"] = min(p + page_size, e)
|
||||||
|
tsks.append(task)
|
||||||
|
|
||||||
|
elif doc["parser_id"] == "table":
|
||||||
|
file_bin = MINIO.get(bucket, name)
|
||||||
|
rn = RAGFlowExcelParser.row_number(
|
||||||
|
doc["name"], file_bin)
|
||||||
|
for i in range(0, rn, 3000):
|
||||||
|
task = new_task()
|
||||||
|
task["from_page"] = i
|
||||||
|
task["to_page"] = min(i + 3000, rn)
|
||||||
|
tsks.append(task)
|
||||||
|
else:
|
||||||
|
tsks.append(new_task())
|
||||||
|
|
||||||
|
bulk_insert_into_db(Task, tsks, True)
|
||||||
|
DocumentService.begin2parse(doc["id"])
|
||||||
|
|
||||||
|
for t in tsks:
|
||||||
|
assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t), "Can't access Redis. Please check the Redis' status."
|
||||||
@ -18,10 +18,14 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
from werkzeug.serving import run_simple
|
from werkzeug.serving import run_simple
|
||||||
from api.apps import app
|
from api.apps import app
|
||||||
from api.db.runtime_config import RuntimeConfig
|
from api.db.runtime_config import RuntimeConfig
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
from api.settings import (
|
from api.settings import (
|
||||||
HOST, HTTP_PORT, access_logger, database_logger, stat_logger,
|
HOST, HTTP_PORT, access_logger, database_logger, stat_logger,
|
||||||
)
|
)
|
||||||
@ -31,6 +35,16 @@ from api.db.db_models import init_database_tables as init_web_db
|
|||||||
from api.db.init_data import init_web_data
|
from api.db.init_data import init_web_data
|
||||||
from api.versions import get_versions
|
from api.versions import get_versions
|
||||||
|
|
||||||
|
|
||||||
|
def update_progress():
|
||||||
|
while True:
|
||||||
|
time.sleep(1)
|
||||||
|
try:
|
||||||
|
DocumentService.update_progress()
|
||||||
|
except Exception as e:
|
||||||
|
stat_logger.error("update_progress exception:" + str(e))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print("""
|
print("""
|
||||||
____ ______ __
|
____ ______ __
|
||||||
@ -71,6 +85,9 @@ if __name__ == '__main__':
|
|||||||
peewee_logger.addHandler(database_logger.handlers[0])
|
peewee_logger.addHandler(database_logger.handlers[0])
|
||||||
peewee_logger.setLevel(database_logger.level)
|
peewee_logger.setLevel(database_logger.level)
|
||||||
|
|
||||||
|
thr = ThreadPoolExecutor(max_workers=1)
|
||||||
|
thr.submit(update_progress)
|
||||||
|
|
||||||
# start http server
|
# start http server
|
||||||
try:
|
try:
|
||||||
stat_logger.info("RAG Flow http server start...")
|
stat_logger.info("RAG Flow http server start...")
|
||||||
|
|||||||
@ -32,7 +32,7 @@ access_logger = getLogger("access")
|
|||||||
database_logger = getLogger("database")
|
database_logger = getLogger("database")
|
||||||
chat_logger = getLogger("chat")
|
chat_logger = getLogger("chat")
|
||||||
|
|
||||||
from rag.utils import ELASTICSEARCH
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
from rag.nlp import search
|
from rag.nlp import search
|
||||||
from api.utils import get_base_config, decrypt_database_config
|
from api.utils import get_base_config, decrypt_database_config
|
||||||
|
|
||||||
@ -86,6 +86,12 @@ default_llm = {
|
|||||||
"embedding_model": "",
|
"embedding_model": "",
|
||||||
"image2text_model": "",
|
"image2text_model": "",
|
||||||
"asr_model": "",
|
"asr_model": "",
|
||||||
|
},
|
||||||
|
"DeepSeek": {
|
||||||
|
"chat_model": "deepseek-chat",
|
||||||
|
"embedding_model": "BAAI/bge-large-zh-v1.5",
|
||||||
|
"image2text_model": "",
|
||||||
|
"asr_model": "",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LLM = get_base_config("user_default_llm", {})
|
LLM = get_base_config("user_default_llm", {})
|
||||||
@ -152,6 +158,7 @@ CLIENT_AUTHENTICATION = AUTHENTICATION_CONF.get(
|
|||||||
"switch", False)
|
"switch", False)
|
||||||
HTTP_APP_KEY = AUTHENTICATION_CONF.get("client", {}).get("http_app_key")
|
HTTP_APP_KEY = AUTHENTICATION_CONF.get("client", {}).get("http_app_key")
|
||||||
GITHUB_OAUTH = get_base_config("oauth", {}).get("github")
|
GITHUB_OAUTH = get_base_config("oauth", {}).get("github")
|
||||||
|
FEISHU_OAUTH = get_base_config("oauth", {}).get("feishu")
|
||||||
WECHAT_OAUTH = get_base_config("oauth", {}).get("wechat")
|
WECHAT_OAUTH = get_base_config("oauth", {}).get("wechat")
|
||||||
|
|
||||||
# site
|
# site
|
||||||
|
|||||||
@ -25,7 +25,6 @@ from flask import (
|
|||||||
from werkzeug.http import HTTP_STATUS_CODES
|
from werkzeug.http import HTTP_STATUS_CODES
|
||||||
|
|
||||||
from api.utils import json_dumps
|
from api.utils import json_dumps
|
||||||
from api.versions import get_rag_version
|
|
||||||
from api.settings import RetCode
|
from api.settings import RetCode
|
||||||
from api.settings import (
|
from api.settings import (
|
||||||
REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC,
|
REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC,
|
||||||
@ -84,9 +83,6 @@ def request(**kwargs):
|
|||||||
return sess.send(prepped, stream=stream, timeout=timeout)
|
return sess.send(prepped, stream=stream, timeout=timeout)
|
||||||
|
|
||||||
|
|
||||||
rag_version = get_rag_version() or ''
|
|
||||||
|
|
||||||
|
|
||||||
def get_exponential_backoff_interval(retries, full_jitter=False):
|
def get_exponential_backoff_interval(retries, full_jitter=False):
|
||||||
"""Calculate the exponential backoff wait time."""
|
"""Calculate the exponential backoff wait time."""
|
||||||
# Will be zero if factor equals 0
|
# Will be zero if factor equals 0
|
||||||
|
|||||||
@ -19,7 +19,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import fitz
|
import pdfplumber
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from cachetools import LRUCache, cached
|
from cachetools import LRUCache, cached
|
||||||
from ruamel.yaml import YAML
|
from ruamel.yaml import YAML
|
||||||
@ -66,6 +66,15 @@ def get_rag_python_directory(*args):
|
|||||||
return get_rag_directory("python", *args)
|
return get_rag_directory("python", *args)
|
||||||
|
|
||||||
|
|
||||||
|
def get_home_cache_dir():
|
||||||
|
dir = os.path.join(os.path.expanduser('~'), ".ragflow")
|
||||||
|
try:
|
||||||
|
os.mkdir(dir)
|
||||||
|
except OSError as error:
|
||||||
|
pass
|
||||||
|
return dir
|
||||||
|
|
||||||
|
|
||||||
@cached(cache=LRUCache(maxsize=10))
|
@cached(cache=LRUCache(maxsize=10))
|
||||||
def load_json_conf(conf_path):
|
def load_json_conf(conf_path):
|
||||||
if os.path.isabs(conf_path):
|
if os.path.isabs(conf_path):
|
||||||
@ -147,7 +156,7 @@ def filename_type(filename):
|
|||||||
return FileType.PDF.value
|
return FileType.PDF.value
|
||||||
|
|
||||||
if re.match(
|
if re.match(
|
||||||
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md)$", filename):
|
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename):
|
||||||
return FileType.DOC.value
|
return FileType.DOC.value
|
||||||
|
|
||||||
if re.match(
|
if re.match(
|
||||||
@ -155,17 +164,17 @@ def filename_type(filename):
|
|||||||
return FileType.AURAL.value
|
return FileType.AURAL.value
|
||||||
|
|
||||||
if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
|
if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
|
||||||
return FileType.VISUAL
|
return FileType.VISUAL.value
|
||||||
|
|
||||||
|
return FileType.OTHER.value
|
||||||
|
|
||||||
|
|
||||||
def thumbnail(filename, blob):
|
def thumbnail(filename, blob):
|
||||||
filename = filename.lower()
|
filename = filename.lower()
|
||||||
if re.match(r".*\.pdf$", filename):
|
if re.match(r".*\.pdf$", filename):
|
||||||
pdf = fitz.open(stream=blob, filetype="pdf")
|
pdf = pdfplumber.open(BytesIO(blob))
|
||||||
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
|
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
Image.frombytes("RGB", [pix.width, pix.height],
|
pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png")
|
||||||
pix.samples).save(buffered, format="png")
|
|
||||||
return "data:image/png;base64," + \
|
return "data:image/png;base64," + \
|
||||||
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||||
|
|
||||||
|
|||||||
@ -14,17 +14,15 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import dotenv
|
import dotenv
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
|
|
||||||
|
|
||||||
def get_versions() -> typing.Mapping[str, typing.Any]:
|
def get_versions() -> typing.Mapping[str, typing.Any]:
|
||||||
return dotenv.dotenv_values(
|
dotenv.load_dotenv(dotenv.find_dotenv())
|
||||||
dotenv_path=os.path.join(get_project_base_directory(), "rag.env")
|
return dotenv.dotenv_values()
|
||||||
)
|
|
||||||
|
|
||||||
def get_rag_version() -> typing.Optional[str]:
|
def get_rag_version() -> typing.Optional[str]:
|
||||||
return get_versions().get("RAG")
|
return get_versions().get("RAGFLOW_VERSION", "dev")
|
||||||
@ -13,12 +13,12 @@ minio:
|
|||||||
user: 'rag_flow'
|
user: 'rag_flow'
|
||||||
password: 'infini_rag_flow'
|
password: 'infini_rag_flow'
|
||||||
host: 'minio:9000'
|
host: 'minio:9000'
|
||||||
|
es:
|
||||||
|
hosts: 'http://es01:9200'
|
||||||
redis:
|
redis:
|
||||||
db: 1
|
db: 1
|
||||||
password: 'infini_rag_flow'
|
password: 'infini_rag_flow'
|
||||||
host: 'redis:6379'
|
host: 'redis:6379'
|
||||||
es:
|
|
||||||
hosts: 'http://es01:9200'
|
|
||||||
user_default_llm:
|
user_default_llm:
|
||||||
factory: 'Tongyi-Qianwen'
|
factory: 'Tongyi-Qianwen'
|
||||||
api_key: 'sk-xxxxxxxxxxxxx'
|
api_key: 'sk-xxxxxxxxxxxxx'
|
||||||
@ -28,6 +28,12 @@ oauth:
|
|||||||
client_id: xxxxxxxxxxxxxxxxxxxxxxxxx
|
client_id: xxxxxxxxxxxxxxxxxxxxxxxxx
|
||||||
secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||||
url: https://github.com/login/oauth/access_token
|
url: https://github.com/login/oauth/access_token
|
||||||
|
feishu:
|
||||||
|
app_id: cli_xxxxxxxxxxxxxxxxxxx
|
||||||
|
app_secret: xxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||||
|
app_access_token_url: https://open.feishu.cn/open-apis/auth/v3/app_access_token/internal
|
||||||
|
user_access_token_url: https://open.feishu.cn/open-apis/authen/v1/oidc/access_token
|
||||||
|
grant_type: 'authorization_code'
|
||||||
authentication:
|
authentication:
|
||||||
client:
|
client:
|
||||||
switch: false
|
switch: false
|
||||||
@ -38,4 +44,4 @@ authentication:
|
|||||||
permission:
|
permission:
|
||||||
switch: false
|
switch: false
|
||||||
component: false
|
component: false
|
||||||
dataset: false
|
dataset: false
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
|
|
||||||
from .pdf_parser import HuParser as PdfParser, PlainParser
|
from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
|
||||||
from .docx_parser import HuDocxParser as DocxParser
|
from .docx_parser import RAGFlowDocxParser as DocxParser
|
||||||
from .excel_parser import HuExcelParser as ExcelParser
|
from .excel_parser import RAGFlowExcelParser as ExcelParser
|
||||||
from .ppt_parser import HuPptParser as PptParser
|
from .ppt_parser import RAGFlowPptParser as PptParser
|
||||||
|
|||||||
@ -3,11 +3,11 @@ from docx import Document
|
|||||||
import re
|
import re
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
|
||||||
class HuDocxParser:
|
class RAGFlowDocxParser:
|
||||||
|
|
||||||
def __extract_table_content(self, tb):
|
def __extract_table_content(self, tb):
|
||||||
df = []
|
df = []
|
||||||
@ -35,14 +35,14 @@ class HuDocxParser:
|
|||||||
for p, n in patt:
|
for p, n in patt:
|
||||||
if re.search(p, b):
|
if re.search(p, b):
|
||||||
return n
|
return n
|
||||||
tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1]
|
tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
|
||||||
if len(tks) > 3:
|
if len(tks) > 3:
|
||||||
if len(tks) < 12:
|
if len(tks) < 12:
|
||||||
return "Tx"
|
return "Tx"
|
||||||
else:
|
else:
|
||||||
return "Lx"
|
return "Lx"
|
||||||
|
|
||||||
if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
|
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
||||||
return "Nr"
|
return "Nr"
|
||||||
|
|
||||||
return "Ot"
|
return "Ot"
|
||||||
|
|||||||
@ -6,31 +6,40 @@ from io import BytesIO
|
|||||||
from rag.nlp import find_codec
|
from rag.nlp import find_codec
|
||||||
|
|
||||||
|
|
||||||
class HuExcelParser:
|
class RAGFlowExcelParser:
|
||||||
def html(self, fnm):
|
def html(self, fnm, chunk_rows=256):
|
||||||
if isinstance(fnm, str):
|
if isinstance(fnm, str):
|
||||||
wb = load_workbook(fnm)
|
wb = load_workbook(fnm)
|
||||||
else:
|
else:
|
||||||
wb = load_workbook(BytesIO(fnm))
|
wb = load_workbook(BytesIO(fnm))
|
||||||
tb = ""
|
|
||||||
|
tb_chunks = []
|
||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheetname]
|
||||||
rows = list(ws.rows)
|
rows = list(ws.rows)
|
||||||
if not rows:continue
|
if not rows: continue
|
||||||
tb += f"<table><caption>{sheetname}</caption><tr>"
|
|
||||||
|
tb_rows_0 = "<tr>"
|
||||||
for t in list(rows[0]):
|
for t in list(rows[0]):
|
||||||
tb += f"<th>{t.value}</th>"
|
tb_rows_0 += f"<th>{t.value}</th>"
|
||||||
tb += "</tr>"
|
tb_rows_0 += "</tr>"
|
||||||
for r in list(rows[1:]):
|
|
||||||
tb += "<tr>"
|
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
|
||||||
for i, c in enumerate(r):
|
tb = ""
|
||||||
if c.value is None:
|
tb += f"<table><caption>{sheetname}</caption>"
|
||||||
tb += "<td></td>"
|
tb += tb_rows_0
|
||||||
else:
|
for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
|
||||||
tb += f"<td>{c.value}</td>"
|
tb += "<tr>"
|
||||||
tb += "</tr>"
|
for i, c in enumerate(r):
|
||||||
tb += "</table>\n"
|
if c.value is None:
|
||||||
return tb
|
tb += "<td></td>"
|
||||||
|
else:
|
||||||
|
tb += f"<td>{c.value}</td>"
|
||||||
|
tb += "</tr>"
|
||||||
|
tb += "</table>\n"
|
||||||
|
tb_chunks.append(tb)
|
||||||
|
|
||||||
|
return tb_chunks
|
||||||
|
|
||||||
def __call__(self, fnm):
|
def __call__(self, fnm):
|
||||||
if isinstance(fnm, str):
|
if isinstance(fnm, str):
|
||||||
@ -69,10 +78,10 @@ class HuExcelParser:
|
|||||||
|
|
||||||
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
return len(txt.split("\n"))
|
return len(txt.split("\n"))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
psr = HuExcelParser()
|
psr = RAGFlowExcelParser()
|
||||||
psr(sys.argv[1])
|
psr(sys.argv[1])
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import fitz
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import torch
|
import torch
|
||||||
@ -16,14 +15,14 @@ from PyPDF2 import PdfReader as pdf2_read
|
|||||||
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|
||||||
class HuParser:
|
class RAGFlowPdfParser:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.ocr = OCR()
|
self.ocr = OCR()
|
||||||
if hasattr(self, "model_speciess"):
|
if hasattr(self, "model_speciess"):
|
||||||
@ -95,13 +94,13 @@ class HuParser:
|
|||||||
h = max(self.__height(up), self.__height(down))
|
h = max(self.__height(up), self.__height(down))
|
||||||
y_dis = self._y_dis(up, down)
|
y_dis = self._y_dis(up, down)
|
||||||
LEN = 6
|
LEN = 6
|
||||||
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
|
tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
|
||||||
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
|
tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
|
||||||
tks_all = up["text"][-LEN:].strip() \
|
tks_all = up["text"][-LEN:].strip() \
|
||||||
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
||||||
up["text"][-1] + down["text"][0]) else "") \
|
up["text"][-1] + down["text"][0]) else "") \
|
||||||
+ down["text"][:LEN].strip()
|
+ down["text"][:LEN].strip()
|
||||||
tks_all = huqie.qie(tks_all).split(" ")
|
tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
|
||||||
fea = [
|
fea = [
|
||||||
up.get("R", -1) == down.get("R", -1),
|
up.get("R", -1) == down.get("R", -1),
|
||||||
y_dis / h,
|
y_dis / h,
|
||||||
@ -142,8 +141,8 @@ class HuParser:
|
|||||||
tks_down[-1] == tks_up[-1],
|
tks_down[-1] == tks_up[-1],
|
||||||
max(down["in_row"], up["in_row"]),
|
max(down["in_row"], up["in_row"]),
|
||||||
abs(down["in_row"] - up["in_row"]),
|
abs(down["in_row"] - up["in_row"]),
|
||||||
len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
|
len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
|
||||||
len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
|
len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
|
||||||
]
|
]
|
||||||
return fea
|
return fea
|
||||||
|
|
||||||
@ -470,7 +469,8 @@ class HuParser:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
|
if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
|
||||||
or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
|
or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
|
||||||
|
or not down["text"].strip():
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -598,7 +598,7 @@ class HuParser:
|
|||||||
|
|
||||||
if b["text"].strip()[0] != b_["text"].strip()[0] \
|
if b["text"].strip()[0] != b_["text"].strip()[0] \
|
||||||
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
|
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
|
||||||
or huqie.is_chinese(b["text"].strip()[0]) \
|
or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
|
||||||
or b["top"] > b_["bottom"]:
|
or b["top"] > b_["bottom"]:
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
@ -749,6 +749,7 @@ class HuParser:
|
|||||||
"layoutno", "")))
|
"layoutno", "")))
|
||||||
|
|
||||||
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
|
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
|
||||||
|
if right < left: right = left + 1
|
||||||
poss.append((pn + self.page_from, left, right, top, bott))
|
poss.append((pn + self.page_from, left, right, top, bott))
|
||||||
return self.page_images[pn] \
|
return self.page_images[pn] \
|
||||||
.crop((left * ZM, top * ZM,
|
.crop((left * ZM, top * ZM,
|
||||||
@ -921,9 +922,7 @@ class HuParser:
|
|||||||
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
||||||
return len(pdf.pages)
|
return len(pdf.pages)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pdf = fitz.open(fnm) if not binary else fitz.open(
|
logging.error(str(e))
|
||||||
stream=fnm, filetype="pdf")
|
|
||||||
return len(pdf)
|
|
||||||
|
|
||||||
def __images__(self, fnm, zoomin=3, page_from=0,
|
def __images__(self, fnm, zoomin=3, page_from=0,
|
||||||
page_to=299, callback=None):
|
page_to=299, callback=None):
|
||||||
@ -945,23 +944,7 @@ class HuParser:
|
|||||||
self.pdf.pages[page_from:page_to]]
|
self.pdf.pages[page_from:page_to]]
|
||||||
self.total_page = len(self.pdf.pages)
|
self.total_page = len(self.pdf.pages)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.pdf = fitz.open(fnm) if isinstance(
|
logging.error(str(e))
|
||||||
fnm, str) else fitz.open(
|
|
||||||
stream=fnm, filetype="pdf")
|
|
||||||
self.page_images = []
|
|
||||||
self.page_chars = []
|
|
||||||
mat = fitz.Matrix(zoomin, zoomin)
|
|
||||||
self.total_page = len(self.pdf)
|
|
||||||
for i, page in enumerate(self.pdf):
|
|
||||||
if i < page_from:
|
|
||||||
continue
|
|
||||||
if i >= page_to:
|
|
||||||
break
|
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
|
||||||
pix.samples)
|
|
||||||
self.page_images.append(img)
|
|
||||||
self.page_chars.append([])
|
|
||||||
|
|
||||||
self.outlines = []
|
self.outlines = []
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from io import BytesIO
|
|||||||
from pptx import Presentation
|
from pptx import Presentation
|
||||||
|
|
||||||
|
|
||||||
class HuPptParser(object):
|
class RAGFlowPptParser(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
import re,json,os
|
import re,json,os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from . import regions
|
from . import regions
|
||||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
||||||
@ -22,14 +22,14 @@ def baike(cid, default_v=0):
|
|||||||
def corpNorm(nm, add_region=True):
|
def corpNorm(nm, add_region=True):
|
||||||
global CORP_TKS
|
global CORP_TKS
|
||||||
if not nm or type(nm)!=type(""):return ""
|
if not nm or type(nm)!=type(""):return ""
|
||||||
nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower()
|
nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
|
||||||
nm = re.sub(r"&", "&", nm)
|
nm = re.sub(r"&", "&", nm)
|
||||||
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
||||||
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
||||||
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
||||||
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
||||||
|
|
||||||
tks = huqie.qie(nm).split(" ")
|
tks = rag_tokenizer.tokenize(nm).split(" ")
|
||||||
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
||||||
nm = ""
|
nm = ""
|
||||||
for t in tks:
|
for t in tks:
|
||||||
|
|||||||
@ -3,7 +3,7 @@ import re, copy, time, datetime, demjson3, \
|
|||||||
traceback, signal
|
traceback, signal
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
||||||
from rag.nlp import huqie, surname
|
from rag.nlp import rag_tokenizer, surname
|
||||||
from xpinyin import Pinyin
|
from xpinyin import Pinyin
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ def forEdu(cv):
|
|||||||
if n.get("school_name") and isinstance(n["school_name"], str):
|
if n.get("school_name") and isinstance(n["school_name"], str):
|
||||||
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
||||||
e["sch_nm_kwd"] = sch[-1]
|
e["sch_nm_kwd"] = sch[-1]
|
||||||
fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1])
|
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
|
||||||
|
|
||||||
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
||||||
maj.append(n["discipline_name"])
|
maj.append(n["discipline_name"])
|
||||||
@ -166,10 +166,10 @@ def forEdu(cv):
|
|||||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||||
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
||||||
|
|
||||||
if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj))
|
if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
|
||||||
if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch))
|
if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
|
||||||
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch))
|
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
|
||||||
if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj))
|
if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
|
||||||
|
|
||||||
return cv
|
return cv
|
||||||
|
|
||||||
@ -187,11 +187,11 @@ def forProj(cv):
|
|||||||
if n.get("achivement"): desc.append(str(n["achivement"]))
|
if n.get("achivement"): desc.append(str(n["achivement"]))
|
||||||
|
|
||||||
if pro_nms:
|
if pro_nms:
|
||||||
# cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms))
|
# cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
|
||||||
cv["project_name_tks"] = huqie.qie(pro_nms[0])
|
cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
|
||||||
if desc:
|
if desc:
|
||||||
cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc)))
|
cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
|
||||||
cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0]))
|
cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
|
||||||
|
|
||||||
return cv
|
return cv
|
||||||
|
|
||||||
@ -280,25 +280,25 @@ def forWork(cv):
|
|||||||
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
||||||
|
|
||||||
if fea["position_name"]:
|
if fea["position_name"]:
|
||||||
cv["position_name_tks"] = huqie.qie(fea["position_name"][0])
|
cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
|
||||||
cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"])
|
cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
|
||||||
cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:]))
|
cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
|
||||||
|
|
||||||
if fea["industry_name"]:
|
if fea["industry_name"]:
|
||||||
cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0])
|
cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
|
||||||
cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"])
|
cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
|
||||||
cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:]))
|
cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
|
||||||
|
|
||||||
if fea["corporation_name"]:
|
if fea["corporation_name"]:
|
||||||
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
||||||
cv["corp_nm_kwd"] = fea["corporation_name"]
|
cv["corp_nm_kwd"] = fea["corporation_name"]
|
||||||
cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0])
|
cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
|
||||||
cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"])
|
cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
|
||||||
cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:]))
|
cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
|
||||||
|
|
||||||
if fea["responsibilities"]:
|
if fea["responsibilities"]:
|
||||||
cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0])
|
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
|
||||||
cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:]))
|
cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
|
||||||
|
|
||||||
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
||||||
re.match(r"[^0-9]+$", str(i))]
|
re.match(r"[^0-9]+$", str(i))]
|
||||||
@ -444,15 +444,15 @@ def parse(cv):
|
|||||||
if nms:
|
if nms:
|
||||||
t = k[:-4]
|
t = k[:-4]
|
||||||
cv[f"{t}_kwd"] = nms
|
cv[f"{t}_kwd"] = nms
|
||||||
cv[f"{t}_tks"] = huqie.qie(" ".join(nms))
|
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
||||||
cv[k] = []
|
cv[k] = []
|
||||||
|
|
||||||
# tokenize fields
|
# tokenize fields
|
||||||
if k in tks_fld:
|
if k in tks_fld:
|
||||||
cv[f"{k}_tks"] = huqie.qie(cv[k])
|
cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
|
||||||
if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"])
|
if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
|
||||||
|
|
||||||
# keyword fields
|
# keyword fields
|
||||||
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
||||||
@ -492,7 +492,7 @@ def parse(cv):
|
|||||||
cv["name_kwd"] = name
|
cv["name_kwd"] = name
|
||||||
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
||||||
cv["name_tks"] = (
|
cv["name_tks"] = (
|
||||||
huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
||||||
) if name else ""
|
) if name else ""
|
||||||
else:
|
else:
|
||||||
cv["integerity_flt"] /= 2.
|
cv["integerity_flt"] /= 2.
|
||||||
@ -515,7 +515,7 @@ def parse(cv):
|
|||||||
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||||
# long text tokenize
|
# long text tokenize
|
||||||
|
|
||||||
if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"]))
|
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
||||||
|
|
||||||
# for yes or no field
|
# for yes or no field
|
||||||
fea = []
|
fea = []
|
||||||
|
|||||||
@ -1,12 +1,13 @@
|
|||||||
|
import pdfplumber
|
||||||
|
|
||||||
from .ocr import OCR
|
from .ocr import OCR
|
||||||
from .recognizer import Recognizer
|
from .recognizer import Recognizer
|
||||||
from .layout_recognizer import LayoutRecognizer
|
from .layout_recognizer import LayoutRecognizer
|
||||||
from .table_structure_recognizer import TableStructureRecognizer
|
from .table_structure_recognizer import TableStructureRecognizer
|
||||||
|
|
||||||
|
|
||||||
def init_in_out(args):
|
def init_in_out(args):
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import fitz
|
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
from api.utils.file_utils import traversal_files
|
from api.utils.file_utils import traversal_files
|
||||||
@ -18,13 +19,11 @@ def init_in_out(args):
|
|||||||
|
|
||||||
def pdf_pages(fnm, zoomin=3):
|
def pdf_pages(fnm, zoomin=3):
|
||||||
nonlocal outputs, images
|
nonlocal outputs, images
|
||||||
pdf = fitz.open(fnm)
|
pdf = pdfplumber.open(fnm)
|
||||||
mat = fitz.Matrix(zoomin, zoomin)
|
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||||
for i, page in enumerate(pdf):
|
enumerate(pdf.pages)]
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
for i, page in enumerate(images):
|
||||||
pix.samples)
|
|
||||||
images.append(img)
|
|
||||||
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
||||||
|
|
||||||
def images_and_outputs(fnm):
|
def images_and_outputs(fnm):
|
||||||
|
|||||||
@ -11,10 +11,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
from deepdoc.vision.seeit import draw_box
|
|
||||||
from deepdoc.vision import OCR, init_in_out
|
|
||||||
import argparse
|
|
||||||
import numpy as np
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(
|
sys.path.insert(
|
||||||
@ -25,6 +21,11 @@ sys.path.insert(
|
|||||||
os.path.abspath(__file__)),
|
os.path.abspath(__file__)),
|
||||||
'../../')))
|
'../../')))
|
||||||
|
|
||||||
|
from deepdoc.vision.seeit import draw_box
|
||||||
|
from deepdoc.vision import OCR, init_in_out
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
ocr = OCR()
|
ocr = OCR()
|
||||||
|
|||||||
@ -10,17 +10,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import os, sys
|
||||||
from deepdoc.vision.seeit import draw_box
|
|
||||||
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import re
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
sys.path.insert(
|
sys.path.insert(
|
||||||
0,
|
0,
|
||||||
os.path.abspath(
|
os.path.abspath(
|
||||||
@ -29,6 +19,13 @@ sys.path.insert(
|
|||||||
os.path.abspath(__file__)),
|
os.path.abspath(__file__)),
|
||||||
'../../')))
|
'../../')))
|
||||||
|
|
||||||
|
from deepdoc.vision.seeit import draw_box
|
||||||
|
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
|
||||||
|
from api.utils.file_utils import get_project_base_directory
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
images, outputs = init_in_out(args)
|
images, outputs = init_in_out(args)
|
||||||
|
|||||||
@ -19,7 +19,7 @@ import numpy as np
|
|||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from .recognizer import Recognizer
|
from .recognizer import Recognizer
|
||||||
|
|
||||||
|
|
||||||
@ -117,14 +117,14 @@ class TableStructureRecognizer(Recognizer):
|
|||||||
for p, n in patt:
|
for p, n in patt:
|
||||||
if re.search(p, b["text"].strip()):
|
if re.search(p, b["text"].strip()):
|
||||||
return n
|
return n
|
||||||
tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
|
tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1]
|
||||||
if len(tks) > 3:
|
if len(tks) > 3:
|
||||||
if len(tks) < 12:
|
if len(tks) < 12:
|
||||||
return "Tx"
|
return "Tx"
|
||||||
else:
|
else:
|
||||||
return "Lx"
|
return "Lx"
|
||||||
|
|
||||||
if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
|
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
||||||
return "Nr"
|
return "Nr"
|
||||||
|
|
||||||
return "Ot"
|
return "Ot"
|
||||||
|
|||||||
@ -25,9 +25,11 @@ MINIO_PORT=9000
|
|||||||
MINIO_USER=rag_flow
|
MINIO_USER=rag_flow
|
||||||
MINIO_PASSWORD=infini_rag_flow
|
MINIO_PASSWORD=infini_rag_flow
|
||||||
|
|
||||||
|
REDIS_PASSWORD=infini_rag_flow
|
||||||
|
|
||||||
SVR_HTTP_PORT=9380
|
SVR_HTTP_PORT=9380
|
||||||
|
|
||||||
RAGFLOW_VERSION=v0.3.2
|
RAGFLOW_VERSION=0.6.0
|
||||||
|
|
||||||
TIMEZONE='Asia/Shanghai'
|
TIMEZONE='Asia/Shanghai'
|
||||||
|
|
||||||
|
|||||||
@ -50,7 +50,7 @@ The serving port of mysql inside the container. The modification should be synch
|
|||||||
The max database connection.
|
The max database connection.
|
||||||
|
|
||||||
### stale_timeout
|
### stale_timeout
|
||||||
The timeout duation in seconds.
|
The timeout duration in seconds.
|
||||||
|
|
||||||
## minio
|
## minio
|
||||||
|
|
||||||
|
|||||||
@ -29,24 +29,6 @@ services:
|
|||||||
- ragflow
|
- ragflow
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
#kibana:
|
|
||||||
# depends_on:
|
|
||||||
# es01:
|
|
||||||
# condition: service_healthy
|
|
||||||
# image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
|
||||||
# container_name: ragflow-kibana
|
|
||||||
# volumes:
|
|
||||||
# - kibanadata:/usr/share/kibana/data
|
|
||||||
# ports:
|
|
||||||
# - ${KIBANA_PORT}:5601
|
|
||||||
# environment:
|
|
||||||
# - SERVERNAME=kibana
|
|
||||||
# - ELASTICSEARCH_HOSTS=http://es01:9200
|
|
||||||
# - TZ=${TIMEZONE}
|
|
||||||
# mem_limit: ${MEM_LIMIT}
|
|
||||||
# networks:
|
|
||||||
# - ragflow
|
|
||||||
|
|
||||||
mysql:
|
mysql:
|
||||||
image: mysql:5.7.18
|
image: mysql:5.7.18
|
||||||
container_name: ragflow-mysql
|
container_name: ragflow-mysql
|
||||||
@ -74,7 +56,6 @@ services:
|
|||||||
retries: 3
|
retries: 3
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
minio:
|
minio:
|
||||||
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
||||||
container_name: ragflow-minio
|
container_name: ragflow-minio
|
||||||
@ -92,16 +73,27 @@ services:
|
|||||||
- ragflow
|
- ragflow
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
|
redis:
|
||||||
|
image: redis:7.2.4
|
||||||
|
container_name: ragflow-redis
|
||||||
|
command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory 128mb --maxmemory-policy allkeys-lru
|
||||||
|
volumes:
|
||||||
|
- redis_data:/data
|
||||||
|
networks:
|
||||||
|
- ragflow
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
esdata01:
|
esdata01:
|
||||||
driver: local
|
driver: local
|
||||||
# kibanadata:
|
|
||||||
# driver: local
|
|
||||||
mysql_data:
|
mysql_data:
|
||||||
driver: local
|
driver: local
|
||||||
minio_data:
|
minio_data:
|
||||||
driver: local
|
driver: local
|
||||||
|
redis_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
ragflow:
|
ragflow:
|
||||||
|
|||||||
@ -4,36 +4,24 @@
|
|||||||
|
|
||||||
export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/
|
export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/
|
||||||
|
|
||||||
PY=/root/miniconda3/envs/py11/bin/python
|
PY=python3
|
||||||
|
if [[ -z "$WS" || $WS -lt 1 ]]; then
|
||||||
|
WS=1
|
||||||
|
fi
|
||||||
|
|
||||||
function task_exe(){
|
function task_exe(){
|
||||||
while [ 1 -eq 1 ];do
|
while [ 1 -eq 1 ];do
|
||||||
$PY rag/svr/task_executor.py $1 $2;
|
$PY rag/svr/task_executor.py ;
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
function watch_broker(){
|
|
||||||
while [ 1 -eq 1 ];do
|
|
||||||
C=`ps aux|grep "task_broker.py"|grep -v grep|wc -l`;
|
|
||||||
if [ $C -lt 1 ];then
|
|
||||||
$PY rag/svr/task_broker.py &
|
|
||||||
fi
|
|
||||||
sleep 5;
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
function task_bro(){
|
|
||||||
watch_broker;
|
|
||||||
}
|
|
||||||
|
|
||||||
task_bro &
|
|
||||||
|
|
||||||
WS=1
|
|
||||||
for ((i=0;i<WS;i++))
|
for ((i=0;i<WS;i++))
|
||||||
do
|
do
|
||||||
task_exe $i $WS &
|
task_exe &
|
||||||
done
|
done
|
||||||
|
|
||||||
$PY api/ragflow_server.py
|
while [ 1 -eq 1 ];do
|
||||||
|
$PY api/ragflow_server.py
|
||||||
|
done
|
||||||
|
|
||||||
wait;
|
wait;
|
||||||
|
|||||||
@ -13,12 +13,12 @@ minio:
|
|||||||
user: 'rag_flow'
|
user: 'rag_flow'
|
||||||
password: 'infini_rag_flow'
|
password: 'infini_rag_flow'
|
||||||
host: 'minio:9000'
|
host: 'minio:9000'
|
||||||
|
es:
|
||||||
|
hosts: 'http://es01:9200'
|
||||||
redis:
|
redis:
|
||||||
db: 1
|
db: 1
|
||||||
password: 'infini_rag_flow'
|
password: 'infini_rag_flow'
|
||||||
host: 'redis:6379'
|
host: 'redis:6379'
|
||||||
es:
|
|
||||||
hosts: 'http://es01:9200'
|
|
||||||
user_default_llm:
|
user_default_llm:
|
||||||
factory: 'Tongyi-Qianwen'
|
factory: 'Tongyi-Qianwen'
|
||||||
api_key: 'sk-xxxxxxxxxxxxx'
|
api_key: 'sk-xxxxxxxxxxxxx'
|
||||||
@ -38,4 +38,4 @@ authentication:
|
|||||||
permission:
|
permission:
|
||||||
switch: false
|
switch: false
|
||||||
component: false
|
component: false
|
||||||
dataset: false
|
dataset: false
|
||||||
|
|||||||
132
docs/configure_knowledge_base.md
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
# Configure a knowledge base
|
||||||
|
|
||||||
|
Knowledge base, hallucination-free chat, and file management are three pillars of RAGFlow. RAGFlow's AI chats are based on knowledge bases. Each of RAGFlow's knowledge bases serves as a knowledge source, *parsing* files uploaded from your local machine and file references generated in **File Management** into the real 'knowledge' for future AI chats. This guide demonstrates some basic usages of the knowledge base feature, covering the following topics:
|
||||||
|
|
||||||
|
- Create a knowledge base
|
||||||
|
- Configure a knowledge base
|
||||||
|
- Search for a knowledge base
|
||||||
|
- Delete a knowledge base
|
||||||
|
|
||||||
|
## Create knowledge base
|
||||||
|
|
||||||
|
With multiple knowledge bases, you can build more flexible, diversified question answering. To create your first knowledge base:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
_Each time a knowledge base is created, a folder with the same name is generated in the **root/.knowledgebase** directory._
|
||||||
|
|
||||||
|
## Configure knowledge base
|
||||||
|
|
||||||
|
The following screen shot shows the configuration page of a knowledge base. A proper configuration of your knowledge base is crucial for future AI chats. For example, choosing the wrong embedding model or chunk method would cause unexpected semantic loss or mismatched answers in chats.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
This section covers the following topics:
|
||||||
|
|
||||||
|
- Select chunk method
|
||||||
|
- Select embedding model
|
||||||
|
- Upload file
|
||||||
|
- Parse file
|
||||||
|
- Intervene with file parsing results
|
||||||
|
- Run retrieval testing
|
||||||
|
|
||||||
|
### Select chunk method
|
||||||
|
|
||||||
|
RAGFlow offers multiple chunking template to facilitate chunking files of different layouts and ensure semantic integrity. In **Chunk method**, you can choose the default template that suits the layouts and formats of your files. The following table shows the descriptions and the compatible file formats of each supported chunk template:
|
||||||
|
|
||||||
|
| **Template** | Description | File format |
|
||||||
|
| ------------ | ------------------------------------------------------------ | ---------------------------------------------------- |
|
||||||
|
| General | Files are consecutively chunked based on a preset chunk token number. | DOCX, EXCEL, PPT, PDF, TXT, JPEG, JPG, PNG, TIF, GIF |
|
||||||
|
| Q&A | | EXCEL, CSV/TXT |
|
||||||
|
| Manual | | PDF |
|
||||||
|
| Table | | EXCEL, CSV/TXT |
|
||||||
|
| Paper | | PDF |
|
||||||
|
| Book | | DOCX, PDF, TXT |
|
||||||
|
| Laws | | DOCX, PDF, TXT |
|
||||||
|
| Presentation | | PDF, PPTX |
|
||||||
|
| Picture | | JPEG, JPG, PNG, TIF, GIF |
|
||||||
|
| One | The entire document is chunked as one. | DOCX, EXCEL, PDF, TXT |
|
||||||
|
|
||||||
|
You can also change the chunk template for a particular file on the **Datasets** page.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
### Select embedding model
|
||||||
|
|
||||||
|
An embedding model builds vector index on file chunks. Once you have chosen an embedding model and used it to parse a file, you are no longer allowed to change it. To switch to a different embedding model, you *must* deletes all completed file chunks in the knowledge base. The obvious reason is that we must *ensure* that all files in a specific knowledge base are parsed using the *same* embedding model (ensure that they are compared in the same embedding space).
|
||||||
|
|
||||||
|
The following embedding models can be deployed locally:
|
||||||
|
|
||||||
|
- BAAI/bge-base-en-v1.5
|
||||||
|
- BAAI/bge-large-en-v1.5
|
||||||
|
- BAAI/bge-small-en-v1.5
|
||||||
|
- BAAI/bge-small-zh-v1.5
|
||||||
|
- jinaai/jina-embeddings-v2-base-en
|
||||||
|
- jinaai/jina-embeddings-v2-small-en
|
||||||
|
- nomic-ai/nomic-embed-text-v1.5
|
||||||
|
- sentence-transformers/all-MiniLM-L6-v2
|
||||||
|
- maidalun1020/bce-embedding-base_v1
|
||||||
|
|
||||||
|
### Upload file
|
||||||
|
|
||||||
|
- RAGFlow's **File Management** allows you to link a file to multiple knowledge bases, in which case each target knowledge base holds a reference to the file.
|
||||||
|
- In **Knowledge Base**, you are also given the option of uploading a single file or a folder of files (bulk upload) from your local machine to a knowledge base, in which case the knowledge base holds file copies.
|
||||||
|
|
||||||
|
While uploading files directly to a knowledge base seems more convenient, we *highly* recommend uploading files to **File Management** and then linking them to the target knowledge bases. This way, you can avoid permanently deleting files uploaded to the knowledge base.
|
||||||
|
|
||||||
|
### Parse file
|
||||||
|
|
||||||
|
File parsing is a crucial topic in knowledge base configuration. The meaning of file parsing in RAGFlow is twofold: chunking files based on file layout and building embedding and full-text (keyword) indexes on these chunks. After having selected the chunk method and embedding model, you can start parsing an file:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
- Click the play button next to **UNSTART** to start file parsing.
|
||||||
|
- Click the red-cross icon and then refresh, if your file parsing stalls for a long time.
|
||||||
|
- As shown above, RAGFlow allows you to use a different chunk method for a particular file, offering flexibility beyond the default method.
|
||||||
|
- As shown above, RAGFlow allows you to enable or disable individual files, offering finer control over knowledge base-based AI chats.
|
||||||
|
|
||||||
|
### Intervene with file parsing results
|
||||||
|
|
||||||
|
RAGFlow features visibility and explainability, allowing you to view the chunking results and intervene where necessary. To do so:
|
||||||
|
|
||||||
|
1. Click on the file that completes file parsing to view the chunking results:
|
||||||
|
|
||||||
|
_You are taken to the **Chunk** page:_
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
2. Hover over each snapshot for a quick view of each chunk.
|
||||||
|
|
||||||
|
3. Double click the chunked texts to add keywords or make *manual* changes where necessary:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
4. In Retrieval testing, ask a quick question in **Test text** to double check if your configurations work:
|
||||||
|
|
||||||
|
_As you can tell from the following, RAGFlow responds with truthful citations._
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
### Run retrieval testing
|
||||||
|
|
||||||
|
RAGFlow uses multiple recall of both full-text search and vector search in its chats. Prior to setting up an AI chat, consider adjusting the following parameters to ensure that the intended information always turns up in answers:
|
||||||
|
|
||||||
|
- Similarity threshold: Chunks with similarities below the threshold will be filtered. Defaultly set to 0.2.
|
||||||
|
- Vector similarity weight: The percentage by which vector similarity contributes to the overall score. Defaultly set to 0.3.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Search for knowledge base
|
||||||
|
|
||||||
|
As of RAGFlow v0.5.0, the search feature is still in a rudimentary form, supporting only knowledge base search by name.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Delete knowledge base
|
||||||
|
|
||||||
|
You are allowed to delete a knowledge base. Hover your mouse over the three dot of the intended knowledge base card and the **Delete** option appears. Once you delete a knowledge base, the associated folder under **root/.knowledge** directory is AUTOMATICALLY REMOVED. The consequence is:
|
||||||
|
|
||||||
|
- The files uploaded directly to the knowledge base are gone;
|
||||||
|
- The file references, which you created from within **File Management**, are gone, but the associated files still exist in **File Management**.
|
||||||
|
|
||||||
|

|
||||||
@ -220,7 +220,10 @@ This will be called to get the answer to users' questions.
|
|||||||
| name | type | optional | description|
|
| name | type | optional | description|
|
||||||
|------|-------|----|----|
|
|------|-------|----|----|
|
||||||
| conversation_id| string | No | This is from calling /new_conversation.|
|
| conversation_id| string | No | This is from calling /new_conversation.|
|
||||||
| messages| json | No | All the conversation history stored here including the latest user's question.|
|
| messages| json | No | The latest question, such as `[{"role": "user", "content": "How are you doing!"}]`|
|
||||||
|
| quote | bool | Yes | Default: true |
|
||||||
|
| stream | bool | Yes | Default: true |
|
||||||
|
| doc_ids | string | Yes | Document IDs which is delimited by comma, like `c790da40ea8911ee928e0242ac180005,c790da40ea8911ee928e0242ac180005`. The retrieved content is limited in these documents. |
|
||||||
|
|
||||||
### Response
|
### Response
|
||||||
```json
|
```json
|
||||||
@ -314,10 +317,12 @@ This is usually used when upload a file to.
|
|||||||
|
|
||||||
### Parameter:
|
### Parameter:
|
||||||
|
|
||||||
| name | type | optional | description |
|
| name | type | optional | description |
|
||||||
|---------|--------|----------|----------------------------------------|
|
|-----------|--------|----------|---------------------------------------------------------|
|
||||||
| file | file | No | Upload file. |
|
| file | file | No | Upload file. |
|
||||||
| kb_name | string | No | Choose the upload knowledge base name. |
|
| kb_name | string | No | Choose the upload knowledge base name. |
|
||||||
|
| parser_id | string | Yes | Choose the parsing method. |
|
||||||
|
| run | string | Yes | Parsing will start automatically when the value is "1". |
|
||||||
|
|
||||||
### Response
|
### Response
|
||||||
```json
|
```json
|
||||||
@ -360,4 +365,39 @@ This is usually used when upload a file to.
|
|||||||
"retmsg": "success"
|
"retmsg": "success"
|
||||||
}
|
}
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Get document chunks
|
||||||
|
|
||||||
|
Get the chunks of the document based on doc_name or doc_id.
|
||||||
|
### Path: /api/list_chunks/
|
||||||
|
### Method: POST
|
||||||
|
|
||||||
|
### Parameter:
|
||||||
|
|
||||||
|
| Name | Type | Optional | Description |
|
||||||
|
|----------|--------|----------|---------------------------------|
|
||||||
|
| `doc_name` | string | Yes | The name of the document in the knowledge base. It must not be empty if `doc_id` is not set.|
|
||||||
|
| `doc_id` | string | Yes | The ID of the document in the knowledge base. It must not be empty if `doc_name` is not set.|
|
||||||
|
|
||||||
|
|
||||||
|
### Response
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"data": [
|
||||||
|
{
|
||||||
|
"content": "Figure 14: Per-request neural-net processingof RL-Cache.\n103\n(sn)\nCPU\n 102\nGPU\n8101\n100\n8\n16 64 256 1K\n4K",
|
||||||
|
"doc_name": "RL-Cache.pdf",
|
||||||
|
"img_id": "0335167613f011ef91240242ac120006-b46c3524952f82dbe061ce9b123f2211"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"content": "4.3 ProcessingOverheadof RL-CacheACKNOWLEDGMENTSThis section evaluates how eectively our RL-Cache implemen-tation leverages modern multi-core CPUs and GPUs to keep the per-request neural-net processing overhead low. Figure 14 depictsThis researchwas supported inpart by the Regional Government of Madrid (grant P2018/TCS-4499, EdgeData-CM)andU.S. National Science Foundation (grants CNS-1763617 andCNS-1717179).REFERENCES",
|
||||||
|
"doc_name": "RL-Cache.pdf",
|
||||||
|
"img_id": "0335167613f011ef91240242ac120006-d4c12c43938eb55d2d8278eea0d7e6d7"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"retcode": 0,
|
||||||
|
"retmsg": "success"
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|||||||
137
docs/faq.md
@ -55,7 +55,7 @@ This feature and the related APIs are still in development. Contributions are we
|
|||||||
```
|
```
|
||||||
$ git clone https://github.com/infiniflow/ragflow.git
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
$ cd ragflow
|
$ cd ragflow
|
||||||
$ docker build -t infiniflow/ragflow:v0.3.2 .
|
$ docker build -t infiniflow/ragflow:latest .
|
||||||
$ cd ragflow/docker
|
$ cd ragflow/docker
|
||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
@ -186,25 +186,40 @@ Parsing requests have to wait in queue due to limited server resources. We are c
|
|||||||
|
|
||||||
If your RAGFlow is deployed *locally*, try the following:
|
If your RAGFlow is deployed *locally*, try the following:
|
||||||
|
|
||||||
1. Check the log of your RAGFlow server to see if it is running properly:
|
1. Click the red cross icon next to **Parsing Status** and refresh the file parsing process.
|
||||||
```bash
|
2. If the issue still persists, try the following:
|
||||||
docker logs -f ragflow-server
|
- check the log of your RAGFlow server to see if it is running properly:
|
||||||
```
|
```bash
|
||||||
2. Check if the **task_executor.py** process exists.
|
docker logs -f ragflow-server
|
||||||
3. Check if your RAGFlow server can access hf-mirror.com or huggingface.com.
|
```
|
||||||
|
- Check if the **task_executor.py** process exists.
|
||||||
|
- Check if your RAGFlow server can access hf-mirror.com or huggingface.com.
|
||||||
|
|
||||||
|
#### 4.5 Why does my pdf parsing stall near completion, while the log does not show any error?
|
||||||
|
|
||||||
#### 4.5 `Index failure`
|
If your RAGFlow is deployed *locally*, the parsing process is likely killed due to insufficient RAM. Try increasing your memory allocation by increasing the `MEM_LIMIT` value in **docker/.env**.
|
||||||
|
|
||||||
|
> Ensure that you restart up your RAGFlow server for your changes to take effect!
|
||||||
|
> ```bash
|
||||||
|
> docker compose stop
|
||||||
|
> ```
|
||||||
|
> ```bash
|
||||||
|
> docker compose up -d
|
||||||
|
> ```
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
#### 4.6 `Index failure`
|
||||||
|
|
||||||
An index failure usually indicates an unavailable Elasticsearch service.
|
An index failure usually indicates an unavailable Elasticsearch service.
|
||||||
|
|
||||||
#### 4.6 How to check the log of RAGFlow?
|
#### 4.7 How to check the log of RAGFlow?
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
tail -f path_to_ragflow/docker/ragflow-logs/rag/*.log
|
tail -f path_to_ragflow/docker/ragflow-logs/rag/*.log
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4.7 How to check the status of each component in RAGFlow?
|
#### 4.8 How to check the status of each component in RAGFlow?
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ docker ps
|
$ docker ps
|
||||||
@ -212,13 +227,13 @@ $ docker ps
|
|||||||
*The system displays the following if all your RAGFlow components are running properly:*
|
*The system displays the following if all your RAGFlow components are running properly:*
|
||||||
|
|
||||||
```
|
```
|
||||||
5bc45806b680 infiniflow/ragflow:v0.3.2 "./entrypoint.sh" 11 hours ago Up 11 hours 0.0.0.0:80->80/tcp, :::80->80/tcp, 0.0.0.0:443->443/tcp, :::443->443/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp ragflow-server
|
5bc45806b680 infiniflow/ragflow:latest "./entrypoint.sh" 11 hours ago Up 11 hours 0.0.0.0:80->80/tcp, :::80->80/tcp, 0.0.0.0:443->443/tcp, :::443->443/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp ragflow-server
|
||||||
91220e3285dd docker.elastic.co/elasticsearch/elasticsearch:8.11.3 "/bin/tini -- /usr/l…" 11 hours ago Up 11 hours (healthy) 9300/tcp, 0.0.0.0:9200->9200/tcp, :::9200->9200/tcp ragflow-es-01
|
91220e3285dd docker.elastic.co/elasticsearch/elasticsearch:8.11.3 "/bin/tini -- /usr/l…" 11 hours ago Up 11 hours (healthy) 9300/tcp, 0.0.0.0:9200->9200/tcp, :::9200->9200/tcp ragflow-es-01
|
||||||
d8c86f06c56b mysql:5.7.18 "docker-entrypoint.s…" 7 days ago Up 16 seconds (healthy) 0.0.0.0:3306->3306/tcp, :::3306->3306/tcp ragflow-mysql
|
d8c86f06c56b mysql:5.7.18 "docker-entrypoint.s…" 7 days ago Up 16 seconds (healthy) 0.0.0.0:3306->3306/tcp, :::3306->3306/tcp ragflow-mysql
|
||||||
cd29bcb254bc quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z "/usr/bin/docker-ent…" 2 weeks ago Up 11 hours 0.0.0.0:9001->9001/tcp, :::9001->9001/tcp, 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp ragflow-minio
|
cd29bcb254bc quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z "/usr/bin/docker-ent…" 2 weeks ago Up 11 hours 0.0.0.0:9001->9001/tcp, :::9001->9001/tcp, 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp ragflow-minio
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4.8 `Exception: Can't connect to ES cluster`
|
#### 4.9 `Exception: Can't connect to ES cluster`
|
||||||
|
|
||||||
1. Check the status of your Elasticsearch component:
|
1. Check the status of your Elasticsearch component:
|
||||||
|
|
||||||
@ -245,23 +260,26 @@ $ docker ps
|
|||||||
curl http://<IP_OF_ES>:<PORT_OF_ES>
|
curl http://<IP_OF_ES>:<PORT_OF_ES>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### 4.10 Can't start ES container and get `Elasticsearch did not exit normally`
|
||||||
|
|
||||||
#### 4.9 `{"data":null,"retcode":100,"retmsg":"<NotFound '404: Not Found'>"}`
|
This is because you forgot to update the `vm.max_map_count` value in **/etc/sysctl.conf** and your change to this value was reset after a system reboot.
|
||||||
|
|
||||||
Your IP address or port number may be incorrect. If you are using the default configurations, enter http://<IP_OF_YOUR_MACHINE> (**NOT 9380, AND NO PORT NUMBER REQUIRED!**) in your browser. This should work.
|
#### 4.11 `{"data":null,"retcode":100,"retmsg":"<NotFound '404: Not Found'>"}`
|
||||||
|
|
||||||
#### 4.10 `Ollama - Mistral instance running at 127.0.0.1:11434 but cannot add Ollama as model in RagFlow`
|
Your IP address or port number may be incorrect. If you are using the default configurations, enter `http://<IP_OF_YOUR_MACHINE>` (**NOT 9380, AND NO PORT NUMBER REQUIRED!**) in your browser. This should work.
|
||||||
|
|
||||||
|
#### 4.12 `Ollama - Mistral instance running at 127.0.0.1:11434 but cannot add Ollama as model in RagFlow`
|
||||||
|
|
||||||
A correct Ollama IP address and port is crucial to adding models to Ollama:
|
A correct Ollama IP address and port is crucial to adding models to Ollama:
|
||||||
|
|
||||||
- If you are on demo.ragflow.io, ensure that the server hosting Ollama has a publicly accessible IP address.Note that 127.0.0.1 is not a publicly accessible IP address.
|
- If you are on demo.ragflow.io, ensure that the server hosting Ollama has a publicly accessible IP address.Note that 127.0.0.1 is not a publicly accessible IP address.
|
||||||
- If you deploy RAGFlow locally, ensure that Ollama and RAGFlow are in the same LAN and can comunicate with each other.
|
- If you deploy RAGFlow locally, ensure that Ollama and RAGFlow are in the same LAN and can comunicate with each other.
|
||||||
|
|
||||||
#### 4.11 Do you offer examples of using deepdoc to parse PDF or other files?
|
#### 4.13 Do you offer examples of using deepdoc to parse PDF or other files?
|
||||||
|
|
||||||
Yes, we do. See the Python files under the **rag/app** folder.
|
Yes, we do. See the Python files under the **rag/app** folder.
|
||||||
|
|
||||||
#### 4.12 Why did I fail to upload a 10MB+ file to my locally deployed RAGFlow?
|
#### 4.14 Why did I fail to upload a 10MB+ file to my locally deployed RAGFlow?
|
||||||
|
|
||||||
You probably forgot to update the **MAX_CONTENT_LENGTH** environment variable:
|
You probably forgot to update the **MAX_CONTENT_LENGTH** environment variable:
|
||||||
|
|
||||||
@ -280,7 +298,7 @@ docker compose up ragflow -d
|
|||||||
```
|
```
|
||||||
*Now you should be able to upload files of sizes less than 100MB.*
|
*Now you should be able to upload files of sizes less than 100MB.*
|
||||||
|
|
||||||
#### 4.13 `Table 'rag_flow.document' doesn't exist`
|
#### 4.15 `Table 'rag_flow.document' doesn't exist`
|
||||||
|
|
||||||
This exception occurs when starting up the RAGFlow server. Try the following:
|
This exception occurs when starting up the RAGFlow server. Try the following:
|
||||||
|
|
||||||
@ -303,7 +321,7 @@ This exception occurs when starting up the RAGFlow server. Try the following:
|
|||||||
docker compose up
|
docker compose up
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4.14 `hint : 102 Fail to access model Connection error`
|
#### 4.16 `hint : 102 Fail to access model Connection error`
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
@ -311,6 +329,13 @@ This exception occurs when starting up the RAGFlow server. Try the following:
|
|||||||
2. Do not forget to append **/v1/** to **http://IP:port**:
|
2. Do not forget to append **/v1/** to **http://IP:port**:
|
||||||
**http://IP:port/v1/**
|
**http://IP:port/v1/**
|
||||||
|
|
||||||
|
#### 4.17 `FileNotFoundError: [Errno 2] No such file or directory`
|
||||||
|
|
||||||
|
1. Check if the status of your minio container is healthy:
|
||||||
|
```bash
|
||||||
|
docker ps
|
||||||
|
```
|
||||||
|
2. Ensure that the username and password settings of MySQL and MinIO in **docker/.env** are in line with those in **docker/service_conf.yml**.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@ -340,10 +365,78 @@ You can use Ollama to deploy local LLM. See [here](https://github.com/infiniflow
|
|||||||
|
|
||||||
### 6. How to configure RAGFlow to respond with 100% matched results, rather than utilizing LLM?
|
### 6. How to configure RAGFlow to respond with 100% matched results, rather than utilizing LLM?
|
||||||
|
|
||||||
1. Click the **Knowledge Base** tab in the middle top of the page.
|
1. Click **Knowledge Base** in the middle top of the page.
|
||||||
2. Right click the desired knowledge base to display the **Configuration** dialogue.
|
2. Right click the desired knowledge base to display the **Configuration** dialogue.
|
||||||
3. Choose **Q&A** as the chunk method and click **Save** to confirm your change.
|
3. Choose **Q&A** as the chunk method and click **Save** to confirm your change.
|
||||||
|
|
||||||
### Do I need to connect to Redis?
|
### 7. Do I need to connect to Redis?
|
||||||
|
|
||||||
No, connecting to Redis is not required to use RAGFlow.
|
No, connecting to Redis is not required.
|
||||||
|
|
||||||
|
### 8. `Error: Range of input length should be [1, 30000]`
|
||||||
|
|
||||||
|
This error occurs because there are too many chunks matching your search criteria. Try reducing the **TopN** and increasing **Similarity threshold** to fix this issue:
|
||||||
|
|
||||||
|
1. Click **Chat** in the middle top of the page.
|
||||||
|
2. Right click the desired conversation > **Edit** > **Prompt Engine**
|
||||||
|
3. Reduce the **TopN** and/or raise **Silimarity threshold**.
|
||||||
|
4. Click **OK** to confirm your changes.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
### 9. How to upgrade RAGFlow?
|
||||||
|
|
||||||
|
You can upgrade RAGFlow to either the dev version or the latest version:
|
||||||
|
|
||||||
|
- Dev versions are for developers and contributors. They are published on a nightly basis and may crash because they are not fully tested. We cannot guarantee their validity and you are at your own risk trying out latest, untested features.
|
||||||
|
- The latest version refers to the most recent, officially published release. It is stable and works best with regular users.
|
||||||
|
|
||||||
|
|
||||||
|
To upgrade RAGFlow to the dev version:
|
||||||
|
|
||||||
|
1. Pull the latest source code
|
||||||
|
```bash
|
||||||
|
cd ragflow
|
||||||
|
git pull
|
||||||
|
```
|
||||||
|
2. If you used `docker compose up -d` to start up RAGFlow server:
|
||||||
|
```bash
|
||||||
|
docker pull infiniflow/ragflow:dev
|
||||||
|
```
|
||||||
|
```bash
|
||||||
|
docker compose up ragflow -d
|
||||||
|
```
|
||||||
|
3. If you used `docker compose -f docker-compose-CN.yml up -d` to start up RAGFlow server:
|
||||||
|
```bash
|
||||||
|
docker pull swr.cn-north-4.myhuaweicloud.com/infiniflow/ragflow:dev
|
||||||
|
```
|
||||||
|
```bash
|
||||||
|
docker compose -f docker-compose-CN.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
To upgrade RAGFlow to the latest version:
|
||||||
|
|
||||||
|
1. Update **ragflow/docker/.env** as follows:
|
||||||
|
```bash
|
||||||
|
RAGFLOW_VERSION=latest
|
||||||
|
```
|
||||||
|
2. Pull the latest source code:
|
||||||
|
```bash
|
||||||
|
cd ragflow
|
||||||
|
git pull
|
||||||
|
```
|
||||||
|
|
||||||
|
3. If you used `docker compose up -d` to start up RAGFlow server:
|
||||||
|
```bash
|
||||||
|
docker pull infiniflow/ragflow:latest
|
||||||
|
```
|
||||||
|
```bash
|
||||||
|
docker compose up ragflow -d
|
||||||
|
```
|
||||||
|
4. If you used `docker compose -f docker-compose-CN.yml up -d` to start up RAGFlow server:
|
||||||
|
```bash
|
||||||
|
docker pull swr.cn-north-4.myhuaweicloud.com/infiniflow/ragflow:latest
|
||||||
|
```
|
||||||
|
```bash
|
||||||
|
docker compose -f docker-compose-CN.yml up -d
|
||||||
|
```
|
||||||
|
|||||||
79
docs/manage_files.md
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
# Manage files
|
||||||
|
|
||||||
|
Knowledge base, hallucination-free chat, and file management are three pillars of RAGFlow. RAGFlow's file management allows you to upload files individually or in bulk. You can then link an uploaded file to multiple target knowledge bases. This guide showcases some basic usages of the file management feature.
|
||||||
|
|
||||||
|
## Create folder
|
||||||
|
|
||||||
|
RAGFlow's file management allows you to establish your file system with nested folder structures. To create a folder in the root directory of RAGFlow:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
> Each knowledge base in RAGFlow has a corresponding folder under the **root/.knowledgebase** directory. You are not allowed to create a subfolder within it.
|
||||||
|
|
||||||
|
## Upload file
|
||||||
|
|
||||||
|
RAGFlow's file management supports file uploads from your local machine, allowing both individual and bulk uploads:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Preview file
|
||||||
|
|
||||||
|
RAGFlow's file management supports previewing files in the following formats:
|
||||||
|
|
||||||
|
- Documents (PDF, DOCS)
|
||||||
|
- Tables (XLSX)
|
||||||
|
- Pictures (JPEG, JPG, PNG, TIF, GIF)
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Link file to knowledge bases
|
||||||
|
|
||||||
|
RAGFlow's file management allows you to *link* an uploaded file to multiple knowledge bases, creating a file reference in each target knowledge base. Therefore, deleting a file in your file management will AUTOMATICALLY REMOVE all related file references across the knowledge bases.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
You can link your file to one knowledge base or multiple knowledge bases at one time:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Move file to specified folder
|
||||||
|
|
||||||
|
As of RAGFlow v0.5.0, this feature is *not* available.
|
||||||
|
|
||||||
|
## Search files or folders
|
||||||
|
|
||||||
|
As of RAGFlow v0.5.0, the search feature is still in a rudimentary form, supporting only file and folder search in the current directory by name (files or folders in the child directory will not be retrieved).
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Rename file or folder
|
||||||
|
|
||||||
|
RAGFlow's file management allows you to rename a file or folder:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
## Delete files or folders
|
||||||
|
|
||||||
|
RAGFlow's file management allows you to delete files or folders individually or in bulk.
|
||||||
|
|
||||||
|
To delete a file or folder:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
To bulk delete files or folders:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
> - You are not allowed to delete the **root/.knowledgebase** folder.
|
||||||
|
> - Deleting files that have been linked to knowledge bases will AUTOMATICALLY REMOVE all associated file references across the knowledge bases.
|
||||||
|
|
||||||
|
## Download uploaded file
|
||||||
|
|
||||||
|
RAGFlow's file management allows you to download an uploaded file:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
> As of RAGFlow v0.5.0, bulk download is not supported, nor can you download an entire folder.
|
||||||
203
docs/quickstart.md
Normal file
@ -0,0 +1,203 @@
|
|||||||
|
# Quickstart
|
||||||
|
|
||||||
|
RAGFlow is an open-source RAG (Retrieval-Augmented Generation) engine based on deep document understanding. When integrated with LLMs, it is capable of providing truthful question-answering capabilities, backed by well-founded citations from various complex formatted data.
|
||||||
|
|
||||||
|
This quick start guide describes a general process from:
|
||||||
|
|
||||||
|
- Starting up a local RAGFlow server,
|
||||||
|
- Creating a knowledge base,
|
||||||
|
- Intervening with file parsing, to
|
||||||
|
- Establishing an AI chat based on your datasets.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- CPU >= 4 cores
|
||||||
|
- RAM >= 16 GB
|
||||||
|
- Disk >= 50 GB
|
||||||
|
- Docker >= 24.0.0 & Docker Compose >= v2.26.1
|
||||||
|
|
||||||
|
> If you have not installed Docker on your local machine (Windows, Mac, or Linux), see [Install Docker Engine](https://docs.docker.com/engine/install/).
|
||||||
|
|
||||||
|
## Start up the server
|
||||||
|
|
||||||
|
1. Ensure `vm.max_map_count` >= 262144 ([more](./docs/max_map_count.md)):
|
||||||
|
|
||||||
|
> To check the value of `vm.max_map_count`:
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> $ sysctl vm.max_map_count
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> Reset `vm.max_map_count` to a value at least 262144 if it is not.
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> # In this case, we set it to 262144:
|
||||||
|
> $ sudo sysctl -w vm.max_map_count=262144
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> This change will be reset after a system reboot. To ensure your change remains permanent, add or update the `vm.max_map_count` value in **/etc/sysctl.conf** accordingly:
|
||||||
|
>
|
||||||
|
> ```bash
|
||||||
|
> vm.max_map_count=262144
|
||||||
|
> ```
|
||||||
|
|
||||||
|
2. Clone the repo:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Build the pre-built Docker images and start up the server:
|
||||||
|
|
||||||
|
> Running the following commands automatically downloads the *dev* version RAGFlow Docker image. To download and run a specified Docker version, update `RAGFLOW_VERSION` in **docker/.env** to the intended version, for example `RAGFLOW_VERSION=v0.6.0`, before running the following commands.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ cd ragflow/docker
|
||||||
|
$ chmod +x ./entrypoint.sh
|
||||||
|
$ docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
> The core image is about 9 GB in size and may take a while to load.
|
||||||
|
|
||||||
|
4. Check the server status after having the server up and running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ docker logs -f ragflow-server
|
||||||
|
```
|
||||||
|
|
||||||
|
_The following output confirms a successful launch of the system:_
|
||||||
|
|
||||||
|
```bash
|
||||||
|
____ ______ __
|
||||||
|
/ __ \ ____ _ ____ _ / ____// /____ _ __
|
||||||
|
/ /_/ // __ `// __ `// /_ / // __ \| | /| / /
|
||||||
|
/ _, _// /_/ // /_/ // __/ / // /_/ /| |/ |/ /
|
||||||
|
/_/ |_| \__,_/ \__, //_/ /_/ \____/ |__/|__/
|
||||||
|
/____/
|
||||||
|
|
||||||
|
* Running on all addresses (0.0.0.0)
|
||||||
|
* Running on http://127.0.0.1:9380
|
||||||
|
* Running on http://x.x.x.x:9380
|
||||||
|
INFO:werkzeug:Press CTRL+C to quit
|
||||||
|
```
|
||||||
|
|
||||||
|
> If you skip this confirmation step and directly log in to RAGFlow, your browser may prompt a `network anomaly` error because, at that moment, your RAGFlow may not be fully initialized.
|
||||||
|
|
||||||
|
5. In your web browser, enter the IP address of your server and log in to RAGFlow.
|
||||||
|
|
||||||
|
> - With default settings, you only need to enter `http://IP_OF_YOUR_MACHINE` (**sans** port number) as the default HTTP serving port `80` can be omitted when using the default configurations.
|
||||||
|
|
||||||
|
## Configure LLMs
|
||||||
|
|
||||||
|
RAGFlow is a RAG engine, and it needs to work with an LLM to offer grounded, hallucination-free question-answering capabilities. For now, RAGFlow supports the following LLMs, and the list is expanding:
|
||||||
|
|
||||||
|
- OpenAI
|
||||||
|
- Tongyi-Qianwen
|
||||||
|
- Moonshot
|
||||||
|
- DeepSeek-V2
|
||||||
|
|
||||||
|
> RAGFlow also supports deploying LLMs locally using Ollama or Xinference, but this part is not covered in this quick start guide.
|
||||||
|
|
||||||
|
To add and configure an LLM:
|
||||||
|
|
||||||
|
1. Click on your logo on the top right of the page **>** **Model Providers**:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
> Each RAGFlow account is able to use **text-embedding-v2** for free, a embedding model of Tongyi-Qianwen. This is why you can see Tongyi-Qianwen in the **Added models** list. And you may need to update your Tongyi-Qianwen API key at a later point.
|
||||||
|
|
||||||
|
2. Click on the desired LLM and update the API key accordingly (DeepSeek-V2 in this case):
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
*Your added models appear as follows:*
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
3. Click **System Model Settings** to select the default models:
|
||||||
|
|
||||||
|
- Chat model,
|
||||||
|
- Embedding model,
|
||||||
|
- Image-to-text model.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
> Some of the models, such as the image-to-text model **qwen-vl-max**, are subsidiary to a particular LLM. And you may need to update your API key accordingly to use these models.
|
||||||
|
|
||||||
|
## Create your first knowledge base
|
||||||
|
|
||||||
|
You are allowed to upload files to a knowledge base in RAGFlow and parse them into datasets. A knowledge base is virtually a collection of datasets. Question answering in RAGFlow can be based on a particular knowledge base or multiple knowledge bases. File formats that RAGFlow supports include documents (PDF, DOC, DOCX, TXT, MD), tables (CSV, XLSX, XLS), pictures (JPEG, JPG, PNG, TIF, GIF), and slides (PPT, PPTX).
|
||||||
|
|
||||||
|
To create your first knowledge base:
|
||||||
|
|
||||||
|
1. Click the **Knowledge Base** tab in the top middle of the page **>** **Create knowledge base**.
|
||||||
|
|
||||||
|
2. Input the name of your knowledge base and click **OK** to confirm your changes.
|
||||||
|
|
||||||
|
_You are taken to the **Configuration** page of your knowledge base._
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
3. RAGFlow offers multiple chunk templates that cater to different document layouts and file formats. Select the embedding model and chunk method (template) for your knowledge base.
|
||||||
|
|
||||||
|
> IMPORTANT: Once you have selected an embedding model and used it to parse a file, you are no longer allowed to change it. The obvious reason is that we must ensure that all files in a specific knowledge base are parsed using the *same* embedding model (ensure that they are being compared in the same embedding space).
|
||||||
|
|
||||||
|
_You are taken to the **Dataset** page of your knowledge base._
|
||||||
|
|
||||||
|
4. Click **+ Add file** **>** **Local files** to start uploading a particular file to the knowledge base.
|
||||||
|
|
||||||
|
5. In the uploaded file entry, click the play button to start file parsing:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
_When the file parsing completes, its parsing status changes to **SUCCESS**._
|
||||||
|
|
||||||
|
## Intervene with file parsing
|
||||||
|
|
||||||
|
RAGFlow features visibility and explainability, allowing you to view the chunking results and intervene where necessary. To do so:
|
||||||
|
|
||||||
|
1. Click on the file that completes file parsing to view the chunking results:
|
||||||
|
|
||||||
|
_You are taken to the **Chunk** page:_
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
2. Hover over each snapshot for a quick view of each chunk.
|
||||||
|
|
||||||
|
3. Double click the chunked texts to add keywords or make *manual* changes where necessary:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
4. In Retrieval testing, ask a quick question in **Test text** to double check if your configurations work:
|
||||||
|
|
||||||
|
_As you can tell from the following, RAGFlow responds with truthful citations._
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Set up an AI chat
|
||||||
|
|
||||||
|
Conversations in RAGFlow are based on a particular knowledge base or multiple knowledge bases. Once you have created your knowledge base and finished file parsing, you can go ahead and start an AI conversation.
|
||||||
|
|
||||||
|
1. Click the **Chat** tab in the middle top of the mage **>** **Create an assistant** to show the **Chat Configuration** dialogue *of your next dialogue*.
|
||||||
|
> RAGFlow offer the flexibility of choosing a different chat model for each dialogue, while allowing you to set the default models in **System Model Settings**.
|
||||||
|
|
||||||
|
2. Update **Assistant Setting**:
|
||||||
|
|
||||||
|
- Name your assistant and specify your knowledge bases.
|
||||||
|
- **Empty response**:
|
||||||
|
- If you wish to *confine* RAGFlow's answers to your knowledge bases, leave a response here. Then when it doesn't retrieve an answer, it *uniformly* responds with what you set here.
|
||||||
|
- If you wish RAGFlow to *improvise* when it doesn't retrieve an answer from your knowledge bases, leave it blank, which may give rise to hallucinations.
|
||||||
|
|
||||||
|
3. Update **Prompt Engine** or leave it as is for the beginning.
|
||||||
|
|
||||||
|
4. Update **Model Setting**.
|
||||||
|
|
||||||
|
5. RAGFlow also offers conversation APIs. Hover over your dialogue **>** **Chat Bot API** to integrate RAGFlow's chat capabilities into your applications:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
6. Now, let's start the show:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
54
docs/start_chat.md
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
# Start an AI chat
|
||||||
|
|
||||||
|
Knowledge base, hallucination-free chat, and file management are three pillars of RAGFlow. Chats in RAGFlow are based on a particular knowledge base or multiple knowledge bases. Once you have created your knowledge base and finished file parsing, you can go ahead and start an AI conversation.
|
||||||
|
|
||||||
|
## Start an AI chat
|
||||||
|
|
||||||
|
You start an AI conversation by creating an assistant.
|
||||||
|
|
||||||
|
1. Click the **Chat** tab in the middle top of the page **>** **Create an assistant** to show the **Chat Configuration** dialogue *of your next dialogue*.
|
||||||
|
|
||||||
|
> RAGFlow offers you the flexibility of choosing a different chat model for each dialogue, while allowing you to set the default models in **System Model Settings**.
|
||||||
|
|
||||||
|
2. Update **Assistant Setting**:
|
||||||
|
|
||||||
|
- **Assistant name** is the name of your chat assistant. Each assistant corresponds to a dialogue with a unique combination of knowledge bases, prompts, hybrid search configurations, and large model settings.
|
||||||
|
- **Empty response**:
|
||||||
|
- If you wish to *confine* RAGFlow's answers to your knowledge bases, leave a response here. Then when it doesn't retrieve an answer, it *uniformly* responds with what you set here.
|
||||||
|
- If you wish RAGFlow to *improvise* when it doesn't retrieve an answer from your knowledge bases, leave it blank, which may give rise to hallucinations.
|
||||||
|
- **Show Quote**: This is a key feature of RAGFlow and enabled by default. RAGFlow does not work like a black box. instead, it clearly shows the sources of information that its responses are based on.
|
||||||
|
- Select the corresponding knowledge bases. You can select one or multiple knowledge bases, but ensure that they use the same embedding model, otherwise an error would occur.
|
||||||
|
|
||||||
|
3. Update **Prompt Engine**:
|
||||||
|
|
||||||
|
- In **System**, you fill in the prompts for your LLM, you can also leave the default prompt as-is for the beginning.
|
||||||
|
- **Similarity threshold** sets the similarity "bar" for each chunk of text. The default is 0.2. Text chunks with lower similarity scores are filtered out of the final response.
|
||||||
|
- **Vector similarity weight** is set to 0.3 by default. RAGFlow uses a hybrid score system, combining keyword similarity and vector similarity, for evaluating the relevance of different text chunks. This value sets the weight assigned to the vector similarity component in the hybrid score.
|
||||||
|
- **Top N** determines the *maximum* number of chunks to feed to the LLM. In other words, even if more chunks are retrieved, only the top N chunks are provided as input.
|
||||||
|
- **Variable**:
|
||||||
|
|
||||||
|
4. Update **Model Setting**:
|
||||||
|
|
||||||
|
- In **Model**: you select the chat model. Though you have selected the default chat model in **System Model Settings**, RAGFlow allows you to choose an alternative chat model for your dialogue.
|
||||||
|
- **Freedom** refers to the level that the LLM improvises. From **Improvise**, **Precise**, to **Balance**, each freedom level corresponds to a unique combination of **Temperature**, **Top P**, **Presence Penalty**, and **Frequency Penalty**.
|
||||||
|
- **Temperature**: Level of the prediction randomness of the LLM. The higher the value, the more creative the LLM is.
|
||||||
|
- **Top P** is also known as "nucleus sampling". See [here](https://en.wikipedia.org/wiki/Top-p_sampling) for more information.
|
||||||
|
- **Max Tokens**: The maximum length of the LLM's responses. Note that the responses may be curtailed if this value is set too low.
|
||||||
|
|
||||||
|
5. Now, let's start the show:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Update settings of an existing dialogue
|
||||||
|
|
||||||
|
Hover over an intended dialogue **>** **Edit** to show the chat configuration dialogue:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Integrate chat capabilities into your application
|
||||||
|
|
||||||
|
RAGFlow also offers conversation APIs. Hover over your dialogue **>** **Chat Bot API** to integrate RAGFlow's chat capabilities into your application:
|
||||||
|
|
||||||
|

|
||||||
@ -18,14 +18,14 @@ from io import BytesIO
|
|||||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
|
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
|
||||||
tokenize_chunks, find_codec
|
tokenize_chunks, find_codec
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
||||||
|
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
to_page=100000, zoomin=3, callback=None):
|
to_page=100000, zoomin=3, callback=None):
|
||||||
callback(msg="OCR is running...")
|
callback(msg="OCR is running...")
|
||||||
self.__images__(
|
self.__images__(
|
||||||
filename if not binary else binary,
|
filename if not binary else binary,
|
||||||
zoomin,
|
zoomin,
|
||||||
@ -63,9 +63,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"""
|
"""
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
pdf_parser = None
|
pdf_parser = None
|
||||||
sections, tbls = [], []
|
sections, tbls = [], []
|
||||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||||
@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from docx import Document
|
|||||||
from api.db import ParserType
|
from api.db import ParserType
|
||||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||||
make_colon_as_title, add_positions, tokenize_chunks, find_codec
|
make_colon_as_title, add_positions, tokenize_chunks, find_codec
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ class Pdf(PdfParser):
|
|||||||
|
|
||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
to_page=100000, zoomin=3, callback=None):
|
to_page=100000, zoomin=3, callback=None):
|
||||||
callback(msg="OCR is running...")
|
callback(msg="OCR is running...")
|
||||||
self.__images__(
|
self.__images__(
|
||||||
filename if not binary else binary,
|
filename if not binary else binary,
|
||||||
zoomin,
|
zoomin,
|
||||||
@ -89,9 +89,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"""
|
"""
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
pdf_parser = None
|
pdf_parser = None
|
||||||
sections = []
|
sections = []
|
||||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||||
@ -113,7 +113,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import copy
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from api.db import ParserType
|
from api.db import ParserType
|
||||||
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||||
from deepdoc.parser import PdfParser, PlainParser
|
from deepdoc.parser import PdfParser, PlainParser
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
|
|
||||||
@ -16,7 +16,7 @@ class Pdf(PdfParser):
|
|||||||
to_page=100000, zoomin=3, callback=None):
|
to_page=100000, zoomin=3, callback=None):
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
start = timer()
|
start = timer()
|
||||||
callback(msg="OCR is running...")
|
callback(msg="OCR is running...")
|
||||||
self.__images__(
|
self.__images__(
|
||||||
filename if not binary else binary,
|
filename if not binary else binary,
|
||||||
zoomin,
|
zoomin,
|
||||||
@ -70,8 +70,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename
|
"docnm_kwd": filename
|
||||||
}
|
}
|
||||||
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
# is it English
|
# is it English
|
||||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@ from docx import Document
|
|||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import re
|
import re
|
||||||
from deepdoc.parser.pdf_parser import PlainParser
|
from deepdoc.parser.pdf_parser import PlainParser
|
||||||
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
||||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
|
|
||||||
@ -69,7 +69,7 @@ class Pdf(PdfParser):
|
|||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
to_page=100000, zoomin=3, callback=None):
|
to_page=100000, zoomin=3, callback=None):
|
||||||
start = timer()
|
start = timer()
|
||||||
callback(msg="OCR is running...")
|
callback(msg="OCR is running...")
|
||||||
self.__images__(
|
self.__images__(
|
||||||
filename if not binary else binary,
|
filename if not binary else binary,
|
||||||
zoomin,
|
zoomin,
|
||||||
@ -112,9 +112,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
res = []
|
res = []
|
||||||
pdf_parser = None
|
pdf_parser = None
|
||||||
sections = []
|
sections = []
|
||||||
@ -134,14 +134,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
excel_parser = ExcelParser()
|
excel_parser = ExcelParser()
|
||||||
sections = [(excel_parser.html(binary), "")]
|
sections = [(l, "") for l in excel_parser.html(binary) if l]
|
||||||
|
|
||||||
elif re.search(r"\.(txt|md)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
@ -14,14 +14,14 @@ from tika import parser
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import re
|
import re
|
||||||
from rag.app import laws
|
from rag.app import laws
|
||||||
from rag.nlp import huqie, tokenize, find_codec
|
from rag.nlp import rag_tokenizer, tokenize, find_codec
|
||||||
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
||||||
|
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
to_page=100000, zoomin=3, callback=None):
|
to_page=100000, zoomin=3, callback=None):
|
||||||
callback(msg="OCR is running...")
|
callback(msg="OCR is running...")
|
||||||
self.__images__(
|
self.__images__(
|
||||||
filename if not binary else binary,
|
filename if not binary else binary,
|
||||||
zoomin,
|
zoomin,
|
||||||
@ -78,14 +78,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
excel_parser = ExcelParser()
|
excel_parser = ExcelParser()
|
||||||
sections = [excel_parser.html(binary)]
|
sections = excel_parser.html(binary, 1000000000)
|
||||||
|
|
||||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
@ -111,9 +111,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
tokenize(doc, "\n".join(sections), eng)
|
tokenize(doc, "\n".join(sections), eng)
|
||||||
return [doc]
|
return [doc]
|
||||||
|
|
||||||
|
|||||||
@ -15,7 +15,7 @@ import re
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
from api.db import ParserType
|
from api.db import ParserType
|
||||||
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||||
from deepdoc.parser import PdfParser, PlainParser
|
from deepdoc.parser import PdfParser, PlainParser
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
@ -28,7 +28,7 @@ class Pdf(PdfParser):
|
|||||||
|
|
||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
to_page=100000, zoomin=3, callback=None):
|
to_page=100000, zoomin=3, callback=None):
|
||||||
callback(msg="OCR is running...")
|
callback(msg="OCR is running...")
|
||||||
self.__images__(
|
self.__images__(
|
||||||
filename if not binary else binary,
|
filename if not binary else binary,
|
||||||
zoomin,
|
zoomin,
|
||||||
@ -153,10 +153,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
else:
|
else:
|
||||||
raise NotImplementedError("file type not supported yet(pdf supported)")
|
raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||||
|
|
||||||
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
|
doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
|
||||||
"title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
|
"title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
|
||||||
# is it English
|
# is it English
|
||||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||||
print("It's English.....", eng)
|
print("It's English.....", eng)
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from io import BytesIO
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from rag.nlp import tokenize, is_english
|
from rag.nlp import tokenize, is_english
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||||
from PyPDF2 import PdfReader as pdf2_read
|
from PyPDF2 import PdfReader as pdf2_read
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ class Pdf(PdfParser):
|
|||||||
|
|
||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
to_page=100000, zoomin=3, callback=None):
|
to_page=100000, zoomin=3, callback=None):
|
||||||
callback(msg="OCR is running...")
|
callback(msg="OCR is running...")
|
||||||
self.__images__(filename if not binary else binary,
|
self.__images__(filename if not binary else binary,
|
||||||
zoomin, from_page, to_page, callback)
|
zoomin, from_page, to_page, callback)
|
||||||
callback(0.8, "Page {}~{}: OCR finished".format(
|
callback(0.8, "Page {}~{}: OCR finished".format(
|
||||||
@ -96,9 +96,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
eng = lang.lower() == "english"
|
eng = lang.lower() == "english"
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
res = []
|
res = []
|
||||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||||
ppt_parser = Ppt()
|
ppt_parser = Ppt()
|
||||||
|
|||||||
@ -16,7 +16,7 @@ from io import BytesIO
|
|||||||
from nltk import word_tokenize
|
from nltk import word_tokenize
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
from rag.nlp import is_english, random_choices, find_codec
|
from rag.nlp import is_english, random_choices, find_codec
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import ExcelParser
|
from deepdoc.parser import ExcelParser
|
||||||
|
|
||||||
|
|
||||||
@ -73,8 +73,8 @@ def beAdoc(d, q, a, eng):
|
|||||||
aprefix = "Answer: " if eng else "回答:"
|
aprefix = "Answer: " if eng else "回答:"
|
||||||
d["content_with_weight"] = "\t".join(
|
d["content_with_weight"] = "\t".join(
|
||||||
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
||||||
d["content_ltks"] = huqie.qie(q)
|
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
@ -94,7 +94,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
res = []
|
res = []
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
@ -107,7 +107,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
@ -116,18 +116,31 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
break
|
break
|
||||||
txt += l
|
txt += l
|
||||||
lines = txt.split("\n")
|
lines = txt.split("\n")
|
||||||
#is_english([rmPrefix(l) for l in lines[:100]])
|
comma, tab = 0, 0
|
||||||
|
for l in lines:
|
||||||
|
if len(l.split(",")) == 2: comma += 1
|
||||||
|
if len(l.split("\t")) == 2: tab += 1
|
||||||
|
delimiter = "\t" if tab >= comma else ","
|
||||||
|
|
||||||
fails = []
|
fails = []
|
||||||
for i, line in enumerate(lines):
|
question, answer = "", ""
|
||||||
arr = [l for l in line.split("\t") if len(l) > 1]
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
arr = lines[i].split(delimiter)
|
||||||
if len(arr) != 2:
|
if len(arr) != 2:
|
||||||
fails.append(str(i))
|
if question: answer += "\n" + lines[i]
|
||||||
continue
|
else:
|
||||||
res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
|
fails.append(str(i+1))
|
||||||
|
elif len(arr) == 2:
|
||||||
|
if question and answer: res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||||
|
question, answer = arr
|
||||||
|
i += 1
|
||||||
if len(res) % 999 == 0:
|
if len(res) % 999 == 0:
|
||||||
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
||||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||||
|
|
||||||
|
if question: res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||||
|
|
||||||
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
||||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||||
|
|
||||||
|
|||||||
@ -18,7 +18,7 @@ import re
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser.resume import refactor
|
from deepdoc.parser.resume import refactor
|
||||||
from deepdoc.parser.resume import step_one, step_two
|
from deepdoc.parser.resume import step_one, step_two
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
@ -131,9 +131,9 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|||||||
titles.append(str(v))
|
titles.append(str(v))
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie("-".join(titles) + "-简历")
|
"title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
pairs = []
|
pairs = []
|
||||||
for n, m in field_map.items():
|
for n, m in field_map.items():
|
||||||
if not resume.get(n):
|
if not resume.get(n):
|
||||||
@ -147,8 +147,8 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|||||||
|
|
||||||
doc["content_with_weight"] = "\n".join(
|
doc["content_with_weight"] = "\n".join(
|
||||||
["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
|
["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
|
||||||
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
|
doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
|
||||||
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
|
doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
|
||||||
for n, _ in field_map.items():
|
for n, _ in field_map.items():
|
||||||
if n not in resume:
|
if n not in resume:
|
||||||
continue
|
continue
|
||||||
@ -156,7 +156,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|||||||
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
||||||
resume[n] = resume[n][0]
|
resume[n] = resume[n][0]
|
||||||
if n.find("_tks") > 0:
|
if n.find("_tks") > 0:
|
||||||
resume[n] = huqie.qieqie(resume[n])
|
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
|
||||||
doc[n] = resume[n]
|
doc[n] = resume[n]
|
||||||
|
|
||||||
print(doc)
|
print(doc)
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from openpyxl import load_workbook
|
|||||||
from dateutil.parser import parse as datetime_parse
|
from dateutil.parser import parse as datetime_parse
|
||||||
|
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from rag.nlp import huqie, is_english, tokenize, find_codec
|
from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
|
||||||
from deepdoc.parser import ExcelParser
|
from deepdoc.parser import ExcelParser
|
||||||
|
|
||||||
|
|
||||||
@ -47,6 +47,7 @@ class Excel(ExcelParser):
|
|||||||
cell.value for i,
|
cell.value for i,
|
||||||
cell in enumerate(
|
cell in enumerate(
|
||||||
rows[0]) if i not in missed]
|
rows[0]) if i not in missed]
|
||||||
|
if not headers:continue
|
||||||
data = []
|
data = []
|
||||||
for i, r in enumerate(rows[1:]):
|
for i, r in enumerate(rows[1:]):
|
||||||
rn += 1
|
rn += 1
|
||||||
@ -148,7 +149,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
@ -216,7 +217,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|||||||
for ii, row in df.iterrows():
|
for ii, row in df.iterrows():
|
||||||
d = {
|
d = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
row_txt = []
|
row_txt = []
|
||||||
for j in range(len(clmns)):
|
for j in range(len(clmns)):
|
||||||
@ -227,7 +228,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|||||||
if pd.isna(row[clmns[j]]):
|
if pd.isna(row[clmns[j]]):
|
||||||
continue
|
continue
|
||||||
fld = clmns_map[j][0]
|
fld = clmns_map[j][0]
|
||||||
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
|
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
|
||||||
row[clmns[j]])
|
row[clmns[j]])
|
||||||
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
|
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
|
||||||
if not row_txt:
|
if not row_txt:
|
||||||
|
|||||||
@ -22,10 +22,11 @@ EmbeddingModel = {
|
|||||||
"Ollama": OllamaEmbed,
|
"Ollama": OllamaEmbed,
|
||||||
"OpenAI": OpenAIEmbed,
|
"OpenAI": OpenAIEmbed,
|
||||||
"Xinference": XinferenceEmbed,
|
"Xinference": XinferenceEmbed,
|
||||||
"Tongyi-Qianwen": HuEmbedding, #QWenEmbed,
|
"Tongyi-Qianwen": DefaultEmbedding, #QWenEmbed,
|
||||||
"ZHIPU-AI": ZhipuEmbed,
|
"ZHIPU-AI": ZhipuEmbed,
|
||||||
"FastEmbed": FastEmbed,
|
"FastEmbed": FastEmbed,
|
||||||
"Youdao": YoudaoEmbed
|
"Youdao": YoudaoEmbed,
|
||||||
|
"DeepSeek": DefaultEmbedding
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -45,6 +46,7 @@ ChatModel = {
|
|||||||
"Tongyi-Qianwen": QWenChat,
|
"Tongyi-Qianwen": QWenChat,
|
||||||
"Ollama": OllamaChat,
|
"Ollama": OllamaChat,
|
||||||
"Xinference": XinferenceChat,
|
"Xinference": XinferenceChat,
|
||||||
"Moonshot": MoonshotChat
|
"Moonshot": MoonshotChat,
|
||||||
|
"DeepSeek": DeepSeekChat
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -24,16 +24,7 @@ from rag.utils import num_tokens_from_string
|
|||||||
|
|
||||||
|
|
||||||
class Base(ABC):
|
class Base(ABC):
|
||||||
def __init__(self, key, model_name):
|
def __init__(self, key, model_name, base_url):
|
||||||
pass
|
|
||||||
|
|
||||||
def chat(self, system, history, gen_conf):
|
|
||||||
raise NotImplementedError("Please implement encode method!")
|
|
||||||
|
|
||||||
|
|
||||||
class GptTurbo(Base):
|
|
||||||
def __init__(self, key, model_name="gpt-3.5-turbo", base_url="https://api.openai.com/v1"):
|
|
||||||
if not base_url: base_url="https://api.openai.com/v1"
|
|
||||||
self.client = OpenAI(api_key=key, base_url=base_url)
|
self.client = OpenAI(api_key=key, base_url=base_url)
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
@ -53,29 +44,54 @@ class GptTurbo(Base):
|
|||||||
except openai.APIError as e:
|
except openai.APIError as e:
|
||||||
return "**ERROR**: " + str(e), 0
|
return "**ERROR**: " + str(e), 0
|
||||||
|
|
||||||
|
def chat_streamly(self, system, history, gen_conf):
|
||||||
class MoonshotChat(GptTurbo):
|
|
||||||
def __init__(self, key, model_name="moonshot-v1-8k", base_url="https://api.moonshot.cn/v1"):
|
|
||||||
if not base_url: base_url="https://api.moonshot.cn/v1"
|
|
||||||
self.client = OpenAI(
|
|
||||||
api_key=key, base_url=base_url)
|
|
||||||
self.model_name = model_name
|
|
||||||
|
|
||||||
def chat(self, system, history, gen_conf):
|
|
||||||
if system:
|
if system:
|
||||||
history.insert(0, {"role": "system", "content": system})
|
history.insert(0, {"role": "system", "content": system})
|
||||||
|
ans = ""
|
||||||
|
total_tokens = 0
|
||||||
try:
|
try:
|
||||||
response = self.client.chat.completions.create(
|
response = self.client.chat.completions.create(
|
||||||
model=self.model_name,
|
model=self.model_name,
|
||||||
messages=history,
|
messages=history,
|
||||||
|
stream=True,
|
||||||
**gen_conf)
|
**gen_conf)
|
||||||
ans = response.choices[0].message.content.strip()
|
for resp in response:
|
||||||
if response.choices[0].finish_reason == "length":
|
if not resp.choices[0].delta.content:continue
|
||||||
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
ans += resp.choices[0].delta.content
|
||||||
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
total_tokens += 1
|
||||||
return ans, response.usage.total_tokens
|
if resp.choices[0].finish_reason == "length":
|
||||||
|
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
||||||
|
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
||||||
|
yield ans
|
||||||
|
|
||||||
except openai.APIError as e:
|
except openai.APIError as e:
|
||||||
return "**ERROR**: " + str(e), 0
|
yield ans + "\n**ERROR**: " + str(e)
|
||||||
|
|
||||||
|
yield total_tokens
|
||||||
|
|
||||||
|
|
||||||
|
class GptTurbo(Base):
|
||||||
|
def __init__(self, key, model_name="gpt-3.5-turbo", base_url="https://api.openai.com/v1"):
|
||||||
|
if not base_url: base_url="https://api.openai.com/v1"
|
||||||
|
super().__init__(key, model_name, base_url)
|
||||||
|
|
||||||
|
|
||||||
|
class MoonshotChat(Base):
|
||||||
|
def __init__(self, key, model_name="moonshot-v1-8k", base_url="https://api.moonshot.cn/v1"):
|
||||||
|
if not base_url: base_url="https://api.moonshot.cn/v1"
|
||||||
|
super().__init__(key, model_name, base_url)
|
||||||
|
|
||||||
|
|
||||||
|
class XinferenceChat(Base):
|
||||||
|
def __init__(self, key=None, model_name="", base_url=""):
|
||||||
|
key = "xxx"
|
||||||
|
super().__init__(key, model_name, base_url)
|
||||||
|
|
||||||
|
|
||||||
|
class DeepSeekChat(Base):
|
||||||
|
def __init__(self, key, model_name="deepseek-chat", base_url="https://api.deepseek.com/v1"):
|
||||||
|
if not base_url: base_url="https://api.deepseek.com/v1"
|
||||||
|
super().__init__(key, model_name, base_url)
|
||||||
|
|
||||||
|
|
||||||
class QWenChat(Base):
|
class QWenChat(Base):
|
||||||
@ -106,6 +122,35 @@ class QWenChat(Base):
|
|||||||
|
|
||||||
return "**ERROR**: " + response.message, tk_count
|
return "**ERROR**: " + response.message, tk_count
|
||||||
|
|
||||||
|
def chat_streamly(self, system, history, gen_conf):
|
||||||
|
from http import HTTPStatus
|
||||||
|
if system:
|
||||||
|
history.insert(0, {"role": "system", "content": system})
|
||||||
|
ans = ""
|
||||||
|
try:
|
||||||
|
response = Generation.call(
|
||||||
|
self.model_name,
|
||||||
|
messages=history,
|
||||||
|
result_format='message',
|
||||||
|
stream=True,
|
||||||
|
**gen_conf
|
||||||
|
)
|
||||||
|
tk_count = 0
|
||||||
|
for resp in response:
|
||||||
|
if resp.status_code == HTTPStatus.OK:
|
||||||
|
ans = resp.output.choices[0]['message']['content']
|
||||||
|
tk_count = resp.usage.total_tokens
|
||||||
|
if resp.output.choices[0].get("finish_reason", "") == "length":
|
||||||
|
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
||||||
|
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
||||||
|
yield ans
|
||||||
|
else:
|
||||||
|
yield ans + "\n**ERROR**: " + resp.message if str(resp.message).find("Access")<0 else "Out of credit. Please set the API key in **settings > Model providers.**"
|
||||||
|
except Exception as e:
|
||||||
|
yield ans + "\n**ERROR**: " + str(e)
|
||||||
|
|
||||||
|
yield tk_count
|
||||||
|
|
||||||
|
|
||||||
class ZhipuChat(Base):
|
class ZhipuChat(Base):
|
||||||
def __init__(self, key, model_name="glm-3-turbo", **kwargs):
|
def __init__(self, key, model_name="glm-3-turbo", **kwargs):
|
||||||
@ -131,6 +176,35 @@ class ZhipuChat(Base):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return "**ERROR**: " + str(e), 0
|
return "**ERROR**: " + str(e), 0
|
||||||
|
|
||||||
|
def chat_streamly(self, system, history, gen_conf):
|
||||||
|
if system:
|
||||||
|
history.insert(0, {"role": "system", "content": system})
|
||||||
|
if "presence_penalty" in gen_conf: del gen_conf["presence_penalty"]
|
||||||
|
if "frequency_penalty" in gen_conf: del gen_conf["frequency_penalty"]
|
||||||
|
ans = ""
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model_name,
|
||||||
|
messages=history,
|
||||||
|
stream=True,
|
||||||
|
**gen_conf
|
||||||
|
)
|
||||||
|
tk_count = 0
|
||||||
|
for resp in response:
|
||||||
|
if not resp.choices[0].delta.content:continue
|
||||||
|
delta = resp.choices[0].delta.content
|
||||||
|
ans += delta
|
||||||
|
if resp.choices[0].finish_reason == "length":
|
||||||
|
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
||||||
|
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
||||||
|
tk_count = resp.usage.total_tokens
|
||||||
|
if resp.choices[0].finish_reason == "stop": tk_count = resp.usage.total_tokens
|
||||||
|
yield ans
|
||||||
|
except Exception as e:
|
||||||
|
yield ans + "\n**ERROR**: " + str(e)
|
||||||
|
|
||||||
|
yield tk_count
|
||||||
|
|
||||||
|
|
||||||
class OllamaChat(Base):
|
class OllamaChat(Base):
|
||||||
def __init__(self, key, model_name, **kwargs):
|
def __init__(self, key, model_name, **kwargs):
|
||||||
@ -141,12 +215,12 @@ class OllamaChat(Base):
|
|||||||
if system:
|
if system:
|
||||||
history.insert(0, {"role": "system", "content": system})
|
history.insert(0, {"role": "system", "content": system})
|
||||||
try:
|
try:
|
||||||
options = {"temperature": gen_conf.get("temperature", 0.1),
|
options = {}
|
||||||
"num_predict": gen_conf.get("max_tokens", 128),
|
if "temperature" in gen_conf: options["temperature"] = gen_conf["temperature"]
|
||||||
"top_k": gen_conf.get("top_p", 0.3),
|
if "max_tokens" in gen_conf: options["num_predict"] = gen_conf["max_tokens"]
|
||||||
"presence_penalty": gen_conf.get("presence_penalty", 0.4),
|
if "top_p" in gen_conf: options["top_k"] = gen_conf["top_p"]
|
||||||
"frequency_penalty": gen_conf.get("frequency_penalty", 0.7),
|
if "presence_penalty" in gen_conf: options["presence_penalty"] = gen_conf["presence_penalty"]
|
||||||
}
|
if "frequency_penalty" in gen_conf: options["frequency_penalty"] = gen_conf["frequency_penalty"]
|
||||||
response = self.client.chat(
|
response = self.client.chat(
|
||||||
model=self.model_name,
|
model=self.model_name,
|
||||||
messages=history,
|
messages=history,
|
||||||
@ -157,25 +231,86 @@ class OllamaChat(Base):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return "**ERROR**: " + str(e), 0
|
return "**ERROR**: " + str(e), 0
|
||||||
|
|
||||||
|
def chat_streamly(self, system, history, gen_conf):
|
||||||
|
if system:
|
||||||
|
history.insert(0, {"role": "system", "content": system})
|
||||||
|
options = {}
|
||||||
|
if "temperature" in gen_conf: options["temperature"] = gen_conf["temperature"]
|
||||||
|
if "max_tokens" in gen_conf: options["num_predict"] = gen_conf["max_tokens"]
|
||||||
|
if "top_p" in gen_conf: options["top_k"] = gen_conf["top_p"]
|
||||||
|
if "presence_penalty" in gen_conf: options["presence_penalty"] = gen_conf["presence_penalty"]
|
||||||
|
if "frequency_penalty" in gen_conf: options["frequency_penalty"] = gen_conf["frequency_penalty"]
|
||||||
|
ans = ""
|
||||||
|
try:
|
||||||
|
response = self.client.chat(
|
||||||
|
model=self.model_name,
|
||||||
|
messages=history,
|
||||||
|
stream=True,
|
||||||
|
options=options
|
||||||
|
)
|
||||||
|
for resp in response:
|
||||||
|
if resp["done"]:
|
||||||
|
yield resp.get("prompt_eval_count", 0) + resp.get("eval_count", 0)
|
||||||
|
ans += resp["message"]["content"]
|
||||||
|
yield ans
|
||||||
|
except Exception as e:
|
||||||
|
yield ans + "\n**ERROR**: " + str(e)
|
||||||
|
yield 0
|
||||||
|
|
||||||
class XinferenceChat(Base):
|
|
||||||
def __init__(self, key=None, model_name="", base_url=""):
|
class LocalLLM(Base):
|
||||||
self.client = OpenAI(api_key="xxx", base_url=base_url)
|
class RPCProxy:
|
||||||
self.model_name = model_name
|
def __init__(self, host, port):
|
||||||
|
self.host = host
|
||||||
|
self.port = int(port)
|
||||||
|
self.__conn()
|
||||||
|
|
||||||
|
def __conn(self):
|
||||||
|
from multiprocessing.connection import Client
|
||||||
|
self._connection = Client(
|
||||||
|
(self.host, self.port), authkey=b'infiniflow-token4kevinhu')
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
def do_rpc(*args, **kwargs):
|
||||||
|
for _ in range(3):
|
||||||
|
try:
|
||||||
|
self._connection.send(
|
||||||
|
pickle.dumps((name, args, kwargs)))
|
||||||
|
return pickle.loads(self._connection.recv())
|
||||||
|
except Exception as e:
|
||||||
|
self.__conn()
|
||||||
|
raise Exception("RPC connection lost!")
|
||||||
|
|
||||||
|
return do_rpc
|
||||||
|
|
||||||
|
def __init__(self, key, model_name="glm-3-turbo"):
|
||||||
|
self.client = LocalLLM.RPCProxy("127.0.0.1", 7860)
|
||||||
|
|
||||||
def chat(self, system, history, gen_conf):
|
def chat(self, system, history, gen_conf):
|
||||||
if system:
|
if system:
|
||||||
history.insert(0, {"role": "system", "content": system})
|
history.insert(0, {"role": "system", "content": system})
|
||||||
try:
|
try:
|
||||||
response = self.client.chat.completions.create(
|
ans = self.client.chat(
|
||||||
model=self.model_name,
|
history,
|
||||||
messages=history,
|
gen_conf
|
||||||
**gen_conf)
|
)
|
||||||
ans = response.choices[0].message.content.strip()
|
return ans, num_tokens_from_string(ans)
|
||||||
if response.choices[0].finish_reason == "length":
|
except Exception as e:
|
||||||
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
|
||||||
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
|
||||||
return ans, response.usage.total_tokens
|
|
||||||
except openai.APIError as e:
|
|
||||||
return "**ERROR**: " + str(e), 0
|
return "**ERROR**: " + str(e), 0
|
||||||
|
|
||||||
|
def chat_streamly(self, system, history, gen_conf):
|
||||||
|
if system:
|
||||||
|
history.insert(0, {"role": "system", "content": system})
|
||||||
|
token_count = 0
|
||||||
|
answer = ""
|
||||||
|
try:
|
||||||
|
for ans in self.client.chat_streamly(history, gen_conf):
|
||||||
|
answer += ans
|
||||||
|
token_count += 1
|
||||||
|
yield answer
|
||||||
|
except Exception as e:
|
||||||
|
yield answer + "\n**ERROR**: " + str(e)
|
||||||
|
|
||||||
|
yield token_count
|
||||||
|
|||||||
@ -26,19 +26,16 @@ from FlagEmbedding import FlagModel
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory, get_home_cache_dir
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string, truncate
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
flag_model = FlagModel(os.path.join(
|
flag_model = FlagModel(os.path.join(get_home_cache_dir(), "bge-large-zh-v1.5"),
|
||||||
get_project_base_directory(),
|
|
||||||
"rag/res/bge-large-zh-v1.5"),
|
|
||||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||||
use_fp16=torch.cuda.is_available())
|
use_fp16=torch.cuda.is_available())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5",
|
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5",
|
||||||
local_dir=os.path.join(get_project_base_directory(), "rag/res/bge-large-zh-v1.5"),
|
local_dir=os.path.join(get_home_cache_dir(), "bge-large-zh-v1.5"),
|
||||||
local_dir_use_symlinks=False)
|
local_dir_use_symlinks=False)
|
||||||
flag_model = FlagModel(model_dir,
|
flag_model = FlagModel(model_dir,
|
||||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||||
@ -56,7 +53,7 @@ class Base(ABC):
|
|||||||
raise NotImplementedError("Please implement encode method!")
|
raise NotImplementedError("Please implement encode method!")
|
||||||
|
|
||||||
|
|
||||||
class HuEmbedding(Base):
|
class DefaultEmbedding(Base):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
||||||
@ -72,7 +69,7 @@ class HuEmbedding(Base):
|
|||||||
self.model = flag_model
|
self.model = flag_model
|
||||||
|
|
||||||
def encode(self, texts: list, batch_size=32):
|
def encode(self, texts: list, batch_size=32):
|
||||||
texts = [t[:2000] for t in texts]
|
texts = [truncate(t, 2048) for t in texts]
|
||||||
token_count = 0
|
token_count = 0
|
||||||
for t in texts:
|
for t in texts:
|
||||||
token_count += num_tokens_from_string(t)
|
token_count += num_tokens_from_string(t)
|
||||||
@ -95,13 +92,14 @@ class OpenAIEmbed(Base):
|
|||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
def encode(self, texts: list, batch_size=32):
|
def encode(self, texts: list, batch_size=32):
|
||||||
|
texts = [truncate(t, 8196) for t in texts]
|
||||||
res = self.client.embeddings.create(input=texts,
|
res = self.client.embeddings.create(input=texts,
|
||||||
model=self.model_name)
|
model=self.model_name)
|
||||||
return np.array([d.embedding for d in res.data]
|
return np.array([d.embedding for d in res.data]
|
||||||
), res.usage.total_tokens
|
), res.usage.total_tokens
|
||||||
|
|
||||||
def encode_queries(self, text):
|
def encode_queries(self, text):
|
||||||
res = self.client.embeddings.create(input=[text],
|
res = self.client.embeddings.create(input=[truncate(text, 8196)],
|
||||||
model=self.model_name)
|
model=self.model_name)
|
||||||
return np.array(res.data[0].embedding), res.usage.total_tokens
|
return np.array(res.data[0].embedding), res.usage.total_tokens
|
||||||
|
|
||||||
@ -115,7 +113,7 @@ class QWenEmbed(Base):
|
|||||||
import dashscope
|
import dashscope
|
||||||
res = []
|
res = []
|
||||||
token_count = 0
|
token_count = 0
|
||||||
texts = [txt[:2048] for txt in texts]
|
texts = [truncate(t, 2048) for t in texts]
|
||||||
for i in range(0, len(texts), batch_size):
|
for i in range(0, len(texts), batch_size):
|
||||||
resp = dashscope.TextEmbedding.call(
|
resp = dashscope.TextEmbedding.call(
|
||||||
model=self.model_name,
|
model=self.model_name,
|
||||||
@ -238,8 +236,8 @@ class YoudaoEmbed(Base):
|
|||||||
try:
|
try:
|
||||||
print("LOADING BCE...")
|
print("LOADING BCE...")
|
||||||
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(
|
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(
|
||||||
get_project_base_directory(),
|
get_home_cache_dir(),
|
||||||
"rag/res/bce-embedding-base_v1"))
|
"bce-embedding-base_v1"))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
YoudaoEmbed._client = qanthing(
|
YoudaoEmbed._client = qanthing(
|
||||||
model_name_or_path=model_name.replace(
|
model_name_or_path=model_name.replace(
|
||||||
|
|||||||
@ -2,9 +2,10 @@ import argparse
|
|||||||
import pickle
|
import pickle
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from copy import deepcopy
|
||||||
from multiprocessing.connection import Listener
|
from multiprocessing.connection import Listener
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
|
||||||
|
|
||||||
|
|
||||||
def torch_gc():
|
def torch_gc():
|
||||||
@ -95,6 +96,32 @@ def chat(messages, gen_conf):
|
|||||||
return str(e)
|
return str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def chat_streamly(messages, gen_conf):
|
||||||
|
global tokenizer
|
||||||
|
model = Model()
|
||||||
|
try:
|
||||||
|
torch_gc()
|
||||||
|
conf = deepcopy(gen_conf)
|
||||||
|
print(messages, conf)
|
||||||
|
text = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True
|
||||||
|
)
|
||||||
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
||||||
|
streamer = TextStreamer(tokenizer)
|
||||||
|
conf["inputs"] = model_inputs.input_ids
|
||||||
|
conf["streamer"] = streamer
|
||||||
|
conf["max_new_tokens"] = conf["max_tokens"]
|
||||||
|
del conf["max_tokens"]
|
||||||
|
thread = Thread(target=model.generate, kwargs=conf)
|
||||||
|
thread.start()
|
||||||
|
for _, new_text in enumerate(streamer):
|
||||||
|
yield new_text
|
||||||
|
except Exception as e:
|
||||||
|
yield "**ERROR**: " + str(e)
|
||||||
|
|
||||||
|
|
||||||
def Model():
|
def Model():
|
||||||
global models
|
global models
|
||||||
random.seed(time.time())
|
random.seed(time.time())
|
||||||
@ -113,6 +140,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
handler = RPCHandler()
|
handler = RPCHandler()
|
||||||
handler.register_function(chat)
|
handler.register_function(chat)
|
||||||
|
handler.register_function(chat_streamly)
|
||||||
|
|
||||||
models = []
|
models = []
|
||||||
for _ in range(1):
|
for _ in range(1):
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import random
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
from . import huqie
|
from . import rag_tokenizer
|
||||||
import re
|
import re
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
@ -28,11 +28,17 @@ all_codecs = [
|
|||||||
def find_codec(blob):
|
def find_codec(blob):
|
||||||
global all_codecs
|
global all_codecs
|
||||||
for c in all_codecs:
|
for c in all_codecs:
|
||||||
|
try:
|
||||||
|
blob[:1024].decode(c)
|
||||||
|
return c
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
try:
|
try:
|
||||||
blob.decode(c)
|
blob.decode(c)
|
||||||
return c
|
return c
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return "utf-8"
|
return "utf-8"
|
||||||
|
|
||||||
|
|
||||||
@ -109,8 +115,8 @@ def is_english(texts):
|
|||||||
def tokenize(d, t, eng):
|
def tokenize(d, t, eng):
|
||||||
d["content_with_weight"] = t
|
d["content_with_weight"] = t
|
||||||
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
||||||
d["content_ltks"] = huqie.qie(t)
|
d["content_ltks"] = rag_tokenizer.tokenize(t)
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
|
|
||||||
|
|
||||||
def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
||||||
|
|||||||
@ -1,475 +0,0 @@
|
|||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
import copy
|
|
||||||
import base64
|
|
||||||
import magic
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import List
|
|
||||||
import numpy as np
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
|
|
||||||
class HuChunker:
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Fields:
|
|
||||||
text_chunks: List = None
|
|
||||||
table_chunks: List = None
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.MAX_LVL = 12
|
|
||||||
self.proj_patt = [
|
|
||||||
(r"第[零一二三四五六七八九十百]+章", 1),
|
|
||||||
(r"第[零一二三四五六七八九十百]+[条节]", 2),
|
|
||||||
(r"[零一二三四五六七八九十百]+[、 ]", 3),
|
|
||||||
(r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
|
|
||||||
(r"[0-9]+(、|\.[ ]|\.[^0-9])", 5),
|
|
||||||
(r"[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 6),
|
|
||||||
(r"[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 7),
|
|
||||||
(r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 8),
|
|
||||||
(r".{,48}[::??]@", 9),
|
|
||||||
(r"[0-9]+)", 10),
|
|
||||||
(r"[\((][0-9]+[)\)]", 11),
|
|
||||||
(r"[零一二三四五六七八九十百]+是", 12),
|
|
||||||
(r"[⚫•➢✓ ]", 12)
|
|
||||||
]
|
|
||||||
self.lines = []
|
|
||||||
|
|
||||||
def _garbage(self, txt):
|
|
||||||
patt = [
|
|
||||||
r"(在此保证|不得以任何形式翻版|请勿传阅|仅供内部使用|未经事先书面授权)",
|
|
||||||
r"(版权(归本公司)*所有|免责声明|保留一切权力|承担全部责任|特别声明|报告中涉及)",
|
|
||||||
r"(不承担任何责任|投资者的通知事项:|任何机构和个人|本报告仅为|不构成投资)",
|
|
||||||
r"(不构成对任何个人或机构投资建议|联系其所在国家|本报告由从事证券交易)",
|
|
||||||
r"(本研究报告由|「认可投资者」|所有研究报告均以|请发邮件至)",
|
|
||||||
r"(本报告仅供|市场有风险,投资需谨慎|本报告中提及的)",
|
|
||||||
r"(本报告反映|此信息仅供|证券分析师承诺|具备证券投资咨询业务资格)",
|
|
||||||
r"^(时间|签字|签章)[::]",
|
|
||||||
r"(参考文献|目录索引|图表索引)",
|
|
||||||
r"[ ]*年[ ]+月[ ]+日",
|
|
||||||
r"^(中国证券业协会|[0-9]+年[0-9]+月[0-9]+日)$",
|
|
||||||
r"\.{10,}",
|
|
||||||
r"(———————END|帮我转发|欢迎收藏|快来关注我吧)"
|
|
||||||
]
|
|
||||||
return any([re.search(p, txt) for p in patt])
|
|
||||||
|
|
||||||
def _proj_match(self, line):
|
|
||||||
for p, j in self.proj_patt:
|
|
||||||
if re.match(p, line):
|
|
||||||
return j
|
|
||||||
return
|
|
||||||
|
|
||||||
def _does_proj_match(self):
|
|
||||||
mat = [None for _ in range(len(self.lines))]
|
|
||||||
for i in range(len(self.lines)):
|
|
||||||
mat[i] = self._proj_match(self.lines[i])
|
|
||||||
return mat
|
|
||||||
|
|
||||||
def naive_text_chunk(self, text, ti="", MAX_LEN=612):
|
|
||||||
if text:
|
|
||||||
self.lines = [l.strip().replace(u'\u3000', u' ')
|
|
||||||
.replace(u'\xa0', u'')
|
|
||||||
for l in text.split("\n\n")]
|
|
||||||
self.lines = [l for l in self.lines if not self._garbage(l)]
|
|
||||||
self.lines = [re.sub(r"([ ]+| )", " ", l)
|
|
||||||
for l in self.lines if l]
|
|
||||||
if not self.lines:
|
|
||||||
return []
|
|
||||||
arr = self.lines
|
|
||||||
|
|
||||||
res = [""]
|
|
||||||
i = 0
|
|
||||||
while i < len(arr):
|
|
||||||
a = arr[i]
|
|
||||||
if not a:
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
if len(a) > MAX_LEN:
|
|
||||||
a_ = a.split("\n")
|
|
||||||
if len(a_) >= 2:
|
|
||||||
arr.pop(i)
|
|
||||||
for j in range(2, len(a_) + 1):
|
|
||||||
if len("\n".join(a_[:j])) >= MAX_LEN:
|
|
||||||
arr.insert(i, "\n".join(a_[:j - 1]))
|
|
||||||
arr.insert(i + 1, "\n".join(a_[j - 1:]))
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
assert False, f"Can't split: {a}"
|
|
||||||
continue
|
|
||||||
|
|
||||||
if len(res[-1]) < MAX_LEN / 3:
|
|
||||||
res[-1] += "\n" + a
|
|
||||||
else:
|
|
||||||
res.append(a)
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
if ti:
|
|
||||||
for i in range(len(res)):
|
|
||||||
if res[i].find("——来自") >= 0:
|
|
||||||
continue
|
|
||||||
res[i] += f"\t——来自“{ti}”"
|
|
||||||
|
|
||||||
return res
|
|
||||||
|
|
||||||
def _merge(self):
|
|
||||||
# merge continuous same level text
|
|
||||||
lines = [self.lines[0]] if self.lines else []
|
|
||||||
for i in range(1, len(self.lines)):
|
|
||||||
if self.mat[i] == self.mat[i - 1] \
|
|
||||||
and len(lines[-1]) < 256 \
|
|
||||||
and len(self.lines[i]) < 256:
|
|
||||||
lines[-1] += "\n" + self.lines[i]
|
|
||||||
continue
|
|
||||||
lines.append(self.lines[i])
|
|
||||||
self.lines = lines
|
|
||||||
self.mat = self._does_proj_match()
|
|
||||||
return self.mat
|
|
||||||
|
|
||||||
def text_chunks(self, text):
|
|
||||||
if text:
|
|
||||||
self.lines = [l.strip().replace(u'\u3000', u' ')
|
|
||||||
.replace(u'\xa0', u'')
|
|
||||||
for l in re.split(r"[\r\n]", text)]
|
|
||||||
self.lines = [l for l in self.lines if not self._garbage(l)]
|
|
||||||
self.lines = [l for l in self.lines if l]
|
|
||||||
self.mat = self._does_proj_match()
|
|
||||||
mat = self._merge()
|
|
||||||
|
|
||||||
tree = []
|
|
||||||
for i in range(len(self.lines)):
|
|
||||||
tree.append({"proj": mat[i],
|
|
||||||
"children": [],
|
|
||||||
"read": False})
|
|
||||||
# find all children
|
|
||||||
for i in range(len(self.lines) - 1):
|
|
||||||
if tree[i]["proj"] is None:
|
|
||||||
continue
|
|
||||||
ed = i + 1
|
|
||||||
while ed < len(tree) and (tree[ed]["proj"] is None or
|
|
||||||
tree[ed]["proj"] > tree[i]["proj"]):
|
|
||||||
ed += 1
|
|
||||||
|
|
||||||
nxt = tree[i]["proj"] + 1
|
|
||||||
st = set([p["proj"] for p in tree[i + 1: ed] if p["proj"]])
|
|
||||||
while nxt not in st:
|
|
||||||
nxt += 1
|
|
||||||
if nxt > self.MAX_LVL:
|
|
||||||
break
|
|
||||||
if nxt <= self.MAX_LVL:
|
|
||||||
for j in range(i + 1, ed):
|
|
||||||
if tree[j]["proj"] is not None:
|
|
||||||
break
|
|
||||||
tree[i]["children"].append(j)
|
|
||||||
for j in range(i + 1, ed):
|
|
||||||
if tree[j]["proj"] != nxt:
|
|
||||||
continue
|
|
||||||
tree[i]["children"].append(j)
|
|
||||||
else:
|
|
||||||
for j in range(i + 1, ed):
|
|
||||||
tree[i]["children"].append(j)
|
|
||||||
|
|
||||||
# get DFS combinations, find all the paths to leaf
|
|
||||||
paths = []
|
|
||||||
|
|
||||||
def dfs(i, path):
|
|
||||||
nonlocal tree, paths
|
|
||||||
path.append(i)
|
|
||||||
tree[i]["read"] = True
|
|
||||||
if len(self.lines[i]) > 256:
|
|
||||||
paths.append(path)
|
|
||||||
return
|
|
||||||
if not tree[i]["children"]:
|
|
||||||
if len(path) > 1 or len(self.lines[i]) >= 32:
|
|
||||||
paths.append(path)
|
|
||||||
return
|
|
||||||
for j in tree[i]["children"]:
|
|
||||||
dfs(j, copy.deepcopy(path))
|
|
||||||
|
|
||||||
for i, t in enumerate(tree):
|
|
||||||
if t["read"]:
|
|
||||||
continue
|
|
||||||
dfs(i, [])
|
|
||||||
|
|
||||||
# concat txt on the path for all paths
|
|
||||||
res = []
|
|
||||||
lines = np.array(self.lines)
|
|
||||||
for p in paths:
|
|
||||||
if len(p) < 2:
|
|
||||||
tree[p[0]]["read"] = False
|
|
||||||
continue
|
|
||||||
txt = "\n".join(lines[p[:-1]]) + "\n" + lines[p[-1]]
|
|
||||||
res.append(txt)
|
|
||||||
# concat continuous orphans
|
|
||||||
assert len(tree) == len(lines)
|
|
||||||
ii = 0
|
|
||||||
while ii < len(tree):
|
|
||||||
if tree[ii]["read"]:
|
|
||||||
ii += 1
|
|
||||||
continue
|
|
||||||
txt = lines[ii]
|
|
||||||
e = ii + 1
|
|
||||||
while e < len(tree) and not tree[e]["read"] and len(txt) < 256:
|
|
||||||
txt += "\n" + lines[e]
|
|
||||||
e += 1
|
|
||||||
res.append(txt)
|
|
||||||
ii = e
|
|
||||||
|
|
||||||
# if the node has not been read, find its daddy
|
|
||||||
def find_daddy(st):
|
|
||||||
nonlocal lines, tree
|
|
||||||
proj = tree[st]["proj"]
|
|
||||||
if len(self.lines[st]) > 512:
|
|
||||||
return [st]
|
|
||||||
if proj is None:
|
|
||||||
proj = self.MAX_LVL + 1
|
|
||||||
for i in range(st - 1, -1, -1):
|
|
||||||
if tree[i]["proj"] and tree[i]["proj"] < proj:
|
|
||||||
a = [st] + find_daddy(i)
|
|
||||||
return a
|
|
||||||
return []
|
|
||||||
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
class PdfChunker(HuChunker):
|
|
||||||
|
|
||||||
def __init__(self, pdf_parser):
|
|
||||||
self.pdf = pdf_parser
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def tableHtmls(self, pdfnm):
|
|
||||||
_, tbls = self.pdf(pdfnm, return_html=True)
|
|
||||||
res = []
|
|
||||||
for img, arr in tbls:
|
|
||||||
if arr[0].find("<table>") < 0:
|
|
||||||
continue
|
|
||||||
buffered = BytesIO()
|
|
||||||
if img:
|
|
||||||
img.save(buffered, format="JPEG")
|
|
||||||
img_str = base64.b64encode(
|
|
||||||
buffered.getvalue()).decode('utf-8') if img else ""
|
|
||||||
res.append({"table": arr[0], "image": img_str})
|
|
||||||
return res
|
|
||||||
|
|
||||||
def html(self, pdfnm):
|
|
||||||
txts, tbls = self.pdf(pdfnm, return_html=True)
|
|
||||||
res = []
|
|
||||||
txt_cks = self.text_chunks(txts)
|
|
||||||
for txt, img in [(self.pdf.remove_tag(c), self.pdf.crop(c))
|
|
||||||
for c in txt_cks]:
|
|
||||||
buffered = BytesIO()
|
|
||||||
if img:
|
|
||||||
img.save(buffered, format="JPEG")
|
|
||||||
img_str = base64.b64encode(
|
|
||||||
buffered.getvalue()).decode('utf-8') if img else ""
|
|
||||||
res.append({"table": "<p>%s</p>" % txt.replace("\n", "<br/>"),
|
|
||||||
"image": img_str})
|
|
||||||
|
|
||||||
for img, arr in tbls:
|
|
||||||
if not arr:
|
|
||||||
continue
|
|
||||||
buffered = BytesIO()
|
|
||||||
if img:
|
|
||||||
img.save(buffered, format="JPEG")
|
|
||||||
img_str = base64.b64encode(
|
|
||||||
buffered.getvalue()).decode('utf-8') if img else ""
|
|
||||||
res.append({"table": arr[0], "image": img_str})
|
|
||||||
|
|
||||||
return res
|
|
||||||
|
|
||||||
def __call__(self, pdfnm, return_image=True, naive_chunk=False):
|
|
||||||
flds = self.Fields()
|
|
||||||
text, tbls = self.pdf(pdfnm)
|
|
||||||
fnm = pdfnm
|
|
||||||
txt_cks = self.text_chunks(text) if not naive_chunk else \
|
|
||||||
self.naive_text_chunk(text, ti=fnm if isinstance(fnm, str) else "")
|
|
||||||
flds.text_chunks = [(self.pdf.remove_tag(c),
|
|
||||||
self.pdf.crop(c) if return_image else None) for c in txt_cks]
|
|
||||||
|
|
||||||
flds.table_chunks = [(arr, img if return_image else None)
|
|
||||||
for img, arr in tbls]
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
class DocxChunker(HuChunker):
|
|
||||||
|
|
||||||
def __init__(self, doc_parser):
|
|
||||||
self.doc = doc_parser
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def _does_proj_match(self):
|
|
||||||
mat = []
|
|
||||||
for s in self.styles:
|
|
||||||
s = s.split(" ")[-1]
|
|
||||||
try:
|
|
||||||
mat.append(int(s))
|
|
||||||
except Exception as e:
|
|
||||||
mat.append(None)
|
|
||||||
return mat
|
|
||||||
|
|
||||||
def _merge(self):
|
|
||||||
i = 1
|
|
||||||
while i < len(self.lines):
|
|
||||||
if self.mat[i] == self.mat[i - 1] \
|
|
||||||
and len(self.lines[i - 1]) < 256 \
|
|
||||||
and len(self.lines[i]) < 256:
|
|
||||||
self.lines[i - 1] += "\n" + self.lines[i]
|
|
||||||
self.styles.pop(i)
|
|
||||||
self.lines.pop(i)
|
|
||||||
self.mat.pop(i)
|
|
||||||
continue
|
|
||||||
i += 1
|
|
||||||
self.mat = self._does_proj_match()
|
|
||||||
return self.mat
|
|
||||||
|
|
||||||
def __call__(self, fnm):
|
|
||||||
flds = self.Fields()
|
|
||||||
flds.title = os.path.splitext(
|
|
||||||
os.path.basename(fnm))[0] if isinstance(
|
|
||||||
fnm, type("")) else ""
|
|
||||||
secs, tbls = self.doc(fnm)
|
|
||||||
self.lines = [l for l, s in secs]
|
|
||||||
self.styles = [s for l, s in secs]
|
|
||||||
|
|
||||||
txt_cks = self.text_chunks("")
|
|
||||||
flds.text_chunks = [(t, None) for t in txt_cks if not self._garbage(t)]
|
|
||||||
flds.table_chunks = [(tb, None) for tb in tbls for t in tb if t]
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
class ExcelChunker(HuChunker):
|
|
||||||
|
|
||||||
def __init__(self, excel_parser):
|
|
||||||
self.excel = excel_parser
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def __call__(self, fnm):
|
|
||||||
flds = self.Fields()
|
|
||||||
flds.text_chunks = [(t, None) for t in self.excel(fnm)]
|
|
||||||
flds.table_chunks = []
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
class PptChunker(HuChunker):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def __extract(self, shape):
|
|
||||||
if shape.shape_type == 19:
|
|
||||||
tb = shape.table
|
|
||||||
rows = []
|
|
||||||
for i in range(1, len(tb.rows)):
|
|
||||||
rows.append("; ".join([tb.cell(
|
|
||||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
|
||||||
return "\n".join(rows)
|
|
||||||
|
|
||||||
if shape.has_text_frame:
|
|
||||||
return shape.text_frame.text
|
|
||||||
|
|
||||||
if shape.shape_type == 6:
|
|
||||||
texts = []
|
|
||||||
for p in shape.shapes:
|
|
||||||
t = self.__extract(p)
|
|
||||||
if t:
|
|
||||||
texts.append(t)
|
|
||||||
return "\n".join(texts)
|
|
||||||
|
|
||||||
def __call__(self, fnm):
|
|
||||||
from pptx import Presentation
|
|
||||||
ppt = Presentation(fnm) if isinstance(
|
|
||||||
fnm, str) else Presentation(
|
|
||||||
BytesIO(fnm))
|
|
||||||
txts = []
|
|
||||||
for slide in ppt.slides:
|
|
||||||
texts = []
|
|
||||||
for shape in slide.shapes:
|
|
||||||
txt = self.__extract(shape)
|
|
||||||
if txt:
|
|
||||||
texts.append(txt)
|
|
||||||
txts.append("\n".join(texts))
|
|
||||||
|
|
||||||
import aspose.slides as slides
|
|
||||||
import aspose.pydrawing as drawing
|
|
||||||
imgs = []
|
|
||||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
|
||||||
for slide in presentation.slides:
|
|
||||||
buffered = BytesIO()
|
|
||||||
slide.get_thumbnail(
|
|
||||||
0.5, 0.5).save(
|
|
||||||
buffered, drawing.imaging.ImageFormat.jpeg)
|
|
||||||
imgs.append(buffered.getvalue())
|
|
||||||
assert len(imgs) == len(
|
|
||||||
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
|
||||||
|
|
||||||
flds = self.Fields()
|
|
||||||
flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))]
|
|
||||||
flds.table_chunks = []
|
|
||||||
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
class TextChunker(HuChunker):
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Fields:
|
|
||||||
text_chunks: List = None
|
|
||||||
table_chunks: List = None
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_binary_file(file_path):
|
|
||||||
mime = magic.Magic(mime=True)
|
|
||||||
if isinstance(file_path, str):
|
|
||||||
file_type = mime.from_file(file_path)
|
|
||||||
else:
|
|
||||||
file_type = mime.from_buffer(file_path)
|
|
||||||
if 'text' in file_type:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
def __call__(self, fnm):
|
|
||||||
flds = self.Fields()
|
|
||||||
if self.is_binary_file(fnm):
|
|
||||||
return flds
|
|
||||||
txt = ""
|
|
||||||
if isinstance(fnm, str):
|
|
||||||
with open(fnm, "r") as f:
|
|
||||||
txt = f.read()
|
|
||||||
else:
|
|
||||||
txt = fnm.decode("utf-8")
|
|
||||||
flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)]
|
|
||||||
flds.table_chunks = []
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import sys
|
|
||||||
sys.path.append(os.path.dirname(__file__) + "/../")
|
|
||||||
if sys.argv[1].split(".")[-1].lower() == "pdf":
|
|
||||||
from deepdoc.parser import PdfParser
|
|
||||||
ckr = PdfChunker(PdfParser())
|
|
||||||
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
|
|
||||||
from deepdoc.parser import DocxParser
|
|
||||||
ckr = DocxChunker(DocxParser())
|
|
||||||
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
|
|
||||||
from deepdoc.parser import ExcelParser
|
|
||||||
ckr = ExcelChunker(ExcelParser())
|
|
||||||
|
|
||||||
# ckr.html(sys.argv[1])
|
|
||||||
print(ckr(sys.argv[1]))
|
|
||||||
@ -7,14 +7,13 @@ import logging
|
|||||||
import copy
|
import copy
|
||||||
from elasticsearch_dsl import Q
|
from elasticsearch_dsl import Q
|
||||||
|
|
||||||
from rag.nlp import huqie, term_weight, synonym
|
from rag.nlp import rag_tokenizer, term_weight, synonym
|
||||||
|
|
||||||
|
|
||||||
class EsQueryer:
|
class EsQueryer:
|
||||||
def __init__(self, es):
|
def __init__(self, es):
|
||||||
self.tw = term_weight.Dealer()
|
self.tw = term_weight.Dealer()
|
||||||
self.es = es
|
self.es = es
|
||||||
self.syn = synonym.Dealer(None)
|
self.syn = synonym.Dealer()
|
||||||
self.flds = ["ask_tks^10", "ask_small_tks"]
|
self.flds = ["ask_tks^10", "ask_small_tks"]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -37,7 +36,7 @@ class EsQueryer:
|
|||||||
patts = [
|
patts = [
|
||||||
(r"是*(什么样的|哪家|一下|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*", ""),
|
(r"是*(什么样的|哪家|一下|那家|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀)是*", ""),
|
||||||
(r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),
|
(r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),
|
||||||
(r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down)", " ")
|
(r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down) ", " ")
|
||||||
]
|
]
|
||||||
for r, p in patts:
|
for r, p in patts:
|
||||||
txt = re.sub(r, p, txt, flags=re.IGNORECASE)
|
txt = re.sub(r, p, txt, flags=re.IGNORECASE)
|
||||||
@ -45,18 +44,19 @@ class EsQueryer:
|
|||||||
|
|
||||||
def question(self, txt, tbl="qa", min_match="60%"):
|
def question(self, txt, tbl="qa", min_match="60%"):
|
||||||
txt = re.sub(
|
txt = re.sub(
|
||||||
r"[ \r\n\t,,。??/`!!&]+",
|
r"[ \r\n\t,,。??/`!!&\^%%]+",
|
||||||
" ",
|
" ",
|
||||||
huqie.tradi2simp(
|
rag_tokenizer.tradi2simp(
|
||||||
huqie.strQ2B(
|
rag_tokenizer.strQ2B(
|
||||||
txt.lower()))).strip()
|
txt.lower()))).strip()
|
||||||
txt = EsQueryer.rmWWW(txt)
|
txt = EsQueryer.rmWWW(txt)
|
||||||
|
|
||||||
if not self.isChinese(txt):
|
if not self.isChinese(txt):
|
||||||
tks = huqie.qie(txt).split(" ")
|
tks = rag_tokenizer.tokenize(txt).split(" ")
|
||||||
q = copy.deepcopy(tks)
|
tks_w = self.tw.weights(tks)
|
||||||
for i in range(1, len(tks)):
|
q = [re.sub(r"[ \\\"']+", "", tk)+"^{:.4f}".format(w) for tk, w in tks_w]
|
||||||
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
|
for i in range(1, len(tks_w)):
|
||||||
|
q.append("\"%s %s\"^%.4f" % (tks_w[i - 1][0], tks_w[i][0], max(tks_w[i - 1][1], tks_w[i][1])*2))
|
||||||
if not q:
|
if not q:
|
||||||
q.append(txt)
|
q.append(txt)
|
||||||
return Q("bool",
|
return Q("bool",
|
||||||
@ -65,7 +65,7 @@ class EsQueryer:
|
|||||||
boost=1)#, minimum_should_match=min_match)
|
boost=1)#, minimum_should_match=min_match)
|
||||||
), tks
|
), tks
|
||||||
|
|
||||||
def needQieqie(tk):
|
def need_fine_grained_tokenize(tk):
|
||||||
if len(tk) < 4:
|
if len(tk) < 4:
|
||||||
return False
|
return False
|
||||||
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
|
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
|
||||||
@ -81,7 +81,7 @@ class EsQueryer:
|
|||||||
logging.info(json.dumps(twts, ensure_ascii=False))
|
logging.info(json.dumps(twts, ensure_ascii=False))
|
||||||
tms = []
|
tms = []
|
||||||
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
||||||
sm = huqie.qieqie(tk).split(" ") if needQieqie(tk) else []
|
sm = rag_tokenizer.fine_grained_tokenize(tk).split(" ") if need_fine_grained_tokenize(tk) else []
|
||||||
sm = [
|
sm = [
|
||||||
re.sub(
|
re.sub(
|
||||||
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
|
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
|
||||||
@ -110,10 +110,10 @@ class EsQueryer:
|
|||||||
if len(twts) > 1:
|
if len(twts) > 1:
|
||||||
tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
|
tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
|
||||||
if re.match(r"[0-9a-z ]+$", tt):
|
if re.match(r"[0-9a-z ]+$", tt):
|
||||||
tms = f"(\"{tt}\" OR \"%s\")" % huqie.qie(tt)
|
tms = f"(\"{tt}\" OR \"%s\")" % rag_tokenizer.tokenize(tt)
|
||||||
|
|
||||||
syns = " OR ".join(
|
syns = " OR ".join(
|
||||||
["\"%s\"^0.7" % EsQueryer.subSpecialChar(huqie.qie(s)) for s in syns])
|
["\"%s\"^0.7" % EsQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) for s in syns])
|
||||||
if syns:
|
if syns:
|
||||||
tms = f"({tms})^5 OR ({syns})^0.7"
|
tms = f"({tms})^5 OR ({syns})^0.7"
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from nltk.stem import PorterStemmer, WordNetLemmatizer
|
|||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
|
|
||||||
|
|
||||||
class Huqie:
|
class RagTokenizer:
|
||||||
def key_(self, line):
|
def key_(self, line):
|
||||||
return str(line.lower().encode("utf-8"))[2:-1]
|
return str(line.lower().encode("utf-8"))[2:-1]
|
||||||
|
|
||||||
@ -241,7 +241,7 @@ class Huqie:
|
|||||||
|
|
||||||
return self.score_(res[::-1])
|
return self.score_(res[::-1])
|
||||||
|
|
||||||
def qie(self, line):
|
def tokenize(self, line):
|
||||||
line = self._strQ2B(line).lower()
|
line = self._strQ2B(line).lower()
|
||||||
line = self._tradi2simp(line)
|
line = self._tradi2simp(line)
|
||||||
zh_num = len([1 for c in line if is_chinese(c)])
|
zh_num = len([1 for c in line if is_chinese(c)])
|
||||||
@ -298,7 +298,7 @@ class Huqie:
|
|||||||
print("[TKS]", self.merge_(res))
|
print("[TKS]", self.merge_(res))
|
||||||
return self.merge_(res)
|
return self.merge_(res)
|
||||||
|
|
||||||
def qieqie(self, tks):
|
def fine_grained_tokenize(self, tks):
|
||||||
tks = tks.split(" ")
|
tks = tks.split(" ")
|
||||||
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
||||||
if zh_num < len(tks) * 0.2:
|
if zh_num < len(tks) * 0.2:
|
||||||
@ -371,53 +371,53 @@ def naiveQie(txt):
|
|||||||
return tks
|
return tks
|
||||||
|
|
||||||
|
|
||||||
hq = Huqie()
|
tokenizer = RagTokenizer()
|
||||||
qie = hq.qie
|
tokenize = tokenizer.tokenize
|
||||||
qieqie = hq.qieqie
|
fine_grained_tokenize = tokenizer.fine_grained_tokenize
|
||||||
tag = hq.tag
|
tag = tokenizer.tag
|
||||||
freq = hq.freq
|
freq = tokenizer.freq
|
||||||
loadUserDict = hq.loadUserDict
|
loadUserDict = tokenizer.loadUserDict
|
||||||
addUserDict = hq.addUserDict
|
addUserDict = tokenizer.addUserDict
|
||||||
tradi2simp = hq._tradi2simp
|
tradi2simp = tokenizer._tradi2simp
|
||||||
strQ2B = hq._strQ2B
|
strQ2B = tokenizer._strQ2B
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
huqie = Huqie(debug=True)
|
tknzr = RagTokenizer(debug=True)
|
||||||
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
|
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
|
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
|
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
|
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
|
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie("虽然我不怎么玩")
|
tks = tknzr.tokenize("虽然我不怎么玩")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
|
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie("这周日你去吗?这周日你有空吗?")
|
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
|
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
sys.exit()
|
sys.exit()
|
||||||
huqie.DEBUG = False
|
tknzr.DEBUG = False
|
||||||
huqie.loadUserDict(sys.argv[1])
|
tknzr.loadUserDict(sys.argv[1])
|
||||||
of = open(sys.argv[2], "r")
|
of = open(sys.argv[2], "r")
|
||||||
while True:
|
while True:
|
||||||
line = of.readline()
|
line = of.readline()
|
||||||
if not line:
|
if not line:
|
||||||
break
|
break
|
||||||
print(huqie.qie(line))
|
print(tknzr.tokenize(line))
|
||||||
of.close()
|
of.close()
|
||||||
@ -9,7 +9,7 @@ from dataclasses import dataclass
|
|||||||
|
|
||||||
from rag.settings import es_logger
|
from rag.settings import es_logger
|
||||||
from rag.utils import rmSpace
|
from rag.utils import rmSpace
|
||||||
from rag.nlp import huqie, query
|
from rag.nlp import rag_tokenizer, query
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
@ -52,16 +52,21 @@ class Dealer:
|
|||||||
def search(self, req, idxnm, emb_mdl=None):
|
def search(self, req, idxnm, emb_mdl=None):
|
||||||
qst = req.get("question", "")
|
qst = req.get("question", "")
|
||||||
bqry, keywords = self.qryr.question(qst)
|
bqry, keywords = self.qryr.question(qst)
|
||||||
if req.get("kb_ids"):
|
def add_filters(bqry):
|
||||||
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
|
nonlocal req
|
||||||
if req.get("doc_ids"):
|
if req.get("kb_ids"):
|
||||||
bqry.filter.append(Q("terms", doc_id=req["doc_ids"]))
|
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
|
||||||
if "available_int" in req:
|
if req.get("doc_ids"):
|
||||||
if req["available_int"] == 0:
|
bqry.filter.append(Q("terms", doc_id=req["doc_ids"]))
|
||||||
bqry.filter.append(Q("range", available_int={"lt": 1}))
|
if "available_int" in req:
|
||||||
else:
|
if req["available_int"] == 0:
|
||||||
bqry.filter.append(
|
bqry.filter.append(Q("range", available_int={"lt": 1}))
|
||||||
Q("bool", must_not=Q("range", available_int={"lt": 1})))
|
else:
|
||||||
|
bqry.filter.append(
|
||||||
|
Q("bool", must_not=Q("range", available_int={"lt": 1})))
|
||||||
|
return bqry
|
||||||
|
|
||||||
|
bqry = add_filters(bqry)
|
||||||
bqry.boost = 0.05
|
bqry.boost = 0.05
|
||||||
|
|
||||||
s = Search()
|
s = Search()
|
||||||
@ -117,8 +122,7 @@ class Dealer:
|
|||||||
es_logger.info("TOTAL: {}".format(self.es.getTotal(res)))
|
es_logger.info("TOTAL: {}".format(self.es.getTotal(res)))
|
||||||
if self.es.getTotal(res) == 0 and "knn" in s:
|
if self.es.getTotal(res) == 0 and "knn" in s:
|
||||||
bqry, _ = self.qryr.question(qst, min_match="10%")
|
bqry, _ = self.qryr.question(qst, min_match="10%")
|
||||||
if req.get("kb_ids"):
|
bqry = add_filters(bqry)
|
||||||
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
|
|
||||||
s["query"] = bqry.to_dict()
|
s["query"] = bqry.to_dict()
|
||||||
s["knn"]["filter"] = bqry.to_dict()
|
s["knn"]["filter"] = bqry.to_dict()
|
||||||
s["knn"]["similarity"] = 0.17
|
s["knn"]["similarity"] = 0.17
|
||||||
@ -128,7 +132,7 @@ class Dealer:
|
|||||||
kwds = set([])
|
kwds = set([])
|
||||||
for k in keywords:
|
for k in keywords:
|
||||||
kwds.add(k)
|
kwds.add(k)
|
||||||
for kk in huqie.qieqie(k).split(" "):
|
for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
|
||||||
if len(kk) < 2:
|
if len(kk) < 2:
|
||||||
continue
|
continue
|
||||||
if kk in kwds:
|
if kk in kwds:
|
||||||
@ -243,7 +247,7 @@ class Dealer:
|
|||||||
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
||||||
len(ans_v[0]), len(chunk_v[0]))
|
len(ans_v[0]), len(chunk_v[0]))
|
||||||
|
|
||||||
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
|
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
|
||||||
for ck in chunks]
|
for ck in chunks]
|
||||||
cites = {}
|
cites = {}
|
||||||
thr = 0.63
|
thr = 0.63
|
||||||
@ -251,7 +255,7 @@ class Dealer:
|
|||||||
for i, a in enumerate(pieces_):
|
for i, a in enumerate(pieces_):
|
||||||
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
||||||
chunk_v,
|
chunk_v,
|
||||||
huqie.qie(
|
rag_tokenizer.tokenize(
|
||||||
self.qryr.rmWWW(pieces_[i])).split(" "),
|
self.qryr.rmWWW(pieces_[i])).split(" "),
|
||||||
chunks_tks,
|
chunks_tks,
|
||||||
tkweight, vtweight)
|
tkweight, vtweight)
|
||||||
@ -310,8 +314,8 @@ class Dealer:
|
|||||||
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
||||||
return self.qryr.hybrid_similarity(ans_embd,
|
return self.qryr.hybrid_similarity(ans_embd,
|
||||||
ins_embd,
|
ins_embd,
|
||||||
huqie.qie(ans).split(" "),
|
rag_tokenizer.tokenize(ans).split(" "),
|
||||||
huqie.qie(inst).split(" "))
|
rag_tokenizer.tokenize(inst).split(" "))
|
||||||
|
|
||||||
def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
|
def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
|
||||||
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
|
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
|
||||||
@ -385,7 +389,7 @@ class Dealer:
|
|||||||
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
||||||
fld, v = r.group(1), r.group(3)
|
fld, v = r.group(1), r.group(3)
|
||||||
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
||||||
fld, huqie.qieqie(huqie.qie(v)))
|
fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v)))
|
||||||
replaces.append(
|
replaces.append(
|
||||||
("{}{}'{}'".format(
|
("{}{}'{}'".format(
|
||||||
r.group(1),
|
r.group(1),
|
||||||
|
|||||||
@ -17,7 +17,7 @@ class Dealer:
|
|||||||
try:
|
try:
|
||||||
self.dictionary = json.load(open(path, 'r'))
|
self.dictionary = json.load(open(path, 'r'))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warn("Miss synonym.json")
|
logging.warn("Missing synonym.json")
|
||||||
self.dictionary = {}
|
self.dictionary = {}
|
||||||
|
|
||||||
if not redis:
|
if not redis:
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import json
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
|
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class Dealer:
|
|||||||
txt = re.sub(p, r, txt)
|
txt = re.sub(p, r, txt)
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
for t in huqie.qie(txt).split(" "):
|
for t in rag_tokenizer.tokenize(txt).split(" "):
|
||||||
tk = t
|
tk = t
|
||||||
if (stpwd and tk in self.stop_words) or (
|
if (stpwd and tk in self.stop_words) or (
|
||||||
re.match(r"[0-9]$", tk) and not num):
|
re.match(r"[0-9]$", tk) and not num):
|
||||||
@ -161,7 +161,7 @@ class Dealer:
|
|||||||
return m[self.ne[t]]
|
return m[self.ne[t]]
|
||||||
|
|
||||||
def postag(t):
|
def postag(t):
|
||||||
t = huqie.tag(t)
|
t = rag_tokenizer.tag(t)
|
||||||
if t in set(["r", "c", "d"]):
|
if t in set(["r", "c", "d"]):
|
||||||
return 0.3
|
return 0.3
|
||||||
if t in set(["ns", "nt"]):
|
if t in set(["ns", "nt"]):
|
||||||
@ -175,14 +175,14 @@ class Dealer:
|
|||||||
def freq(t):
|
def freq(t):
|
||||||
if re.match(r"[0-9. -]{2,}$", t):
|
if re.match(r"[0-9. -]{2,}$", t):
|
||||||
return 3
|
return 3
|
||||||
s = huqie.freq(t)
|
s = rag_tokenizer.freq(t)
|
||||||
if not s and re.match(r"[a-z. -]+$", t):
|
if not s and re.match(r"[a-z. -]+$", t):
|
||||||
return 300
|
return 300
|
||||||
if not s:
|
if not s:
|
||||||
s = 0
|
s = 0
|
||||||
|
|
||||||
if not s and len(t) >= 4:
|
if not s and len(t) >= 4:
|
||||||
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
|
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
|
||||||
if len(s) > 1:
|
if len(s) > 1:
|
||||||
s = np.min([freq(tt) for tt in s]) / 6.
|
s = np.min([freq(tt) for tt in s]) / 6.
|
||||||
else:
|
else:
|
||||||
@ -198,7 +198,7 @@ class Dealer:
|
|||||||
elif re.match(r"[a-z. -]+$", t):
|
elif re.match(r"[a-z. -]+$", t):
|
||||||
return 300
|
return 300
|
||||||
elif len(t) >= 4:
|
elif len(t) >= 4:
|
||||||
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
|
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
|
||||||
if len(s) > 1:
|
if len(s) > 1:
|
||||||
return max(3, np.min([df(tt) for tt in s]) / 6.)
|
return max(3, np.min([df(tt) for tt in s]) / 6.)
|
||||||
|
|
||||||
|
|||||||
@ -47,3 +47,9 @@ cron_logger = getLogger("cron_logger")
|
|||||||
cron_logger.setLevel(20)
|
cron_logger.setLevel(20)
|
||||||
chunk_logger = getLogger("chunk_logger")
|
chunk_logger = getLogger("chunk_logger")
|
||||||
database_logger = getLogger("database")
|
database_logger = getLogger("database")
|
||||||
|
|
||||||
|
SVR_QUEUE_NAME = "rag_flow_svr_queue"
|
||||||
|
SVR_QUEUE_RETENTION = 60*60
|
||||||
|
SVR_QUEUE_MAX_LEN = 1024
|
||||||
|
SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
|
||||||
|
SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"
|
||||||
|
|||||||
@ -4,13 +4,14 @@ import traceback
|
|||||||
|
|
||||||
from api.db.db_models import close_connection
|
from api.db.db_models import close_connection
|
||||||
from api.db.services.task_service import TaskService
|
from api.db.services.task_service import TaskService
|
||||||
from rag.utils import MINIO
|
from rag.settings import cron_logger
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
from rag.utils.redis_conn import REDIS_CONN
|
from rag.utils.redis_conn import REDIS_CONN
|
||||||
|
|
||||||
|
|
||||||
def collect():
|
def collect():
|
||||||
doc_locations = TaskService.get_ongoing_doc_name()
|
doc_locations = TaskService.get_ongoing_doc_name()
|
||||||
#print(tasks)
|
print(doc_locations)
|
||||||
if len(doc_locations) == 0:
|
if len(doc_locations) == 0:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
return
|
return
|
||||||
@ -28,7 +29,7 @@ def main():
|
|||||||
if REDIS_CONN.exist(key):continue
|
if REDIS_CONN.exist(key):continue
|
||||||
file_bin = MINIO.get(kb_id, loc)
|
file_bin = MINIO.get(kb_id, loc)
|
||||||
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
||||||
print("CACHE:", loc)
|
cron_logger.info("CACHE: {}".format(loc))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_stack(e)
|
traceback.print_stack(e)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -1,193 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
from datetime import datetime
|
|
||||||
from api.db.db_models import Task
|
|
||||||
from api.db.db_utils import bulk_insert_into_db
|
|
||||||
from api.db.services.task_service import TaskService
|
|
||||||
from deepdoc.parser import PdfParser
|
|
||||||
from deepdoc.parser.excel_parser import HuExcelParser
|
|
||||||
from rag.settings import cron_logger
|
|
||||||
from rag.utils import MINIO
|
|
||||||
from rag.utils import findMaxTm
|
|
||||||
import pandas as pd
|
|
||||||
from api.db import FileType, TaskStatus
|
|
||||||
from api.db.services.document_service import DocumentService
|
|
||||||
from api.settings import database_logger
|
|
||||||
from api.utils import get_format_time, get_uuid
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
|
||||||
from rag.utils.redis_conn import REDIS_CONN
|
|
||||||
from api.db.db_models import init_database_tables as init_web_db
|
|
||||||
from api.db.init_data import init_web_data
|
|
||||||
|
|
||||||
|
|
||||||
def collect(tm):
|
|
||||||
docs = DocumentService.get_newly_uploaded(tm)
|
|
||||||
if len(docs) == 0:
|
|
||||||
return pd.DataFrame()
|
|
||||||
docs = pd.DataFrame(docs)
|
|
||||||
mtm = docs["update_time"].max()
|
|
||||||
cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def set_dispatching(docid):
|
|
||||||
try:
|
|
||||||
DocumentService.update_by_id(
|
|
||||||
docid, {"progress": random.random() * 1 / 100.,
|
|
||||||
"progress_msg": "Task dispatched...",
|
|
||||||
"process_begin_at": get_format_time()
|
|
||||||
})
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.error("set_dispatching:({}), {}".format(docid, str(e)))
|
|
||||||
|
|
||||||
|
|
||||||
def dispatch():
|
|
||||||
tm_fnm = os.path.join(
|
|
||||||
get_project_base_directory(),
|
|
||||||
"rag/res",
|
|
||||||
f"broker.tm")
|
|
||||||
tm = findMaxTm(tm_fnm)
|
|
||||||
rows = collect(tm)
|
|
||||||
if len(rows) == 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
tmf = open(tm_fnm, "a+")
|
|
||||||
for _, r in rows.iterrows():
|
|
||||||
try:
|
|
||||||
tsks = TaskService.query(doc_id=r["id"])
|
|
||||||
if tsks:
|
|
||||||
for t in tsks:
|
|
||||||
TaskService.delete_by_id(t.id)
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.exception(e)
|
|
||||||
|
|
||||||
def new_task():
|
|
||||||
nonlocal r
|
|
||||||
return {
|
|
||||||
"id": get_uuid(),
|
|
||||||
"doc_id": r["id"]
|
|
||||||
}
|
|
||||||
|
|
||||||
tsks = []
|
|
||||||
try:
|
|
||||||
file_bin = MINIO.get(r["kb_id"], r["location"])
|
|
||||||
if REDIS_CONN.is_alive():
|
|
||||||
try:
|
|
||||||
REDIS_CONN.set("{}/{}".format(r["kb_id"], r["location"]), file_bin, 12*60)
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.warning("Put into redis[EXCEPTION]:" + str(e))
|
|
||||||
|
|
||||||
if r["type"] == FileType.PDF.value:
|
|
||||||
do_layout = r["parser_config"].get("layout_recognize", True)
|
|
||||||
pages = PdfParser.total_page_number(r["name"], file_bin)
|
|
||||||
page_size = r["parser_config"].get("task_page_size", 12)
|
|
||||||
if r["parser_id"] == "paper":
|
|
||||||
page_size = r["parser_config"].get("task_page_size", 22)
|
|
||||||
if r["parser_id"] == "one":
|
|
||||||
page_size = 1000000000
|
|
||||||
if not do_layout:
|
|
||||||
page_size = 1000000000
|
|
||||||
page_ranges = r["parser_config"].get("pages")
|
|
||||||
if not page_ranges:
|
|
||||||
page_ranges = [(1, 100000)]
|
|
||||||
for s, e in page_ranges:
|
|
||||||
s -= 1
|
|
||||||
s = max(0, s)
|
|
||||||
e = min(e - 1, pages)
|
|
||||||
for p in range(s, e, page_size):
|
|
||||||
task = new_task()
|
|
||||||
task["from_page"] = p
|
|
||||||
task["to_page"] = min(p + page_size, e)
|
|
||||||
tsks.append(task)
|
|
||||||
|
|
||||||
elif r["parser_id"] == "table":
|
|
||||||
rn = HuExcelParser.row_number(
|
|
||||||
r["name"], file_bin)
|
|
||||||
for i in range(0, rn, 3000):
|
|
||||||
task = new_task()
|
|
||||||
task["from_page"] = i
|
|
||||||
task["to_page"] = min(i + 3000, rn)
|
|
||||||
tsks.append(task)
|
|
||||||
else:
|
|
||||||
tsks.append(new_task())
|
|
||||||
|
|
||||||
bulk_insert_into_db(Task, tsks, True)
|
|
||||||
set_dispatching(r["id"])
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.exception(e)
|
|
||||||
|
|
||||||
tmf.write(str(r["update_time"]) + "\n")
|
|
||||||
tmf.close()
|
|
||||||
|
|
||||||
|
|
||||||
def update_progress():
|
|
||||||
docs = DocumentService.get_unfinished_docs()
|
|
||||||
for d in docs:
|
|
||||||
try:
|
|
||||||
tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
|
|
||||||
if not tsks:
|
|
||||||
continue
|
|
||||||
msg = []
|
|
||||||
prg = 0
|
|
||||||
finished = True
|
|
||||||
bad = 0
|
|
||||||
status = TaskStatus.RUNNING.value
|
|
||||||
for t in tsks:
|
|
||||||
if 0 <= t.progress < 1:
|
|
||||||
finished = False
|
|
||||||
prg += t.progress if t.progress >= 0 else 0
|
|
||||||
msg.append(t.progress_msg)
|
|
||||||
if t.progress == -1:
|
|
||||||
bad += 1
|
|
||||||
prg /= len(tsks)
|
|
||||||
if finished and bad:
|
|
||||||
prg = -1
|
|
||||||
status = TaskStatus.FAIL.value
|
|
||||||
elif finished:
|
|
||||||
status = TaskStatus.DONE.value
|
|
||||||
|
|
||||||
msg = "\n".join(msg)
|
|
||||||
info = {
|
|
||||||
"process_duation": datetime.timestamp(
|
|
||||||
datetime.now()) -
|
|
||||||
d["process_begin_at"].timestamp(),
|
|
||||||
"run": status}
|
|
||||||
if prg != 0:
|
|
||||||
info["progress"] = prg
|
|
||||||
if msg:
|
|
||||||
info["progress_msg"] = msg
|
|
||||||
DocumentService.update_by_id(d["id"], info)
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.error("fetch task exception:" + str(e))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
peewee_logger = logging.getLogger('peewee')
|
|
||||||
peewee_logger.propagate = False
|
|
||||||
peewee_logger.addHandler(database_logger.handlers[0])
|
|
||||||
peewee_logger.setLevel(database_logger.level)
|
|
||||||
# init db
|
|
||||||
init_web_db()
|
|
||||||
init_web_data()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
dispatch()
|
|
||||||
time.sleep(1)
|
|
||||||
update_progress()
|
|
||||||
@ -24,16 +24,18 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from rag.utils import MINIO
|
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
from api.db.db_models import close_connection
|
from api.db.db_models import close_connection
|
||||||
from rag.settings import database_logger
|
from rag.settings import database_logger, SVR_QUEUE_NAME
|
||||||
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
|
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from elasticsearch_dsl import Q
|
from elasticsearch_dsl import Q
|
||||||
from multiprocessing.context import TimeoutError
|
from multiprocessing.context import TimeoutError
|
||||||
from api.db.services.task_service import TaskService
|
from api.db.services.task_service import TaskService
|
||||||
from rag.utils import ELASTICSEARCH
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from rag.utils import rmSpace, findMaxTm
|
from rag.utils import rmSpace, findMaxTm
|
||||||
|
|
||||||
@ -78,7 +80,7 @@ def set_progress(task_id, from_page=0, to_page=-1,
|
|||||||
|
|
||||||
if to_page > 0:
|
if to_page > 0:
|
||||||
if msg:
|
if msg:
|
||||||
msg = f"Page({from_page+1}~{to_page+1}): " + msg
|
msg = f"Page({from_page + 1}~{to_page + 1}): " + msg
|
||||||
d = {"progress_msg": msg}
|
d = {"progress_msg": msg}
|
||||||
if prog is not None:
|
if prog is not None:
|
||||||
d["progress"] = prog
|
d["progress"] = prog
|
||||||
@ -87,43 +89,42 @@ def set_progress(task_id, from_page=0, to_page=-1,
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
|
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
|
||||||
|
|
||||||
|
close_connection()
|
||||||
if cancel:
|
if cancel:
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
def collect(comm, mod, tm):
|
def collect():
|
||||||
tasks = TaskService.get_tasks(tm, mod, comm)
|
try:
|
||||||
#print(tasks)
|
payload = REDIS_CONN.queue_consumer(SVR_QUEUE_NAME, "rag_flow_svr_task_broker", "rag_flow_svr_task_consumer")
|
||||||
if len(tasks) == 0:
|
if not payload:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
return pd.DataFrame()
|
||||||
|
except Exception as e:
|
||||||
|
cron_logger.error("Get task event from queue exception:" + str(e))
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
msg = payload.get_message()
|
||||||
|
payload.ack()
|
||||||
|
if not msg: return pd.DataFrame()
|
||||||
|
|
||||||
|
if TaskService.do_cancel(msg["id"]):
|
||||||
|
cron_logger.info("Task {} has been canceled.".format(msg["id"]))
|
||||||
|
return pd.DataFrame()
|
||||||
|
tasks = TaskService.get_tasks(msg["id"])
|
||||||
|
assert tasks, "{} empty task!".format(msg["id"])
|
||||||
tasks = pd.DataFrame(tasks)
|
tasks = pd.DataFrame(tasks)
|
||||||
mtm = tasks["update_time"].max()
|
|
||||||
cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm))
|
|
||||||
return tasks
|
return tasks
|
||||||
|
|
||||||
|
|
||||||
def get_minio_binary(bucket, name):
|
def get_minio_binary(bucket, name):
|
||||||
global MINIO
|
|
||||||
if REDIS_CONN.is_alive():
|
|
||||||
try:
|
|
||||||
for _ in range(30):
|
|
||||||
if REDIS_CONN.exist("{}/{}".format(bucket, name)):
|
|
||||||
time.sleep(1)
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
r = REDIS_CONN.get("{}/{}".format(bucket, name))
|
|
||||||
if r: return r
|
|
||||||
cron_logger.warning("Cache missing: {}".format(name))
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.warning("Get redis[EXCEPTION]:" + str(e))
|
|
||||||
return MINIO.get(bucket, name)
|
return MINIO.get(bucket, name)
|
||||||
|
|
||||||
|
|
||||||
def build(row):
|
def build(row):
|
||||||
if row["size"] > DOC_MAXIMUM_SIZE:
|
if row["size"] > DOC_MAXIMUM_SIZE:
|
||||||
set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
|
set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
|
||||||
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
|
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
callback = partial(
|
callback = partial(
|
||||||
@ -132,19 +133,17 @@ def build(row):
|
|||||||
row["from_page"],
|
row["from_page"],
|
||||||
row["to_page"])
|
row["to_page"])
|
||||||
chunker = FACTORY[row["parser_id"].lower()]
|
chunker = FACTORY[row["parser_id"].lower()]
|
||||||
pool = Pool(processes=1)
|
|
||||||
try:
|
try:
|
||||||
st = timer()
|
st = timer()
|
||||||
thr = pool.apply_async(get_minio_binary, args=(row["kb_id"], row["location"]))
|
bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"])
|
||||||
binary = thr.get(timeout=90)
|
binary = get_minio_binary(bucket, name)
|
||||||
pool.terminate()
|
|
||||||
cron_logger.info(
|
cron_logger.info(
|
||||||
"From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
|
"From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
|
||||||
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
|
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
|
||||||
to_page=row["to_page"], lang=row["language"], callback=callback,
|
to_page=row["to_page"], lang=row["language"], callback=callback,
|
||||||
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
|
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
|
||||||
cron_logger.info(
|
cron_logger.info(
|
||||||
"Chunkking({}) {}/{}".format(timer()-st, row["location"], row["name"]))
|
"Chunkking({}) {}/{}".format(timer() - st, row["location"], row["name"]))
|
||||||
except TimeoutError as e:
|
except TimeoutError as e:
|
||||||
callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.")
|
callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.")
|
||||||
cron_logger.error(
|
cron_logger.error(
|
||||||
@ -156,7 +155,6 @@ def build(row):
|
|||||||
else:
|
else:
|
||||||
callback(-1, f"Internal server error: %s" %
|
callback(-1, f"Internal server error: %s" %
|
||||||
str(e).replace("'", ""))
|
str(e).replace("'", ""))
|
||||||
pool.terminate()
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
cron_logger.error(
|
cron_logger.error(
|
||||||
@ -175,7 +173,7 @@ def build(row):
|
|||||||
d.update(ck)
|
d.update(ck)
|
||||||
md5 = hashlib.md5()
|
md5 = hashlib.md5()
|
||||||
md5.update((ck["content_with_weight"] +
|
md5.update((ck["content_with_weight"] +
|
||||||
str(d["doc_id"])).encode("utf-8"))
|
str(d["doc_id"])).encode("utf-8"))
|
||||||
d["_id"] = md5.hexdigest()
|
d["_id"] = md5.hexdigest()
|
||||||
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
||||||
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
||||||
@ -247,41 +245,33 @@ def embedding(docs, mdl, parser_config={}, callback=None):
|
|||||||
return tk_count
|
return tk_count
|
||||||
|
|
||||||
|
|
||||||
def main(comm, mod):
|
def main():
|
||||||
tm_fnm = os.path.join(
|
rows = collect()
|
||||||
get_project_base_directory(),
|
|
||||||
"rag/res",
|
|
||||||
f"{comm}-{mod}.tm")
|
|
||||||
tm = findMaxTm(tm_fnm)
|
|
||||||
rows = collect(comm, mod, tm)
|
|
||||||
if len(rows) == 0:
|
if len(rows) == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
tmf = open(tm_fnm, "a+")
|
|
||||||
for _, r in rows.iterrows():
|
for _, r in rows.iterrows():
|
||||||
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
||||||
#callback(random.random()/10., "Task has been received.")
|
|
||||||
try:
|
try:
|
||||||
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_stack(e)
|
callback(-1, msg=str(e))
|
||||||
callback(prog=-1, msg=str(e))
|
cron_logger.error(str(e))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
st = timer()
|
st = timer()
|
||||||
cks = build(r)
|
cks = build(r)
|
||||||
cron_logger.info("Build chunks({}): {}".format(r["name"], timer()-st))
|
cron_logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
|
||||||
if cks is None:
|
if cks is None:
|
||||||
continue
|
continue
|
||||||
if not cks:
|
if not cks:
|
||||||
tmf.write(str(r["update_time"]) + "\n")
|
|
||||||
callback(1., "No chunk! Done!")
|
callback(1., "No chunk! Done!")
|
||||||
continue
|
continue
|
||||||
# TODO: exception handler
|
# TODO: exception handler
|
||||||
## set_progress(r["did"], -1, "ERROR: ")
|
## set_progress(r["did"], -1, "ERROR: ")
|
||||||
callback(
|
callback(
|
||||||
msg="Finished slicing files(%d). Start to embedding the content." %
|
msg="Finished slicing files(%d). Start to embedding the content." %
|
||||||
len(cks))
|
len(cks))
|
||||||
st = timer()
|
st = timer()
|
||||||
try:
|
try:
|
||||||
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
||||||
@ -289,14 +279,19 @@ def main(comm, mod):
|
|||||||
callback(-1, "Embedding error:{}".format(str(e)))
|
callback(-1, "Embedding error:{}".format(str(e)))
|
||||||
cron_logger.error(str(e))
|
cron_logger.error(str(e))
|
||||||
tk_count = 0
|
tk_count = 0
|
||||||
cron_logger.info("Embedding elapsed({}): {}".format(r["name"], timer()-st))
|
cron_logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||||
|
|
||||||
callback(msg="Finished embedding({})! Start to build index!".format(timer()-st))
|
callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer() - st))
|
||||||
init_kb(r)
|
init_kb(r)
|
||||||
chunk_count = len(set([c["_id"] for c in cks]))
|
chunk_count = len(set([c["_id"] for c in cks]))
|
||||||
st = timer()
|
st = timer()
|
||||||
es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
|
es_r = ""
|
||||||
cron_logger.info("Indexing elapsed({}): {}".format(r["name"], timer()-st))
|
for b in range(0, len(cks), 32):
|
||||||
|
es_r = ELASTICSEARCH.bulk(cks[b:b + 32], search.index_name(r["tenant_id"]))
|
||||||
|
if b % 128 == 0:
|
||||||
|
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
|
||||||
|
|
||||||
|
cron_logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
||||||
if es_r:
|
if es_r:
|
||||||
callback(-1, "Index failure!")
|
callback(-1, "Index failure!")
|
||||||
ELASTICSEARCH.deleteByQuery(
|
ELASTICSEARCH.deleteByQuery(
|
||||||
@ -311,11 +306,8 @@ def main(comm, mod):
|
|||||||
DocumentService.increment_chunk_num(
|
DocumentService.increment_chunk_num(
|
||||||
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
||||||
cron_logger.info(
|
cron_logger.info(
|
||||||
"Chunk doc({}), token({}), chunks({}), elapsed:{}".format(
|
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
|
||||||
r["id"], tk_count, len(cks), timer()-st))
|
r["id"], tk_count, len(cks), timer() - st))
|
||||||
|
|
||||||
tmf.write(str(r["update_time"]) + "\n")
|
|
||||||
tmf.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -324,8 +316,5 @@ if __name__ == "__main__":
|
|||||||
peewee_logger.addHandler(database_logger.handlers[0])
|
peewee_logger.addHandler(database_logger.handlers[0])
|
||||||
peewee_logger.setLevel(database_logger.level)
|
peewee_logger.setLevel(database_logger.level)
|
||||||
|
|
||||||
#from mpi4py import MPI
|
|
||||||
#comm = MPI.COMM_WORLD
|
|
||||||
while True:
|
while True:
|
||||||
main(int(sys.argv[2]), int(sys.argv[1]))
|
main()
|
||||||
close_connection()
|
|
||||||
|
|||||||
@ -15,9 +15,6 @@ def singleton(cls, *args, **kw):
|
|||||||
return _singleton
|
return _singleton
|
||||||
|
|
||||||
|
|
||||||
from .minio_conn import MINIO
|
|
||||||
from .es_conn import ELASTICSEARCH
|
|
||||||
|
|
||||||
def rmSpace(txt):
|
def rmSpace(txt):
|
||||||
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||||
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
|
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||||
@ -66,3 +63,7 @@ def num_tokens_from_string(string: str) -> int:
|
|||||||
num_tokens = len(encoder.encode(string))
|
num_tokens = len(encoder.encode(string))
|
||||||
return num_tokens
|
return num_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def truncate(string: str, max_len: int) -> int:
|
||||||
|
"""Returns truncated text if the length of text exceed max_len."""
|
||||||
|
return encoder.decode(encoder.encode(string)[:max_len])
|
||||||
|
|||||||
@ -15,7 +15,7 @@ es_logger.info("Elasticsearch version: "+str(elasticsearch.__version__))
|
|||||||
|
|
||||||
|
|
||||||
@singleton
|
@singleton
|
||||||
class HuEs:
|
class ESConnection:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.info = {}
|
self.info = {}
|
||||||
self.conn()
|
self.conn()
|
||||||
@ -43,6 +43,9 @@ class HuEs:
|
|||||||
v = v["number"].split(".")[0]
|
v = v["number"].split(".")[0]
|
||||||
return int(v) >= 7
|
return int(v) >= 7
|
||||||
|
|
||||||
|
def health(self):
|
||||||
|
return dict(self.es.cluster.health())
|
||||||
|
|
||||||
def upsert(self, df, idxnm=""):
|
def upsert(self, df, idxnm=""):
|
||||||
res = []
|
res = []
|
||||||
for d in df:
|
for d in df:
|
||||||
@ -454,4 +457,4 @@ class HuEs:
|
|||||||
scroll_size = len(page['hits']['hits'])
|
scroll_size = len(page['hits']['hits'])
|
||||||
|
|
||||||
|
|
||||||
ELASTICSEARCH = HuEs()
|
ELASTICSEARCH = ESConnection()
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from rag.utils import singleton
|
|||||||
|
|
||||||
|
|
||||||
@singleton
|
@singleton
|
||||||
class HuMinio(object):
|
class RAGFlowMinio(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.conn = None
|
self.conn = None
|
||||||
self.__open__()
|
self.__open__()
|
||||||
@ -34,8 +34,18 @@ class HuMinio(object):
|
|||||||
del self.conn
|
del self.conn
|
||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
|
def health(self):
|
||||||
|
bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
|
||||||
|
if not self.conn.bucket_exists(bucket):
|
||||||
|
self.conn.make_bucket(bucket)
|
||||||
|
r = self.conn.put_object(bucket, fnm,
|
||||||
|
BytesIO(binary),
|
||||||
|
len(binary)
|
||||||
|
)
|
||||||
|
return r
|
||||||
|
|
||||||
def put(self, bucket, fnm, binary):
|
def put(self, bucket, fnm, binary):
|
||||||
for _ in range(10):
|
for _ in range(3):
|
||||||
try:
|
try:
|
||||||
if not self.conn.bucket_exists(bucket):
|
if not self.conn.bucket_exists(bucket):
|
||||||
self.conn.make_bucket(bucket)
|
self.conn.make_bucket(bucket)
|
||||||
@ -86,10 +96,12 @@ class HuMinio(object):
|
|||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
return
|
return
|
||||||
|
|
||||||
MINIO = HuMinio()
|
|
||||||
|
MINIO = RAGFlowMinio()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
conn = HuMinio()
|
conn = RAGFlowMinio()
|
||||||
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
img = Image.open(fnm)
|
img = Image.open(fnm)
|
||||||
|
|||||||
@ -5,6 +5,27 @@ import logging
|
|||||||
from rag import settings
|
from rag import settings
|
||||||
from rag.utils import singleton
|
from rag.utils import singleton
|
||||||
|
|
||||||
|
|
||||||
|
class Payload:
|
||||||
|
def __init__(self, consumer, queue_name, group_name, msg_id, message):
|
||||||
|
self.__consumer = consumer
|
||||||
|
self.__queue_name = queue_name
|
||||||
|
self.__group_name = group_name
|
||||||
|
self.__msg_id = msg_id
|
||||||
|
self.__message = json.loads(message['message'])
|
||||||
|
|
||||||
|
def ack(self):
|
||||||
|
try:
|
||||||
|
self.__consumer.xack(self.__queue_name, self.__group_name, self.__msg_id)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("[EXCEPTION]ack" + str(self.__queue_name) + "||" + str(e))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_message(self):
|
||||||
|
return self.__message
|
||||||
|
|
||||||
|
|
||||||
@singleton
|
@singleton
|
||||||
class RedisDB:
|
class RedisDB:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -14,14 +35,19 @@ class RedisDB:
|
|||||||
|
|
||||||
def __open__(self):
|
def __open__(self):
|
||||||
try:
|
try:
|
||||||
self.REDIS = redis.Redis(host=self.config.get("host", "redis").split(":")[0],
|
self.REDIS = redis.StrictRedis(host=self.config["host"].split(":")[0],
|
||||||
port=int(self.config.get("host", ":6379").split(":")[1]),
|
port=int(self.config.get("host", ":6379").split(":")[1]),
|
||||||
db=int(self.config.get("db", 1)),
|
db=int(self.config.get("db", 1)),
|
||||||
password=self.config.get("password"))
|
password=self.config.get("password"),
|
||||||
|
decode_responses=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning("Redis can't be connected.")
|
logging.warning("Redis can't be connected.")
|
||||||
return self.REDIS
|
return self.REDIS
|
||||||
|
|
||||||
|
def health(self, queue_name):
|
||||||
|
self.REDIS.ping()
|
||||||
|
return self.REDIS.xinfo_groups(queue_name)[0]
|
||||||
|
|
||||||
def is_alive(self):
|
def is_alive(self):
|
||||||
return self.REDIS is not None
|
return self.REDIS is not None
|
||||||
|
|
||||||
@ -70,5 +96,48 @@ class RedisDB:
|
|||||||
self.__open__()
|
self.__open__()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def queue_product(self, queue, message, exp=settings.SVR_QUEUE_RETENTION) -> bool:
|
||||||
|
try:
|
||||||
|
payload = {"message": json.dumps(message)}
|
||||||
|
pipeline = self.REDIS.pipeline()
|
||||||
|
pipeline.xadd(queue, payload)
|
||||||
|
pipeline.expire(queue, exp)
|
||||||
|
pipeline.execute()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("[EXCEPTION]producer" + str(queue) + "||" + str(e))
|
||||||
|
return False
|
||||||
|
|
||||||
REDIS_CONN = RedisDB()
|
def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> Payload:
|
||||||
|
try:
|
||||||
|
group_info = self.REDIS.xinfo_groups(queue_name)
|
||||||
|
if not any(e["name"] == group_name for e in group_info):
|
||||||
|
self.REDIS.xgroup_create(
|
||||||
|
queue_name,
|
||||||
|
group_name,
|
||||||
|
id="$",
|
||||||
|
mkstream=True
|
||||||
|
)
|
||||||
|
args = {
|
||||||
|
"groupname": group_name,
|
||||||
|
"consumername": consumer_name,
|
||||||
|
"count": 1,
|
||||||
|
"block": 10000,
|
||||||
|
"streams": {queue_name: msg_id},
|
||||||
|
}
|
||||||
|
messages = self.REDIS.xreadgroup(**args)
|
||||||
|
if not messages:
|
||||||
|
return None
|
||||||
|
stream, element_list = messages[0]
|
||||||
|
msg_id, payload = element_list[0]
|
||||||
|
res = Payload(self.REDIS, queue_name, group_name, msg_id, payload)
|
||||||
|
return res
|
||||||
|
except Exception as e:
|
||||||
|
if 'key' in str(e):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
logging.warning("[EXCEPTION]consumer" + str(queue_name) + "||" + str(e))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
REDIS_CONN = RedisDB()
|
||||||
|
|||||||
@ -50,7 +50,6 @@ joblib==1.3.2
|
|||||||
lxml==5.1.0
|
lxml==5.1.0
|
||||||
MarkupSafe==2.1.5
|
MarkupSafe==2.1.5
|
||||||
minio==7.2.4
|
minio==7.2.4
|
||||||
mpi4py==3.1.5
|
|
||||||
mpmath==1.3.0
|
mpmath==1.3.0
|
||||||
multidict==6.0.5
|
multidict==6.0.5
|
||||||
multiprocess==0.70.16
|
multiprocess==0.70.16
|
||||||
@ -69,6 +68,7 @@ nvidia-cusparse-cu12==12.1.0.106
|
|||||||
nvidia-nccl-cu12==2.19.3
|
nvidia-nccl-cu12==2.19.3
|
||||||
nvidia-nvjitlink-cu12==12.3.101
|
nvidia-nvjitlink-cu12==12.3.101
|
||||||
nvidia-nvtx-cu12==12.1.105
|
nvidia-nvtx-cu12==12.1.105
|
||||||
|
ollama==0.1.9
|
||||||
onnxruntime-gpu==1.17.1
|
onnxruntime-gpu==1.17.1
|
||||||
openai==1.12.0
|
openai==1.12.0
|
||||||
opencv-python==4.9.0.80
|
opencv-python==4.9.0.80
|
||||||
@ -91,8 +91,6 @@ pycryptodomex==3.20.0
|
|||||||
pydantic==2.6.2
|
pydantic==2.6.2
|
||||||
pydantic_core==2.16.3
|
pydantic_core==2.16.3
|
||||||
PyJWT==2.8.0
|
PyJWT==2.8.0
|
||||||
PyMuPDF==1.23.25
|
|
||||||
PyMuPDFb==1.23.22
|
|
||||||
PyMySQL==1.1.0
|
PyMySQL==1.1.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
pypdfium2==4.27.0
|
pypdfium2==4.27.0
|
||||||
@ -102,6 +100,7 @@ python-dotenv==1.0.1
|
|||||||
python-pptx==0.6.23
|
python-pptx==0.6.23
|
||||||
pytz==2024.1
|
pytz==2024.1
|
||||||
PyYAML==6.0.1
|
PyYAML==6.0.1
|
||||||
|
redis==5.0.3
|
||||||
regex==2023.12.25
|
regex==2023.12.25
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
ruamel.yaml==0.18.6
|
ruamel.yaml==0.18.6
|
||||||
@ -134,4 +133,4 @@ xxhash==3.4.1
|
|||||||
yarl==1.9.4
|
yarl==1.9.4
|
||||||
zhipuai==2.0.1
|
zhipuai==2.0.1
|
||||||
BCEmbedding
|
BCEmbedding
|
||||||
loguru==0.7.2
|
loguru==0.7.2
|
||||||
|
|||||||
124
requirements_dev.txt
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
accelerate==0.27.2
|
||||||
|
aiohttp==3.9.3
|
||||||
|
aiosignal==1.3.1
|
||||||
|
annotated-types==0.6.0
|
||||||
|
anyio==4.3.0
|
||||||
|
argon2-cffi==23.1.0
|
||||||
|
argon2-cffi-bindings==21.2.0
|
||||||
|
Aspose.Slides==24.2.0
|
||||||
|
attrs==23.2.0
|
||||||
|
blinker==1.7.0
|
||||||
|
cachelib==0.12.0
|
||||||
|
cachetools==5.3.3
|
||||||
|
certifi==2024.2.2
|
||||||
|
cffi==1.16.0
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
click==8.1.7
|
||||||
|
coloredlogs==15.0.1
|
||||||
|
cryptography==42.0.5
|
||||||
|
dashscope==1.14.1
|
||||||
|
datasets==2.17.1
|
||||||
|
datrie==0.8.2
|
||||||
|
demjson3==3.0.6
|
||||||
|
dill==0.3.8
|
||||||
|
distro==1.9.0
|
||||||
|
elastic-transport==8.12.0
|
||||||
|
elasticsearch==8.12.1
|
||||||
|
elasticsearch-dsl==8.12.0
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
filelock==3.13.1
|
||||||
|
fastembed==0.2.6
|
||||||
|
FlagEmbedding==1.2.5
|
||||||
|
Flask==3.0.2
|
||||||
|
Flask-Cors==4.0.0
|
||||||
|
Flask-Login==0.6.3
|
||||||
|
Flask-Session==0.6.0
|
||||||
|
flatbuffers==23.5.26
|
||||||
|
frozenlist==1.4.1
|
||||||
|
fsspec==2023.10.0
|
||||||
|
h11==0.14.0
|
||||||
|
hanziconv==0.3.2
|
||||||
|
httpcore==1.0.4
|
||||||
|
httpx==0.27.0
|
||||||
|
huggingface-hub==0.20.3
|
||||||
|
humanfriendly==10.0
|
||||||
|
idna==3.6
|
||||||
|
install==1.3.5
|
||||||
|
itsdangerous==2.1.2
|
||||||
|
Jinja2==3.1.3
|
||||||
|
joblib==1.3.2
|
||||||
|
lxml==5.1.0
|
||||||
|
MarkupSafe==2.1.5
|
||||||
|
minio==7.2.4
|
||||||
|
mpi4py==3.1.5
|
||||||
|
mpmath==1.3.0
|
||||||
|
multidict==6.0.5
|
||||||
|
multiprocess==0.70.16
|
||||||
|
networkx==3.2.1
|
||||||
|
nltk==3.8.1
|
||||||
|
numpy==1.26.4
|
||||||
|
openai==1.12.0
|
||||||
|
opencv-python==4.9.0.80
|
||||||
|
openpyxl==3.1.2
|
||||||
|
packaging==23.2
|
||||||
|
pandas==2.2.1
|
||||||
|
pdfminer.six==20221105
|
||||||
|
pdfplumber==0.10.4
|
||||||
|
peewee==3.17.1
|
||||||
|
pillow==10.2.0
|
||||||
|
protobuf==4.25.3
|
||||||
|
psutil==5.9.8
|
||||||
|
pyarrow==15.0.0
|
||||||
|
pyarrow-hotfix==0.6
|
||||||
|
pyclipper==1.3.0.post5
|
||||||
|
pycparser==2.21
|
||||||
|
pycryptodome==3.20.0
|
||||||
|
pycryptodome-test-vectors==1.0.14
|
||||||
|
pycryptodomex==3.20.0
|
||||||
|
pydantic==2.6.2
|
||||||
|
pydantic_core==2.16.3
|
||||||
|
PyJWT==2.8.0
|
||||||
|
PyMySQL==1.1.0
|
||||||
|
PyPDF2==3.0.1
|
||||||
|
pypdfium2==4.27.0
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
python-docx==1.1.0
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
python-pptx==0.6.23
|
||||||
|
pytz==2024.1
|
||||||
|
PyYAML==6.0.1
|
||||||
|
regex==2023.12.25
|
||||||
|
requests==2.31.0
|
||||||
|
ruamel.yaml==0.18.6
|
||||||
|
ruamel.yaml.clib==0.2.8
|
||||||
|
safetensors==0.4.2
|
||||||
|
scikit-learn==1.4.1.post1
|
||||||
|
scipy==1.12.0
|
||||||
|
sentence-transformers==2.4.0
|
||||||
|
shapely==2.0.3
|
||||||
|
six==1.16.0
|
||||||
|
sniffio==1.3.1
|
||||||
|
StrEnum==0.4.15
|
||||||
|
sympy==1.12
|
||||||
|
threadpoolctl==3.3.0
|
||||||
|
tika==2.6.0
|
||||||
|
tiktoken==0.6.0
|
||||||
|
tokenizers==0.15.2
|
||||||
|
torch==2.2.1
|
||||||
|
tqdm==4.66.2
|
||||||
|
transformers==4.38.1
|
||||||
|
triton==2.2.0
|
||||||
|
typing_extensions==4.10.0
|
||||||
|
tzdata==2024.1
|
||||||
|
urllib3==2.2.1
|
||||||
|
Werkzeug==3.0.1
|
||||||
|
xgboost==2.0.3
|
||||||
|
XlsxWriter==3.2.0
|
||||||
|
xpinyin==0.7.6
|
||||||
|
xxhash==3.4.1
|
||||||
|
yarl==1.9.4
|
||||||
|
zhipuai==2.0.1
|
||||||
|
BCEmbedding
|
||||||
|
loguru==0.7.2
|
||||||
|
ollama==0.1.8
|
||||||
|
redis==5.0.4
|
||||||
@ -1,7 +1,9 @@
|
|||||||
import { defineConfig } from 'umi';
|
import { defineConfig } from 'umi';
|
||||||
|
import { appName } from './src/conf.json';
|
||||||
import routes from './src/routes';
|
import routes from './src/routes';
|
||||||
|
|
||||||
export default defineConfig({
|
export default defineConfig({
|
||||||
|
title: appName,
|
||||||
outputPath: 'dist',
|
outputPath: 'dist',
|
||||||
// alias: { '@': './src' },
|
// alias: { '@': './src' },
|
||||||
npmClient: 'npm',
|
npmClient: 'npm',
|
||||||
@ -25,10 +27,13 @@ export default defineConfig({
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
devtool: 'source-map',
|
devtool: 'source-map',
|
||||||
|
copy: ['src/conf.json'],
|
||||||
proxy: {
|
proxy: {
|
||||||
'/v1': {
|
'/v1': {
|
||||||
target: 'http://192.168.200.233:9380/',
|
target: '',
|
||||||
changeOrigin: true,
|
changeOrigin: true,
|
||||||
|
ws: true,
|
||||||
|
logger: console,
|
||||||
// pathRewrite: { '^/v1': '/v1' },
|
// pathRewrite: { '^/v1': '/v1' },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|||||||
4355
web/package-lock.json
generated
@ -3,7 +3,7 @@
|
|||||||
"author": "zhaofengchao <13723060510@163.com>",
|
"author": "zhaofengchao <13723060510@163.com>",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "umi build",
|
"build": "umi build",
|
||||||
"dev": "cross-env PORT=9000 umi dev",
|
"dev": "cross-env UMI_DEV_SERVER_COMPRESS=none umi dev",
|
||||||
"postinstall": "umi setup",
|
"postinstall": "umi setup",
|
||||||
"lint": "umi lint --eslint-only",
|
"lint": "umi lint --eslint-only",
|
||||||
"setup": "umi setup",
|
"setup": "umi setup",
|
||||||
@ -13,15 +13,19 @@
|
|||||||
"@ant-design/icons": "^5.2.6",
|
"@ant-design/icons": "^5.2.6",
|
||||||
"@ant-design/pro-components": "^2.6.46",
|
"@ant-design/pro-components": "^2.6.46",
|
||||||
"@ant-design/pro-layout": "^7.17.16",
|
"@ant-design/pro-layout": "^7.17.16",
|
||||||
|
"@js-preview/excel": "^1.7.8",
|
||||||
"ahooks": "^3.7.10",
|
"ahooks": "^3.7.10",
|
||||||
"antd": "^5.12.7",
|
"antd": "^5.12.7",
|
||||||
"axios": "^1.6.3",
|
"axios": "^1.6.3",
|
||||||
"classnames": "^2.5.1",
|
"classnames": "^2.5.1",
|
||||||
"dayjs": "^1.11.10",
|
"dayjs": "^1.11.10",
|
||||||
|
"eventsource-parser": "^1.1.2",
|
||||||
"i18next": "^23.7.16",
|
"i18next": "^23.7.16",
|
||||||
|
"i18next-browser-languagedetector": "^8.0.0",
|
||||||
"js-base64": "^3.7.5",
|
"js-base64": "^3.7.5",
|
||||||
"jsencrypt": "^3.3.2",
|
"jsencrypt": "^3.3.2",
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
|
"mammoth": "^1.7.2",
|
||||||
"rc-tween-one": "^3.0.6",
|
"rc-tween-one": "^3.0.6",
|
||||||
"react-chat-elements": "^12.0.13",
|
"react-chat-elements": "^12.0.13",
|
||||||
"react-copy-to-clipboard": "^5.1.0",
|
"react-copy-to-clipboard": "^5.1.0",
|
||||||
@ -31,6 +35,7 @@
|
|||||||
"react-pdf-highlighter": "^6.1.0",
|
"react-pdf-highlighter": "^6.1.0",
|
||||||
"react-string-replace": "^1.1.1",
|
"react-string-replace": "^1.1.1",
|
||||||
"react-syntax-highlighter": "^15.5.0",
|
"react-syntax-highlighter": "^15.5.0",
|
||||||
|
"reactflow": "^11.11.2",
|
||||||
"recharts": "^2.12.4",
|
"recharts": "^2.12.4",
|
||||||
"remark-gfm": "^4.0.0",
|
"remark-gfm": "^4.0.0",
|
||||||
"umi": "^4.0.90",
|
"umi": "^4.0.90",
|
||||||
|
|||||||
|
Before Width: | Height: | Size: 12 KiB |
24
web/src/assets/svg/es.svg
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
<svg t="1716195941333" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="7780"
|
||||||
|
width="200" height="200">
|
||||||
|
<path
|
||||||
|
d="M1024 534.4c0-85.76-53.12-160.96-133.44-190.08 3.52-17.92 5.44-36.16 5.44-55.04C896 129.92 766.4 0 606.72 0c-93.12 0-179.84 44.8-234.24 120A153.632 153.632 0 0 0 278.4 87.68a153.632 153.632 0 0 0-144 207.04c-79.68 28.8-134.4 105.6-134.4 190.72 0 86.4 53.44 161.6 133.76 190.72-3.52 17.92-5.12 36.48-5.12 55.04 0 159.04 129.6 288.64 288.64 288.64 93.44 0 180.16-44.8 234.24-120.64a152 152 0 0 0 94.08 32.64 153.632 153.632 0 0 0 144-207.04c79.68-28.48 134.4-105.28 134.4-190.4"
|
||||||
|
fill="#FFFFFF" p-id="7781"></path>
|
||||||
|
<path
|
||||||
|
d="M402.56 439.36l224 102.08 225.92-198.08c3.2-16.32 4.8-32.64 4.8-49.6 0-139.52-113.28-252.8-252.8-252.8-83.52 0-161.28 40.96-208.32 109.76l-37.76 195.2 44.16 93.44z"
|
||||||
|
fill="#FFD00A" p-id="7782"></path>
|
||||||
|
<path
|
||||||
|
d="M170.56 676.48c-3.2 16.32-4.8 33.28-4.8 50.56 0 139.84 113.6 253.44 253.44 253.44 84.16 0 162.24-41.28 209.28-111.04l37.44-194.56-49.92-95.04-224.96-102.4-220.48 199.04z"
|
||||||
|
fill="#20B9AF" p-id="7783"></path>
|
||||||
|
<path
|
||||||
|
d="M169.28 288.96l153.6 36.16 33.6-174.72c-21.12-16-47.04-24.96-73.6-24.96-66.88 0-120.96 54.4-120.96 120.96 0 15.04 2.56 29.12 7.36 42.56"
|
||||||
|
fill="#EE5096" p-id="7784"></path>
|
||||||
|
<path
|
||||||
|
d="M155.84 325.44c-68.48 22.72-116.16 88.64-116.16 160.96 0 70.4 43.52 133.44 108.8 158.08l215.36-194.88-39.68-84.48-168.32-39.68z"
|
||||||
|
fill="#12A5DF" p-id="7785"></path>
|
||||||
|
<path
|
||||||
|
d="M667.84 869.44c21.12 16.32 46.72 24.96 73.28 24.96 66.88 0 120.96-54.4 120.96-120.96 0-14.72-2.56-28.8-7.36-42.24l-153.28-35.84-33.6 174.08z"
|
||||||
|
fill="#90C640" p-id="7786"></path>
|
||||||
|
<path
|
||||||
|
d="M699.2 655.36l168.96 39.36c68.48-22.72 116.48-88.32 116.48-160.96 0-70.4-43.52-133.12-109.12-158.08l-220.8 193.6 44.48 86.08z"
|
||||||
|
fill="#05799F" p-id="7787"></path>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 1.9 KiB |
6
web/src/assets/svg/llm/deepseek.svg
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<svg t="1715133624982" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="4263"
|
||||||
|
width="200" height="200">
|
||||||
|
<path
|
||||||
|
d="M320.512 804.864C46.08 676.864 77.824 274.432 362.496 274.432c34.816 0 86.016-7.168 114.688-14.336 59.392-16.384 99.328-10.24 69.632 10.24-9.216 7.168-15.36 19.456-13.312 28.672 5.12 20.48 158.72 161.792 177.152 161.792 27.648 0 27.648-32.768 1.024-57.344-43.008-38.912-55.296-90.112-35.84-141.312l9.216-26.624 54.272 52.224c35.84 34.816 58.368 49.152 68.608 44.032 9.216-4.096 30.72-9.216 49.152-12.288 18.432-2.048 38.912-10.24 45.056-18.432 19.456-23.552 43.008-17.408 35.84 9.216-3.072 12.288-6.144 27.648-6.144 34.816 0 23.552-62.464 83.968-92.16 90.112-23.552 5.12-30.72 12.288-30.72 30.72 0 46.08-38.912 148.48-75.776 198.656l-37.888 51.2 36.864 15.36c56.32 23.552 40.96 41.984-37.888 43.008-43.008 1.024-75.776 7.168-92.16 18.432-68.608 45.056-198.656 50.176-281.6 12.288z m251.904-86.016c-24.576-27.648-66.56-79.872-93.184-117.76-69.632-98.304-158.72-150.528-256-150.528-37.888 0-38.912 1.024-38.912 34.816 0 94.208 99.328 240.64 175.104 257.024 38.912 9.216 59.392-7.168 39.936-29.696-7.168-9.216-10.24-23.552-6.144-31.744 5.12-14.336 9.216-14.336 38.912 1.024 18.432 9.216 50.176 29.696 69.632 45.056 35.84 27.648 58.368 37.888 96.256 39.936 14.336 1.024 9.216-10.24-25.6-48.128z m88.064-145.408c8.192-13.312-31.744-78.848-56.32-92.16-10.24-6.144-26.624-10.24-34.816-10.24-23.552 0-20.48 27.648 4.096 33.792 13.312 3.072 20.48 14.336 20.48 29.696 0 13.312 5.12 29.696 12.288 36.864 15.36 15.36 46.08 16.384 54.272 2.048z"
|
||||||
|
fill="#4D6BFE" p-id="4264"></path>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 1.6 KiB |
@ -1,29 +0,0 @@
|
|||||||
<svg width="32" height="34" viewBox="0 0 32 34" fill="none" xmlns="http://www.w3.org/2000/svg">
|
|
||||||
<path fill-rule="evenodd" clip-rule="evenodd"
|
|
||||||
d="M3.43265 20.7677C4.15835 21.5062 4.15834 22.7035 3.43262 23.4419L3.39546 23.4797C2.66974 24.2182 1.49312 24.2182 0.767417 23.4797C0.0417107 22.7412 0.0417219 21.544 0.767442 20.8055L0.804608 20.7677C1.53033 20.0292 2.70694 20.0293 3.43265 20.7677Z"
|
|
||||||
fill="#B2DDFF" />
|
|
||||||
<path fill-rule="evenodd" clip-rule="evenodd"
|
|
||||||
d="M12.1689 21.3375C12.8933 22.0773 12.8912 23.2746 12.1641 24.0117L7.01662 29.2307C6.2896 29.9678 5.11299 29.9657 4.38859 29.2259C3.66419 28.4861 3.66632 27.2888 4.39334 26.5517L9.54085 21.3327C10.2679 20.5956 11.4445 20.5977 12.1689 21.3375Z"
|
|
||||||
fill="#53B1FD" />
|
|
||||||
<path fill-rule="evenodd" clip-rule="evenodd"
|
|
||||||
d="M19.1551 30.3217C19.7244 29.4528 20.8781 29.218 21.7321 29.7973L21.8436 29.8729C22.6975 30.4522 22.9283 31.6262 22.359 32.4952C21.7897 33.3641 20.6359 33.5989 19.782 33.0196L19.6705 32.944C18.8165 32.3647 18.5858 31.1907 19.1551 30.3217Z"
|
|
||||||
fill="#B2DDFF" />
|
|
||||||
<path fill-rule="evenodd" clip-rule="evenodd"
|
|
||||||
d="M31.4184 20.6544C32.1441 21.3929 32.1441 22.5902 31.4184 23.3286L28.8911 25.9003C28.1654 26.6388 26.9887 26.6388 26.263 25.9003C25.5373 25.1619 25.5373 23.9646 26.263 23.2261L28.7903 20.6544C29.516 19.916 30.6927 19.916 31.4184 20.6544Z"
|
|
||||||
fill="#53B1FD" />
|
|
||||||
<path fill-rule="evenodd" clip-rule="evenodd"
|
|
||||||
d="M31.4557 11.1427C32.1814 11.8812 32.1814 13.0785 31.4557 13.8169L12.7797 32.8209C12.054 33.5594 10.8774 33.5594 10.1517 32.8209C9.42599 32.0825 9.42599 30.8852 10.1517 30.1467L28.8277 11.1427C29.5534 10.4043 30.73 10.4043 31.4557 11.1427Z"
|
|
||||||
fill="#1570EF" />
|
|
||||||
<path fill-rule="evenodd" clip-rule="evenodd"
|
|
||||||
d="M27.925 5.29994C28.6508 6.0384 28.6508 7.23568 27.925 7.97414L17.184 18.9038C16.4583 19.6423 15.2817 19.6423 14.556 18.9038C13.8303 18.1653 13.8303 16.9681 14.556 16.2296L25.297 5.29994C26.0227 4.56148 27.1993 4.56148 27.925 5.29994Z"
|
|
||||||
fill="#1570EF" />
|
|
||||||
<path fill-rule="evenodd" clip-rule="evenodd"
|
|
||||||
d="M22.256 1.59299C22.9822 2.33095 22.983 3.52823 22.2578 4.26718L8.45055 18.3358C7.72533 19.0748 6.54871 19.0756 5.82251 18.3376C5.09631 17.5996 5.09552 16.4024 5.82075 15.6634L19.6279 1.59478C20.3532 0.855827 21.5298 0.855022 22.256 1.59299Z"
|
|
||||||
fill="#1570EF" />
|
|
||||||
<path fill-rule="evenodd" clip-rule="evenodd"
|
|
||||||
d="M8.58225 6.09619C9.30671 6.83592 9.30469 8.0332 8.57772 8.77038L3.17006 14.2541C2.4431 14.9913 1.26649 14.9893 0.542025 14.2495C-0.182438 13.5098 -0.180413 12.3125 0.546548 11.5753L5.95421 6.09159C6.68117 5.3544 7.85778 5.35646 8.58225 6.09619Z"
|
|
||||||
fill="#53B1FD" />
|
|
||||||
<path fill-rule="evenodd" clip-rule="evenodd"
|
|
||||||
d="M11.893 0.624023C12.9193 0.624023 13.7513 1.47063 13.7513 2.51497V2.70406C13.7513 3.7484 12.9193 4.59501 11.893 4.59501C10.8667 4.59501 10.0347 3.7484 10.0347 2.70406V2.51497C10.0347 1.47063 10.8667 0.624023 11.893 0.624023Z"
|
|
||||||
fill="#B2DDFF" />
|
|
||||||
</svg>
|
|
||||||
|
Before Width: | Height: | Size: 3.0 KiB |
10
web/src/assets/svg/minio.svg
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<svg t="1716195854453" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="5857"
|
||||||
|
width="200" height="200">
|
||||||
|
<path
|
||||||
|
d="M638.855218 56.525807s99.268136 159.838523 132.357514 216.623262a2.523766 2.523766 0 0 1 0 2.944394 2.383557 2.383557 0 0 1-3.50523 0L596.231612 97.326693l42.623606-40.800886z"
|
||||||
|
fill="#dd113c" p-id="5858"></path>
|
||||||
|
<path
|
||||||
|
d="M346.518971 639.655999a588.878771 588.878771 0 0 1 116.654081-165.446893 597.291325 597.291325 0 0 1 58.32704-51.176369v126.188308L346.518971 639.655999zM245.568325 756.590498l275.931767-140.209231v321.079139l62.112689 80.760517v-434.648616l37.716283-19.489084a187.179324 187.179324 0 0 0 51.456788-296.121896L530.753901 119.479752a31.547077 31.547077 0 0 1 1.542302-44.446327 31.687286 31.687286 0 0 1 44.586535 1.542302l19.909711 20.750966 42.062769-40.941095c-50.335114-65.337502-112.167385-57.065157-147.64032-24.396407a90.575163 90.575163 0 0 0-3.925859 127.870819l143.574253 149.60325a128.151237 128.151237 0 0 1-28.041846 197.414597l-19.489083 10.095065V314.090164A649.589368 649.589368 0 0 0 245.568325 755.889452v0.701046z"
|
||||||
|
fill="#dd113c" p-id="5859"></path>
|
||||||
|
<path d="M583.612781 583.432097v65.617921l-62.112689 31.547077v-65.197293z" fill="#dd113c" p-id="5860"></path>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 1.3 KiB |
9
web/src/assets/svg/mysql.svg
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
<svg t="1716195691568" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="4834"
|
||||||
|
width="200" height="200">
|
||||||
|
<path
|
||||||
|
d="M1001.632 793.792c-7.84-13.856-26.016-37.536-93.12-83.2a1096.224 1096.224 0 0 0-125.152-74.144c-30.592-82.784-89.824-190.112-176.256-319.36-93.056-139.168-201.12-197.792-321.888-174.56a756.608 756.608 0 0 0-40.928-37.696C213.824 78.688 139.2 56.48 96.32 60.736c-19.424 1.952-34.016 9.056-43.36 21.088-21.664 27.904-14.432 68.064 85.504 198.912 19.008 55.616 23.072 84.672 23.072 99.296 0 30.912 15.968 66.368 49.984 110.752l-32 109.504c-28.544 97.792 23.328 224.288 71.616 268.384 25.76 23.552 47.456 20.032 58.176 15.84 21.504-8.448 38.848-29.472 50.048-89.504 5.728 14.112 11.808 29.312 18.208 45.6 34.56 87.744 68.352 136.288 106.336 152.736a32.032 32.032 0 0 0 25.44-58.688c-9.408-4.096-35.328-23.712-72.288-117.504-31.168-79.136-53.856-132.064-69.376-161.856a32.224 32.224 0 0 0-35.328-16.48 32.032 32.032 0 0 0-25.024 29.92c-3.872 91.04-13.056 130.4-19.2 147.008-26.496-30.464-68.128-125.984-47.232-197.536 20.768-71.232 32.992-112.928 36.64-125.248a31.936 31.936 0 0 0-5.888-29.28c-41.664-51.168-46.176-75.584-46.176-83.712 0-29.472-9.248-70.4-28.288-125.152a31.104 31.104 0 0 0-4.768-8.896c-53.824-70.112-73.6-105.216-80.832-121.888 25.632 1.216 74.336 15.04 91.008 29.376a660.8 660.8 0 0 1 49.024 46.304c8 8.448 19.968 11.872 31.232 8.928 100.192-25.92 188.928 21.152 271.072 144 87.808 131.328 146.144 238.048 173.408 317.216a32 32 0 0 0 16.384 18.432 1004.544 1004.544 0 0 1 128.8 75.264c7.392 5.024 14.048 9.696 20.064 14.016h-98.848a32.032 32.032 0 0 0-24.352 52.736 3098.752 3098.752 0 0 0 97.856 110.464 32 32 0 1 0 46.56-43.872 2237.6 2237.6 0 0 1-50.08-55.328h110.08a32.032 32.032 0 0 0 27.84-47.776z"
|
||||||
|
p-id="4835"></path>
|
||||||
|
<path
|
||||||
|
d="M320 289.472c12.672 21.76 22.464 37.344 29.344 46.784 8.288 16.256 21.184 29.248 29.44 45.536l2.016-1.984c14.528-9.952 25.92-49.504 2.752-75.488-12.032-18.176-51.04-17.664-63.552-14.848z"
|
||||||
|
p-id="4836"></path>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 2.0 KiB |
6
web/src/assets/svg/redis.svg
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<svg t="1716195575286" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="3818"
|
||||||
|
width="200" height="200">
|
||||||
|
<path
|
||||||
|
d="M959.744 602.16l0.256 0.064v101.952c0 10.24-10.752 21.44-35.072 35.84-22.976 13.696-91.968 47.616-163.328 82.624l-35.712 17.536c-65.088 32-126.016 62.208-149.184 76.032-52.8 31.36-82.048 31.104-123.712 8.32-41.6-22.72-305.28-144.256-352.704-170.176-23.744-12.992-36.224-23.936-36.224-34.24v-103.424c0.384 10.368 12.48 21.248 36.224 34.24C147.776 676.8 411.328 798.4 452.992 821.12c41.664 22.784 70.912 23.04 123.712-8.32 52.672-31.36 300.416-147.712 348.224-176.128 23.232-13.824 34.56-24.768 34.88-34.56l-0.064 0.064z m0-168.576h0.192v101.952c0 10.24-10.752 21.44-35.072 35.968-47.808 28.416-295.552 144.768-348.224 176.128-52.8 31.36-82.048 31.04-123.712 8.32-41.6-22.72-305.28-144.32-352.704-170.24C76.48 572.8 64 561.92 64 551.536v-103.424c0.384 10.24 12.48 21.248 36.224 34.176 47.488 25.92 311.04 147.52 352.704 170.24 41.664 22.72 70.912 23.04 123.712-8.32 52.672-31.36 300.416-147.712 348.224-176.192 23.168-13.824 34.56-24.704 34.88-34.432zM462.656 81.84c55.36-22.72 74.56-23.488 121.664-3.776 47.168 19.776 293.376 131.648 339.968 151.104 24 10.048 35.84 19.2 35.456 29.632H960v101.952c0 10.176-10.816 21.44-35.072 35.904C877.056 425.072 629.376 541.44 576.64 572.8c-52.736 31.36-81.984 31.104-123.648 8.32-41.664-22.656-305.28-144.32-352.768-170.24C76.544 397.936 64 387.056 64 376.688V273.28c-0.32-10.304 11.072-19.968 34.368-30.464 46.656-20.8 308.8-138.24 364.288-160.896v-0.064z m129.792 238.4l-207.552 36.352 144.832 68.608 62.72-104.96z m128.704-113.6l-135.936 61.44 122.688 55.36 13.376-5.952 122.752-55.424-122.88-55.424z m-392.32 13.44c-61.248 0-110.912 22.016-110.912 49.152 0 27.072 49.664 49.088 110.976 49.088s110.912-21.952 110.912-49.088-49.6-49.088-110.912-49.088l-0.064-0.064z m134.656-101.888l20.096 42.304-66.88 27.52 89.6 9.216 28.032 53.248 17.408-47.744 77.632-9.216-60.16-25.728 16-43.712-59.136 22.08-62.592-27.968z"
|
||||||
|
fill="#D82A1F" p-id="3819"></path>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 2.0 KiB |