mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Compare commits
49 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 48607c3cfb | |||
| d15ba37313 | |||
| a553dc8dbd | |||
| eb27a4309e | |||
| 48e1534bf4 | |||
| e9d19c4684 | |||
| 8d6d7f6887 | |||
| a6e4b74d94 | |||
| a5aed2412f | |||
| 2810c60757 | |||
| 62afcf5ac8 | |||
| a74c755d83 | |||
| 7013d7f620 | |||
| de839fc3f0 | |||
| c6b6c748ae | |||
| ca5acc151a | |||
| 385dbe5ab5 | |||
| 3050a8cb07 | |||
| 9c77d367d0 | |||
| 5f03a4de11 | |||
| 290e5d958d | |||
| 9703633a57 | |||
| 7d3b68bb1e | |||
| c89f3c3cdb | |||
| 5d7f573379 | |||
| cab274f560 | |||
| 7059ec2298 | |||
| 674b3aeafd | |||
| 4c1476032d | |||
| 2af74cc494 | |||
| 38f0cc016f | |||
| 6874c6f3a7 | |||
| 8acc01a227 | |||
| 8c07992b6c | |||
| aee8b48d2f | |||
| daf215d266 | |||
| cdcc779705 | |||
| d589b0f568 | |||
| 9d60a84958 | |||
| aadb9cbec8 | |||
| 038822f3bd | |||
| ae501c58fa | |||
| 944776f207 | |||
| f1c98aad6b | |||
| ab06f502d7 | |||
| 6329339a32 | |||
| 84b39c60f6 | |||
| eb62c669ae | |||
| f69ff39fa0 |
2
.gitignore
vendored
2
.gitignore
vendored
@ -27,3 +27,5 @@ Cargo.lock
|
|||||||
|
|
||||||
# Exclude the log folder
|
# Exclude the log folder
|
||||||
docker/ragflow-logs/
|
docker/ragflow-logs/
|
||||||
|
/flask_session
|
||||||
|
/logs
|
||||||
|
|||||||
@ -4,7 +4,7 @@ USER root
|
|||||||
WORKDIR /ragflow
|
WORKDIR /ragflow
|
||||||
|
|
||||||
ADD ./web ./web
|
ADD ./web ./web
|
||||||
RUN cd ./web && npm i && npm run build
|
RUN cd ./web && npm i --force && npm run build
|
||||||
|
|
||||||
ADD ./api ./api
|
ADD ./api ./api
|
||||||
ADD ./conf ./conf
|
ADD ./conf ./conf
|
||||||
|
|||||||
@ -9,7 +9,7 @@ RUN /root/miniconda3/envs/py11/bin/pip install onnxruntime-gpu --extra-index-url
|
|||||||
|
|
||||||
|
|
||||||
ADD ./web ./web
|
ADD ./web ./web
|
||||||
RUN cd ./web && npm i && npm run build
|
RUN cd ./web && npm i --force && npm run build
|
||||||
|
|
||||||
ADD ./api ./api
|
ADD ./api ./api
|
||||||
ADD ./conf ./conf
|
ADD ./conf ./conf
|
||||||
|
|||||||
@ -34,7 +34,7 @@ ADD ./requirements.txt ./requirements.txt
|
|||||||
RUN apt install openmpi-bin openmpi-common libopenmpi-dev
|
RUN apt install openmpi-bin openmpi-common libopenmpi-dev
|
||||||
ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH
|
ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH
|
||||||
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
||||||
RUN cd ./web && npm i && npm run build
|
RUN cd ./web && npm i --force && npm run build
|
||||||
RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt
|
RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
|
|||||||
@ -35,7 +35,7 @@ RUN dnf install -y openmpi openmpi-devel python3-openmpi
|
|||||||
ENV C_INCLUDE_PATH /usr/include/openmpi-x86_64:$C_INCLUDE_PATH
|
ENV C_INCLUDE_PATH /usr/include/openmpi-x86_64:$C_INCLUDE_PATH
|
||||||
ENV LD_LIBRARY_PATH /usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
|
ENV LD_LIBRARY_PATH /usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
|
||||||
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
||||||
RUN cd ./web && npm i && npm run build
|
RUN cd ./web && npm i --force && npm run build
|
||||||
RUN conda run -n py11 pip install $(grep -ivE "mpi4py" ./requirements.txt) # without mpi4py==3.1.5
|
RUN conda run -n py11 pip install $(grep -ivE "mpi4py" ./requirements.txt) # without mpi4py==3.1.5
|
||||||
RUN conda run -n py11 pip install redis
|
RUN conda run -n py11 pip install redis
|
||||||
|
|
||||||
|
|||||||
86
README.md
86
README.md
@ -15,12 +15,12 @@
|
|||||||
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
||||||
</a>
|
</a>
|
||||||
<a href="https://demo.ragflow.io" target="_blank">
|
<a href="https://demo.ragflow.io" target="_blank">
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
<img alt="Static Badge" src="https://img.shields.io/badge/Online-Demo-4e6b99"></a>
|
||||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.3.2-brightgreen"
|
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.5.0-brightgreen"
|
||||||
alt="docker pull infiniflow/ragflow:v0.3.2"></a>
|
alt="docker pull infiniflow/ragflow:v0.5.0"></a>
|
||||||
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
||||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=7d09f1" alt="license">
|
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=1570EF" alt="license">
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@ -58,13 +58,14 @@
|
|||||||
|
|
||||||
## 📌 Latest Features
|
## 📌 Latest Features
|
||||||
|
|
||||||
- 2024-04-19 Support conversation API ([detail](./docs/conversation_api.md)).
|
- 2024-05-08 Integrates LLM DeepSeek.
|
||||||
- 2024-04-16 Add an embedding model 'bce-embedding-base_v1' from [BCEmbedding](https://github.com/netease-youdao/BCEmbedding).
|
- 2024-04-26 Adds file management.
|
||||||
- 2024-04-16 Add [FastEmbed](https://github.com/qdrant/fastembed), which is designed specifically for light and speedy embedding.
|
- 2024-04-19 Supports conversation API ([detail](./docs/conversation_api.md)).
|
||||||
- 2024-04-11 Support [Xinference](./docs/xinference.md) for local LLM deployment.
|
- 2024-04-16 Integrates an embedding model 'bce-embedding-base_v1' from [BCEmbedding](https://github.com/netease-youdao/BCEmbedding), and [FastEmbed](https://github.com/qdrant/fastembed), which is designed specifically for light and speedy embedding.
|
||||||
- 2024-04-10 Add a new layout recognization model for analyzing Laws documentation.
|
- 2024-04-11 Supports [Xinference](./docs/xinference.md) for local LLM deployment.
|
||||||
- 2024-04-08 Support [Ollama](./docs/ollama.md) for local LLM deployment.
|
- 2024-04-10 Adds a new layout recognition model for analyzing Laws documentation.
|
||||||
- 2024-04-07 Support Chinese UI.
|
- 2024-04-08 Supports [Ollama](./docs/ollama.md) for local LLM deployment.
|
||||||
|
- 2024-04-07 Supports Chinese UI.
|
||||||
|
|
||||||
## 🔎 System Architecture
|
## 🔎 System Architecture
|
||||||
|
|
||||||
@ -118,6 +119,7 @@
|
|||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
> Please note that running the above commands will automatically download the development version docker image of RAGFlow. If you want to download and run a specific version of docker image, please find the RAGFLOW_VERSION variable in the docker/.env file, change it to the corresponding version, for example, RAGFLOW_VERSION=v0.5.0, and run the above commands.
|
||||||
|
|
||||||
> The core image is about 9 GB in size and may take a while to load.
|
> The core image is about 9 GB in size and may take a while to load.
|
||||||
|
|
||||||
@ -179,12 +181,72 @@ To build the Docker images from source:
|
|||||||
```bash
|
```bash
|
||||||
$ git clone https://github.com/infiniflow/ragflow.git
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
$ cd ragflow/
|
$ cd ragflow/
|
||||||
$ docker build -t infiniflow/ragflow:v0.3.2 .
|
$ docker build -t infiniflow/ragflow:dev .
|
||||||
$ cd ragflow/docker
|
$ cd ragflow/docker
|
||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 🛠️ Launch Service from Source
|
||||||
|
|
||||||
|
To launch the service from source, please follow these steps:
|
||||||
|
|
||||||
|
1. Clone the repository
|
||||||
|
```bash
|
||||||
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
|
$ cd ragflow/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Create a virtual environment (ensure Anaconda or Miniconda is installed)
|
||||||
|
```bash
|
||||||
|
$ conda create -n ragflow python=3.11.0
|
||||||
|
$ conda activate ragflow
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
If CUDA version is greater than 12.0, execute the following additional commands:
|
||||||
|
```bash
|
||||||
|
$ pip uninstall -y onnxruntime-gpu
|
||||||
|
$ pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Copy the entry script and configure environment variables
|
||||||
|
```bash
|
||||||
|
$ cp docker/entrypoint.sh .
|
||||||
|
$ vi entrypoint.sh
|
||||||
|
```
|
||||||
|
Use the following commands to obtain the Python path and the ragflow project path:
|
||||||
|
```bash
|
||||||
|
$ which python
|
||||||
|
$ pwd
|
||||||
|
```
|
||||||
|
|
||||||
|
Set the output of `which python` as the value for `PY` and the output of `pwd` as the value for `PYTHONPATH`.
|
||||||
|
|
||||||
|
If `LD_LIBRARY_PATH` is already configured, it can be commented out.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Adjust configurations according to your actual situation; the two export commands are newly added.
|
||||||
|
PY=${PY}
|
||||||
|
export PYTHONPATH=${PYTHONPATH}
|
||||||
|
# Optional: Add Hugging Face mirror
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Start the base services
|
||||||
|
```bash
|
||||||
|
$ cd docker
|
||||||
|
$ docker compose -f docker-compose-base.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Check the configuration files
|
||||||
|
Ensure that the settings in **docker/.env** match those in **conf/service_conf.yaml**. The IP addresses and ports for related services in **service_conf.yaml** should be changed to the local machine IP and ports exposed by the container.
|
||||||
|
|
||||||
|
6. Launch the service
|
||||||
|
```bash
|
||||||
|
$ chmod +x ./entrypoint.sh
|
||||||
|
$ bash ./entrypoint.sh
|
||||||
|
```
|
||||||
|
|
||||||
## 📚 Documentation
|
## 📚 Documentation
|
||||||
|
|
||||||
- [FAQ](./docs/faq.md)
|
- [FAQ](./docs/faq.md)
|
||||||
|
|||||||
76
README_ja.md
76
README_ja.md
@ -15,12 +15,12 @@
|
|||||||
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
||||||
</a>
|
</a>
|
||||||
<a href="https://demo.ragflow.io" target="_blank">
|
<a href="https://demo.ragflow.io" target="_blank">
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
<img alt="Static Badge" src="https://img.shields.io/badge/Online-Demo-4e6b99"></a>
|
||||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.3.2-brightgreen"
|
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.5.0-brightgreen"
|
||||||
alt="docker pull infiniflow/ragflow:v0.3.2"></a>
|
alt="docker pull infiniflow/ragflow:v0.5.0"></a>
|
||||||
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
||||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=7d09f1" alt="license">
|
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=1570EF" alt="license">
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@ -58,6 +58,8 @@
|
|||||||
|
|
||||||
## 📌 最新の機能
|
## 📌 最新の機能
|
||||||
|
|
||||||
|
- 2024-05-08
|
||||||
|
- 2024-04-26 「ファイル管理」機能を追加しました。
|
||||||
- 2024-04-19 会話 API をサポートします ([詳細](./docs/conversation_api.md))。
|
- 2024-04-19 会話 API をサポートします ([詳細](./docs/conversation_api.md))。
|
||||||
- 2024-04-16 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) から埋め込みモデル「bce-embedding-base_v1」を追加します。
|
- 2024-04-16 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) から埋め込みモデル「bce-embedding-base_v1」を追加します。
|
||||||
- 2024-04-16 [FastEmbed](https://github.com/qdrant/fastembed) は、軽量かつ高速な埋め込み用に設計されています。
|
- 2024-04-16 [FastEmbed](https://github.com/qdrant/fastembed) は、軽量かつ高速な埋め込み用に設計されています。
|
||||||
@ -119,7 +121,9 @@
|
|||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
> コアイメージのサイズは約 15 GB で、ロードに時間がかかる場合があります。
|
> 上記のコマンドを実行すると、RAGFlowの開発版dockerイメージが自動的にダウンロードされます。 特定のバージョンのDockerイメージをダウンロードして実行したい場合は、docker/.envファイルのRAGFLOW_VERSION変数を見つけて、対応するバージョンに変更してください。 例えば、RAGFLOW_VERSION=v0.5.0として、上記のコマンドを実行してください。
|
||||||
|
|
||||||
|
> コアイメージのサイズは約 9 GB で、ロードに時間がかかる場合があります。
|
||||||
|
|
||||||
4. サーバーを立ち上げた後、サーバーの状態を確認する:
|
4. サーバーを立ち上げた後、サーバーの状態を確認する:
|
||||||
|
|
||||||
@ -179,12 +183,72 @@
|
|||||||
```bash
|
```bash
|
||||||
$ git clone https://github.com/infiniflow/ragflow.git
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
$ cd ragflow/
|
$ cd ragflow/
|
||||||
$ docker build -t infiniflow/ragflow:v0.3.2 .
|
$ docker build -t infiniflow/ragflow:v0.5.0 .
|
||||||
$ cd ragflow/docker
|
$ cd ragflow/docker
|
||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 🛠️ ソースコードからサービスを起動する方法
|
||||||
|
|
||||||
|
ソースコードからサービスを起動する場合は、以下の手順に従ってください:
|
||||||
|
|
||||||
|
1. リポジトリをクローンします
|
||||||
|
```bash
|
||||||
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
|
$ cd ragflow/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 仮想環境を作成します(AnacondaまたはMinicondaがインストールされていることを確認してください)
|
||||||
|
```bash
|
||||||
|
$ conda create -n ragflow python=3.11.0
|
||||||
|
$ conda activate ragflow
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
CUDAのバージョンが12.0以上の場合、以下の追加コマンドを実行してください:
|
||||||
|
```bash
|
||||||
|
$ pip uninstall -y onnxruntime-gpu
|
||||||
|
$ pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
||||||
|
```
|
||||||
|
|
||||||
|
3. エントリースクリプトをコピーし、環境変数を設定します
|
||||||
|
```bash
|
||||||
|
$ cp docker/entrypoint.sh .
|
||||||
|
$ vi entrypoint.sh
|
||||||
|
```
|
||||||
|
以下のコマンドでPythonのパスとragflowプロジェクトのパスを取得します:
|
||||||
|
```bash
|
||||||
|
$ which python
|
||||||
|
$ pwd
|
||||||
|
```
|
||||||
|
|
||||||
|
`which python`の出力を`PY`の値として、`pwd`の出力を`PYTHONPATH`の値として設定します。
|
||||||
|
|
||||||
|
`LD_LIBRARY_PATH`が既に設定されている場合は、コメントアウトできます。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 実際の状況に応じて設定を調整してください。以下の二つのexportは新たに追加された設定です
|
||||||
|
PY=${PY}
|
||||||
|
export PYTHONPATH=${PYTHONPATH}
|
||||||
|
# オプション:Hugging Faceミラーを追加
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
```
|
||||||
|
|
||||||
|
4. 基本サービスを起動します
|
||||||
|
```bash
|
||||||
|
$ cd docker
|
||||||
|
$ docker compose -f docker-compose-base.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
5. 設定ファイルを確認します
|
||||||
|
**docker/.env**内の設定が**conf/service_conf.yaml**内の設定と一致していることを確認してください。**service_conf.yaml**内の関連サービスのIPアドレスとポートは、ローカルマシンのIPアドレスとコンテナが公開するポートに変更する必要があります。
|
||||||
|
|
||||||
|
6. サービスを起動します
|
||||||
|
```bash
|
||||||
|
$ chmod +x ./entrypoint.sh
|
||||||
|
$ bash ./entrypoint.sh
|
||||||
|
```
|
||||||
|
|
||||||
## 📚 ドキュメンテーション
|
## 📚 ドキュメンテーション
|
||||||
|
|
||||||
- [FAQ](./docs/faq.md)
|
- [FAQ](./docs/faq.md)
|
||||||
|
|||||||
79
README_zh.md
79
README_zh.md
@ -15,12 +15,12 @@
|
|||||||
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
||||||
</a>
|
</a>
|
||||||
<a href="https://demo.ragflow.io" target="_blank">
|
<a href="https://demo.ragflow.io" target="_blank">
|
||||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
<img alt="Static Badge" src="https://img.shields.io/badge/Online-Demo-4e6b99"></a>
|
||||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.3.2-brightgreen"
|
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.5.0-brightgreen"
|
||||||
alt="docker pull infiniflow/ragflow:v0.3.2"></a>
|
alt="docker pull infiniflow/ragflow:v0.5.0"></a>
|
||||||
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
||||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=7d09f1" alt="license">
|
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=1570EF" alt="license">
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@ -58,9 +58,10 @@
|
|||||||
|
|
||||||
## 📌 新增功能
|
## 📌 新增功能
|
||||||
|
|
||||||
|
- 2024-05-08 集成大模型 DeepSeek
|
||||||
|
- 2024-04-26 增添了'文件管理'功能.
|
||||||
- 2024-04-19 支持对话 API ([更多](./docs/conversation_api.md)).
|
- 2024-04-19 支持对话 API ([更多](./docs/conversation_api.md)).
|
||||||
- 2024-04-16 添加嵌入模型 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) 。
|
- 2024-04-16 集成嵌入模型 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) 和 专为轻型和高速嵌入而设计的 [FastEmbed](https://github.com/qdrant/fastembed) 。
|
||||||
- 2024-04-16 添加 [FastEmbed](https://github.com/qdrant/fastembed) 专为轻型和高速嵌入而设计。
|
|
||||||
- 2024-04-11 支持用 [Xinference](./docs/xinference.md) 本地化部署大模型。
|
- 2024-04-11 支持用 [Xinference](./docs/xinference.md) 本地化部署大模型。
|
||||||
- 2024-04-10 为‘Laws’版面分析增加了底层模型。
|
- 2024-04-10 为‘Laws’版面分析增加了底层模型。
|
||||||
- 2024-04-08 支持用 [Ollama](./docs/ollama.md) 本地化部署大模型。
|
- 2024-04-08 支持用 [Ollama](./docs/ollama.md) 本地化部署大模型。
|
||||||
@ -119,7 +120,9 @@
|
|||||||
$ docker compose -f docker-compose-CN.yml up -d
|
$ docker compose -f docker-compose-CN.yml up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
> 核心镜像文件大约 15 GB,可能需要一定时间拉取。请耐心等待。
|
> 请注意,运行上述命令会自动下载 RAGFlow 的开发版本 docker 镜像。如果你想下载并运行特定版本的 docker 镜像,请在 docker/.env 文件中找到 RAGFLOW_VERSION 变量,将其改为对应版本。例如 RAGFLOW_VERSION=v0.5.0,然后运行上述命令。
|
||||||
|
|
||||||
|
> 核心镜像文件大约 9 GB,可能需要一定时间拉取。请耐心等待。
|
||||||
|
|
||||||
4. 服务器启动成功后再次确认服务器状态:
|
4. 服务器启动成功后再次确认服务器状态:
|
||||||
|
|
||||||
@ -179,12 +182,72 @@
|
|||||||
```bash
|
```bash
|
||||||
$ git clone https://github.com/infiniflow/ragflow.git
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
$ cd ragflow/
|
$ cd ragflow/
|
||||||
$ docker build -t infiniflow/ragflow:v0.3.2 .
|
$ docker build -t infiniflow/ragflow:v0.5.0 .
|
||||||
$ cd ragflow/docker
|
$ cd ragflow/docker
|
||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## 🛠️ 源码启动服务
|
||||||
|
|
||||||
|
如需从源码启动服务,请参考以下步骤:
|
||||||
|
|
||||||
|
1. 克隆仓库
|
||||||
|
```bash
|
||||||
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
|
$ cd ragflow/
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 创建虚拟环境(确保已安装 Anaconda 或 Miniconda)
|
||||||
|
```bash
|
||||||
|
$ conda create -n ragflow python=3.11.0
|
||||||
|
$ conda activate ragflow
|
||||||
|
$ pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
如果cuda > 12.0,需额外执行以下命令:
|
||||||
|
```bash
|
||||||
|
$ pip uninstall -y onnxruntime-gpu
|
||||||
|
$ pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
||||||
|
```
|
||||||
|
|
||||||
|
3. 拷贝入口脚本并配置环境变量
|
||||||
|
```bash
|
||||||
|
$ cp docker/entrypoint.sh .
|
||||||
|
$ vi entrypoint.sh
|
||||||
|
```
|
||||||
|
使用以下命令获取python路径及ragflow项目路径:
|
||||||
|
```bash
|
||||||
|
$ which python
|
||||||
|
$ pwd
|
||||||
|
```
|
||||||
|
|
||||||
|
将上述`which python`的输出作为`PY`的值,将`pwd`的输出作为`PYTHONPATH`的值。
|
||||||
|
|
||||||
|
`LD_LIBRARY_PATH`如果环境已经配置好,可以注释掉。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 此处配置需要按照实际情况调整,两个export为新增配置
|
||||||
|
PY=${PY}
|
||||||
|
export PYTHONPATH=${PYTHONPATH}
|
||||||
|
# 可选:添加Hugging Face镜像
|
||||||
|
export HF_ENDPOINT=https://hf-mirror.com
|
||||||
|
```
|
||||||
|
|
||||||
|
4. 启动基础服务
|
||||||
|
```bash
|
||||||
|
$ cd docker
|
||||||
|
$ docker compose -f docker-compose-base.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
5. 检查配置文件
|
||||||
|
确保**docker/.env**中的配置与**conf/service_conf.yaml**中配置一致, **service_conf.yaml**中相关服务的IP地址与端口应该改成本机IP地址及容器映射出来的端口。
|
||||||
|
|
||||||
|
6. 启动服务
|
||||||
|
```bash
|
||||||
|
$ chmod +x ./entrypoint.sh
|
||||||
|
$ bash ./entrypoint.sh
|
||||||
|
```
|
||||||
|
|
||||||
## 📚 技术文档
|
## 📚 技术文档
|
||||||
|
|
||||||
- [FAQ](./docs/faq.md)
|
- [FAQ](./docs/faq.md)
|
||||||
|
|||||||
@ -33,7 +33,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge
|
|||||||
from itsdangerous import URLSafeTimedSerializer
|
from itsdangerous import URLSafeTimedSerializer
|
||||||
|
|
||||||
from api.utils.file_utils import filename_type, thumbnail
|
from api.utils.file_utils import filename_type, thumbnail
|
||||||
from rag.utils import MINIO
|
from rag.utils.minio_conn import MINIO
|
||||||
|
|
||||||
|
|
||||||
def generate_confirmation_token(tenent_id):
|
def generate_confirmation_token(tenent_id):
|
||||||
|
|||||||
@ -20,8 +20,9 @@ from flask_login import login_required, current_user
|
|||||||
from elasticsearch_dsl import Q
|
from elasticsearch_dsl import Q
|
||||||
|
|
||||||
from rag.app.qa import rmPrefix, beAdoc
|
from rag.app.qa import rmPrefix, beAdoc
|
||||||
from rag.nlp import search, huqie
|
from rag.nlp import search, rag_tokenizer
|
||||||
from rag.utils import ELASTICSEARCH, rmSpace
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
from rag.utils import rmSpace
|
||||||
from api.db import LLMType, ParserType
|
from api.db import LLMType, ParserType
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from api.db.services.llm_service import TenantLLMService
|
from api.db.services.llm_service import TenantLLMService
|
||||||
@ -124,10 +125,10 @@ def set():
|
|||||||
d = {
|
d = {
|
||||||
"id": req["chunk_id"],
|
"id": req["chunk_id"],
|
||||||
"content_with_weight": req["content_with_weight"]}
|
"content_with_weight": req["content_with_weight"]}
|
||||||
d["content_ltks"] = huqie.qie(req["content_with_weight"])
|
d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
d["important_kwd"] = req["important_kwd"]
|
d["important_kwd"] = req["important_kwd"]
|
||||||
d["important_tks"] = huqie.qie(" ".join(req["important_kwd"]))
|
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
|
||||||
if "available_int" in req:
|
if "available_int" in req:
|
||||||
d["available_int"] = req["available_int"]
|
d["available_int"] = req["available_int"]
|
||||||
|
|
||||||
@ -151,7 +152,7 @@ def set():
|
|||||||
retmsg="Q&A must be separated by TAB/ENTER key.")
|
retmsg="Q&A must be separated by TAB/ENTER key.")
|
||||||
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
||||||
d = beAdoc(d, arr[0], arr[1], not any(
|
d = beAdoc(d, arr[0], arr[1], not any(
|
||||||
[huqie.is_chinese(t) for t in q + a]))
|
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
||||||
|
|
||||||
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
||||||
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
||||||
@ -201,11 +202,11 @@ def create():
|
|||||||
md5 = hashlib.md5()
|
md5 = hashlib.md5()
|
||||||
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
||||||
chunck_id = md5.hexdigest()
|
chunck_id = md5.hexdigest()
|
||||||
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
|
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
|
||||||
"content_with_weight": req["content_with_weight"]}
|
"content_with_weight": req["content_with_weight"]}
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
d["important_kwd"] = req.get("important_kwd", [])
|
d["important_kwd"] = req.get("important_kwd", [])
|
||||||
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
|
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
|
||||||
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
||||||
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
||||||
|
|
||||||
|
|||||||
@ -35,13 +35,7 @@ def set_dialog():
|
|||||||
top_n = req.get("top_n", 6)
|
top_n = req.get("top_n", 6)
|
||||||
similarity_threshold = req.get("similarity_threshold", 0.1)
|
similarity_threshold = req.get("similarity_threshold", 0.1)
|
||||||
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
|
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
|
||||||
llm_setting = req.get("llm_setting", {
|
llm_setting = req.get("llm_setting", {})
|
||||||
"temperature": 0.1,
|
|
||||||
"top_p": 0.3,
|
|
||||||
"frequency_penalty": 0.7,
|
|
||||||
"presence_penalty": 0.4,
|
|
||||||
"max_tokens": 215
|
|
||||||
})
|
|
||||||
default_prompt = {
|
default_prompt = {
|
||||||
"system": """你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。
|
"system": """你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。
|
||||||
以下是知识库:
|
以下是知识库:
|
||||||
|
|||||||
@ -14,7 +14,6 @@
|
|||||||
# limitations under the License
|
# limitations under the License
|
||||||
#
|
#
|
||||||
|
|
||||||
import base64
|
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
@ -23,8 +22,13 @@ import flask
|
|||||||
from elasticsearch_dsl import Q
|
from elasticsearch_dsl import Q
|
||||||
from flask import request
|
from flask import request
|
||||||
from flask_login import login_required, current_user
|
from flask_login import login_required, current_user
|
||||||
|
|
||||||
|
from api.db.db_models import Task
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
|
from api.db.services.task_service import TaskService, queue_tasks
|
||||||
from rag.nlp import search
|
from rag.nlp import search
|
||||||
from rag.utils import ELASTICSEARCH
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
from api.db.services import duplicate_name
|
from api.db.services import duplicate_name
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||||
@ -48,34 +52,35 @@ def upload():
|
|||||||
if 'file' not in request.files:
|
if 'file' not in request.files:
|
||||||
return get_json_result(
|
return get_json_result(
|
||||||
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
file = request.files['file']
|
|
||||||
if file.filename == '':
|
file_objs = request.files.getlist('file')
|
||||||
|
for file_obj in file_objs:
|
||||||
|
if file_obj.filename == '':
|
||||||
return get_json_result(
|
return get_json_result(
|
||||||
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
|
||||||
|
err = []
|
||||||
|
for file in file_objs:
|
||||||
try:
|
try:
|
||||||
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(
|
raise LookupError("Can't find this knowledgebase!")
|
||||||
retmsg="Can't find this knowledgebase!")
|
|
||||||
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
||||||
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
|
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
|
||||||
return get_data_error_result(
|
raise RuntimeError("Exceed the maximum file number of a free user!")
|
||||||
retmsg="Exceed the maximum file number of a free user!")
|
|
||||||
|
|
||||||
filename = duplicate_name(
|
filename = duplicate_name(
|
||||||
DocumentService.query,
|
DocumentService.query,
|
||||||
name=file.filename,
|
name=file.filename,
|
||||||
kb_id=kb.id)
|
kb_id=kb.id)
|
||||||
filetype = filename_type(filename)
|
filetype = filename_type(filename)
|
||||||
if not filetype:
|
if filetype == FileType.OTHER.value:
|
||||||
return get_data_error_result(
|
raise RuntimeError("This type of file has not been supported yet!")
|
||||||
retmsg="This type of file has not been supported yet!")
|
|
||||||
|
|
||||||
location = filename
|
location = filename
|
||||||
while MINIO.obj_exist(kb_id, location):
|
while MINIO.obj_exist(kb_id, location):
|
||||||
location += "_"
|
location += "_"
|
||||||
blob = request.files['file'].read()
|
blob = file.read()
|
||||||
MINIO.put(kb_id, location, blob)
|
MINIO.put(kb_id, location, blob)
|
||||||
doc = {
|
doc = {
|
||||||
"id": get_uuid(),
|
"id": get_uuid(),
|
||||||
@ -93,10 +98,13 @@ def upload():
|
|||||||
doc["parser_id"] = ParserType.PICTURE.value
|
doc["parser_id"] = ParserType.PICTURE.value
|
||||||
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
||||||
doc["parser_id"] = ParserType.PRESENTATION.value
|
doc["parser_id"] = ParserType.PRESENTATION.value
|
||||||
doc = DocumentService.insert(doc)
|
DocumentService.insert(doc)
|
||||||
return get_json_result(data=doc.to_json())
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
err.append(file.filename + ": " + str(e))
|
||||||
|
if err:
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
||||||
|
return get_json_result(data=True)
|
||||||
|
|
||||||
|
|
||||||
@manager.route('/create', methods=['POST'])
|
@manager.route('/create', methods=['POST'])
|
||||||
@ -218,26 +226,37 @@ def change_status():
|
|||||||
@validate_request("doc_id")
|
@validate_request("doc_id")
|
||||||
def rm():
|
def rm():
|
||||||
req = request.json
|
req = request.json
|
||||||
|
doc_ids = req["doc_id"]
|
||||||
|
if isinstance(doc_ids, str): doc_ids = [doc_ids]
|
||||||
|
errors = ""
|
||||||
|
for doc_id in doc_ids:
|
||||||
try:
|
try:
|
||||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
|
||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(retmsg="Document not found!")
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||||
if not tenant_id:
|
if not tenant_id:
|
||||||
return get_data_error_result(retmsg="Tenant not found!")
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
|
||||||
ELASTICSEARCH.deleteByQuery(
|
ELASTICSEARCH.deleteByQuery(
|
||||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||||
|
|
||||||
DocumentService.increment_chunk_num(
|
DocumentService.increment_chunk_num(
|
||||||
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||||
if not DocumentService.delete(doc):
|
if not DocumentService.delete(doc):
|
||||||
return get_data_error_result(
|
return get_data_error_result(
|
||||||
retmsg="Database error (Document removal)!")
|
retmsg="Database error (Document removal)!")
|
||||||
|
|
||||||
|
informs = File2DocumentService.get_by_document_id(doc_id)
|
||||||
|
if not informs:
|
||||||
MINIO.rm(doc.kb_id, doc.location)
|
MINIO.rm(doc.kb_id, doc.location)
|
||||||
return get_json_result(data=True)
|
else:
|
||||||
|
File2DocumentService.delete_by_document_id(doc_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
errors += str(e)
|
||||||
|
|
||||||
|
if errors: return server_error_response(e)
|
||||||
|
return get_json_result(data=True)
|
||||||
|
|
||||||
|
|
||||||
@manager.route('/run', methods=['POST'])
|
@manager.route('/run', methods=['POST'])
|
||||||
@ -260,6 +279,14 @@ def run():
|
|||||||
ELASTICSEARCH.deleteByQuery(
|
ELASTICSEARCH.deleteByQuery(
|
||||||
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
||||||
|
|
||||||
|
if str(req["run"]) == TaskStatus.RUNNING.value:
|
||||||
|
TaskService.filter_delete([Task.doc_id == id])
|
||||||
|
e, doc = DocumentService.get_by_id(id)
|
||||||
|
doc = doc.to_dict()
|
||||||
|
doc["tenant_id"] = tenant_id
|
||||||
|
bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
|
||||||
|
queue_tasks(doc, bucket, name)
|
||||||
|
|
||||||
return get_json_result(data=True)
|
return get_json_result(data=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
@ -289,6 +316,11 @@ def rename():
|
|||||||
return get_data_error_result(
|
return get_data_error_result(
|
||||||
retmsg="Database error (Document rename)!")
|
retmsg="Database error (Document rename)!")
|
||||||
|
|
||||||
|
informs = File2DocumentService.get_by_document_id(req["doc_id"])
|
||||||
|
if informs:
|
||||||
|
e, file = FileService.get_by_id(informs[0].file_id)
|
||||||
|
FileService.update_by_id(file.id, {"name": req["name"]})
|
||||||
|
|
||||||
return get_json_result(data=True)
|
return get_json_result(data=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return server_error_response(e)
|
return server_error_response(e)
|
||||||
@ -302,7 +334,13 @@ def get(doc_id):
|
|||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(retmsg="Document not found!")
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
|
||||||
|
informs = File2DocumentService.get_by_document_id(doc_id)
|
||||||
|
if not informs:
|
||||||
response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
|
response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
|
||||||
|
else:
|
||||||
|
e, file = FileService.get_by_id(informs[0].file_id)
|
||||||
|
response = flask.make_response(MINIO.get(file.parent_id, doc.location))
|
||||||
|
|
||||||
ext = re.search(r"\.([^.]+)$", doc.name)
|
ext = re.search(r"\.([^.]+)$", doc.name)
|
||||||
if ext:
|
if ext:
|
||||||
if doc.type == FileType.VISUAL.value:
|
if doc.type == FileType.VISUAL.value:
|
||||||
@ -338,7 +376,8 @@ def change_parser():
|
|||||||
return get_data_error_result(retmsg="Not supported yet!")
|
return get_data_error_result(retmsg="Not supported yet!")
|
||||||
|
|
||||||
e = DocumentService.update_by_id(doc.id,
|
e = DocumentService.update_by_id(doc.id,
|
||||||
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
|
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
|
||||||
|
"run": TaskStatus.UNSTART.value})
|
||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(retmsg="Document not found!")
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
if "parser_config" in req:
|
if "parser_config" in req:
|
||||||
|
|||||||
137
api/apps/file2document_app.py
Normal file
137
api/apps/file2document_app.py
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License
|
||||||
|
#
|
||||||
|
from elasticsearch_dsl import Q
|
||||||
|
|
||||||
|
from api.db.db_models import File2Document
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
|
|
||||||
|
from flask import request
|
||||||
|
from flask_login import login_required, current_user
|
||||||
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
|
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||||
|
from api.utils import get_uuid
|
||||||
|
from api.db import FileType
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.settings import RetCode
|
||||||
|
from api.utils.api_utils import get_json_result
|
||||||
|
from rag.nlp import search
|
||||||
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/convert', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("file_ids", "kb_ids")
|
||||||
|
def convert():
|
||||||
|
req = request.json
|
||||||
|
kb_ids = req["kb_ids"]
|
||||||
|
file_ids = req["file_ids"]
|
||||||
|
file2documents = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for file_id in file_ids:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
file_ids_list = [file_id]
|
||||||
|
if file.type == FileType.FOLDER.value:
|
||||||
|
file_ids_list = FileService.get_all_innermost_file_ids(file_id, [])
|
||||||
|
for id in file_ids_list:
|
||||||
|
informs = File2DocumentService.get_by_file_id(id)
|
||||||
|
# delete
|
||||||
|
for inform in informs:
|
||||||
|
doc_id = inform.document_id
|
||||||
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||||
|
if not tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
ELASTICSEARCH.deleteByQuery(
|
||||||
|
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||||
|
DocumentService.increment_chunk_num(
|
||||||
|
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||||
|
if not DocumentService.delete(doc):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (Document removal)!")
|
||||||
|
File2DocumentService.delete_by_file_id(id)
|
||||||
|
|
||||||
|
# insert
|
||||||
|
for kb_id in kb_ids:
|
||||||
|
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Can't find this knowledgebase!")
|
||||||
|
e, file = FileService.get_by_id(id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Can't find this file!")
|
||||||
|
|
||||||
|
doc = DocumentService.insert({
|
||||||
|
"id": get_uuid(),
|
||||||
|
"kb_id": kb.id,
|
||||||
|
"parser_id": kb.parser_id,
|
||||||
|
"parser_config": kb.parser_config,
|
||||||
|
"created_by": current_user.id,
|
||||||
|
"type": file.type,
|
||||||
|
"name": file.name,
|
||||||
|
"location": file.location,
|
||||||
|
"size": file.size
|
||||||
|
})
|
||||||
|
file2document = File2DocumentService.insert({
|
||||||
|
"id": get_uuid(),
|
||||||
|
"file_id": id,
|
||||||
|
"document_id": doc.id,
|
||||||
|
})
|
||||||
|
file2documents.append(file2document.to_json())
|
||||||
|
return get_json_result(data=file2documents)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/rm', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("file_ids")
|
||||||
|
def rm():
|
||||||
|
req = request.json
|
||||||
|
file_ids = req["file_ids"]
|
||||||
|
if not file_ids:
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg='Lack of "Files ID"', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
try:
|
||||||
|
for file_id in file_ids:
|
||||||
|
informs = File2DocumentService.get_by_file_id(file_id)
|
||||||
|
if not informs:
|
||||||
|
return get_data_error_result(retmsg="Inform not found!")
|
||||||
|
for inform in informs:
|
||||||
|
if not inform:
|
||||||
|
return get_data_error_result(retmsg="Inform not found!")
|
||||||
|
File2DocumentService.delete_by_file_id(file_id)
|
||||||
|
doc_id = inform.document_id
|
||||||
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||||
|
if not tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
ELASTICSEARCH.deleteByQuery(
|
||||||
|
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||||
|
DocumentService.increment_chunk_num(
|
||||||
|
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||||
|
if not DocumentService.delete(doc):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (Document removal)!")
|
||||||
|
return get_json_result(data=True)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
347
api/apps/file_app.py
Normal file
347
api/apps/file_app.py
Normal file
@ -0,0 +1,347 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License
|
||||||
|
#
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import re
|
||||||
|
|
||||||
|
import flask
|
||||||
|
from elasticsearch_dsl import Q
|
||||||
|
from flask import request
|
||||||
|
from flask_login import login_required, current_user
|
||||||
|
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||||
|
from api.utils import get_uuid
|
||||||
|
from api.db import FileType
|
||||||
|
from api.db.services import duplicate_name
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
|
from api.settings import RetCode
|
||||||
|
from api.utils.api_utils import get_json_result
|
||||||
|
from api.utils.file_utils import filename_type
|
||||||
|
from rag.nlp import search
|
||||||
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/upload', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
# @validate_request("parent_id")
|
||||||
|
def upload():
|
||||||
|
pf_id = request.form.get("parent_id")
|
||||||
|
|
||||||
|
if not pf_id:
|
||||||
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
|
pf_id = root_folder.id
|
||||||
|
|
||||||
|
if 'file' not in request.files:
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
file_objs = request.files.getlist('file')
|
||||||
|
|
||||||
|
for file_obj in file_objs:
|
||||||
|
if file_obj.filename == '':
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
file_res = []
|
||||||
|
try:
|
||||||
|
for file_obj in file_objs:
|
||||||
|
e, file = FileService.get_by_id(pf_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Can't find this folder!")
|
||||||
|
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
||||||
|
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(current_user.id) >= MAX_FILE_NUM_PER_USER:
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Exceed the maximum file number of a free user!")
|
||||||
|
|
||||||
|
# split file name path
|
||||||
|
if not file_obj.filename:
|
||||||
|
e, file = FileService.get_by_id(pf_id)
|
||||||
|
file_obj_names = [file.name, file_obj.filename]
|
||||||
|
else:
|
||||||
|
full_path = '/' + file_obj.filename
|
||||||
|
file_obj_names = full_path.split('/')
|
||||||
|
file_len = len(file_obj_names)
|
||||||
|
|
||||||
|
# get folder
|
||||||
|
file_id_list = FileService.get_id_list_by_id(pf_id, file_obj_names, 1, [pf_id])
|
||||||
|
len_id_list = len(file_id_list)
|
||||||
|
|
||||||
|
# create folder
|
||||||
|
if file_len != len_id_list:
|
||||||
|
e, file = FileService.get_by_id(file_id_list[len_id_list - 1])
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 1], file_obj_names,
|
||||||
|
len_id_list)
|
||||||
|
else:
|
||||||
|
e, file = FileService.get_by_id(file_id_list[len_id_list - 2])
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 2], file_obj_names,
|
||||||
|
len_id_list)
|
||||||
|
|
||||||
|
# file type
|
||||||
|
filetype = filename_type(file_obj_names[file_len - 1])
|
||||||
|
location = file_obj_names[file_len - 1]
|
||||||
|
while MINIO.obj_exist(last_folder.id, location):
|
||||||
|
location += "_"
|
||||||
|
blob = file_obj.read()
|
||||||
|
filename = duplicate_name(
|
||||||
|
FileService.query,
|
||||||
|
name=file_obj_names[file_len - 1],
|
||||||
|
parent_id=last_folder.id)
|
||||||
|
file = {
|
||||||
|
"id": get_uuid(),
|
||||||
|
"parent_id": last_folder.id,
|
||||||
|
"tenant_id": current_user.id,
|
||||||
|
"created_by": current_user.id,
|
||||||
|
"type": filetype,
|
||||||
|
"name": filename,
|
||||||
|
"location": location,
|
||||||
|
"size": len(blob),
|
||||||
|
}
|
||||||
|
file = FileService.insert(file)
|
||||||
|
MINIO.put(last_folder.id, location, blob)
|
||||||
|
file_res.append(file.to_json())
|
||||||
|
return get_json_result(data=file_res)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/create', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("name")
|
||||||
|
def create():
|
||||||
|
req = request.json
|
||||||
|
pf_id = request.json.get("parent_id")
|
||||||
|
input_file_type = request.json.get("type")
|
||||||
|
if not pf_id:
|
||||||
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
|
pf_id = root_folder.id
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not FileService.is_parent_folder_exist(pf_id):
|
||||||
|
return get_json_result(
|
||||||
|
data=False, retmsg="Parent Folder Doesn't Exist!", retcode=RetCode.OPERATING_ERROR)
|
||||||
|
if FileService.query(name=req["name"], parent_id=pf_id):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Duplicated folder name in the same folder.")
|
||||||
|
|
||||||
|
if input_file_type == FileType.FOLDER.value:
|
||||||
|
file_type = FileType.FOLDER.value
|
||||||
|
else:
|
||||||
|
file_type = FileType.VIRTUAL.value
|
||||||
|
|
||||||
|
file = FileService.insert({
|
||||||
|
"id": get_uuid(),
|
||||||
|
"parent_id": pf_id,
|
||||||
|
"tenant_id": current_user.id,
|
||||||
|
"created_by": current_user.id,
|
||||||
|
"name": req["name"],
|
||||||
|
"location": "",
|
||||||
|
"size": 0,
|
||||||
|
"type": file_type
|
||||||
|
})
|
||||||
|
|
||||||
|
return get_json_result(data=file.to_json())
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/list', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def list():
|
||||||
|
pf_id = request.args.get("parent_id")
|
||||||
|
|
||||||
|
keywords = request.args.get("keywords", "")
|
||||||
|
|
||||||
|
page_number = int(request.args.get("page", 1))
|
||||||
|
items_per_page = int(request.args.get("page_size", 15))
|
||||||
|
orderby = request.args.get("orderby", "create_time")
|
||||||
|
desc = request.args.get("desc", True)
|
||||||
|
if not pf_id:
|
||||||
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
|
pf_id = root_folder.id
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(pf_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
|
||||||
|
files, total = FileService.get_by_pf_id(
|
||||||
|
current_user.id, pf_id, page_number, items_per_page, orderby, desc, keywords)
|
||||||
|
|
||||||
|
parent_folder = FileService.get_parent_folder(pf_id)
|
||||||
|
if not FileService.get_parent_folder(pf_id):
|
||||||
|
return get_json_result(retmsg="File not found!")
|
||||||
|
|
||||||
|
return get_json_result(data={"total": total, "files": files, "parent_folder": parent_folder.to_json()})
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/root_folder', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def get_root_folder():
|
||||||
|
try:
|
||||||
|
root_folder = FileService.get_root_folder(current_user.id)
|
||||||
|
return get_json_result(data={"root_folder": root_folder.to_json()})
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/parent_folder', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def get_parent_folder():
|
||||||
|
file_id = request.args.get("file_id")
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
|
||||||
|
parent_folder = FileService.get_parent_folder(file_id)
|
||||||
|
return get_json_result(data={"parent_folder": parent_folder.to_json()})
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/all_parent_folder', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def get_all_parent_folders():
|
||||||
|
file_id = request.args.get("file_id")
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Folder not found!")
|
||||||
|
|
||||||
|
parent_folders = FileService.get_all_parent_folders(file_id)
|
||||||
|
parent_folders_res = []
|
||||||
|
for parent_folder in parent_folders:
|
||||||
|
parent_folders_res.append(parent_folder.to_json())
|
||||||
|
return get_json_result(data={"parent_folders": parent_folders_res})
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/rm', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("file_ids")
|
||||||
|
def rm():
|
||||||
|
req = request.json
|
||||||
|
file_ids = req["file_ids"]
|
||||||
|
try:
|
||||||
|
for file_id in file_ids:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="File or Folder not found!")
|
||||||
|
if not file.tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
|
||||||
|
if file.type == FileType.FOLDER.value:
|
||||||
|
file_id_list = FileService.get_all_innermost_file_ids(file_id, [])
|
||||||
|
for inner_file_id in file_id_list:
|
||||||
|
e, file = FileService.get_by_id(inner_file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="File not found!")
|
||||||
|
MINIO.rm(file.parent_id, file.location)
|
||||||
|
FileService.delete_folder_by_pf_id(current_user.id, file_id)
|
||||||
|
else:
|
||||||
|
if not FileService.delete(file):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (File removal)!")
|
||||||
|
|
||||||
|
# delete file2document
|
||||||
|
informs = File2DocumentService.get_by_file_id(file_id)
|
||||||
|
for inform in informs:
|
||||||
|
doc_id = inform.document_id
|
||||||
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||||
|
if not tenant_id:
|
||||||
|
return get_data_error_result(retmsg="Tenant not found!")
|
||||||
|
ELASTICSEARCH.deleteByQuery(
|
||||||
|
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||||
|
DocumentService.increment_chunk_num(
|
||||||
|
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||||
|
if not DocumentService.delete(doc):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (Document removal)!")
|
||||||
|
File2DocumentService.delete_by_file_id(file_id)
|
||||||
|
|
||||||
|
return get_json_result(data=True)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/rename', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
@validate_request("file_id", "name")
|
||||||
|
def rename():
|
||||||
|
req = request.json
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(req["file_id"])
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="File not found!")
|
||||||
|
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
|
||||||
|
file.name.lower()).suffix:
|
||||||
|
return get_json_result(
|
||||||
|
data=False,
|
||||||
|
retmsg="The extension of file can't be changed",
|
||||||
|
retcode=RetCode.ARGUMENT_ERROR)
|
||||||
|
if FileService.query(name=req["name"], pf_id=file.parent_id):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Duplicated file name in the same folder.")
|
||||||
|
|
||||||
|
if not FileService.update_by_id(
|
||||||
|
req["file_id"], {"name": req["name"]}):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (File rename)!")
|
||||||
|
|
||||||
|
informs = File2DocumentService.get_by_file_id(req["file_id"])
|
||||||
|
if informs:
|
||||||
|
if not DocumentService.update_by_id(
|
||||||
|
informs[0].document_id, {"name": req["name"]}):
|
||||||
|
return get_data_error_result(
|
||||||
|
retmsg="Database error (Document rename)!")
|
||||||
|
|
||||||
|
return get_json_result(data=True)
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
|
|
||||||
|
|
||||||
|
@manager.route('/get/<file_id>', methods=['GET'])
|
||||||
|
# @login_required
|
||||||
|
def get(file_id):
|
||||||
|
try:
|
||||||
|
e, file = FileService.get_by_id(file_id)
|
||||||
|
if not e:
|
||||||
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
|
||||||
|
response = flask.make_response(MINIO.get(file.parent_id, file.location))
|
||||||
|
ext = re.search(r"\.([^.]+)$", file.name)
|
||||||
|
if ext:
|
||||||
|
if file.type == FileType.VISUAL.value:
|
||||||
|
response.headers.set('Content-Type', 'image/%s' % ext.group(1))
|
||||||
|
else:
|
||||||
|
response.headers.set(
|
||||||
|
'Content-Type',
|
||||||
|
'application/%s' %
|
||||||
|
ext.group(1))
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
return server_error_response(e)
|
||||||
@ -28,7 +28,7 @@ from api.db.db_models import Knowledgebase
|
|||||||
from api.settings import stat_logger, RetCode
|
from api.settings import stat_logger, RetCode
|
||||||
from api.utils.api_utils import get_json_result
|
from api.utils.api_utils import get_json_result
|
||||||
from rag.nlp import search
|
from rag.nlp import search
|
||||||
from rag.utils import ELASTICSEARCH
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
|
||||||
|
|
||||||
@manager.route('/create', methods=['post'])
|
@manager.route('/create', methods=['post'])
|
||||||
@ -111,7 +111,7 @@ def detail():
|
|||||||
@login_required
|
@login_required
|
||||||
def list():
|
def list():
|
||||||
page_number = request.args.get("page", 1)
|
page_number = request.args.get("page", 1)
|
||||||
items_per_page = request.args.get("page_size", 15)
|
items_per_page = request.args.get("page_size", 150)
|
||||||
orderby = request.args.get("orderby", "create_time")
|
orderby = request.args.get("orderby", "create_time")
|
||||||
desc = request.args.get("desc", True)
|
desc = request.args.get("desc", True)
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -24,10 +24,11 @@ from api.db.db_models import TenantLLM
|
|||||||
from api.db.services.llm_service import TenantLLMService, LLMService
|
from api.db.services.llm_service import TenantLLMService, LLMService
|
||||||
from api.utils.api_utils import server_error_response, validate_request
|
from api.utils.api_utils import server_error_response, validate_request
|
||||||
from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format
|
from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format
|
||||||
from api.db import UserTenantRole, LLMType
|
from api.db import UserTenantRole, LLMType, FileType
|
||||||
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
|
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
|
||||||
LLM_FACTORY, LLM_BASE_URL
|
LLM_FACTORY, LLM_BASE_URL
|
||||||
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
from api.settings import stat_logger
|
from api.settings import stat_logger
|
||||||
from api.utils.api_utils import get_json_result, cors_reponse
|
from api.utils.api_utils import get_json_result, cors_reponse
|
||||||
|
|
||||||
@ -221,6 +222,17 @@ def user_register(user_id, user):
|
|||||||
"invited_by": user_id,
|
"invited_by": user_id,
|
||||||
"role": UserTenantRole.OWNER
|
"role": UserTenantRole.OWNER
|
||||||
}
|
}
|
||||||
|
file_id = get_uuid()
|
||||||
|
file = {
|
||||||
|
"id": file_id,
|
||||||
|
"parent_id": file_id,
|
||||||
|
"tenant_id": user_id,
|
||||||
|
"created_by": user_id,
|
||||||
|
"name": "/",
|
||||||
|
"type": FileType.FOLDER.value,
|
||||||
|
"size": 0,
|
||||||
|
"location": "",
|
||||||
|
}
|
||||||
tenant_llm = []
|
tenant_llm = []
|
||||||
for llm in LLMService.query(fid=LLM_FACTORY):
|
for llm in LLMService.query(fid=LLM_FACTORY):
|
||||||
tenant_llm.append({"tenant_id": user_id,
|
tenant_llm.append({"tenant_id": user_id,
|
||||||
@ -236,6 +248,7 @@ def user_register(user_id, user):
|
|||||||
TenantService.insert(**tenant)
|
TenantService.insert(**tenant)
|
||||||
UserTenantService.insert(**usr_tenant)
|
UserTenantService.insert(**usr_tenant)
|
||||||
TenantLLMService.insert_many(tenant_llm)
|
TenantLLMService.insert_many(tenant_llm)
|
||||||
|
FileService.insert(file)
|
||||||
return UserService.query(email=user["email"])
|
return UserService.query(email=user["email"])
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -45,6 +45,8 @@ class FileType(StrEnum):
|
|||||||
VISUAL = 'visual'
|
VISUAL = 'visual'
|
||||||
AURAL = 'aural'
|
AURAL = 'aural'
|
||||||
VIRTUAL = 'virtual'
|
VIRTUAL = 'virtual'
|
||||||
|
FOLDER = 'folder'
|
||||||
|
OTHER = "other"
|
||||||
|
|
||||||
|
|
||||||
class LLMType(StrEnum):
|
class LLMType(StrEnum):
|
||||||
@ -62,6 +64,7 @@ class ChatStyle(StrEnum):
|
|||||||
|
|
||||||
|
|
||||||
class TaskStatus(StrEnum):
|
class TaskStatus(StrEnum):
|
||||||
|
UNSTART = "0"
|
||||||
RUNNING = "1"
|
RUNNING = "1"
|
||||||
CANCEL = "2"
|
CANCEL = "2"
|
||||||
DONE = "3"
|
DONE = "3"
|
||||||
|
|||||||
@ -669,6 +669,61 @@ class Document(DataBaseModel):
|
|||||||
db_table = "document"
|
db_table = "document"
|
||||||
|
|
||||||
|
|
||||||
|
class File(DataBaseModel):
|
||||||
|
id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
primary_key=True,
|
||||||
|
)
|
||||||
|
parent_id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=False,
|
||||||
|
help_text="parent folder id",
|
||||||
|
index=True)
|
||||||
|
tenant_id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=False,
|
||||||
|
help_text="tenant id",
|
||||||
|
index=True)
|
||||||
|
created_by = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=False,
|
||||||
|
help_text="who created it")
|
||||||
|
name = CharField(
|
||||||
|
max_length=255,
|
||||||
|
null=False,
|
||||||
|
help_text="file name or folder name",
|
||||||
|
index=True)
|
||||||
|
location = CharField(
|
||||||
|
max_length=255,
|
||||||
|
null=True,
|
||||||
|
help_text="where dose it store")
|
||||||
|
size = IntegerField(default=0)
|
||||||
|
type = CharField(max_length=32, null=False, help_text="file extension")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
db_table = "file"
|
||||||
|
|
||||||
|
|
||||||
|
class File2Document(DataBaseModel):
|
||||||
|
id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
primary_key=True,
|
||||||
|
)
|
||||||
|
file_id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=True,
|
||||||
|
help_text="file id",
|
||||||
|
index=True)
|
||||||
|
document_id = CharField(
|
||||||
|
max_length=32,
|
||||||
|
null=True,
|
||||||
|
help_text="document id",
|
||||||
|
index=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
db_table = "file2document"
|
||||||
|
|
||||||
|
|
||||||
class Task(DataBaseModel):
|
class Task(DataBaseModel):
|
||||||
id = CharField(max_length=32, primary_key=True)
|
id = CharField(max_length=32, primary_key=True)
|
||||||
doc_id = CharField(max_length=32, null=False, index=True)
|
doc_id = CharField(max_length=32, null=False, index=True)
|
||||||
|
|||||||
@ -124,6 +124,11 @@ factory_infos = [{
|
|||||||
"logo": "",
|
"logo": "",
|
||||||
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
||||||
"status": "1",
|
"status": "1",
|
||||||
|
},{
|
||||||
|
"name": "DeepSeek",
|
||||||
|
"logo": "",
|
||||||
|
"tags": "LLM",
|
||||||
|
"status": "1",
|
||||||
},
|
},
|
||||||
# {
|
# {
|
||||||
# "name": "文心一言",
|
# "name": "文心一言",
|
||||||
@ -331,6 +336,21 @@ def init_llm_factory():
|
|||||||
"max_tokens": 512,
|
"max_tokens": 512,
|
||||||
"model_type": LLMType.EMBEDDING.value
|
"model_type": LLMType.EMBEDDING.value
|
||||||
},
|
},
|
||||||
|
# ------------------------ DeepSeek -----------------------
|
||||||
|
{
|
||||||
|
"fid": factory_infos[8]["name"],
|
||||||
|
"llm_name": "deepseek-chat",
|
||||||
|
"tags": "LLM,CHAT,",
|
||||||
|
"max_tokens": 32768,
|
||||||
|
"model_type": LLMType.CHAT.value
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"fid": factory_infos[8]["name"],
|
||||||
|
"llm_name": "deepseek-coder",
|
||||||
|
"tags": "LLM,CHAT,",
|
||||||
|
"max_tokens": 16385,
|
||||||
|
"model_type": LLMType.CHAT.value
|
||||||
|
},
|
||||||
]
|
]
|
||||||
for info in factory_infos:
|
for info in factory_infos:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -136,7 +136,7 @@ def chat(dialog, messages, **kwargs):
|
|||||||
chat_logger.info("User: {}|Assistant: {}".format(
|
chat_logger.info("User: {}|Assistant: {}".format(
|
||||||
msg[-1]["content"], answer))
|
msg[-1]["content"], answer))
|
||||||
|
|
||||||
if knowledges and prompt_config.get("quote", True):
|
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
||||||
answer, idx = retrievaler.insert_citations(answer,
|
answer, idx = retrievaler.insert_citations(answer,
|
||||||
[ck["content_ltks"]
|
[ck["content_ltks"]
|
||||||
for ck in kbinfos["chunks"]],
|
for ck in kbinfos["chunks"]],
|
||||||
|
|||||||
@ -13,10 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
from peewee import Expression
|
import random
|
||||||
|
from datetime import datetime
|
||||||
|
from elasticsearch_dsl import Q
|
||||||
|
|
||||||
|
from api.settings import stat_logger
|
||||||
|
from api.utils import current_timestamp, get_format_time
|
||||||
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
|
from rag.nlp import search
|
||||||
|
|
||||||
from api.db import FileType, TaskStatus
|
from api.db import FileType, TaskStatus
|
||||||
from api.db.db_models import DB, Knowledgebase, Tenant
|
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
||||||
from api.db.db_models import Document
|
from api.db.db_models import Document
|
||||||
from api.db.services.common_service import CommonService
|
from api.db.services.common_service import CommonService
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
@ -71,7 +79,21 @@ class DocumentService(CommonService):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_newly_uploaded(cls, tm, mod=0, comm=1, items_per_page=64):
|
def remove_document(cls, doc, tenant_id):
|
||||||
|
ELASTICSEARCH.deleteByQuery(
|
||||||
|
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||||
|
|
||||||
|
cls.increment_chunk_num(
|
||||||
|
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||||
|
if not cls.delete(doc):
|
||||||
|
raise RuntimeError("Database error (Document removal)!")
|
||||||
|
|
||||||
|
MINIO.rm(doc.kb_id, doc.location)
|
||||||
|
return cls.delete_by_id(doc.id)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_newly_uploaded(cls):
|
||||||
fields = [
|
fields = [
|
||||||
cls.model.id,
|
cls.model.id,
|
||||||
cls.model.kb_id,
|
cls.model.kb_id,
|
||||||
@ -93,11 +115,9 @@ class DocumentService(CommonService):
|
|||||||
cls.model.status == StatusEnum.VALID.value,
|
cls.model.status == StatusEnum.VALID.value,
|
||||||
~(cls.model.type == FileType.VIRTUAL.value),
|
~(cls.model.type == FileType.VIRTUAL.value),
|
||||||
cls.model.progress == 0,
|
cls.model.progress == 0,
|
||||||
cls.model.update_time >= tm,
|
cls.model.update_time >= current_timestamp() - 1000 * 600,
|
||||||
cls.model.run == TaskStatus.RUNNING.value,
|
cls.model.run == TaskStatus.RUNNING.value)\
|
||||||
(Expression(cls.model.create_time, "%%", comm) == mod))\
|
.order_by(cls.model.update_time.asc())
|
||||||
.order_by(cls.model.update_time.asc())\
|
|
||||||
.paginate(1, items_per_page)
|
|
||||||
return list(docs.dicts())
|
return list(docs.dicts())
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -177,3 +197,55 @@ class DocumentService(CommonService):
|
|||||||
on=(Knowledgebase.id == cls.model.kb_id)).where(
|
on=(Knowledgebase.id == cls.model.kb_id)).where(
|
||||||
Knowledgebase.tenant_id == tenant_id)
|
Knowledgebase.tenant_id == tenant_id)
|
||||||
return len(docs)
|
return len(docs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def begin2parse(cls, docid):
|
||||||
|
cls.update_by_id(
|
||||||
|
docid, {"progress": random.random() * 1 / 100.,
|
||||||
|
"progress_msg": "Task dispatched...",
|
||||||
|
"process_begin_at": get_format_time()
|
||||||
|
})
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def update_progress(cls):
|
||||||
|
docs = cls.get_unfinished_docs()
|
||||||
|
for d in docs:
|
||||||
|
try:
|
||||||
|
tsks = Task.query(doc_id=d["id"], order_by=Task.create_time)
|
||||||
|
if not tsks:
|
||||||
|
continue
|
||||||
|
msg = []
|
||||||
|
prg = 0
|
||||||
|
finished = True
|
||||||
|
bad = 0
|
||||||
|
status = TaskStatus.RUNNING.value
|
||||||
|
for t in tsks:
|
||||||
|
if 0 <= t.progress < 1:
|
||||||
|
finished = False
|
||||||
|
prg += t.progress if t.progress >= 0 else 0
|
||||||
|
msg.append(t.progress_msg)
|
||||||
|
if t.progress == -1:
|
||||||
|
bad += 1
|
||||||
|
prg /= len(tsks)
|
||||||
|
if finished and bad:
|
||||||
|
prg = -1
|
||||||
|
status = TaskStatus.FAIL.value
|
||||||
|
elif finished:
|
||||||
|
status = TaskStatus.DONE.value
|
||||||
|
|
||||||
|
msg = "\n".join(msg)
|
||||||
|
info = {
|
||||||
|
"process_duation": datetime.timestamp(
|
||||||
|
datetime.now()) -
|
||||||
|
d["process_begin_at"].timestamp(),
|
||||||
|
"run": status}
|
||||||
|
if prg != 0:
|
||||||
|
info["progress"] = prg
|
||||||
|
if msg:
|
||||||
|
info["progress_msg"] = msg
|
||||||
|
cls.update_by_id(d["id"], info)
|
||||||
|
except Exception as e:
|
||||||
|
stat_logger.error("fetch task exception:" + str(e))
|
||||||
|
|
||||||
|
|||||||
83
api/db/services/file2document_service.py
Normal file
83
api/db/services/file2document_service.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from api.db.db_models import DB
|
||||||
|
from api.db.db_models import File, Document, File2Document
|
||||||
|
from api.db.services.common_service import CommonService
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
|
from api.db.services.file_service import FileService
|
||||||
|
from api.utils import current_timestamp, datetime_format
|
||||||
|
|
||||||
|
|
||||||
|
class File2DocumentService(CommonService):
|
||||||
|
model = File2Document
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_by_file_id(cls, file_id):
|
||||||
|
objs = cls.model.select().where(cls.model.file_id == file_id)
|
||||||
|
return objs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_by_document_id(cls, document_id):
|
||||||
|
objs = cls.model.select().where(cls.model.document_id == document_id)
|
||||||
|
return objs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def insert(cls, obj):
|
||||||
|
if not cls.save(**obj):
|
||||||
|
raise RuntimeError("Database error (File)!")
|
||||||
|
e, obj = cls.get_by_id(obj["id"])
|
||||||
|
if not e:
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
return obj
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete_by_file_id(cls, file_id):
|
||||||
|
return cls.model.delete().where(cls.model.file_id == file_id).execute()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete_by_document_id(cls, doc_id):
|
||||||
|
return cls.model.delete().where(cls.model.document_id == doc_id).execute()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def update_by_file_id(cls, file_id, obj):
|
||||||
|
obj["update_time"] = current_timestamp()
|
||||||
|
obj["update_date"] = datetime_format(datetime.now())
|
||||||
|
num = cls.model.update(obj).where(cls.model.id == file_id).execute()
|
||||||
|
e, obj = cls.get_by_id(cls.model.id)
|
||||||
|
return obj
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_minio_address(cls, doc_id=None, file_id=None):
|
||||||
|
if doc_id:
|
||||||
|
ids = File2DocumentService.get_by_document_id(doc_id)
|
||||||
|
else:
|
||||||
|
ids = File2DocumentService.get_by_file_id(file_id)
|
||||||
|
if ids:
|
||||||
|
e, file = FileService.get_by_id(ids[0].file_id)
|
||||||
|
return file.parent_id, file.location
|
||||||
|
else:
|
||||||
|
assert doc_id, "please specify doc_id"
|
||||||
|
e, doc = DocumentService.get_by_id(doc_id)
|
||||||
|
return doc.kb_id, doc.location
|
||||||
243
api/db/services/file_service.py
Normal file
243
api/db/services/file_service.py
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
from flask_login import current_user
|
||||||
|
from peewee import fn
|
||||||
|
|
||||||
|
from api.db import FileType
|
||||||
|
from api.db.db_models import DB, File2Document, Knowledgebase
|
||||||
|
from api.db.db_models import File, Document
|
||||||
|
from api.db.services.common_service import CommonService
|
||||||
|
from api.utils import get_uuid
|
||||||
|
|
||||||
|
|
||||||
|
class FileService(CommonService):
|
||||||
|
model = File
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_by_pf_id(cls, tenant_id, pf_id, page_number, items_per_page,
|
||||||
|
orderby, desc, keywords):
|
||||||
|
if keywords:
|
||||||
|
files = cls.model.select().where(
|
||||||
|
(cls.model.tenant_id == tenant_id)
|
||||||
|
& (cls.model.parent_id == pf_id), (fn.LOWER(cls.model.name).like(f"%%{keywords.lower()}%%")))
|
||||||
|
else:
|
||||||
|
files = cls.model.select().where((cls.model.tenant_id == tenant_id)
|
||||||
|
& (cls.model.parent_id == pf_id))
|
||||||
|
count = files.count()
|
||||||
|
if desc:
|
||||||
|
files = files.order_by(cls.model.getter_by(orderby).desc())
|
||||||
|
else:
|
||||||
|
files = files.order_by(cls.model.getter_by(orderby).asc())
|
||||||
|
|
||||||
|
files = files.paginate(page_number, items_per_page)
|
||||||
|
|
||||||
|
res_files = list(files.dicts())
|
||||||
|
for file in res_files:
|
||||||
|
if file["type"] == FileType.FOLDER.value:
|
||||||
|
file["size"] = cls.get_folder_size(file["id"])
|
||||||
|
file['kbs_info'] = []
|
||||||
|
continue
|
||||||
|
kbs_info = cls.get_kb_id_by_file_id(file['id'])
|
||||||
|
file['kbs_info'] = kbs_info
|
||||||
|
|
||||||
|
return res_files, count
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_kb_id_by_file_id(cls, file_id):
|
||||||
|
kbs = (cls.model.select(*[Knowledgebase.id, Knowledgebase.name])
|
||||||
|
.join(File2Document, on=(File2Document.file_id == file_id))
|
||||||
|
.join(Document, on=(File2Document.document_id == Document.id))
|
||||||
|
.join(Knowledgebase, on=(Knowledgebase.id == Document.kb_id))
|
||||||
|
.where(cls.model.id == file_id))
|
||||||
|
if not kbs: return []
|
||||||
|
kbs_info_list = []
|
||||||
|
for kb in list(kbs.dicts()):
|
||||||
|
kbs_info_list.append({"kb_id": kb['id'], "kb_name": kb['name']})
|
||||||
|
return kbs_info_list
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_by_pf_id_name(cls, id, name):
|
||||||
|
file = cls.model.select().where((cls.model.parent_id == id) & (cls.model.name == name))
|
||||||
|
if file.count():
|
||||||
|
e, file = cls.get_by_id(file[0].id)
|
||||||
|
if not e:
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
return file
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_id_list_by_id(cls, id, name, count, res):
|
||||||
|
if count < len(name):
|
||||||
|
file = cls.get_by_pf_id_name(id, name[count])
|
||||||
|
if file:
|
||||||
|
res.append(file.id)
|
||||||
|
return cls.get_id_list_by_id(file.id, name, count + 1, res)
|
||||||
|
else:
|
||||||
|
return res
|
||||||
|
else:
|
||||||
|
return res
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_all_innermost_file_ids(cls, folder_id, result_ids):
|
||||||
|
subfolders = cls.model.select().where(cls.model.parent_id == folder_id)
|
||||||
|
if subfolders.exists():
|
||||||
|
for subfolder in subfolders:
|
||||||
|
cls.get_all_innermost_file_ids(subfolder.id, result_ids)
|
||||||
|
else:
|
||||||
|
result_ids.append(folder_id)
|
||||||
|
return result_ids
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def create_folder(cls, file, parent_id, name, count):
|
||||||
|
if count > len(name) - 2:
|
||||||
|
return file
|
||||||
|
else:
|
||||||
|
file = cls.insert({
|
||||||
|
"id": get_uuid(),
|
||||||
|
"parent_id": parent_id,
|
||||||
|
"tenant_id": current_user.id,
|
||||||
|
"created_by": current_user.id,
|
||||||
|
"name": name[count],
|
||||||
|
"location": "",
|
||||||
|
"size": 0,
|
||||||
|
"type": FileType.FOLDER.value
|
||||||
|
})
|
||||||
|
return cls.create_folder(file, file.id, name, count + 1)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def is_parent_folder_exist(cls, parent_id):
|
||||||
|
parent_files = cls.model.select().where(cls.model.id == parent_id)
|
||||||
|
if parent_files.count():
|
||||||
|
return True
|
||||||
|
cls.delete_folder_by_pf_id(parent_id)
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_root_folder(cls, tenant_id):
|
||||||
|
file = cls.model.select().where(cls.model.tenant_id == tenant_id and
|
||||||
|
cls.model.parent_id == cls.model.id)
|
||||||
|
if not file:
|
||||||
|
file_id = get_uuid()
|
||||||
|
file = {
|
||||||
|
"id": file_id,
|
||||||
|
"parent_id": file_id,
|
||||||
|
"tenant_id": tenant_id,
|
||||||
|
"created_by": tenant_id,
|
||||||
|
"name": "/",
|
||||||
|
"type": FileType.FOLDER.value,
|
||||||
|
"size": 0,
|
||||||
|
"location": "",
|
||||||
|
}
|
||||||
|
cls.save(**file)
|
||||||
|
else:
|
||||||
|
file_id = file[0].id
|
||||||
|
|
||||||
|
e, file = cls.get_by_id(file_id)
|
||||||
|
if not e:
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
return file
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_parent_folder(cls, file_id):
|
||||||
|
file = cls.model.select().where(cls.model.id == file_id)
|
||||||
|
if file.count():
|
||||||
|
e, file = cls.get_by_id(file[0].parent_id)
|
||||||
|
if not e:
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
else:
|
||||||
|
raise RuntimeError("Database error (File doesn't exist)!")
|
||||||
|
return file
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_all_parent_folders(cls, start_id):
|
||||||
|
parent_folders = []
|
||||||
|
current_id = start_id
|
||||||
|
while current_id:
|
||||||
|
e, file = cls.get_by_id(current_id)
|
||||||
|
if file.parent_id != file.id and e:
|
||||||
|
parent_folders.append(file)
|
||||||
|
current_id = file.parent_id
|
||||||
|
else:
|
||||||
|
parent_folders.append(file)
|
||||||
|
break
|
||||||
|
return parent_folders
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def insert(cls, file):
|
||||||
|
if not cls.save(**file):
|
||||||
|
raise RuntimeError("Database error (File)!")
|
||||||
|
e, file = cls.get_by_id(file["id"])
|
||||||
|
if not e:
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
return file
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete(cls, file):
|
||||||
|
return cls.delete_by_id(file.id)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete_by_pf_id(cls, folder_id):
|
||||||
|
return cls.model.delete().where(cls.model.parent_id == folder_id).execute()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def delete_folder_by_pf_id(cls, user_id, folder_id):
|
||||||
|
try:
|
||||||
|
files = cls.model.select().where((cls.model.tenant_id == user_id)
|
||||||
|
& (cls.model.parent_id == folder_id))
|
||||||
|
for file in files:
|
||||||
|
cls.delete_folder_by_pf_id(user_id, file.id)
|
||||||
|
return cls.model.delete().where((cls.model.tenant_id == user_id)
|
||||||
|
& (cls.model.id == folder_id)).execute(),
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
raise RuntimeError("Database error (File retrieval)!")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_file_count(cls, tenant_id):
|
||||||
|
files = cls.model.select(cls.model.id).where(cls.model.tenant_id == tenant_id)
|
||||||
|
return len(files)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@DB.connection_context()
|
||||||
|
def get_folder_size(cls, folder_id):
|
||||||
|
size = 0
|
||||||
|
|
||||||
|
def dfs(parent_id):
|
||||||
|
nonlocal size
|
||||||
|
for f in cls.model.select(*[cls.model.id, cls.model.size, cls.model.type]).where(
|
||||||
|
cls.model.parent_id == parent_id, cls.model.id != parent_id):
|
||||||
|
size += f.size
|
||||||
|
if f.type == FileType.FOLDER.value:
|
||||||
|
dfs(f.id)
|
||||||
|
|
||||||
|
dfs(folder_id)
|
||||||
|
return size
|
||||||
|
|
||||||
@ -128,7 +128,9 @@ class TenantLLMService(CommonService):
|
|||||||
else:
|
else:
|
||||||
assert False, "LLM type error"
|
assert False, "LLM type error"
|
||||||
|
|
||||||
num = cls.model.update(used_tokens=cls.model.used_tokens + used_tokens)\
|
num = 0
|
||||||
|
for u in cls.query(tenant_id = tenant_id, llm_name=mdlnm):
|
||||||
|
num += cls.model.update(used_tokens = u.used_tokens + used_tokens)\
|
||||||
.where(cls.model.tenant_id == tenant_id, cls.model.llm_name == mdlnm)\
|
.where(cls.model.tenant_id == tenant_id, cls.model.llm_name == mdlnm)\
|
||||||
.execute()
|
.execute()
|
||||||
return num
|
return num
|
||||||
|
|||||||
@ -15,13 +15,19 @@
|
|||||||
#
|
#
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from peewee import Expression
|
from api.db.db_utils import bulk_insert_into_db
|
||||||
from api.db.db_models import DB
|
from deepdoc.parser import PdfParser
|
||||||
|
from peewee import JOIN
|
||||||
|
from api.db.db_models import DB, File2Document, File
|
||||||
from api.db import StatusEnum, FileType, TaskStatus
|
from api.db import StatusEnum, FileType, TaskStatus
|
||||||
from api.db.db_models import Task, Document, Knowledgebase, Tenant
|
from api.db.db_models import Task, Document, Knowledgebase, Tenant
|
||||||
from api.db.services.common_service import CommonService
|
from api.db.services.common_service import CommonService
|
||||||
from api.db.services.document_service import DocumentService
|
from api.db.services.document_service import DocumentService
|
||||||
from api.utils import current_timestamp
|
from api.utils import current_timestamp, get_uuid
|
||||||
|
from deepdoc.parser.excel_parser import RAGFlowExcelParser
|
||||||
|
from rag.settings import SVR_QUEUE_NAME
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
|
from rag.utils.redis_conn import REDIS_CONN
|
||||||
|
|
||||||
|
|
||||||
class TaskService(CommonService):
|
class TaskService(CommonService):
|
||||||
@ -29,7 +35,7 @@ class TaskService(CommonService):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=1, takeit=True):
|
def get_tasks(cls, task_id):
|
||||||
fields = [
|
fields = [
|
||||||
cls.model.id,
|
cls.model.id,
|
||||||
cls.model.doc_id,
|
cls.model.doc_id,
|
||||||
@ -48,26 +54,16 @@ class TaskService(CommonService):
|
|||||||
Tenant.img2txt_id,
|
Tenant.img2txt_id,
|
||||||
Tenant.asr_id,
|
Tenant.asr_id,
|
||||||
cls.model.update_time]
|
cls.model.update_time]
|
||||||
with DB.lock("get_task", -1):
|
|
||||||
docs = cls.model.select(*fields) \
|
docs = cls.model.select(*fields) \
|
||||||
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
||||||
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
|
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
|
||||||
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
|
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \
|
||||||
.where(
|
.where(cls.model.id == task_id)
|
||||||
Document.status == StatusEnum.VALID.value,
|
|
||||||
Document.run == TaskStatus.RUNNING.value,
|
|
||||||
~(Document.type == FileType.VIRTUAL.value),
|
|
||||||
cls.model.progress == 0,
|
|
||||||
#cls.model.update_time >= tm,
|
|
||||||
#(Expression(cls.model.create_time, "%%", comm) == mod)
|
|
||||||
)\
|
|
||||||
.order_by(cls.model.update_time.asc())\
|
|
||||||
.paginate(0, items_per_page)
|
|
||||||
docs = list(docs.dicts())
|
docs = list(docs.dicts())
|
||||||
if not docs: return []
|
if not docs: return []
|
||||||
if not takeit: return docs
|
|
||||||
|
|
||||||
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", progress=random.random()/10.).where(
|
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.",
|
||||||
|
progress=random.random() / 10.).where(
|
||||||
cls.model.id == docs[0]["id"]).execute()
|
cls.model.id == docs[0]["id"]).execute()
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
@ -75,20 +71,21 @@ class TaskService(CommonService):
|
|||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
def get_ongoing_doc_name(cls):
|
def get_ongoing_doc_name(cls):
|
||||||
with DB.lock("get_task", -1):
|
with DB.lock("get_task", -1):
|
||||||
docs = cls.model.select(*[Document.kb_id, Document.location]) \
|
docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
|
||||||
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
||||||
|
.join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
|
||||||
|
.join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \
|
||||||
.where(
|
.where(
|
||||||
Document.status == StatusEnum.VALID.value,
|
Document.status == StatusEnum.VALID.value,
|
||||||
Document.run == TaskStatus.RUNNING.value,
|
Document.run == TaskStatus.RUNNING.value,
|
||||||
~(Document.type == FileType.VIRTUAL.value),
|
~(Document.type == FileType.VIRTUAL.value),
|
||||||
cls.model.progress >= 0,
|
|
||||||
cls.model.progress < 1,
|
cls.model.progress < 1,
|
||||||
cls.model.create_time >= current_timestamp() - 180000
|
cls.model.create_time >= current_timestamp() - 1000 * 600
|
||||||
)
|
)
|
||||||
docs = list(docs.dicts())
|
docs = list(docs.dicts())
|
||||||
if not docs: return []
|
if not docs: return []
|
||||||
|
|
||||||
return list(set([(d["kb_id"], d["location"]) for d in docs]))
|
return list(set([(d["parent_id"] if d["parent_id"] else d["kb_id"], d["location"]) for d in docs]))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
@ -111,3 +108,55 @@ class TaskService(CommonService):
|
|||||||
if "progress" in info:
|
if "progress" in info:
|
||||||
cls.model.update(progress=info["progress"]).where(
|
cls.model.update(progress=info["progress"]).where(
|
||||||
cls.model.id == id).execute()
|
cls.model.id == id).execute()
|
||||||
|
|
||||||
|
|
||||||
|
def queue_tasks(doc, bucket, name):
|
||||||
|
def new_task():
|
||||||
|
nonlocal doc
|
||||||
|
return {
|
||||||
|
"id": get_uuid(),
|
||||||
|
"doc_id": doc["id"]
|
||||||
|
}
|
||||||
|
tsks = []
|
||||||
|
|
||||||
|
if doc["type"] == FileType.PDF.value:
|
||||||
|
file_bin = MINIO.get(bucket, name)
|
||||||
|
do_layout = doc["parser_config"].get("layout_recognize", True)
|
||||||
|
pages = PdfParser.total_page_number(doc["name"], file_bin)
|
||||||
|
page_size = doc["parser_config"].get("task_page_size", 12)
|
||||||
|
if doc["parser_id"] == "paper":
|
||||||
|
page_size = doc["parser_config"].get("task_page_size", 22)
|
||||||
|
if doc["parser_id"] == "one":
|
||||||
|
page_size = 1000000000
|
||||||
|
if not do_layout:
|
||||||
|
page_size = 1000000000
|
||||||
|
page_ranges = doc["parser_config"].get("pages")
|
||||||
|
if not page_ranges:
|
||||||
|
page_ranges = [(1, 100000)]
|
||||||
|
for s, e in page_ranges:
|
||||||
|
s -= 1
|
||||||
|
s = max(0, s)
|
||||||
|
e = min(e - 1, pages)
|
||||||
|
for p in range(s, e, page_size):
|
||||||
|
task = new_task()
|
||||||
|
task["from_page"] = p
|
||||||
|
task["to_page"] = min(p + page_size, e)
|
||||||
|
tsks.append(task)
|
||||||
|
|
||||||
|
elif doc["parser_id"] == "table":
|
||||||
|
file_bin = MINIO.get(bucket, name)
|
||||||
|
rn = RAGFlowExcelParser.row_number(
|
||||||
|
doc["name"], file_bin)
|
||||||
|
for i in range(0, rn, 3000):
|
||||||
|
task = new_task()
|
||||||
|
task["from_page"] = i
|
||||||
|
task["to_page"] = min(i + 3000, rn)
|
||||||
|
tsks.append(task)
|
||||||
|
else:
|
||||||
|
tsks.append(new_task())
|
||||||
|
|
||||||
|
bulk_insert_into_db(Task, tsks, True)
|
||||||
|
DocumentService.begin2parse(doc["id"])
|
||||||
|
|
||||||
|
for t in tsks:
|
||||||
|
REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t)
|
||||||
@ -18,10 +18,14 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
from werkzeug.serving import run_simple
|
from werkzeug.serving import run_simple
|
||||||
from api.apps import app
|
from api.apps import app
|
||||||
from api.db.runtime_config import RuntimeConfig
|
from api.db.runtime_config import RuntimeConfig
|
||||||
|
from api.db.services.document_service import DocumentService
|
||||||
from api.settings import (
|
from api.settings import (
|
||||||
HOST, HTTP_PORT, access_logger, database_logger, stat_logger,
|
HOST, HTTP_PORT, access_logger, database_logger, stat_logger,
|
||||||
)
|
)
|
||||||
@ -31,6 +35,16 @@ from api.db.db_models import init_database_tables as init_web_db
|
|||||||
from api.db.init_data import init_web_data
|
from api.db.init_data import init_web_data
|
||||||
from api.versions import get_versions
|
from api.versions import get_versions
|
||||||
|
|
||||||
|
|
||||||
|
def update_progress():
|
||||||
|
while True:
|
||||||
|
time.sleep(1)
|
||||||
|
try:
|
||||||
|
DocumentService.update_progress()
|
||||||
|
except Exception as e:
|
||||||
|
stat_logger.error("update_progress exception:" + str(e))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print("""
|
print("""
|
||||||
____ ______ __
|
____ ______ __
|
||||||
@ -71,6 +85,9 @@ if __name__ == '__main__':
|
|||||||
peewee_logger.addHandler(database_logger.handlers[0])
|
peewee_logger.addHandler(database_logger.handlers[0])
|
||||||
peewee_logger.setLevel(database_logger.level)
|
peewee_logger.setLevel(database_logger.level)
|
||||||
|
|
||||||
|
thr = ThreadPoolExecutor(max_workers=1)
|
||||||
|
thr.submit(update_progress)
|
||||||
|
|
||||||
# start http server
|
# start http server
|
||||||
try:
|
try:
|
||||||
stat_logger.info("RAG Flow http server start...")
|
stat_logger.info("RAG Flow http server start...")
|
||||||
|
|||||||
@ -32,7 +32,7 @@ access_logger = getLogger("access")
|
|||||||
database_logger = getLogger("database")
|
database_logger = getLogger("database")
|
||||||
chat_logger = getLogger("chat")
|
chat_logger = getLogger("chat")
|
||||||
|
|
||||||
from rag.utils import ELASTICSEARCH
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
from rag.nlp import search
|
from rag.nlp import search
|
||||||
from api.utils import get_base_config, decrypt_database_config
|
from api.utils import get_base_config, decrypt_database_config
|
||||||
|
|
||||||
|
|||||||
@ -19,7 +19,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import fitz
|
import pdfplumber
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from cachetools import LRUCache, cached
|
from cachetools import LRUCache, cached
|
||||||
from ruamel.yaml import YAML
|
from ruamel.yaml import YAML
|
||||||
@ -66,6 +66,15 @@ def get_rag_python_directory(*args):
|
|||||||
return get_rag_directory("python", *args)
|
return get_rag_directory("python", *args)
|
||||||
|
|
||||||
|
|
||||||
|
def get_home_cache_dir():
|
||||||
|
dir = os.path.join(os.path.expanduser('~'), ".ragflow")
|
||||||
|
try:
|
||||||
|
os.mkdir(dir)
|
||||||
|
except OSError as error:
|
||||||
|
pass
|
||||||
|
return dir
|
||||||
|
|
||||||
|
|
||||||
@cached(cache=LRUCache(maxsize=10))
|
@cached(cache=LRUCache(maxsize=10))
|
||||||
def load_json_conf(conf_path):
|
def load_json_conf(conf_path):
|
||||||
if os.path.isabs(conf_path):
|
if os.path.isabs(conf_path):
|
||||||
@ -155,17 +164,17 @@ def filename_type(filename):
|
|||||||
return FileType.AURAL.value
|
return FileType.AURAL.value
|
||||||
|
|
||||||
if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
|
if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
|
||||||
return FileType.VISUAL
|
return FileType.VISUAL.value
|
||||||
|
|
||||||
|
return FileType.OTHER.value
|
||||||
|
|
||||||
|
|
||||||
def thumbnail(filename, blob):
|
def thumbnail(filename, blob):
|
||||||
filename = filename.lower()
|
filename = filename.lower()
|
||||||
if re.match(r".*\.pdf$", filename):
|
if re.match(r".*\.pdf$", filename):
|
||||||
pdf = fitz.open(stream=blob, filetype="pdf")
|
pdf = pdfplumber.open(BytesIO(blob))
|
||||||
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
|
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
Image.frombytes("RGB", [pix.width, pix.height],
|
pdf.pages[0].to_image().annotated.save(buffered, format="png")
|
||||||
pix.samples).save(buffered, format="png")
|
|
||||||
return "data:image/png;base64," + \
|
return "data:image/png;base64," + \
|
||||||
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||||
|
|
||||||
|
|||||||
@ -13,12 +13,12 @@ minio:
|
|||||||
user: 'rag_flow'
|
user: 'rag_flow'
|
||||||
password: 'infini_rag_flow'
|
password: 'infini_rag_flow'
|
||||||
host: 'minio:9000'
|
host: 'minio:9000'
|
||||||
|
es:
|
||||||
|
hosts: 'http://es01:9200'
|
||||||
redis:
|
redis:
|
||||||
db: 1
|
db: 1
|
||||||
password: 'infini_rag_flow'
|
password: 'infini_rag_flow'
|
||||||
host: 'redis:6379'
|
host: 'redis:6379'
|
||||||
es:
|
|
||||||
hosts: 'http://es01:9200'
|
|
||||||
user_default_llm:
|
user_default_llm:
|
||||||
factory: 'Tongyi-Qianwen'
|
factory: 'Tongyi-Qianwen'
|
||||||
api_key: 'sk-xxxxxxxxxxxxx'
|
api_key: 'sk-xxxxxxxxxxxxx'
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
|
|
||||||
from .pdf_parser import HuParser as PdfParser, PlainParser
|
from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
|
||||||
from .docx_parser import HuDocxParser as DocxParser
|
from .docx_parser import RAGFlowDocxParser as DocxParser
|
||||||
from .excel_parser import HuExcelParser as ExcelParser
|
from .excel_parser import RAGFlowExcelParser as ExcelParser
|
||||||
from .ppt_parser import HuPptParser as PptParser
|
from .ppt_parser import RAGFlowPptParser as PptParser
|
||||||
|
|||||||
@ -3,11 +3,11 @@ from docx import Document
|
|||||||
import re
|
import re
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
|
|
||||||
class HuDocxParser:
|
class RAGFlowDocxParser:
|
||||||
|
|
||||||
def __extract_table_content(self, tb):
|
def __extract_table_content(self, tb):
|
||||||
df = []
|
df = []
|
||||||
@ -35,14 +35,14 @@ class HuDocxParser:
|
|||||||
for p, n in patt:
|
for p, n in patt:
|
||||||
if re.search(p, b):
|
if re.search(p, b):
|
||||||
return n
|
return n
|
||||||
tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1]
|
tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
|
||||||
if len(tks) > 3:
|
if len(tks) > 3:
|
||||||
if len(tks) < 12:
|
if len(tks) < 12:
|
||||||
return "Tx"
|
return "Tx"
|
||||||
else:
|
else:
|
||||||
return "Lx"
|
return "Lx"
|
||||||
|
|
||||||
if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
|
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
||||||
return "Nr"
|
return "Nr"
|
||||||
|
|
||||||
return "Ot"
|
return "Ot"
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from io import BytesIO
|
|||||||
from rag.nlp import find_codec
|
from rag.nlp import find_codec
|
||||||
|
|
||||||
|
|
||||||
class HuExcelParser:
|
class RAGFlowExcelParser:
|
||||||
def html(self, fnm):
|
def html(self, fnm):
|
||||||
if isinstance(fnm, str):
|
if isinstance(fnm, str):
|
||||||
wb = load_workbook(fnm)
|
wb = load_workbook(fnm)
|
||||||
@ -69,10 +69,10 @@ class HuExcelParser:
|
|||||||
|
|
||||||
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
return len(txt.split("\n"))
|
return len(txt.split("\n"))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
psr = HuExcelParser()
|
psr = RAGFlowExcelParser()
|
||||||
psr(sys.argv[1])
|
psr(sys.argv[1])
|
||||||
|
|||||||
@ -2,7 +2,6 @@
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import fitz
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import torch
|
import torch
|
||||||
@ -16,14 +15,14 @@ from PyPDF2 import PdfReader as pdf2_read
|
|||||||
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|
||||||
class HuParser:
|
class RAGFlowPdfParser:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.ocr = OCR()
|
self.ocr = OCR()
|
||||||
if hasattr(self, "model_speciess"):
|
if hasattr(self, "model_speciess"):
|
||||||
@ -95,13 +94,13 @@ class HuParser:
|
|||||||
h = max(self.__height(up), self.__height(down))
|
h = max(self.__height(up), self.__height(down))
|
||||||
y_dis = self._y_dis(up, down)
|
y_dis = self._y_dis(up, down)
|
||||||
LEN = 6
|
LEN = 6
|
||||||
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
|
tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
|
||||||
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
|
tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
|
||||||
tks_all = up["text"][-LEN:].strip() \
|
tks_all = up["text"][-LEN:].strip() \
|
||||||
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
||||||
up["text"][-1] + down["text"][0]) else "") \
|
up["text"][-1] + down["text"][0]) else "") \
|
||||||
+ down["text"][:LEN].strip()
|
+ down["text"][:LEN].strip()
|
||||||
tks_all = huqie.qie(tks_all).split(" ")
|
tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
|
||||||
fea = [
|
fea = [
|
||||||
up.get("R", -1) == down.get("R", -1),
|
up.get("R", -1) == down.get("R", -1),
|
||||||
y_dis / h,
|
y_dis / h,
|
||||||
@ -142,8 +141,8 @@ class HuParser:
|
|||||||
tks_down[-1] == tks_up[-1],
|
tks_down[-1] == tks_up[-1],
|
||||||
max(down["in_row"], up["in_row"]),
|
max(down["in_row"], up["in_row"]),
|
||||||
abs(down["in_row"] - up["in_row"]),
|
abs(down["in_row"] - up["in_row"]),
|
||||||
len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
|
len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
|
||||||
len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
|
len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
|
||||||
]
|
]
|
||||||
return fea
|
return fea
|
||||||
|
|
||||||
@ -470,7 +469,8 @@ class HuParser:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
|
if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
|
||||||
or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
|
or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
|
||||||
|
or not down["text"].strip():
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -598,7 +598,7 @@ class HuParser:
|
|||||||
|
|
||||||
if b["text"].strip()[0] != b_["text"].strip()[0] \
|
if b["text"].strip()[0] != b_["text"].strip()[0] \
|
||||||
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
|
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
|
||||||
or huqie.is_chinese(b["text"].strip()[0]) \
|
or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
|
||||||
or b["top"] > b_["bottom"]:
|
or b["top"] > b_["bottom"]:
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
@ -921,9 +921,7 @@ class HuParser:
|
|||||||
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
||||||
return len(pdf.pages)
|
return len(pdf.pages)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pdf = fitz.open(fnm) if not binary else fitz.open(
|
logging.error(str(e))
|
||||||
stream=fnm, filetype="pdf")
|
|
||||||
return len(pdf)
|
|
||||||
|
|
||||||
def __images__(self, fnm, zoomin=3, page_from=0,
|
def __images__(self, fnm, zoomin=3, page_from=0,
|
||||||
page_to=299, callback=None):
|
page_to=299, callback=None):
|
||||||
@ -945,23 +943,7 @@ class HuParser:
|
|||||||
self.pdf.pages[page_from:page_to]]
|
self.pdf.pages[page_from:page_to]]
|
||||||
self.total_page = len(self.pdf.pages)
|
self.total_page = len(self.pdf.pages)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.pdf = fitz.open(fnm) if isinstance(
|
logging.error(str(e))
|
||||||
fnm, str) else fitz.open(
|
|
||||||
stream=fnm, filetype="pdf")
|
|
||||||
self.page_images = []
|
|
||||||
self.page_chars = []
|
|
||||||
mat = fitz.Matrix(zoomin, zoomin)
|
|
||||||
self.total_page = len(self.pdf)
|
|
||||||
for i, page in enumerate(self.pdf):
|
|
||||||
if i < page_from:
|
|
||||||
continue
|
|
||||||
if i >= page_to:
|
|
||||||
break
|
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
|
||||||
pix.samples)
|
|
||||||
self.page_images.append(img)
|
|
||||||
self.page_chars.append([])
|
|
||||||
|
|
||||||
self.outlines = []
|
self.outlines = []
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from io import BytesIO
|
|||||||
from pptx import Presentation
|
from pptx import Presentation
|
||||||
|
|
||||||
|
|
||||||
class HuPptParser(object):
|
class RAGFlowPptParser(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
import re,json,os
|
import re,json,os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from . import regions
|
from . import regions
|
||||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
||||||
@ -22,14 +22,14 @@ def baike(cid, default_v=0):
|
|||||||
def corpNorm(nm, add_region=True):
|
def corpNorm(nm, add_region=True):
|
||||||
global CORP_TKS
|
global CORP_TKS
|
||||||
if not nm or type(nm)!=type(""):return ""
|
if not nm or type(nm)!=type(""):return ""
|
||||||
nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower()
|
nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
|
||||||
nm = re.sub(r"&", "&", nm)
|
nm = re.sub(r"&", "&", nm)
|
||||||
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
||||||
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
||||||
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
||||||
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
||||||
|
|
||||||
tks = huqie.qie(nm).split(" ")
|
tks = rag_tokenizer.tokenize(nm).split(" ")
|
||||||
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
||||||
nm = ""
|
nm = ""
|
||||||
for t in tks:
|
for t in tks:
|
||||||
|
|||||||
@ -3,7 +3,7 @@ import re, copy, time, datetime, demjson3, \
|
|||||||
traceback, signal
|
traceback, signal
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
||||||
from rag.nlp import huqie, surname
|
from rag.nlp import rag_tokenizer, surname
|
||||||
from xpinyin import Pinyin
|
from xpinyin import Pinyin
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ def forEdu(cv):
|
|||||||
if n.get("school_name") and isinstance(n["school_name"], str):
|
if n.get("school_name") and isinstance(n["school_name"], str):
|
||||||
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
||||||
e["sch_nm_kwd"] = sch[-1]
|
e["sch_nm_kwd"] = sch[-1]
|
||||||
fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1])
|
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
|
||||||
|
|
||||||
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
||||||
maj.append(n["discipline_name"])
|
maj.append(n["discipline_name"])
|
||||||
@ -166,10 +166,10 @@ def forEdu(cv):
|
|||||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||||
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
||||||
|
|
||||||
if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj))
|
if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
|
||||||
if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch))
|
if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
|
||||||
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch))
|
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
|
||||||
if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj))
|
if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
|
||||||
|
|
||||||
return cv
|
return cv
|
||||||
|
|
||||||
@ -187,11 +187,11 @@ def forProj(cv):
|
|||||||
if n.get("achivement"): desc.append(str(n["achivement"]))
|
if n.get("achivement"): desc.append(str(n["achivement"]))
|
||||||
|
|
||||||
if pro_nms:
|
if pro_nms:
|
||||||
# cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms))
|
# cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
|
||||||
cv["project_name_tks"] = huqie.qie(pro_nms[0])
|
cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
|
||||||
if desc:
|
if desc:
|
||||||
cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc)))
|
cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
|
||||||
cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0]))
|
cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
|
||||||
|
|
||||||
return cv
|
return cv
|
||||||
|
|
||||||
@ -280,25 +280,25 @@ def forWork(cv):
|
|||||||
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
||||||
|
|
||||||
if fea["position_name"]:
|
if fea["position_name"]:
|
||||||
cv["position_name_tks"] = huqie.qie(fea["position_name"][0])
|
cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
|
||||||
cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"])
|
cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
|
||||||
cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:]))
|
cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
|
||||||
|
|
||||||
if fea["industry_name"]:
|
if fea["industry_name"]:
|
||||||
cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0])
|
cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
|
||||||
cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"])
|
cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
|
||||||
cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:]))
|
cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
|
||||||
|
|
||||||
if fea["corporation_name"]:
|
if fea["corporation_name"]:
|
||||||
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
||||||
cv["corp_nm_kwd"] = fea["corporation_name"]
|
cv["corp_nm_kwd"] = fea["corporation_name"]
|
||||||
cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0])
|
cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
|
||||||
cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"])
|
cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
|
||||||
cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:]))
|
cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
|
||||||
|
|
||||||
if fea["responsibilities"]:
|
if fea["responsibilities"]:
|
||||||
cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0])
|
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
|
||||||
cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:]))
|
cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
|
||||||
|
|
||||||
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
||||||
re.match(r"[^0-9]+$", str(i))]
|
re.match(r"[^0-9]+$", str(i))]
|
||||||
@ -444,15 +444,15 @@ def parse(cv):
|
|||||||
if nms:
|
if nms:
|
||||||
t = k[:-4]
|
t = k[:-4]
|
||||||
cv[f"{t}_kwd"] = nms
|
cv[f"{t}_kwd"] = nms
|
||||||
cv[f"{t}_tks"] = huqie.qie(" ".join(nms))
|
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
||||||
cv[k] = []
|
cv[k] = []
|
||||||
|
|
||||||
# tokenize fields
|
# tokenize fields
|
||||||
if k in tks_fld:
|
if k in tks_fld:
|
||||||
cv[f"{k}_tks"] = huqie.qie(cv[k])
|
cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
|
||||||
if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"])
|
if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
|
||||||
|
|
||||||
# keyword fields
|
# keyword fields
|
||||||
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
||||||
@ -492,7 +492,7 @@ def parse(cv):
|
|||||||
cv["name_kwd"] = name
|
cv["name_kwd"] = name
|
||||||
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
||||||
cv["name_tks"] = (
|
cv["name_tks"] = (
|
||||||
huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
||||||
) if name else ""
|
) if name else ""
|
||||||
else:
|
else:
|
||||||
cv["integerity_flt"] /= 2.
|
cv["integerity_flt"] /= 2.
|
||||||
@ -515,7 +515,7 @@ def parse(cv):
|
|||||||
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||||
# long text tokenize
|
# long text tokenize
|
||||||
|
|
||||||
if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"]))
|
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
||||||
|
|
||||||
# for yes or no field
|
# for yes or no field
|
||||||
fea = []
|
fea = []
|
||||||
|
|||||||
@ -1,12 +1,13 @@
|
|||||||
|
import pdfplumber
|
||||||
|
|
||||||
from .ocr import OCR
|
from .ocr import OCR
|
||||||
from .recognizer import Recognizer
|
from .recognizer import Recognizer
|
||||||
from .layout_recognizer import LayoutRecognizer
|
from .layout_recognizer import LayoutRecognizer
|
||||||
from .table_structure_recognizer import TableStructureRecognizer
|
from .table_structure_recognizer import TableStructureRecognizer
|
||||||
|
|
||||||
|
|
||||||
def init_in_out(args):
|
def init_in_out(args):
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import fitz
|
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
from api.utils.file_utils import traversal_files
|
from api.utils.file_utils import traversal_files
|
||||||
@ -18,13 +19,11 @@ def init_in_out(args):
|
|||||||
|
|
||||||
def pdf_pages(fnm, zoomin=3):
|
def pdf_pages(fnm, zoomin=3):
|
||||||
nonlocal outputs, images
|
nonlocal outputs, images
|
||||||
pdf = fitz.open(fnm)
|
pdf = pdfplumber.open(fnm)
|
||||||
mat = fitz.Matrix(zoomin, zoomin)
|
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||||
for i, page in enumerate(pdf):
|
enumerate(pdf.pages)]
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
for i, page in enumerate(images):
|
||||||
pix.samples)
|
|
||||||
images.append(img)
|
|
||||||
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
||||||
|
|
||||||
def images_and_outputs(fnm):
|
def images_and_outputs(fnm):
|
||||||
|
|||||||
@ -11,10 +11,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
from deepdoc.vision.seeit import draw_box
|
|
||||||
from deepdoc.vision import OCR, init_in_out
|
|
||||||
import argparse
|
|
||||||
import numpy as np
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(
|
sys.path.insert(
|
||||||
@ -25,6 +21,11 @@ sys.path.insert(
|
|||||||
os.path.abspath(__file__)),
|
os.path.abspath(__file__)),
|
||||||
'../../')))
|
'../../')))
|
||||||
|
|
||||||
|
from deepdoc.vision.seeit import draw_box
|
||||||
|
from deepdoc.vision import OCR, init_in_out
|
||||||
|
import argparse
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
ocr = OCR()
|
ocr = OCR()
|
||||||
|
|||||||
@ -10,17 +10,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import os, sys
|
||||||
from deepdoc.vision.seeit import draw_box
|
|
||||||
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import re
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
sys.path.insert(
|
sys.path.insert(
|
||||||
0,
|
0,
|
||||||
os.path.abspath(
|
os.path.abspath(
|
||||||
@ -29,6 +19,13 @@ sys.path.insert(
|
|||||||
os.path.abspath(__file__)),
|
os.path.abspath(__file__)),
|
||||||
'../../')))
|
'../../')))
|
||||||
|
|
||||||
|
from deepdoc.vision.seeit import draw_box
|
||||||
|
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
|
||||||
|
from api.utils.file_utils import get_project_base_directory
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
images, outputs = init_in_out(args)
|
images, outputs = init_in_out(args)
|
||||||
|
|||||||
@ -19,7 +19,7 @@ import numpy as np
|
|||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from .recognizer import Recognizer
|
from .recognizer import Recognizer
|
||||||
|
|
||||||
|
|
||||||
@ -117,14 +117,14 @@ class TableStructureRecognizer(Recognizer):
|
|||||||
for p, n in patt:
|
for p, n in patt:
|
||||||
if re.search(p, b["text"].strip()):
|
if re.search(p, b["text"].strip()):
|
||||||
return n
|
return n
|
||||||
tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
|
tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1]
|
||||||
if len(tks) > 3:
|
if len(tks) > 3:
|
||||||
if len(tks) < 12:
|
if len(tks) < 12:
|
||||||
return "Tx"
|
return "Tx"
|
||||||
else:
|
else:
|
||||||
return "Lx"
|
return "Lx"
|
||||||
|
|
||||||
if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
|
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
||||||
return "Nr"
|
return "Nr"
|
||||||
|
|
||||||
return "Ot"
|
return "Ot"
|
||||||
|
|||||||
@ -25,9 +25,11 @@ MINIO_PORT=9000
|
|||||||
MINIO_USER=rag_flow
|
MINIO_USER=rag_flow
|
||||||
MINIO_PASSWORD=infini_rag_flow
|
MINIO_PASSWORD=infini_rag_flow
|
||||||
|
|
||||||
|
REDIS_PASSWORD=infini_rag_flow
|
||||||
|
|
||||||
SVR_HTTP_PORT=9380
|
SVR_HTTP_PORT=9380
|
||||||
|
|
||||||
RAGFLOW_VERSION=v0.3.2
|
RAGFLOW_VERSION=dev
|
||||||
|
|
||||||
TIMEZONE='Asia/Shanghai'
|
TIMEZONE='Asia/Shanghai'
|
||||||
|
|
||||||
|
|||||||
@ -50,7 +50,7 @@ The serving port of mysql inside the container. The modification should be synch
|
|||||||
The max database connection.
|
The max database connection.
|
||||||
|
|
||||||
### stale_timeout
|
### stale_timeout
|
||||||
The timeout duation in seconds.
|
The timeout duration in seconds.
|
||||||
|
|
||||||
## minio
|
## minio
|
||||||
|
|
||||||
|
|||||||
@ -29,24 +29,6 @@ services:
|
|||||||
- ragflow
|
- ragflow
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
#kibana:
|
|
||||||
# depends_on:
|
|
||||||
# es01:
|
|
||||||
# condition: service_healthy
|
|
||||||
# image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
|
||||||
# container_name: ragflow-kibana
|
|
||||||
# volumes:
|
|
||||||
# - kibanadata:/usr/share/kibana/data
|
|
||||||
# ports:
|
|
||||||
# - ${KIBANA_PORT}:5601
|
|
||||||
# environment:
|
|
||||||
# - SERVERNAME=kibana
|
|
||||||
# - ELASTICSEARCH_HOSTS=http://es01:9200
|
|
||||||
# - TZ=${TIMEZONE}
|
|
||||||
# mem_limit: ${MEM_LIMIT}
|
|
||||||
# networks:
|
|
||||||
# - ragflow
|
|
||||||
|
|
||||||
mysql:
|
mysql:
|
||||||
image: mysql:5.7.18
|
image: mysql:5.7.18
|
||||||
container_name: ragflow-mysql
|
container_name: ragflow-mysql
|
||||||
@ -74,7 +56,6 @@ services:
|
|||||||
retries: 3
|
retries: 3
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
minio:
|
minio:
|
||||||
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
||||||
container_name: ragflow-minio
|
container_name: ragflow-minio
|
||||||
@ -92,16 +73,27 @@ services:
|
|||||||
- ragflow
|
- ragflow
|
||||||
restart: always
|
restart: always
|
||||||
|
|
||||||
|
redis:
|
||||||
|
image: redis:7.2.4
|
||||||
|
container_name: ragflow-redis
|
||||||
|
command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory 128mb --maxmemory-policy allkeys-lru
|
||||||
|
volumes:
|
||||||
|
- redis_data:/data
|
||||||
|
networks:
|
||||||
|
- ragflow
|
||||||
|
restart: always
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
esdata01:
|
esdata01:
|
||||||
driver: local
|
driver: local
|
||||||
# kibanadata:
|
|
||||||
# driver: local
|
|
||||||
mysql_data:
|
mysql_data:
|
||||||
driver: local
|
driver: local
|
||||||
minio_data:
|
minio_data:
|
||||||
driver: local
|
driver: local
|
||||||
|
redis_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
ragflow:
|
ragflow:
|
||||||
|
|||||||
@ -12,28 +12,14 @@ function task_exe(){
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
function watch_broker(){
|
|
||||||
while [ 1 -eq 1 ];do
|
|
||||||
C=`ps aux|grep "task_broker.py"|grep -v grep|wc -l`;
|
|
||||||
if [ $C -lt 1 ];then
|
|
||||||
$PY rag/svr/task_broker.py &
|
|
||||||
fi
|
|
||||||
sleep 5;
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
function task_bro(){
|
|
||||||
watch_broker;
|
|
||||||
}
|
|
||||||
|
|
||||||
task_bro &
|
|
||||||
|
|
||||||
WS=1
|
WS=1
|
||||||
for ((i=0;i<WS;i++))
|
for ((i=0;i<WS;i++))
|
||||||
do
|
do
|
||||||
task_exe $i $WS &
|
task_exe $i $WS &
|
||||||
done
|
done
|
||||||
|
|
||||||
$PY api/ragflow_server.py
|
while [ 1 -eq 1 ];do
|
||||||
|
$PY api/ragflow_server.py
|
||||||
|
done
|
||||||
|
|
||||||
wait;
|
wait;
|
||||||
@ -13,12 +13,12 @@ minio:
|
|||||||
user: 'rag_flow'
|
user: 'rag_flow'
|
||||||
password: 'infini_rag_flow'
|
password: 'infini_rag_flow'
|
||||||
host: 'minio:9000'
|
host: 'minio:9000'
|
||||||
|
es:
|
||||||
|
hosts: 'http://es01:9200'
|
||||||
redis:
|
redis:
|
||||||
db: 1
|
db: 1
|
||||||
password: 'infini_rag_flow'
|
password: 'infini_rag_flow'
|
||||||
host: 'redis:6379'
|
host: 'redis:6379'
|
||||||
es:
|
|
||||||
hosts: 'http://es01:9200'
|
|
||||||
user_default_llm:
|
user_default_llm:
|
||||||
factory: 'Tongyi-Qianwen'
|
factory: 'Tongyi-Qianwen'
|
||||||
api_key: 'sk-xxxxxxxxxxxxx'
|
api_key: 'sk-xxxxxxxxxxxxx'
|
||||||
|
|||||||
@ -221,6 +221,7 @@ This will be called to get the answer to users' questions.
|
|||||||
|------|-------|----|----|
|
|------|-------|----|----|
|
||||||
| conversation_id| string | No | This is from calling /new_conversation.|
|
| conversation_id| string | No | This is from calling /new_conversation.|
|
||||||
| messages| json | No | All the conversation history stored here including the latest user's question.|
|
| messages| json | No | All the conversation history stored here including the latest user's question.|
|
||||||
|
| quote | bool | Yes | Default: true |
|
||||||
|
|
||||||
### Response
|
### Response
|
||||||
```json
|
```json
|
||||||
|
|||||||
86
docs/faq.md
86
docs/faq.md
@ -55,7 +55,7 @@ This feature and the related APIs are still in development. Contributions are we
|
|||||||
```
|
```
|
||||||
$ git clone https://github.com/infiniflow/ragflow.git
|
$ git clone https://github.com/infiniflow/ragflow.git
|
||||||
$ cd ragflow
|
$ cd ragflow
|
||||||
$ docker build -t infiniflow/ragflow:v0.3.2 .
|
$ docker build -t infiniflow/ragflow:latest .
|
||||||
$ cd ragflow/docker
|
$ cd ragflow/docker
|
||||||
$ chmod +x ./entrypoint.sh
|
$ chmod +x ./entrypoint.sh
|
||||||
$ docker compose up -d
|
$ docker compose up -d
|
||||||
@ -193,18 +193,31 @@ docker logs -f ragflow-server
|
|||||||
2. Check if the **task_executor.py** process exists.
|
2. Check if the **task_executor.py** process exists.
|
||||||
3. Check if your RAGFlow server can access hf-mirror.com or huggingface.com.
|
3. Check if your RAGFlow server can access hf-mirror.com or huggingface.com.
|
||||||
|
|
||||||
|
#### 4.5 Why does my pdf parsing stall near completion, while the log does not show any error?
|
||||||
|
|
||||||
#### 4.5 `Index failure`
|
If your RAGFlow is deployed *locally*, the parsing process is likely killed due to insufficient RAM. Try increasing your memory allocation by increasing the `MEM_LIMIT` value in **docker/.env**.
|
||||||
|
|
||||||
|
> Ensure that you restart up your RAGFlow server for your changes to take effect!
|
||||||
|
> ```bash
|
||||||
|
> docker compose stop
|
||||||
|
> ```
|
||||||
|
> ```bash
|
||||||
|
> docker compose up -d
|
||||||
|
> ```
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
#### 4.6 `Index failure`
|
||||||
|
|
||||||
An index failure usually indicates an unavailable Elasticsearch service.
|
An index failure usually indicates an unavailable Elasticsearch service.
|
||||||
|
|
||||||
#### 4.6 How to check the log of RAGFlow?
|
#### 4.7 How to check the log of RAGFlow?
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
tail -f path_to_ragflow/docker/ragflow-logs/rag/*.log
|
tail -f path_to_ragflow/docker/ragflow-logs/rag/*.log
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4.7 How to check the status of each component in RAGFlow?
|
#### 4.8 How to check the status of each component in RAGFlow?
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ docker ps
|
$ docker ps
|
||||||
@ -212,13 +225,13 @@ $ docker ps
|
|||||||
*The system displays the following if all your RAGFlow components are running properly:*
|
*The system displays the following if all your RAGFlow components are running properly:*
|
||||||
|
|
||||||
```
|
```
|
||||||
5bc45806b680 infiniflow/ragflow:v0.3.2 "./entrypoint.sh" 11 hours ago Up 11 hours 0.0.0.0:80->80/tcp, :::80->80/tcp, 0.0.0.0:443->443/tcp, :::443->443/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp ragflow-server
|
5bc45806b680 infiniflow/ragflow:latest "./entrypoint.sh" 11 hours ago Up 11 hours 0.0.0.0:80->80/tcp, :::80->80/tcp, 0.0.0.0:443->443/tcp, :::443->443/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp ragflow-server
|
||||||
91220e3285dd docker.elastic.co/elasticsearch/elasticsearch:8.11.3 "/bin/tini -- /usr/l…" 11 hours ago Up 11 hours (healthy) 9300/tcp, 0.0.0.0:9200->9200/tcp, :::9200->9200/tcp ragflow-es-01
|
91220e3285dd docker.elastic.co/elasticsearch/elasticsearch:8.11.3 "/bin/tini -- /usr/l…" 11 hours ago Up 11 hours (healthy) 9300/tcp, 0.0.0.0:9200->9200/tcp, :::9200->9200/tcp ragflow-es-01
|
||||||
d8c86f06c56b mysql:5.7.18 "docker-entrypoint.s…" 7 days ago Up 16 seconds (healthy) 0.0.0.0:3306->3306/tcp, :::3306->3306/tcp ragflow-mysql
|
d8c86f06c56b mysql:5.7.18 "docker-entrypoint.s…" 7 days ago Up 16 seconds (healthy) 0.0.0.0:3306->3306/tcp, :::3306->3306/tcp ragflow-mysql
|
||||||
cd29bcb254bc quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z "/usr/bin/docker-ent…" 2 weeks ago Up 11 hours 0.0.0.0:9001->9001/tcp, :::9001->9001/tcp, 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp ragflow-minio
|
cd29bcb254bc quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z "/usr/bin/docker-ent…" 2 weeks ago Up 11 hours 0.0.0.0:9001->9001/tcp, :::9001->9001/tcp, 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp ragflow-minio
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4.8 `Exception: Can't connect to ES cluster`
|
#### 4.9 `Exception: Can't connect to ES cluster`
|
||||||
|
|
||||||
1. Check the status of your Elasticsearch component:
|
1. Check the status of your Elasticsearch component:
|
||||||
|
|
||||||
@ -245,23 +258,26 @@ $ docker ps
|
|||||||
curl http://<IP_OF_ES>:<PORT_OF_ES>
|
curl http://<IP_OF_ES>:<PORT_OF_ES>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### 4.10 Can't start ES container and get `Elasticsearch did not exit normally`
|
||||||
|
|
||||||
#### 4.9 `{"data":null,"retcode":100,"retmsg":"<NotFound '404: Not Found'>"}`
|
This is because you forgot to update the `vm.max_map_count` value in **/etc/sysctl.conf** and your change to this value was reset after a system reboot.
|
||||||
|
|
||||||
|
#### 4.11 `{"data":null,"retcode":100,"retmsg":"<NotFound '404: Not Found'>"}`
|
||||||
|
|
||||||
Your IP address or port number may be incorrect. If you are using the default configurations, enter http://<IP_OF_YOUR_MACHINE> (**NOT 9380, AND NO PORT NUMBER REQUIRED!**) in your browser. This should work.
|
Your IP address or port number may be incorrect. If you are using the default configurations, enter http://<IP_OF_YOUR_MACHINE> (**NOT 9380, AND NO PORT NUMBER REQUIRED!**) in your browser. This should work.
|
||||||
|
|
||||||
#### 4.10 `Ollama - Mistral instance running at 127.0.0.1:11434 but cannot add Ollama as model in RagFlow`
|
#### 4.12 `Ollama - Mistral instance running at 127.0.0.1:11434 but cannot add Ollama as model in RagFlow`
|
||||||
|
|
||||||
A correct Ollama IP address and port is crucial to adding models to Ollama:
|
A correct Ollama IP address and port is crucial to adding models to Ollama:
|
||||||
|
|
||||||
- If you are on demo.ragflow.io, ensure that the server hosting Ollama has a publicly accessible IP address.Note that 127.0.0.1 is not a publicly accessible IP address.
|
- If you are on demo.ragflow.io, ensure that the server hosting Ollama has a publicly accessible IP address.Note that 127.0.0.1 is not a publicly accessible IP address.
|
||||||
- If you deploy RAGFlow locally, ensure that Ollama and RAGFlow are in the same LAN and can comunicate with each other.
|
- If you deploy RAGFlow locally, ensure that Ollama and RAGFlow are in the same LAN and can comunicate with each other.
|
||||||
|
|
||||||
#### 4.11 Do you offer examples of using deepdoc to parse PDF or other files?
|
#### 4.13 Do you offer examples of using deepdoc to parse PDF or other files?
|
||||||
|
|
||||||
Yes, we do. See the Python files under the **rag/app** folder.
|
Yes, we do. See the Python files under the **rag/app** folder.
|
||||||
|
|
||||||
#### 4.12 Why did I fail to upload a 10MB+ file to my locally deployed RAGFlow?
|
#### 4.14 Why did I fail to upload a 10MB+ file to my locally deployed RAGFlow?
|
||||||
|
|
||||||
You probably forgot to update the **MAX_CONTENT_LENGTH** environment variable:
|
You probably forgot to update the **MAX_CONTENT_LENGTH** environment variable:
|
||||||
|
|
||||||
@ -280,7 +296,7 @@ docker compose up ragflow -d
|
|||||||
```
|
```
|
||||||
*Now you should be able to upload files of sizes less than 100MB.*
|
*Now you should be able to upload files of sizes less than 100MB.*
|
||||||
|
|
||||||
#### 4.13 `Table 'rag_flow.document' doesn't exist`
|
#### 4.15 `Table 'rag_flow.document' doesn't exist`
|
||||||
|
|
||||||
This exception occurs when starting up the RAGFlow server. Try the following:
|
This exception occurs when starting up the RAGFlow server. Try the following:
|
||||||
|
|
||||||
@ -303,7 +319,7 @@ This exception occurs when starting up the RAGFlow server. Try the following:
|
|||||||
docker compose up
|
docker compose up
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4.14 `hint : 102 Fail to access model Connection error`
|
#### 4.16 `hint : 102 Fail to access model Connection error`
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
@ -311,6 +327,13 @@ This exception occurs when starting up the RAGFlow server. Try the following:
|
|||||||
2. Do not forget to append **/v1/** to **http://IP:port**:
|
2. Do not forget to append **/v1/** to **http://IP:port**:
|
||||||
**http://IP:port/v1/**
|
**http://IP:port/v1/**
|
||||||
|
|
||||||
|
#### 4.17 `FileNotFoundError: [Errno 2] No such file or directory`
|
||||||
|
|
||||||
|
1. Check if the status of your minio container is healthy:
|
||||||
|
```bash
|
||||||
|
docker ps
|
||||||
|
```
|
||||||
|
2. Ensure that the username and password settings of MySQL and MinIO in **docker/.env** are in line with those in **docker/service_conf.yml**.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@ -340,10 +363,43 @@ You can use Ollama to deploy local LLM. See [here](https://github.com/infiniflow
|
|||||||
|
|
||||||
### 6. How to configure RAGFlow to respond with 100% matched results, rather than utilizing LLM?
|
### 6. How to configure RAGFlow to respond with 100% matched results, rather than utilizing LLM?
|
||||||
|
|
||||||
1. Click the **Knowledge Base** tab in the middle top of the page.
|
1. Click **Knowledge Base** in the middle top of the page.
|
||||||
2. Right click the desired knowledge base to display the **Configuration** dialogue.
|
2. Right click the desired knowledge base to display the **Configuration** dialogue.
|
||||||
3. Choose **Q&A** as the chunk method and click **Save** to confirm your change.
|
3. Choose **Q&A** as the chunk method and click **Save** to confirm your change.
|
||||||
|
|
||||||
### Do I need to connect to Redis?
|
### 7 Do I need to connect to Redis?
|
||||||
|
|
||||||
No, connecting to Redis is not required to use RAGFlow.
|
No, connecting to Redis is not required.
|
||||||
|
|
||||||
|
### 8 `Error: Range of input length should be [1, 30000]`
|
||||||
|
|
||||||
|
This error occurs because there are too many chunks matching your search criteria. Try reducing the **TopN** and increasing **Similarity threshold** to fix this issue:
|
||||||
|
|
||||||
|
1. Click **Chat** in the middle top of the page.
|
||||||
|
2. Right click the desired conversation > **Edit** > **Prompt Engine**
|
||||||
|
3. Reduce the **TopN** and/or raise **Silimarity threshold**.
|
||||||
|
4. Click **OK** to confirm your changes.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
### 9 How to update RAGFlow to the latest version?
|
||||||
|
|
||||||
|
1. Pull the latest source code
|
||||||
|
```bash
|
||||||
|
cd ragflow
|
||||||
|
git pull
|
||||||
|
```
|
||||||
|
2. If you used `docker compose up -d` to start up RAGFlow server:
|
||||||
|
```bash
|
||||||
|
docker pull infiniflow/ragflow:dev
|
||||||
|
```
|
||||||
|
```bash
|
||||||
|
docker compose up ragflow -d
|
||||||
|
```
|
||||||
|
3. If you used `docker compose -f docker-compose-CN.yml up -d` to start up RAGFlow server:
|
||||||
|
```bash
|
||||||
|
docker pull swr.cn-north-4.myhuaweicloud.com/infiniflow/ragflow:dev
|
||||||
|
```
|
||||||
|
```bash
|
||||||
|
docker compose -f docker-compose-CN.yml up -d
|
||||||
|
```
|
||||||
|
|||||||
@ -18,7 +18,7 @@ from io import BytesIO
|
|||||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
|
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
|
||||||
tokenize_chunks, find_codec
|
tokenize_chunks, find_codec
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
||||||
|
|
||||||
|
|
||||||
@ -63,9 +63,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"""
|
"""
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
pdf_parser = None
|
pdf_parser = None
|
||||||
sections, tbls = [], []
|
sections, tbls = [], []
|
||||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||||
@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from docx import Document
|
|||||||
from api.db import ParserType
|
from api.db import ParserType
|
||||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||||
make_colon_as_title, add_positions, tokenize_chunks, find_codec
|
make_colon_as_title, add_positions, tokenize_chunks, find_codec
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
|
|
||||||
@ -89,9 +89,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"""
|
"""
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
pdf_parser = None
|
pdf_parser = None
|
||||||
sections = []
|
sections = []
|
||||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||||
@ -113,7 +113,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import copy
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from api.db import ParserType
|
from api.db import ParserType
|
||||||
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||||
from deepdoc.parser import PdfParser, PlainParser
|
from deepdoc.parser import PdfParser, PlainParser
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
|
|
||||||
@ -70,8 +70,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename
|
"docnm_kwd": filename
|
||||||
}
|
}
|
||||||
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
# is it English
|
# is it English
|
||||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@ from docx import Document
|
|||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import re
|
import re
|
||||||
from deepdoc.parser.pdf_parser import PlainParser
|
from deepdoc.parser.pdf_parser import PlainParser
|
||||||
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
||||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
|
|
||||||
@ -112,9 +112,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
res = []
|
res = []
|
||||||
pdf_parser = None
|
pdf_parser = None
|
||||||
sections = []
|
sections = []
|
||||||
@ -141,7 +141,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from tika import parser
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import re
|
import re
|
||||||
from rag.app import laws
|
from rag.app import laws
|
||||||
from rag.nlp import huqie, tokenize, find_codec
|
from rag.nlp import rag_tokenizer, tokenize, find_codec
|
||||||
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
||||||
|
|
||||||
|
|
||||||
@ -85,7 +85,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
@ -111,9 +111,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
tokenize(doc, "\n".join(sections), eng)
|
tokenize(doc, "\n".join(sections), eng)
|
||||||
return [doc]
|
return [doc]
|
||||||
|
|
||||||
|
|||||||
@ -15,7 +15,7 @@ import re
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
from api.db import ParserType
|
from api.db import ParserType
|
||||||
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||||
from deepdoc.parser import PdfParser, PlainParser
|
from deepdoc.parser import PdfParser, PlainParser
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
@ -153,10 +153,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
else:
|
else:
|
||||||
raise NotImplementedError("file type not supported yet(pdf supported)")
|
raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||||
|
|
||||||
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
|
doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
|
||||||
"title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
|
"title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
|
||||||
# is it English
|
# is it English
|
||||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||||
print("It's English.....", eng)
|
print("It's English.....", eng)
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from io import BytesIO
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from rag.nlp import tokenize, is_english
|
from rag.nlp import tokenize, is_english
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||||
from PyPDF2 import PdfReader as pdf2_read
|
from PyPDF2 import PdfReader as pdf2_read
|
||||||
|
|
||||||
@ -96,9 +96,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
eng = lang.lower() == "english"
|
eng = lang.lower() == "english"
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
res = []
|
res = []
|
||||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||||
ppt_parser = Ppt()
|
ppt_parser = Ppt()
|
||||||
|
|||||||
@ -16,7 +16,7 @@ from io import BytesIO
|
|||||||
from nltk import word_tokenize
|
from nltk import word_tokenize
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
from rag.nlp import is_english, random_choices, find_codec
|
from rag.nlp import is_english, random_choices, find_codec
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import ExcelParser
|
from deepdoc.parser import ExcelParser
|
||||||
|
|
||||||
|
|
||||||
@ -73,8 +73,8 @@ def beAdoc(d, q, a, eng):
|
|||||||
aprefix = "Answer: " if eng else "回答:"
|
aprefix = "Answer: " if eng else "回答:"
|
||||||
d["content_with_weight"] = "\t".join(
|
d["content_with_weight"] = "\t".join(
|
||||||
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
||||||
d["content_ltks"] = huqie.qie(q)
|
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
@ -94,7 +94,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
res = []
|
res = []
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
@ -107,7 +107,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
@ -116,18 +116,31 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
|||||||
break
|
break
|
||||||
txt += l
|
txt += l
|
||||||
lines = txt.split("\n")
|
lines = txt.split("\n")
|
||||||
#is_english([rmPrefix(l) for l in lines[:100]])
|
comma, tab = 0, 0
|
||||||
|
for l in lines:
|
||||||
|
if len(l.split(",")) == 2: comma += 1
|
||||||
|
if len(l.split("\t")) == 2: tab += 1
|
||||||
|
delimiter = "\t" if tab >= comma else ","
|
||||||
|
|
||||||
fails = []
|
fails = []
|
||||||
for i, line in enumerate(lines):
|
question, answer = "", ""
|
||||||
arr = [l for l in line.split("\t") if len(l) > 1]
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
arr = lines[i].split(delimiter)
|
||||||
if len(arr) != 2:
|
if len(arr) != 2:
|
||||||
fails.append(str(i))
|
if question: answer += "\n" + lines[i]
|
||||||
continue
|
else:
|
||||||
res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
|
fails.append(str(i+1))
|
||||||
|
elif len(arr) == 2:
|
||||||
|
if question and answer: res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||||
|
question, answer = arr
|
||||||
|
i += 1
|
||||||
if len(res) % 999 == 0:
|
if len(res) % 999 == 0:
|
||||||
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
||||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||||
|
|
||||||
|
if question: res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||||
|
|
||||||
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
||||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||||
|
|
||||||
|
|||||||
@ -18,7 +18,7 @@ import re
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser.resume import refactor
|
from deepdoc.parser.resume import refactor
|
||||||
from deepdoc.parser.resume import step_one, step_two
|
from deepdoc.parser.resume import step_one, step_two
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
@ -131,9 +131,9 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|||||||
titles.append(str(v))
|
titles.append(str(v))
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie("-".join(titles) + "-简历")
|
"title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
|
||||||
}
|
}
|
||||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
pairs = []
|
pairs = []
|
||||||
for n, m in field_map.items():
|
for n, m in field_map.items():
|
||||||
if not resume.get(n):
|
if not resume.get(n):
|
||||||
@ -147,8 +147,8 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|||||||
|
|
||||||
doc["content_with_weight"] = "\n".join(
|
doc["content_with_weight"] = "\n".join(
|
||||||
["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
|
["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
|
||||||
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
|
doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
|
||||||
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
|
doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
|
||||||
for n, _ in field_map.items():
|
for n, _ in field_map.items():
|
||||||
if n not in resume:
|
if n not in resume:
|
||||||
continue
|
continue
|
||||||
@ -156,7 +156,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
|||||||
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
||||||
resume[n] = resume[n][0]
|
resume[n] = resume[n][0]
|
||||||
if n.find("_tks") > 0:
|
if n.find("_tks") > 0:
|
||||||
resume[n] = huqie.qieqie(resume[n])
|
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
|
||||||
doc[n] = resume[n]
|
doc[n] = resume[n]
|
||||||
|
|
||||||
print(doc)
|
print(doc)
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from openpyxl import load_workbook
|
|||||||
from dateutil.parser import parse as datetime_parse
|
from dateutil.parser import parse as datetime_parse
|
||||||
|
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from rag.nlp import huqie, is_english, tokenize, find_codec
|
from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
|
||||||
from deepdoc.parser import ExcelParser
|
from deepdoc.parser import ExcelParser
|
||||||
|
|
||||||
|
|
||||||
@ -47,6 +47,7 @@ class Excel(ExcelParser):
|
|||||||
cell.value for i,
|
cell.value for i,
|
||||||
cell in enumerate(
|
cell in enumerate(
|
||||||
rows[0]) if i not in missed]
|
rows[0]) if i not in missed]
|
||||||
|
if not headers:continue
|
||||||
data = []
|
data = []
|
||||||
for i, r in enumerate(rows[1:]):
|
for i, r in enumerate(rows[1:]):
|
||||||
rn += 1
|
rn += 1
|
||||||
@ -148,7 +149,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|||||||
txt = ""
|
txt = ""
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding)
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
while True:
|
while True:
|
||||||
@ -216,7 +217,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|||||||
for ii, row in df.iterrows():
|
for ii, row in df.iterrows():
|
||||||
d = {
|
d = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
}
|
}
|
||||||
row_txt = []
|
row_txt = []
|
||||||
for j in range(len(clmns)):
|
for j in range(len(clmns)):
|
||||||
@ -227,7 +228,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
|||||||
if pd.isna(row[clmns[j]]):
|
if pd.isna(row[clmns[j]]):
|
||||||
continue
|
continue
|
||||||
fld = clmns_map[j][0]
|
fld = clmns_map[j][0]
|
||||||
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
|
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
|
||||||
row[clmns[j]])
|
row[clmns[j]])
|
||||||
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
|
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
|
||||||
if not row_txt:
|
if not row_txt:
|
||||||
|
|||||||
@ -22,7 +22,7 @@ EmbeddingModel = {
|
|||||||
"Ollama": OllamaEmbed,
|
"Ollama": OllamaEmbed,
|
||||||
"OpenAI": OpenAIEmbed,
|
"OpenAI": OpenAIEmbed,
|
||||||
"Xinference": XinferenceEmbed,
|
"Xinference": XinferenceEmbed,
|
||||||
"Tongyi-Qianwen": HuEmbedding, #QWenEmbed,
|
"Tongyi-Qianwen": DefaultEmbedding, #QWenEmbed,
|
||||||
"ZHIPU-AI": ZhipuEmbed,
|
"ZHIPU-AI": ZhipuEmbed,
|
||||||
"FastEmbed": FastEmbed,
|
"FastEmbed": FastEmbed,
|
||||||
"Youdao": YoudaoEmbed
|
"Youdao": YoudaoEmbed
|
||||||
@ -45,6 +45,7 @@ ChatModel = {
|
|||||||
"Tongyi-Qianwen": QWenChat,
|
"Tongyi-Qianwen": QWenChat,
|
||||||
"Ollama": OllamaChat,
|
"Ollama": OllamaChat,
|
||||||
"Xinference": XinferenceChat,
|
"Xinference": XinferenceChat,
|
||||||
"Moonshot": MoonshotChat
|
"Moonshot": MoonshotChat,
|
||||||
|
"DeepSeek": DeepSeekChat
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -24,16 +24,7 @@ from rag.utils import num_tokens_from_string
|
|||||||
|
|
||||||
|
|
||||||
class Base(ABC):
|
class Base(ABC):
|
||||||
def __init__(self, key, model_name):
|
def __init__(self, key, model_name, base_url):
|
||||||
pass
|
|
||||||
|
|
||||||
def chat(self, system, history, gen_conf):
|
|
||||||
raise NotImplementedError("Please implement encode method!")
|
|
||||||
|
|
||||||
|
|
||||||
class GptTurbo(Base):
|
|
||||||
def __init__(self, key, model_name="gpt-3.5-turbo", base_url="https://api.openai.com/v1"):
|
|
||||||
if not base_url: base_url="https://api.openai.com/v1"
|
|
||||||
self.client = OpenAI(api_key=key, base_url=base_url)
|
self.client = OpenAI(api_key=key, base_url=base_url)
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
@ -54,28 +45,28 @@ class GptTurbo(Base):
|
|||||||
return "**ERROR**: " + str(e), 0
|
return "**ERROR**: " + str(e), 0
|
||||||
|
|
||||||
|
|
||||||
class MoonshotChat(GptTurbo):
|
class GptTurbo(Base):
|
||||||
|
def __init__(self, key, model_name="gpt-3.5-turbo", base_url="https://api.openai.com/v1"):
|
||||||
|
if not base_url: base_url="https://api.openai.com/v1"
|
||||||
|
super().__init__(key, model_name, base_url)
|
||||||
|
|
||||||
|
|
||||||
|
class MoonshotChat(Base):
|
||||||
def __init__(self, key, model_name="moonshot-v1-8k", base_url="https://api.moonshot.cn/v1"):
|
def __init__(self, key, model_name="moonshot-v1-8k", base_url="https://api.moonshot.cn/v1"):
|
||||||
if not base_url: base_url="https://api.moonshot.cn/v1"
|
if not base_url: base_url="https://api.moonshot.cn/v1"
|
||||||
self.client = OpenAI(
|
super().__init__(key, model_name, base_url)
|
||||||
api_key=key, base_url=base_url)
|
|
||||||
self.model_name = model_name
|
|
||||||
|
|
||||||
def chat(self, system, history, gen_conf):
|
|
||||||
if system:
|
class XinferenceChat(Base):
|
||||||
history.insert(0, {"role": "system", "content": system})
|
def __init__(self, key=None, model_name="", base_url=""):
|
||||||
try:
|
key = "xxx"
|
||||||
response = self.client.chat.completions.create(
|
super().__init__(key, model_name, base_url)
|
||||||
model=self.model_name,
|
|
||||||
messages=history,
|
|
||||||
**gen_conf)
|
class DeepSeekChat(Base):
|
||||||
ans = response.choices[0].message.content.strip()
|
def __init__(self, key, model_name="deepseek-chat", base_url="https://api.deepseek.com/v1"):
|
||||||
if response.choices[0].finish_reason == "length":
|
if not base_url: base_url="https://api.deepseek.com/v1"
|
||||||
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
super().__init__(key, model_name, base_url)
|
||||||
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
|
||||||
return ans, response.usage.total_tokens
|
|
||||||
except openai.APIError as e:
|
|
||||||
return "**ERROR**: " + str(e), 0
|
|
||||||
|
|
||||||
|
|
||||||
class QWenChat(Base):
|
class QWenChat(Base):
|
||||||
@ -141,12 +132,12 @@ class OllamaChat(Base):
|
|||||||
if system:
|
if system:
|
||||||
history.insert(0, {"role": "system", "content": system})
|
history.insert(0, {"role": "system", "content": system})
|
||||||
try:
|
try:
|
||||||
options = {"temperature": gen_conf.get("temperature", 0.1),
|
options = {}
|
||||||
"num_predict": gen_conf.get("max_tokens", 128),
|
if "temperature" in gen_conf: options["temperature"] = gen_conf["temperature"]
|
||||||
"top_k": gen_conf.get("top_p", 0.3),
|
if "max_tokens" in gen_conf: options["num_predict"] = gen_conf["max_tokens"]
|
||||||
"presence_penalty": gen_conf.get("presence_penalty", 0.4),
|
if "top_p" in gen_conf: options["top_k"] = gen_conf["top_p"]
|
||||||
"frequency_penalty": gen_conf.get("frequency_penalty", 0.7),
|
if "presence_penalty" in gen_conf: options["presence_penalty"] = gen_conf["presence_penalty"]
|
||||||
}
|
if "frequency_penalty" in gen_conf: options["frequency_penalty"] = gen_conf["frequency_penalty"]
|
||||||
response = self.client.chat(
|
response = self.client.chat(
|
||||||
model=self.model_name,
|
model=self.model_name,
|
||||||
messages=history,
|
messages=history,
|
||||||
@ -157,25 +148,3 @@ class OllamaChat(Base):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return "**ERROR**: " + str(e), 0
|
return "**ERROR**: " + str(e), 0
|
||||||
|
|
||||||
|
|
||||||
class XinferenceChat(Base):
|
|
||||||
def __init__(self, key=None, model_name="", base_url=""):
|
|
||||||
self.client = OpenAI(api_key="xxx", base_url=base_url)
|
|
||||||
self.model_name = model_name
|
|
||||||
|
|
||||||
def chat(self, system, history, gen_conf):
|
|
||||||
if system:
|
|
||||||
history.insert(0, {"role": "system", "content": system})
|
|
||||||
try:
|
|
||||||
response = self.client.chat.completions.create(
|
|
||||||
model=self.model_name,
|
|
||||||
messages=history,
|
|
||||||
**gen_conf)
|
|
||||||
ans = response.choices[0].message.content.strip()
|
|
||||||
if response.choices[0].finish_reason == "length":
|
|
||||||
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
|
||||||
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
|
||||||
return ans, response.usage.total_tokens
|
|
||||||
except openai.APIError as e:
|
|
||||||
return "**ERROR**: " + str(e), 0
|
|
||||||
|
|
||||||
|
|||||||
@ -26,19 +26,17 @@ from FlagEmbedding import FlagModel
|
|||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory, get_home_cache_dir
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
flag_model = FlagModel(os.path.join(
|
flag_model = FlagModel(os.path.join(get_home_cache_dir(), "bge-large-zh-v1.5"),
|
||||||
get_project_base_directory(),
|
|
||||||
"rag/res/bge-large-zh-v1.5"),
|
|
||||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||||
use_fp16=torch.cuda.is_available())
|
use_fp16=torch.cuda.is_available())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5",
|
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5",
|
||||||
local_dir=os.path.join(get_project_base_directory(), "rag/res/bge-large-zh-v1.5"),
|
local_dir=os.path.join(get_home_cache_dir(), "bge-large-zh-v1.5"),
|
||||||
local_dir_use_symlinks=False)
|
local_dir_use_symlinks=False)
|
||||||
flag_model = FlagModel(model_dir,
|
flag_model = FlagModel(model_dir,
|
||||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||||
@ -56,7 +54,7 @@ class Base(ABC):
|
|||||||
raise NotImplementedError("Please implement encode method!")
|
raise NotImplementedError("Please implement encode method!")
|
||||||
|
|
||||||
|
|
||||||
class HuEmbedding(Base):
|
class DefaultEmbedding(Base):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
||||||
@ -97,8 +95,7 @@ class OpenAIEmbed(Base):
|
|||||||
def encode(self, texts: list, batch_size=32):
|
def encode(self, texts: list, batch_size=32):
|
||||||
res = self.client.embeddings.create(input=texts,
|
res = self.client.embeddings.create(input=texts,
|
||||||
model=self.model_name)
|
model=self.model_name)
|
||||||
return np.array([d.embedding for d in res.data]
|
return np.array([d.embedding for d in res.data]), res.usage.total_tokens
|
||||||
), res.usage.total_tokens
|
|
||||||
|
|
||||||
def encode_queries(self, text):
|
def encode_queries(self, text):
|
||||||
res = self.client.embeddings.create(input=[text],
|
res = self.client.embeddings.create(input=[text],
|
||||||
@ -238,8 +235,8 @@ class YoudaoEmbed(Base):
|
|||||||
try:
|
try:
|
||||||
print("LOADING BCE...")
|
print("LOADING BCE...")
|
||||||
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(
|
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(
|
||||||
get_project_base_directory(),
|
get_home_cache_dir(),
|
||||||
"rag/res/bce-embedding-base_v1"))
|
"bce-embedding-base_v1"))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
YoudaoEmbed._client = qanthing(
|
YoudaoEmbed._client = qanthing(
|
||||||
model_name_or_path=model_name.replace(
|
model_name_or_path=model_name.replace(
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import random
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
from . import huqie
|
from . import rag_tokenizer
|
||||||
import re
|
import re
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
@ -28,11 +28,17 @@ all_codecs = [
|
|||||||
def find_codec(blob):
|
def find_codec(blob):
|
||||||
global all_codecs
|
global all_codecs
|
||||||
for c in all_codecs:
|
for c in all_codecs:
|
||||||
|
try:
|
||||||
|
blob[:1024].decode(c)
|
||||||
|
return c
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
try:
|
try:
|
||||||
blob.decode(c)
|
blob.decode(c)
|
||||||
return c
|
return c
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return "utf-8"
|
return "utf-8"
|
||||||
|
|
||||||
|
|
||||||
@ -109,8 +115,8 @@ def is_english(texts):
|
|||||||
def tokenize(d, t, eng):
|
def tokenize(d, t, eng):
|
||||||
d["content_with_weight"] = t
|
d["content_with_weight"] = t
|
||||||
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
||||||
d["content_ltks"] = huqie.qie(t)
|
d["content_ltks"] = rag_tokenizer.tokenize(t)
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
|
|
||||||
|
|
||||||
def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
||||||
|
|||||||
@ -1,475 +0,0 @@
|
|||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
import copy
|
|
||||||
import base64
|
|
||||||
import magic
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import List
|
|
||||||
import numpy as np
|
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
|
|
||||||
class HuChunker:
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Fields:
|
|
||||||
text_chunks: List = None
|
|
||||||
table_chunks: List = None
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.MAX_LVL = 12
|
|
||||||
self.proj_patt = [
|
|
||||||
(r"第[零一二三四五六七八九十百]+章", 1),
|
|
||||||
(r"第[零一二三四五六七八九十百]+[条节]", 2),
|
|
||||||
(r"[零一二三四五六七八九十百]+[、 ]", 3),
|
|
||||||
(r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
|
|
||||||
(r"[0-9]+(、|\.[ ]|\.[^0-9])", 5),
|
|
||||||
(r"[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 6),
|
|
||||||
(r"[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 7),
|
|
||||||
(r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 8),
|
|
||||||
(r".{,48}[::??]@", 9),
|
|
||||||
(r"[0-9]+)", 10),
|
|
||||||
(r"[\((][0-9]+[)\)]", 11),
|
|
||||||
(r"[零一二三四五六七八九十百]+是", 12),
|
|
||||||
(r"[⚫•➢✓ ]", 12)
|
|
||||||
]
|
|
||||||
self.lines = []
|
|
||||||
|
|
||||||
def _garbage(self, txt):
|
|
||||||
patt = [
|
|
||||||
r"(在此保证|不得以任何形式翻版|请勿传阅|仅供内部使用|未经事先书面授权)",
|
|
||||||
r"(版权(归本公司)*所有|免责声明|保留一切权力|承担全部责任|特别声明|报告中涉及)",
|
|
||||||
r"(不承担任何责任|投资者的通知事项:|任何机构和个人|本报告仅为|不构成投资)",
|
|
||||||
r"(不构成对任何个人或机构投资建议|联系其所在国家|本报告由从事证券交易)",
|
|
||||||
r"(本研究报告由|「认可投资者」|所有研究报告均以|请发邮件至)",
|
|
||||||
r"(本报告仅供|市场有风险,投资需谨慎|本报告中提及的)",
|
|
||||||
r"(本报告反映|此信息仅供|证券分析师承诺|具备证券投资咨询业务资格)",
|
|
||||||
r"^(时间|签字|签章)[::]",
|
|
||||||
r"(参考文献|目录索引|图表索引)",
|
|
||||||
r"[ ]*年[ ]+月[ ]+日",
|
|
||||||
r"^(中国证券业协会|[0-9]+年[0-9]+月[0-9]+日)$",
|
|
||||||
r"\.{10,}",
|
|
||||||
r"(———————END|帮我转发|欢迎收藏|快来关注我吧)"
|
|
||||||
]
|
|
||||||
return any([re.search(p, txt) for p in patt])
|
|
||||||
|
|
||||||
def _proj_match(self, line):
|
|
||||||
for p, j in self.proj_patt:
|
|
||||||
if re.match(p, line):
|
|
||||||
return j
|
|
||||||
return
|
|
||||||
|
|
||||||
def _does_proj_match(self):
|
|
||||||
mat = [None for _ in range(len(self.lines))]
|
|
||||||
for i in range(len(self.lines)):
|
|
||||||
mat[i] = self._proj_match(self.lines[i])
|
|
||||||
return mat
|
|
||||||
|
|
||||||
def naive_text_chunk(self, text, ti="", MAX_LEN=612):
|
|
||||||
if text:
|
|
||||||
self.lines = [l.strip().replace(u'\u3000', u' ')
|
|
||||||
.replace(u'\xa0', u'')
|
|
||||||
for l in text.split("\n\n")]
|
|
||||||
self.lines = [l for l in self.lines if not self._garbage(l)]
|
|
||||||
self.lines = [re.sub(r"([ ]+| )", " ", l)
|
|
||||||
for l in self.lines if l]
|
|
||||||
if not self.lines:
|
|
||||||
return []
|
|
||||||
arr = self.lines
|
|
||||||
|
|
||||||
res = [""]
|
|
||||||
i = 0
|
|
||||||
while i < len(arr):
|
|
||||||
a = arr[i]
|
|
||||||
if not a:
|
|
||||||
i += 1
|
|
||||||
continue
|
|
||||||
if len(a) > MAX_LEN:
|
|
||||||
a_ = a.split("\n")
|
|
||||||
if len(a_) >= 2:
|
|
||||||
arr.pop(i)
|
|
||||||
for j in range(2, len(a_) + 1):
|
|
||||||
if len("\n".join(a_[:j])) >= MAX_LEN:
|
|
||||||
arr.insert(i, "\n".join(a_[:j - 1]))
|
|
||||||
arr.insert(i + 1, "\n".join(a_[j - 1:]))
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
assert False, f"Can't split: {a}"
|
|
||||||
continue
|
|
||||||
|
|
||||||
if len(res[-1]) < MAX_LEN / 3:
|
|
||||||
res[-1] += "\n" + a
|
|
||||||
else:
|
|
||||||
res.append(a)
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
if ti:
|
|
||||||
for i in range(len(res)):
|
|
||||||
if res[i].find("——来自") >= 0:
|
|
||||||
continue
|
|
||||||
res[i] += f"\t——来自“{ti}”"
|
|
||||||
|
|
||||||
return res
|
|
||||||
|
|
||||||
def _merge(self):
|
|
||||||
# merge continuous same level text
|
|
||||||
lines = [self.lines[0]] if self.lines else []
|
|
||||||
for i in range(1, len(self.lines)):
|
|
||||||
if self.mat[i] == self.mat[i - 1] \
|
|
||||||
and len(lines[-1]) < 256 \
|
|
||||||
and len(self.lines[i]) < 256:
|
|
||||||
lines[-1] += "\n" + self.lines[i]
|
|
||||||
continue
|
|
||||||
lines.append(self.lines[i])
|
|
||||||
self.lines = lines
|
|
||||||
self.mat = self._does_proj_match()
|
|
||||||
return self.mat
|
|
||||||
|
|
||||||
def text_chunks(self, text):
|
|
||||||
if text:
|
|
||||||
self.lines = [l.strip().replace(u'\u3000', u' ')
|
|
||||||
.replace(u'\xa0', u'')
|
|
||||||
for l in re.split(r"[\r\n]", text)]
|
|
||||||
self.lines = [l for l in self.lines if not self._garbage(l)]
|
|
||||||
self.lines = [l for l in self.lines if l]
|
|
||||||
self.mat = self._does_proj_match()
|
|
||||||
mat = self._merge()
|
|
||||||
|
|
||||||
tree = []
|
|
||||||
for i in range(len(self.lines)):
|
|
||||||
tree.append({"proj": mat[i],
|
|
||||||
"children": [],
|
|
||||||
"read": False})
|
|
||||||
# find all children
|
|
||||||
for i in range(len(self.lines) - 1):
|
|
||||||
if tree[i]["proj"] is None:
|
|
||||||
continue
|
|
||||||
ed = i + 1
|
|
||||||
while ed < len(tree) and (tree[ed]["proj"] is None or
|
|
||||||
tree[ed]["proj"] > tree[i]["proj"]):
|
|
||||||
ed += 1
|
|
||||||
|
|
||||||
nxt = tree[i]["proj"] + 1
|
|
||||||
st = set([p["proj"] for p in tree[i + 1: ed] if p["proj"]])
|
|
||||||
while nxt not in st:
|
|
||||||
nxt += 1
|
|
||||||
if nxt > self.MAX_LVL:
|
|
||||||
break
|
|
||||||
if nxt <= self.MAX_LVL:
|
|
||||||
for j in range(i + 1, ed):
|
|
||||||
if tree[j]["proj"] is not None:
|
|
||||||
break
|
|
||||||
tree[i]["children"].append(j)
|
|
||||||
for j in range(i + 1, ed):
|
|
||||||
if tree[j]["proj"] != nxt:
|
|
||||||
continue
|
|
||||||
tree[i]["children"].append(j)
|
|
||||||
else:
|
|
||||||
for j in range(i + 1, ed):
|
|
||||||
tree[i]["children"].append(j)
|
|
||||||
|
|
||||||
# get DFS combinations, find all the paths to leaf
|
|
||||||
paths = []
|
|
||||||
|
|
||||||
def dfs(i, path):
|
|
||||||
nonlocal tree, paths
|
|
||||||
path.append(i)
|
|
||||||
tree[i]["read"] = True
|
|
||||||
if len(self.lines[i]) > 256:
|
|
||||||
paths.append(path)
|
|
||||||
return
|
|
||||||
if not tree[i]["children"]:
|
|
||||||
if len(path) > 1 or len(self.lines[i]) >= 32:
|
|
||||||
paths.append(path)
|
|
||||||
return
|
|
||||||
for j in tree[i]["children"]:
|
|
||||||
dfs(j, copy.deepcopy(path))
|
|
||||||
|
|
||||||
for i, t in enumerate(tree):
|
|
||||||
if t["read"]:
|
|
||||||
continue
|
|
||||||
dfs(i, [])
|
|
||||||
|
|
||||||
# concat txt on the path for all paths
|
|
||||||
res = []
|
|
||||||
lines = np.array(self.lines)
|
|
||||||
for p in paths:
|
|
||||||
if len(p) < 2:
|
|
||||||
tree[p[0]]["read"] = False
|
|
||||||
continue
|
|
||||||
txt = "\n".join(lines[p[:-1]]) + "\n" + lines[p[-1]]
|
|
||||||
res.append(txt)
|
|
||||||
# concat continuous orphans
|
|
||||||
assert len(tree) == len(lines)
|
|
||||||
ii = 0
|
|
||||||
while ii < len(tree):
|
|
||||||
if tree[ii]["read"]:
|
|
||||||
ii += 1
|
|
||||||
continue
|
|
||||||
txt = lines[ii]
|
|
||||||
e = ii + 1
|
|
||||||
while e < len(tree) and not tree[e]["read"] and len(txt) < 256:
|
|
||||||
txt += "\n" + lines[e]
|
|
||||||
e += 1
|
|
||||||
res.append(txt)
|
|
||||||
ii = e
|
|
||||||
|
|
||||||
# if the node has not been read, find its daddy
|
|
||||||
def find_daddy(st):
|
|
||||||
nonlocal lines, tree
|
|
||||||
proj = tree[st]["proj"]
|
|
||||||
if len(self.lines[st]) > 512:
|
|
||||||
return [st]
|
|
||||||
if proj is None:
|
|
||||||
proj = self.MAX_LVL + 1
|
|
||||||
for i in range(st - 1, -1, -1):
|
|
||||||
if tree[i]["proj"] and tree[i]["proj"] < proj:
|
|
||||||
a = [st] + find_daddy(i)
|
|
||||||
return a
|
|
||||||
return []
|
|
||||||
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
class PdfChunker(HuChunker):
|
|
||||||
|
|
||||||
def __init__(self, pdf_parser):
|
|
||||||
self.pdf = pdf_parser
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def tableHtmls(self, pdfnm):
|
|
||||||
_, tbls = self.pdf(pdfnm, return_html=True)
|
|
||||||
res = []
|
|
||||||
for img, arr in tbls:
|
|
||||||
if arr[0].find("<table>") < 0:
|
|
||||||
continue
|
|
||||||
buffered = BytesIO()
|
|
||||||
if img:
|
|
||||||
img.save(buffered, format="JPEG")
|
|
||||||
img_str = base64.b64encode(
|
|
||||||
buffered.getvalue()).decode('utf-8') if img else ""
|
|
||||||
res.append({"table": arr[0], "image": img_str})
|
|
||||||
return res
|
|
||||||
|
|
||||||
def html(self, pdfnm):
|
|
||||||
txts, tbls = self.pdf(pdfnm, return_html=True)
|
|
||||||
res = []
|
|
||||||
txt_cks = self.text_chunks(txts)
|
|
||||||
for txt, img in [(self.pdf.remove_tag(c), self.pdf.crop(c))
|
|
||||||
for c in txt_cks]:
|
|
||||||
buffered = BytesIO()
|
|
||||||
if img:
|
|
||||||
img.save(buffered, format="JPEG")
|
|
||||||
img_str = base64.b64encode(
|
|
||||||
buffered.getvalue()).decode('utf-8') if img else ""
|
|
||||||
res.append({"table": "<p>%s</p>" % txt.replace("\n", "<br/>"),
|
|
||||||
"image": img_str})
|
|
||||||
|
|
||||||
for img, arr in tbls:
|
|
||||||
if not arr:
|
|
||||||
continue
|
|
||||||
buffered = BytesIO()
|
|
||||||
if img:
|
|
||||||
img.save(buffered, format="JPEG")
|
|
||||||
img_str = base64.b64encode(
|
|
||||||
buffered.getvalue()).decode('utf-8') if img else ""
|
|
||||||
res.append({"table": arr[0], "image": img_str})
|
|
||||||
|
|
||||||
return res
|
|
||||||
|
|
||||||
def __call__(self, pdfnm, return_image=True, naive_chunk=False):
|
|
||||||
flds = self.Fields()
|
|
||||||
text, tbls = self.pdf(pdfnm)
|
|
||||||
fnm = pdfnm
|
|
||||||
txt_cks = self.text_chunks(text) if not naive_chunk else \
|
|
||||||
self.naive_text_chunk(text, ti=fnm if isinstance(fnm, str) else "")
|
|
||||||
flds.text_chunks = [(self.pdf.remove_tag(c),
|
|
||||||
self.pdf.crop(c) if return_image else None) for c in txt_cks]
|
|
||||||
|
|
||||||
flds.table_chunks = [(arr, img if return_image else None)
|
|
||||||
for img, arr in tbls]
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
class DocxChunker(HuChunker):
|
|
||||||
|
|
||||||
def __init__(self, doc_parser):
|
|
||||||
self.doc = doc_parser
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def _does_proj_match(self):
|
|
||||||
mat = []
|
|
||||||
for s in self.styles:
|
|
||||||
s = s.split(" ")[-1]
|
|
||||||
try:
|
|
||||||
mat.append(int(s))
|
|
||||||
except Exception as e:
|
|
||||||
mat.append(None)
|
|
||||||
return mat
|
|
||||||
|
|
||||||
def _merge(self):
|
|
||||||
i = 1
|
|
||||||
while i < len(self.lines):
|
|
||||||
if self.mat[i] == self.mat[i - 1] \
|
|
||||||
and len(self.lines[i - 1]) < 256 \
|
|
||||||
and len(self.lines[i]) < 256:
|
|
||||||
self.lines[i - 1] += "\n" + self.lines[i]
|
|
||||||
self.styles.pop(i)
|
|
||||||
self.lines.pop(i)
|
|
||||||
self.mat.pop(i)
|
|
||||||
continue
|
|
||||||
i += 1
|
|
||||||
self.mat = self._does_proj_match()
|
|
||||||
return self.mat
|
|
||||||
|
|
||||||
def __call__(self, fnm):
|
|
||||||
flds = self.Fields()
|
|
||||||
flds.title = os.path.splitext(
|
|
||||||
os.path.basename(fnm))[0] if isinstance(
|
|
||||||
fnm, type("")) else ""
|
|
||||||
secs, tbls = self.doc(fnm)
|
|
||||||
self.lines = [l for l, s in secs]
|
|
||||||
self.styles = [s for l, s in secs]
|
|
||||||
|
|
||||||
txt_cks = self.text_chunks("")
|
|
||||||
flds.text_chunks = [(t, None) for t in txt_cks if not self._garbage(t)]
|
|
||||||
flds.table_chunks = [(tb, None) for tb in tbls for t in tb if t]
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
class ExcelChunker(HuChunker):
|
|
||||||
|
|
||||||
def __init__(self, excel_parser):
|
|
||||||
self.excel = excel_parser
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def __call__(self, fnm):
|
|
||||||
flds = self.Fields()
|
|
||||||
flds.text_chunks = [(t, None) for t in self.excel(fnm)]
|
|
||||||
flds.table_chunks = []
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
class PptChunker(HuChunker):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
def __extract(self, shape):
|
|
||||||
if shape.shape_type == 19:
|
|
||||||
tb = shape.table
|
|
||||||
rows = []
|
|
||||||
for i in range(1, len(tb.rows)):
|
|
||||||
rows.append("; ".join([tb.cell(
|
|
||||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
|
||||||
return "\n".join(rows)
|
|
||||||
|
|
||||||
if shape.has_text_frame:
|
|
||||||
return shape.text_frame.text
|
|
||||||
|
|
||||||
if shape.shape_type == 6:
|
|
||||||
texts = []
|
|
||||||
for p in shape.shapes:
|
|
||||||
t = self.__extract(p)
|
|
||||||
if t:
|
|
||||||
texts.append(t)
|
|
||||||
return "\n".join(texts)
|
|
||||||
|
|
||||||
def __call__(self, fnm):
|
|
||||||
from pptx import Presentation
|
|
||||||
ppt = Presentation(fnm) if isinstance(
|
|
||||||
fnm, str) else Presentation(
|
|
||||||
BytesIO(fnm))
|
|
||||||
txts = []
|
|
||||||
for slide in ppt.slides:
|
|
||||||
texts = []
|
|
||||||
for shape in slide.shapes:
|
|
||||||
txt = self.__extract(shape)
|
|
||||||
if txt:
|
|
||||||
texts.append(txt)
|
|
||||||
txts.append("\n".join(texts))
|
|
||||||
|
|
||||||
import aspose.slides as slides
|
|
||||||
import aspose.pydrawing as drawing
|
|
||||||
imgs = []
|
|
||||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
|
||||||
for slide in presentation.slides:
|
|
||||||
buffered = BytesIO()
|
|
||||||
slide.get_thumbnail(
|
|
||||||
0.5, 0.5).save(
|
|
||||||
buffered, drawing.imaging.ImageFormat.jpeg)
|
|
||||||
imgs.append(buffered.getvalue())
|
|
||||||
assert len(imgs) == len(
|
|
||||||
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
|
||||||
|
|
||||||
flds = self.Fields()
|
|
||||||
flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))]
|
|
||||||
flds.table_chunks = []
|
|
||||||
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
class TextChunker(HuChunker):
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Fields:
|
|
||||||
text_chunks: List = None
|
|
||||||
table_chunks: List = None
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_binary_file(file_path):
|
|
||||||
mime = magic.Magic(mime=True)
|
|
||||||
if isinstance(file_path, str):
|
|
||||||
file_type = mime.from_file(file_path)
|
|
||||||
else:
|
|
||||||
file_type = mime.from_buffer(file_path)
|
|
||||||
if 'text' in file_type:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
def __call__(self, fnm):
|
|
||||||
flds = self.Fields()
|
|
||||||
if self.is_binary_file(fnm):
|
|
||||||
return flds
|
|
||||||
txt = ""
|
|
||||||
if isinstance(fnm, str):
|
|
||||||
with open(fnm, "r") as f:
|
|
||||||
txt = f.read()
|
|
||||||
else:
|
|
||||||
txt = fnm.decode("utf-8")
|
|
||||||
flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)]
|
|
||||||
flds.table_chunks = []
|
|
||||||
return flds
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import sys
|
|
||||||
sys.path.append(os.path.dirname(__file__) + "/../")
|
|
||||||
if sys.argv[1].split(".")[-1].lower() == "pdf":
|
|
||||||
from deepdoc.parser import PdfParser
|
|
||||||
ckr = PdfChunker(PdfParser())
|
|
||||||
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
|
|
||||||
from deepdoc.parser import DocxParser
|
|
||||||
ckr = DocxChunker(DocxParser())
|
|
||||||
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
|
|
||||||
from deepdoc.parser import ExcelParser
|
|
||||||
ckr = ExcelChunker(ExcelParser())
|
|
||||||
|
|
||||||
# ckr.html(sys.argv[1])
|
|
||||||
print(ckr(sys.argv[1]))
|
|
||||||
@ -7,14 +7,13 @@ import logging
|
|||||||
import copy
|
import copy
|
||||||
from elasticsearch_dsl import Q
|
from elasticsearch_dsl import Q
|
||||||
|
|
||||||
from rag.nlp import huqie, term_weight, synonym
|
from rag.nlp import rag_tokenizer, term_weight, synonym
|
||||||
|
|
||||||
|
|
||||||
class EsQueryer:
|
class EsQueryer:
|
||||||
def __init__(self, es):
|
def __init__(self, es):
|
||||||
self.tw = term_weight.Dealer()
|
self.tw = term_weight.Dealer()
|
||||||
self.es = es
|
self.es = es
|
||||||
self.syn = synonym.Dealer(None)
|
self.syn = synonym.Dealer()
|
||||||
self.flds = ["ask_tks^10", "ask_small_tks"]
|
self.flds = ["ask_tks^10", "ask_small_tks"]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -47,13 +46,13 @@ class EsQueryer:
|
|||||||
txt = re.sub(
|
txt = re.sub(
|
||||||
r"[ \r\n\t,,。??/`!!&]+",
|
r"[ \r\n\t,,。??/`!!&]+",
|
||||||
" ",
|
" ",
|
||||||
huqie.tradi2simp(
|
rag_tokenizer.tradi2simp(
|
||||||
huqie.strQ2B(
|
rag_tokenizer.strQ2B(
|
||||||
txt.lower()))).strip()
|
txt.lower()))).strip()
|
||||||
txt = EsQueryer.rmWWW(txt)
|
txt = EsQueryer.rmWWW(txt)
|
||||||
|
|
||||||
if not self.isChinese(txt):
|
if not self.isChinese(txt):
|
||||||
tks = huqie.qie(txt).split(" ")
|
tks = rag_tokenizer.tokenize(txt).split(" ")
|
||||||
q = copy.deepcopy(tks)
|
q = copy.deepcopy(tks)
|
||||||
for i in range(1, len(tks)):
|
for i in range(1, len(tks)):
|
||||||
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
|
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
|
||||||
@ -65,7 +64,7 @@ class EsQueryer:
|
|||||||
boost=1)#, minimum_should_match=min_match)
|
boost=1)#, minimum_should_match=min_match)
|
||||||
), tks
|
), tks
|
||||||
|
|
||||||
def needQieqie(tk):
|
def need_fine_grained_tokenize(tk):
|
||||||
if len(tk) < 4:
|
if len(tk) < 4:
|
||||||
return False
|
return False
|
||||||
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
|
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
|
||||||
@ -81,7 +80,7 @@ class EsQueryer:
|
|||||||
logging.info(json.dumps(twts, ensure_ascii=False))
|
logging.info(json.dumps(twts, ensure_ascii=False))
|
||||||
tms = []
|
tms = []
|
||||||
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
||||||
sm = huqie.qieqie(tk).split(" ") if needQieqie(tk) else []
|
sm = rag_tokenizer.fine_grained_tokenize(tk).split(" ") if need_fine_grained_tokenize(tk) else []
|
||||||
sm = [
|
sm = [
|
||||||
re.sub(
|
re.sub(
|
||||||
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
|
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
|
||||||
@ -110,10 +109,10 @@ class EsQueryer:
|
|||||||
if len(twts) > 1:
|
if len(twts) > 1:
|
||||||
tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
|
tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
|
||||||
if re.match(r"[0-9a-z ]+$", tt):
|
if re.match(r"[0-9a-z ]+$", tt):
|
||||||
tms = f"(\"{tt}\" OR \"%s\")" % huqie.qie(tt)
|
tms = f"(\"{tt}\" OR \"%s\")" % rag_tokenizer.tokenize(tt)
|
||||||
|
|
||||||
syns = " OR ".join(
|
syns = " OR ".join(
|
||||||
["\"%s\"^0.7" % EsQueryer.subSpecialChar(huqie.qie(s)) for s in syns])
|
["\"%s\"^0.7" % EsQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) for s in syns])
|
||||||
if syns:
|
if syns:
|
||||||
tms = f"({tms})^5 OR ({syns})^0.7"
|
tms = f"({tms})^5 OR ({syns})^0.7"
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from nltk.stem import PorterStemmer, WordNetLemmatizer
|
|||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
|
|
||||||
|
|
||||||
class Huqie:
|
class RagTokenizer:
|
||||||
def key_(self, line):
|
def key_(self, line):
|
||||||
return str(line.lower().encode("utf-8"))[2:-1]
|
return str(line.lower().encode("utf-8"))[2:-1]
|
||||||
|
|
||||||
@ -241,7 +241,7 @@ class Huqie:
|
|||||||
|
|
||||||
return self.score_(res[::-1])
|
return self.score_(res[::-1])
|
||||||
|
|
||||||
def qie(self, line):
|
def tokenize(self, line):
|
||||||
line = self._strQ2B(line).lower()
|
line = self._strQ2B(line).lower()
|
||||||
line = self._tradi2simp(line)
|
line = self._tradi2simp(line)
|
||||||
zh_num = len([1 for c in line if is_chinese(c)])
|
zh_num = len([1 for c in line if is_chinese(c)])
|
||||||
@ -298,7 +298,7 @@ class Huqie:
|
|||||||
print("[TKS]", self.merge_(res))
|
print("[TKS]", self.merge_(res))
|
||||||
return self.merge_(res)
|
return self.merge_(res)
|
||||||
|
|
||||||
def qieqie(self, tks):
|
def fine_grained_tokenize(self, tks):
|
||||||
tks = tks.split(" ")
|
tks = tks.split(" ")
|
||||||
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
||||||
if zh_num < len(tks) * 0.2:
|
if zh_num < len(tks) * 0.2:
|
||||||
@ -371,53 +371,53 @@ def naiveQie(txt):
|
|||||||
return tks
|
return tks
|
||||||
|
|
||||||
|
|
||||||
hq = Huqie()
|
tokenizer = RagTokenizer()
|
||||||
qie = hq.qie
|
tokenize = tokenizer.tokenize
|
||||||
qieqie = hq.qieqie
|
fine_grained_tokenize = tokenizer.fine_grained_tokenize
|
||||||
tag = hq.tag
|
tag = tokenizer.tag
|
||||||
freq = hq.freq
|
freq = tokenizer.freq
|
||||||
loadUserDict = hq.loadUserDict
|
loadUserDict = tokenizer.loadUserDict
|
||||||
addUserDict = hq.addUserDict
|
addUserDict = tokenizer.addUserDict
|
||||||
tradi2simp = hq._tradi2simp
|
tradi2simp = tokenizer._tradi2simp
|
||||||
strQ2B = hq._strQ2B
|
strQ2B = tokenizer._strQ2B
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
huqie = Huqie(debug=True)
|
tknzr = RagTokenizer(debug=True)
|
||||||
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
|
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
|
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
|
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
|
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
|
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie("虽然我不怎么玩")
|
tks = tknzr.tokenize("虽然我不怎么玩")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
|
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie("这周日你去吗?这周日你有空吗?")
|
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
tks = huqie.qie(
|
tks = tknzr.tokenize(
|
||||||
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
|
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
|
||||||
print(huqie.qieqie(tks))
|
print(tknzr.fine_grained_tokenize(tks))
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
sys.exit()
|
sys.exit()
|
||||||
huqie.DEBUG = False
|
tknzr.DEBUG = False
|
||||||
huqie.loadUserDict(sys.argv[1])
|
tknzr.loadUserDict(sys.argv[1])
|
||||||
of = open(sys.argv[2], "r")
|
of = open(sys.argv[2], "r")
|
||||||
while True:
|
while True:
|
||||||
line = of.readline()
|
line = of.readline()
|
||||||
if not line:
|
if not line:
|
||||||
break
|
break
|
||||||
print(huqie.qie(line))
|
print(tknzr.tokenize(line))
|
||||||
of.close()
|
of.close()
|
||||||
@ -9,7 +9,7 @@ from dataclasses import dataclass
|
|||||||
|
|
||||||
from rag.settings import es_logger
|
from rag.settings import es_logger
|
||||||
from rag.utils import rmSpace
|
from rag.utils import rmSpace
|
||||||
from rag.nlp import huqie, query
|
from rag.nlp import rag_tokenizer, query
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
@ -128,7 +128,7 @@ class Dealer:
|
|||||||
kwds = set([])
|
kwds = set([])
|
||||||
for k in keywords:
|
for k in keywords:
|
||||||
kwds.add(k)
|
kwds.add(k)
|
||||||
for kk in huqie.qieqie(k).split(" "):
|
for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
|
||||||
if len(kk) < 2:
|
if len(kk) < 2:
|
||||||
continue
|
continue
|
||||||
if kk in kwds:
|
if kk in kwds:
|
||||||
@ -243,7 +243,7 @@ class Dealer:
|
|||||||
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
||||||
len(ans_v[0]), len(chunk_v[0]))
|
len(ans_v[0]), len(chunk_v[0]))
|
||||||
|
|
||||||
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
|
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
|
||||||
for ck in chunks]
|
for ck in chunks]
|
||||||
cites = {}
|
cites = {}
|
||||||
thr = 0.63
|
thr = 0.63
|
||||||
@ -251,7 +251,7 @@ class Dealer:
|
|||||||
for i, a in enumerate(pieces_):
|
for i, a in enumerate(pieces_):
|
||||||
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
||||||
chunk_v,
|
chunk_v,
|
||||||
huqie.qie(
|
rag_tokenizer.tokenize(
|
||||||
self.qryr.rmWWW(pieces_[i])).split(" "),
|
self.qryr.rmWWW(pieces_[i])).split(" "),
|
||||||
chunks_tks,
|
chunks_tks,
|
||||||
tkweight, vtweight)
|
tkweight, vtweight)
|
||||||
@ -310,8 +310,8 @@ class Dealer:
|
|||||||
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
||||||
return self.qryr.hybrid_similarity(ans_embd,
|
return self.qryr.hybrid_similarity(ans_embd,
|
||||||
ins_embd,
|
ins_embd,
|
||||||
huqie.qie(ans).split(" "),
|
rag_tokenizer.tokenize(ans).split(" "),
|
||||||
huqie.qie(inst).split(" "))
|
rag_tokenizer.tokenize(inst).split(" "))
|
||||||
|
|
||||||
def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
|
def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
|
||||||
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
|
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
|
||||||
@ -385,7 +385,7 @@ class Dealer:
|
|||||||
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
||||||
fld, v = r.group(1), r.group(3)
|
fld, v = r.group(1), r.group(3)
|
||||||
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
||||||
fld, huqie.qieqie(huqie.qie(v)))
|
fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v)))
|
||||||
replaces.append(
|
replaces.append(
|
||||||
("{}{}'{}'".format(
|
("{}{}'{}'".format(
|
||||||
r.group(1),
|
r.group(1),
|
||||||
|
|||||||
@ -17,7 +17,7 @@ class Dealer:
|
|||||||
try:
|
try:
|
||||||
self.dictionary = json.load(open(path, 'r'))
|
self.dictionary = json.load(open(path, 'r'))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warn("Miss synonym.json")
|
logging.warn("Missing synonym.json")
|
||||||
self.dictionary = {}
|
self.dictionary = {}
|
||||||
|
|
||||||
if not redis:
|
if not redis:
|
||||||
|
|||||||
@ -4,7 +4,7 @@ import json
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from rag.nlp import huqie
|
from rag.nlp import rag_tokenizer
|
||||||
from api.utils.file_utils import get_project_base_directory
|
from api.utils.file_utils import get_project_base_directory
|
||||||
|
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ class Dealer:
|
|||||||
txt = re.sub(p, r, txt)
|
txt = re.sub(p, r, txt)
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
for t in huqie.qie(txt).split(" "):
|
for t in rag_tokenizer.tokenize(txt).split(" "):
|
||||||
tk = t
|
tk = t
|
||||||
if (stpwd and tk in self.stop_words) or (
|
if (stpwd and tk in self.stop_words) or (
|
||||||
re.match(r"[0-9]$", tk) and not num):
|
re.match(r"[0-9]$", tk) and not num):
|
||||||
@ -161,7 +161,7 @@ class Dealer:
|
|||||||
return m[self.ne[t]]
|
return m[self.ne[t]]
|
||||||
|
|
||||||
def postag(t):
|
def postag(t):
|
||||||
t = huqie.tag(t)
|
t = rag_tokenizer.tag(t)
|
||||||
if t in set(["r", "c", "d"]):
|
if t in set(["r", "c", "d"]):
|
||||||
return 0.3
|
return 0.3
|
||||||
if t in set(["ns", "nt"]):
|
if t in set(["ns", "nt"]):
|
||||||
@ -175,14 +175,14 @@ class Dealer:
|
|||||||
def freq(t):
|
def freq(t):
|
||||||
if re.match(r"[0-9. -]{2,}$", t):
|
if re.match(r"[0-9. -]{2,}$", t):
|
||||||
return 3
|
return 3
|
||||||
s = huqie.freq(t)
|
s = rag_tokenizer.freq(t)
|
||||||
if not s and re.match(r"[a-z. -]+$", t):
|
if not s and re.match(r"[a-z. -]+$", t):
|
||||||
return 300
|
return 300
|
||||||
if not s:
|
if not s:
|
||||||
s = 0
|
s = 0
|
||||||
|
|
||||||
if not s and len(t) >= 4:
|
if not s and len(t) >= 4:
|
||||||
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
|
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
|
||||||
if len(s) > 1:
|
if len(s) > 1:
|
||||||
s = np.min([freq(tt) for tt in s]) / 6.
|
s = np.min([freq(tt) for tt in s]) / 6.
|
||||||
else:
|
else:
|
||||||
@ -198,7 +198,7 @@ class Dealer:
|
|||||||
elif re.match(r"[a-z. -]+$", t):
|
elif re.match(r"[a-z. -]+$", t):
|
||||||
return 300
|
return 300
|
||||||
elif len(t) >= 4:
|
elif len(t) >= 4:
|
||||||
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
|
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
|
||||||
if len(s) > 1:
|
if len(s) > 1:
|
||||||
return max(3, np.min([df(tt) for tt in s]) / 6.)
|
return max(3, np.min([df(tt) for tt in s]) / 6.)
|
||||||
|
|
||||||
|
|||||||
@ -47,3 +47,9 @@ cron_logger = getLogger("cron_logger")
|
|||||||
cron_logger.setLevel(20)
|
cron_logger.setLevel(20)
|
||||||
chunk_logger = getLogger("chunk_logger")
|
chunk_logger = getLogger("chunk_logger")
|
||||||
database_logger = getLogger("database")
|
database_logger = getLogger("database")
|
||||||
|
|
||||||
|
SVR_QUEUE_NAME = "rag_flow_svr_queue"
|
||||||
|
SVR_QUEUE_RETENTION = 60*60
|
||||||
|
SVR_QUEUE_MAX_LEN = 1024
|
||||||
|
SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
|
||||||
|
SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"
|
||||||
|
|||||||
@ -4,13 +4,14 @@ import traceback
|
|||||||
|
|
||||||
from api.db.db_models import close_connection
|
from api.db.db_models import close_connection
|
||||||
from api.db.services.task_service import TaskService
|
from api.db.services.task_service import TaskService
|
||||||
from rag.utils import MINIO
|
from rag.settings import cron_logger
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
from rag.utils.redis_conn import REDIS_CONN
|
from rag.utils.redis_conn import REDIS_CONN
|
||||||
|
|
||||||
|
|
||||||
def collect():
|
def collect():
|
||||||
doc_locations = TaskService.get_ongoing_doc_name()
|
doc_locations = TaskService.get_ongoing_doc_name()
|
||||||
#print(tasks)
|
print(doc_locations)
|
||||||
if len(doc_locations) == 0:
|
if len(doc_locations) == 0:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
return
|
return
|
||||||
@ -28,7 +29,7 @@ def main():
|
|||||||
if REDIS_CONN.exist(key):continue
|
if REDIS_CONN.exist(key):continue
|
||||||
file_bin = MINIO.get(kb_id, loc)
|
file_bin = MINIO.get(kb_id, loc)
|
||||||
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
||||||
print("CACHE:", loc)
|
cron_logger.info("CACHE: {}".format(loc))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
traceback.print_stack(e)
|
traceback.print_stack(e)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -1,193 +0,0 @@
|
|||||||
#
|
|
||||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
from datetime import datetime
|
|
||||||
from api.db.db_models import Task
|
|
||||||
from api.db.db_utils import bulk_insert_into_db
|
|
||||||
from api.db.services.task_service import TaskService
|
|
||||||
from deepdoc.parser import PdfParser
|
|
||||||
from deepdoc.parser.excel_parser import HuExcelParser
|
|
||||||
from rag.settings import cron_logger
|
|
||||||
from rag.utils import MINIO
|
|
||||||
from rag.utils import findMaxTm
|
|
||||||
import pandas as pd
|
|
||||||
from api.db import FileType, TaskStatus
|
|
||||||
from api.db.services.document_service import DocumentService
|
|
||||||
from api.settings import database_logger
|
|
||||||
from api.utils import get_format_time, get_uuid
|
|
||||||
from api.utils.file_utils import get_project_base_directory
|
|
||||||
from rag.utils.redis_conn import REDIS_CONN
|
|
||||||
from api.db.db_models import init_database_tables as init_web_db
|
|
||||||
from api.db.init_data import init_web_data
|
|
||||||
|
|
||||||
|
|
||||||
def collect(tm):
|
|
||||||
docs = DocumentService.get_newly_uploaded(tm)
|
|
||||||
if len(docs) == 0:
|
|
||||||
return pd.DataFrame()
|
|
||||||
docs = pd.DataFrame(docs)
|
|
||||||
mtm = docs["update_time"].max()
|
|
||||||
cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
def set_dispatching(docid):
|
|
||||||
try:
|
|
||||||
DocumentService.update_by_id(
|
|
||||||
docid, {"progress": random.random() * 1 / 100.,
|
|
||||||
"progress_msg": "Task dispatched...",
|
|
||||||
"process_begin_at": get_format_time()
|
|
||||||
})
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.error("set_dispatching:({}), {}".format(docid, str(e)))
|
|
||||||
|
|
||||||
|
|
||||||
def dispatch():
|
|
||||||
tm_fnm = os.path.join(
|
|
||||||
get_project_base_directory(),
|
|
||||||
"rag/res",
|
|
||||||
f"broker.tm")
|
|
||||||
tm = findMaxTm(tm_fnm)
|
|
||||||
rows = collect(tm)
|
|
||||||
if len(rows) == 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
tmf = open(tm_fnm, "a+")
|
|
||||||
for _, r in rows.iterrows():
|
|
||||||
try:
|
|
||||||
tsks = TaskService.query(doc_id=r["id"])
|
|
||||||
if tsks:
|
|
||||||
for t in tsks:
|
|
||||||
TaskService.delete_by_id(t.id)
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.exception(e)
|
|
||||||
|
|
||||||
def new_task():
|
|
||||||
nonlocal r
|
|
||||||
return {
|
|
||||||
"id": get_uuid(),
|
|
||||||
"doc_id": r["id"]
|
|
||||||
}
|
|
||||||
|
|
||||||
tsks = []
|
|
||||||
try:
|
|
||||||
file_bin = MINIO.get(r["kb_id"], r["location"])
|
|
||||||
if REDIS_CONN.is_alive():
|
|
||||||
try:
|
|
||||||
REDIS_CONN.set("{}/{}".format(r["kb_id"], r["location"]), file_bin, 12*60)
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.warning("Put into redis[EXCEPTION]:" + str(e))
|
|
||||||
|
|
||||||
if r["type"] == FileType.PDF.value:
|
|
||||||
do_layout = r["parser_config"].get("layout_recognize", True)
|
|
||||||
pages = PdfParser.total_page_number(r["name"], file_bin)
|
|
||||||
page_size = r["parser_config"].get("task_page_size", 12)
|
|
||||||
if r["parser_id"] == "paper":
|
|
||||||
page_size = r["parser_config"].get("task_page_size", 22)
|
|
||||||
if r["parser_id"] == "one":
|
|
||||||
page_size = 1000000000
|
|
||||||
if not do_layout:
|
|
||||||
page_size = 1000000000
|
|
||||||
page_ranges = r["parser_config"].get("pages")
|
|
||||||
if not page_ranges:
|
|
||||||
page_ranges = [(1, 100000)]
|
|
||||||
for s, e in page_ranges:
|
|
||||||
s -= 1
|
|
||||||
s = max(0, s)
|
|
||||||
e = min(e - 1, pages)
|
|
||||||
for p in range(s, e, page_size):
|
|
||||||
task = new_task()
|
|
||||||
task["from_page"] = p
|
|
||||||
task["to_page"] = min(p + page_size, e)
|
|
||||||
tsks.append(task)
|
|
||||||
|
|
||||||
elif r["parser_id"] == "table":
|
|
||||||
rn = HuExcelParser.row_number(
|
|
||||||
r["name"], file_bin)
|
|
||||||
for i in range(0, rn, 3000):
|
|
||||||
task = new_task()
|
|
||||||
task["from_page"] = i
|
|
||||||
task["to_page"] = min(i + 3000, rn)
|
|
||||||
tsks.append(task)
|
|
||||||
else:
|
|
||||||
tsks.append(new_task())
|
|
||||||
|
|
||||||
bulk_insert_into_db(Task, tsks, True)
|
|
||||||
set_dispatching(r["id"])
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.exception(e)
|
|
||||||
|
|
||||||
tmf.write(str(r["update_time"]) + "\n")
|
|
||||||
tmf.close()
|
|
||||||
|
|
||||||
|
|
||||||
def update_progress():
|
|
||||||
docs = DocumentService.get_unfinished_docs()
|
|
||||||
for d in docs:
|
|
||||||
try:
|
|
||||||
tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
|
|
||||||
if not tsks:
|
|
||||||
continue
|
|
||||||
msg = []
|
|
||||||
prg = 0
|
|
||||||
finished = True
|
|
||||||
bad = 0
|
|
||||||
status = TaskStatus.RUNNING.value
|
|
||||||
for t in tsks:
|
|
||||||
if 0 <= t.progress < 1:
|
|
||||||
finished = False
|
|
||||||
prg += t.progress if t.progress >= 0 else 0
|
|
||||||
msg.append(t.progress_msg)
|
|
||||||
if t.progress == -1:
|
|
||||||
bad += 1
|
|
||||||
prg /= len(tsks)
|
|
||||||
if finished and bad:
|
|
||||||
prg = -1
|
|
||||||
status = TaskStatus.FAIL.value
|
|
||||||
elif finished:
|
|
||||||
status = TaskStatus.DONE.value
|
|
||||||
|
|
||||||
msg = "\n".join(msg)
|
|
||||||
info = {
|
|
||||||
"process_duation": datetime.timestamp(
|
|
||||||
datetime.now()) -
|
|
||||||
d["process_begin_at"].timestamp(),
|
|
||||||
"run": status}
|
|
||||||
if prg != 0:
|
|
||||||
info["progress"] = prg
|
|
||||||
if msg:
|
|
||||||
info["progress_msg"] = msg
|
|
||||||
DocumentService.update_by_id(d["id"], info)
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.error("fetch task exception:" + str(e))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
peewee_logger = logging.getLogger('peewee')
|
|
||||||
peewee_logger.propagate = False
|
|
||||||
peewee_logger.addHandler(database_logger.handlers[0])
|
|
||||||
peewee_logger.setLevel(database_logger.level)
|
|
||||||
# init db
|
|
||||||
init_web_db()
|
|
||||||
init_web_data()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
dispatch()
|
|
||||||
time.sleep(1)
|
|
||||||
update_progress()
|
|
||||||
@ -24,16 +24,18 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from rag.utils import MINIO
|
|
||||||
|
from api.db.services.file2document_service import File2DocumentService
|
||||||
|
from rag.utils.minio_conn import MINIO
|
||||||
from api.db.db_models import close_connection
|
from api.db.db_models import close_connection
|
||||||
from rag.settings import database_logger
|
from rag.settings import database_logger, SVR_QUEUE_NAME
|
||||||
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
|
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
|
||||||
from multiprocessing import Pool
|
from multiprocessing import Pool
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from elasticsearch_dsl import Q
|
from elasticsearch_dsl import Q
|
||||||
from multiprocessing.context import TimeoutError
|
from multiprocessing.context import TimeoutError
|
||||||
from api.db.services.task_service import TaskService
|
from api.db.services.task_service import TaskService
|
||||||
from rag.utils import ELASTICSEARCH
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from rag.utils import rmSpace, findMaxTm
|
from rag.utils import rmSpace, findMaxTm
|
||||||
|
|
||||||
@ -87,36 +89,34 @@ def set_progress(task_id, from_page=0, to_page=-1,
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
|
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
|
||||||
|
|
||||||
|
close_connection()
|
||||||
if cancel:
|
if cancel:
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
def collect(comm, mod, tm):
|
def collect():
|
||||||
tasks = TaskService.get_tasks(tm, mod, comm)
|
try:
|
||||||
#print(tasks)
|
payload = REDIS_CONN.queue_consumer(SVR_QUEUE_NAME, "rag_flow_svr_task_broker", "rag_flow_svr_task_consumer")
|
||||||
if len(tasks) == 0:
|
if not payload:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
except Exception as e:
|
||||||
|
cron_logger.error("Get task event from queue exception:" + str(e))
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
msg = payload.get_message()
|
||||||
|
payload.ack()
|
||||||
|
if not msg: return pd.DataFrame()
|
||||||
|
|
||||||
|
if TaskService.do_cancel(msg["id"]):
|
||||||
|
return pd.DataFrame()
|
||||||
|
tasks = TaskService.get_tasks(msg["id"])
|
||||||
|
assert tasks, "{} empty task!".format(msg["id"])
|
||||||
tasks = pd.DataFrame(tasks)
|
tasks = pd.DataFrame(tasks)
|
||||||
mtm = tasks["update_time"].max()
|
|
||||||
cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm))
|
|
||||||
return tasks
|
return tasks
|
||||||
|
|
||||||
|
|
||||||
def get_minio_binary(bucket, name):
|
def get_minio_binary(bucket, name):
|
||||||
global MINIO
|
|
||||||
if REDIS_CONN.is_alive():
|
|
||||||
try:
|
|
||||||
for _ in range(30):
|
|
||||||
if REDIS_CONN.exist("{}/{}".format(bucket, name)):
|
|
||||||
time.sleep(1)
|
|
||||||
break
|
|
||||||
time.sleep(1)
|
|
||||||
r = REDIS_CONN.get("{}/{}".format(bucket, name))
|
|
||||||
if r: return r
|
|
||||||
cron_logger.warning("Cache missing: {}".format(name))
|
|
||||||
except Exception as e:
|
|
||||||
cron_logger.warning("Get redis[EXCEPTION]:" + str(e))
|
|
||||||
return MINIO.get(bucket, name)
|
return MINIO.get(bucket, name)
|
||||||
|
|
||||||
|
|
||||||
@ -132,12 +132,10 @@ def build(row):
|
|||||||
row["from_page"],
|
row["from_page"],
|
||||||
row["to_page"])
|
row["to_page"])
|
||||||
chunker = FACTORY[row["parser_id"].lower()]
|
chunker = FACTORY[row["parser_id"].lower()]
|
||||||
pool = Pool(processes=1)
|
|
||||||
try:
|
try:
|
||||||
st = timer()
|
st = timer()
|
||||||
thr = pool.apply_async(get_minio_binary, args=(row["kb_id"], row["location"]))
|
bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"])
|
||||||
binary = thr.get(timeout=90)
|
binary = get_minio_binary(bucket, name)
|
||||||
pool.terminate()
|
|
||||||
cron_logger.info(
|
cron_logger.info(
|
||||||
"From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
|
"From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
|
||||||
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
|
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
|
||||||
@ -156,7 +154,6 @@ def build(row):
|
|||||||
else:
|
else:
|
||||||
callback(-1, f"Internal server error: %s" %
|
callback(-1, f"Internal server error: %s" %
|
||||||
str(e).replace("'", ""))
|
str(e).replace("'", ""))
|
||||||
pool.terminate()
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
cron_logger.error(
|
cron_logger.error(
|
||||||
@ -247,20 +244,13 @@ def embedding(docs, mdl, parser_config={}, callback=None):
|
|||||||
return tk_count
|
return tk_count
|
||||||
|
|
||||||
|
|
||||||
def main(comm, mod):
|
def main():
|
||||||
tm_fnm = os.path.join(
|
rows = collect()
|
||||||
get_project_base_directory(),
|
|
||||||
"rag/res",
|
|
||||||
f"{comm}-{mod}.tm")
|
|
||||||
tm = findMaxTm(tm_fnm)
|
|
||||||
rows = collect(comm, mod, tm)
|
|
||||||
if len(rows) == 0:
|
if len(rows) == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
tmf = open(tm_fnm, "a+")
|
|
||||||
for _, r in rows.iterrows():
|
for _, r in rows.iterrows():
|
||||||
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
||||||
#callback(random.random()/10., "Task has been received.")
|
|
||||||
try:
|
try:
|
||||||
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -274,7 +264,6 @@ def main(comm, mod):
|
|||||||
if cks is None:
|
if cks is None:
|
||||||
continue
|
continue
|
||||||
if not cks:
|
if not cks:
|
||||||
tmf.write(str(r["update_time"]) + "\n")
|
|
||||||
callback(1., "No chunk! Done!")
|
callback(1., "No chunk! Done!")
|
||||||
continue
|
continue
|
||||||
# TODO: exception handler
|
# TODO: exception handler
|
||||||
@ -314,8 +303,6 @@ def main(comm, mod):
|
|||||||
"Chunk doc({}), token({}), chunks({}), elapsed:{}".format(
|
"Chunk doc({}), token({}), chunks({}), elapsed:{}".format(
|
||||||
r["id"], tk_count, len(cks), timer()-st))
|
r["id"], tk_count, len(cks), timer()-st))
|
||||||
|
|
||||||
tmf.write(str(r["update_time"]) + "\n")
|
|
||||||
tmf.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -324,8 +311,5 @@ if __name__ == "__main__":
|
|||||||
peewee_logger.addHandler(database_logger.handlers[0])
|
peewee_logger.addHandler(database_logger.handlers[0])
|
||||||
peewee_logger.setLevel(database_logger.level)
|
peewee_logger.setLevel(database_logger.level)
|
||||||
|
|
||||||
#from mpi4py import MPI
|
|
||||||
#comm = MPI.COMM_WORLD
|
|
||||||
while True:
|
while True:
|
||||||
main(int(sys.argv[2]), int(sys.argv[1]))
|
main()
|
||||||
close_connection()
|
|
||||||
|
|||||||
@ -15,9 +15,6 @@ def singleton(cls, *args, **kw):
|
|||||||
return _singleton
|
return _singleton
|
||||||
|
|
||||||
|
|
||||||
from .minio_conn import MINIO
|
|
||||||
from .es_conn import ELASTICSEARCH
|
|
||||||
|
|
||||||
def rmSpace(txt):
|
def rmSpace(txt):
|
||||||
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||||
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
|
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||||
|
|||||||
@ -15,7 +15,7 @@ es_logger.info("Elasticsearch version: "+str(elasticsearch.__version__))
|
|||||||
|
|
||||||
|
|
||||||
@singleton
|
@singleton
|
||||||
class HuEs:
|
class ESConnection:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.info = {}
|
self.info = {}
|
||||||
self.conn()
|
self.conn()
|
||||||
@ -454,4 +454,4 @@ class HuEs:
|
|||||||
scroll_size = len(page['hits']['hits'])
|
scroll_size = len(page['hits']['hits'])
|
||||||
|
|
||||||
|
|
||||||
ELASTICSEARCH = HuEs()
|
ELASTICSEARCH = ESConnection()
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from rag.utils import singleton
|
|||||||
|
|
||||||
|
|
||||||
@singleton
|
@singleton
|
||||||
class HuMinio(object):
|
class RAGFlowMinio(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.conn = None
|
self.conn = None
|
||||||
self.__open__()
|
self.__open__()
|
||||||
@ -35,7 +35,7 @@ class HuMinio(object):
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
def put(self, bucket, fnm, binary):
|
def put(self, bucket, fnm, binary):
|
||||||
for _ in range(10):
|
for _ in range(3):
|
||||||
try:
|
try:
|
||||||
if not self.conn.bucket_exists(bucket):
|
if not self.conn.bucket_exists(bucket):
|
||||||
self.conn.make_bucket(bucket)
|
self.conn.make_bucket(bucket)
|
||||||
@ -86,10 +86,12 @@ class HuMinio(object):
|
|||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
return
|
return
|
||||||
|
|
||||||
MINIO = HuMinio()
|
|
||||||
|
MINIO = RAGFlowMinio()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
conn = HuMinio()
|
conn = RAGFlowMinio()
|
||||||
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
img = Image.open(fnm)
|
img = Image.open(fnm)
|
||||||
|
|||||||
@ -5,6 +5,27 @@ import logging
|
|||||||
from rag import settings
|
from rag import settings
|
||||||
from rag.utils import singleton
|
from rag.utils import singleton
|
||||||
|
|
||||||
|
|
||||||
|
class Payload:
|
||||||
|
def __init__(self, consumer, queue_name, group_name, msg_id, message):
|
||||||
|
self.__consumer = consumer
|
||||||
|
self.__queue_name = queue_name
|
||||||
|
self.__group_name = group_name
|
||||||
|
self.__msg_id = msg_id
|
||||||
|
self.__message = json.loads(message['message'])
|
||||||
|
|
||||||
|
def ack(self):
|
||||||
|
try:
|
||||||
|
self.__consumer.xack(self.__queue_name, self.__group_name, self.__msg_id)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("[EXCEPTION]ack" + str(self.__queue_name) + "||" + str(e))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_message(self):
|
||||||
|
return self.__message
|
||||||
|
|
||||||
|
|
||||||
@singleton
|
@singleton
|
||||||
class RedisDB:
|
class RedisDB:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -14,10 +35,11 @@ class RedisDB:
|
|||||||
|
|
||||||
def __open__(self):
|
def __open__(self):
|
||||||
try:
|
try:
|
||||||
self.REDIS = redis.Redis(host=self.config.get("host", "redis").split(":")[0],
|
self.REDIS = redis.StrictRedis(host=self.config["host"].split(":")[0],
|
||||||
port=int(self.config.get("host", ":6379").split(":")[1]),
|
port=int(self.config.get("host", ":6379").split(":")[1]),
|
||||||
db=int(self.config.get("db", 1)),
|
db=int(self.config.get("db", 1)),
|
||||||
password=self.config.get("password"))
|
password=self.config.get("password"),
|
||||||
|
decode_responses=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning("Redis can't be connected.")
|
logging.warning("Redis can't be connected.")
|
||||||
return self.REDIS
|
return self.REDIS
|
||||||
@ -70,5 +92,48 @@ class RedisDB:
|
|||||||
self.__open__()
|
self.__open__()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def queue_product(self, queue, message, exp=settings.SVR_QUEUE_RETENTION) -> bool:
|
||||||
|
try:
|
||||||
|
payload = {"message": json.dumps(message)}
|
||||||
|
pipeline = self.REDIS.pipeline()
|
||||||
|
pipeline.xadd(queue, payload)
|
||||||
|
pipeline.expire(queue, exp)
|
||||||
|
pipeline.execute()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("[EXCEPTION]producer" + str(queue) + "||" + str(e))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> Payload:
|
||||||
|
try:
|
||||||
|
group_info = self.REDIS.xinfo_groups(queue_name)
|
||||||
|
if not any(e["name"] == group_name for e in group_info):
|
||||||
|
self.REDIS.xgroup_create(
|
||||||
|
queue_name,
|
||||||
|
group_name,
|
||||||
|
id="$",
|
||||||
|
mkstream=True
|
||||||
|
)
|
||||||
|
args = {
|
||||||
|
"groupname": group_name,
|
||||||
|
"consumername": consumer_name,
|
||||||
|
"count": 1,
|
||||||
|
"block": 10000,
|
||||||
|
"streams": {queue_name: msg_id},
|
||||||
|
}
|
||||||
|
messages = self.REDIS.xreadgroup(**args)
|
||||||
|
if not messages:
|
||||||
|
return None
|
||||||
|
stream, element_list = messages[0]
|
||||||
|
msg_id, payload = element_list[0]
|
||||||
|
res = Payload(self.REDIS, queue_name, group_name, msg_id, payload)
|
||||||
|
return res
|
||||||
|
except Exception as e:
|
||||||
|
if 'key' in str(e):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
logging.warning("[EXCEPTION]consumer" + str(queue_name) + "||" + str(e))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
REDIS_CONN = RedisDB()
|
REDIS_CONN = RedisDB()
|
||||||
@ -50,7 +50,6 @@ joblib==1.3.2
|
|||||||
lxml==5.1.0
|
lxml==5.1.0
|
||||||
MarkupSafe==2.1.5
|
MarkupSafe==2.1.5
|
||||||
minio==7.2.4
|
minio==7.2.4
|
||||||
mpi4py==3.1.5
|
|
||||||
mpmath==1.3.0
|
mpmath==1.3.0
|
||||||
multidict==6.0.5
|
multidict==6.0.5
|
||||||
multiprocess==0.70.16
|
multiprocess==0.70.16
|
||||||
@ -69,6 +68,7 @@ nvidia-cusparse-cu12==12.1.0.106
|
|||||||
nvidia-nccl-cu12==2.19.3
|
nvidia-nccl-cu12==2.19.3
|
||||||
nvidia-nvjitlink-cu12==12.3.101
|
nvidia-nvjitlink-cu12==12.3.101
|
||||||
nvidia-nvtx-cu12==12.1.105
|
nvidia-nvtx-cu12==12.1.105
|
||||||
|
ollama==0.1.9
|
||||||
onnxruntime-gpu==1.17.1
|
onnxruntime-gpu==1.17.1
|
||||||
openai==1.12.0
|
openai==1.12.0
|
||||||
opencv-python==4.9.0.80
|
opencv-python==4.9.0.80
|
||||||
@ -91,8 +91,6 @@ pycryptodomex==3.20.0
|
|||||||
pydantic==2.6.2
|
pydantic==2.6.2
|
||||||
pydantic_core==2.16.3
|
pydantic_core==2.16.3
|
||||||
PyJWT==2.8.0
|
PyJWT==2.8.0
|
||||||
PyMuPDF==1.23.25
|
|
||||||
PyMuPDFb==1.23.22
|
|
||||||
PyMySQL==1.1.0
|
PyMySQL==1.1.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
pypdfium2==4.27.0
|
pypdfium2==4.27.0
|
||||||
@ -102,6 +100,7 @@ python-dotenv==1.0.1
|
|||||||
python-pptx==0.6.23
|
python-pptx==0.6.23
|
||||||
pytz==2024.1
|
pytz==2024.1
|
||||||
PyYAML==6.0.1
|
PyYAML==6.0.1
|
||||||
|
redis==5.0.3
|
||||||
regex==2023.12.25
|
regex==2023.12.25
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
ruamel.yaml==0.18.6
|
ruamel.yaml==0.18.6
|
||||||
|
|||||||
126
requirements_dev.txt
Normal file
126
requirements_dev.txt
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
accelerate==0.27.2
|
||||||
|
aiohttp==3.9.3
|
||||||
|
aiosignal==1.3.1
|
||||||
|
annotated-types==0.6.0
|
||||||
|
anyio==4.3.0
|
||||||
|
argon2-cffi==23.1.0
|
||||||
|
argon2-cffi-bindings==21.2.0
|
||||||
|
Aspose.Slides==24.2.0
|
||||||
|
attrs==23.2.0
|
||||||
|
blinker==1.7.0
|
||||||
|
cachelib==0.12.0
|
||||||
|
cachetools==5.3.3
|
||||||
|
certifi==2024.2.2
|
||||||
|
cffi==1.16.0
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
click==8.1.7
|
||||||
|
coloredlogs==15.0.1
|
||||||
|
cryptography==42.0.5
|
||||||
|
dashscope==1.14.1
|
||||||
|
datasets==2.17.1
|
||||||
|
datrie==0.8.2
|
||||||
|
demjson3==3.0.6
|
||||||
|
dill==0.3.8
|
||||||
|
distro==1.9.0
|
||||||
|
elastic-transport==8.12.0
|
||||||
|
elasticsearch==8.12.1
|
||||||
|
elasticsearch-dsl==8.12.0
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
filelock==3.13.1
|
||||||
|
fastembed==0.2.6
|
||||||
|
FlagEmbedding==1.2.5
|
||||||
|
Flask==3.0.2
|
||||||
|
Flask-Cors==4.0.0
|
||||||
|
Flask-Login==0.6.3
|
||||||
|
Flask-Session==0.6.0
|
||||||
|
flatbuffers==23.5.26
|
||||||
|
frozenlist==1.4.1
|
||||||
|
fsspec==2023.10.0
|
||||||
|
h11==0.14.0
|
||||||
|
hanziconv==0.3.2
|
||||||
|
httpcore==1.0.4
|
||||||
|
httpx==0.27.0
|
||||||
|
huggingface-hub==0.20.3
|
||||||
|
humanfriendly==10.0
|
||||||
|
idna==3.6
|
||||||
|
install==1.3.5
|
||||||
|
itsdangerous==2.1.2
|
||||||
|
Jinja2==3.1.3
|
||||||
|
joblib==1.3.2
|
||||||
|
lxml==5.1.0
|
||||||
|
MarkupSafe==2.1.5
|
||||||
|
minio==7.2.4
|
||||||
|
mpi4py==3.1.5
|
||||||
|
mpmath==1.3.0
|
||||||
|
multidict==6.0.5
|
||||||
|
multiprocess==0.70.16
|
||||||
|
networkx==3.2.1
|
||||||
|
nltk==3.8.1
|
||||||
|
numpy==1.26.4
|
||||||
|
openai==1.12.0
|
||||||
|
opencv-python==4.9.0.80
|
||||||
|
openpyxl==3.1.2
|
||||||
|
packaging==23.2
|
||||||
|
pandas==2.2.1
|
||||||
|
pdfminer.six==20221105
|
||||||
|
pdfplumber==0.10.4
|
||||||
|
peewee==3.17.1
|
||||||
|
pillow==10.2.0
|
||||||
|
protobuf==4.25.3
|
||||||
|
psutil==5.9.8
|
||||||
|
pyarrow==15.0.0
|
||||||
|
pyarrow-hotfix==0.6
|
||||||
|
pyclipper==1.3.0.post5
|
||||||
|
pycparser==2.21
|
||||||
|
pycryptodome==3.20.0
|
||||||
|
pycryptodome-test-vectors==1.0.14
|
||||||
|
pycryptodomex==3.20.0
|
||||||
|
pydantic==2.6.2
|
||||||
|
pydantic_core==2.16.3
|
||||||
|
PyJWT==2.8.0
|
||||||
|
PyMuPDF==1.23.25
|
||||||
|
PyMuPDFb==1.23.22
|
||||||
|
PyMySQL==1.1.0
|
||||||
|
PyPDF2==3.0.1
|
||||||
|
pypdfium2==4.27.0
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
python-docx==1.1.0
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
python-pptx==0.6.23
|
||||||
|
pytz==2024.1
|
||||||
|
PyYAML==6.0.1
|
||||||
|
regex==2023.12.25
|
||||||
|
requests==2.31.0
|
||||||
|
ruamel.yaml==0.18.6
|
||||||
|
ruamel.yaml.clib==0.2.8
|
||||||
|
safetensors==0.4.2
|
||||||
|
scikit-learn==1.4.1.post1
|
||||||
|
scipy==1.12.0
|
||||||
|
sentence-transformers==2.4.0
|
||||||
|
shapely==2.0.3
|
||||||
|
six==1.16.0
|
||||||
|
sniffio==1.3.1
|
||||||
|
StrEnum==0.4.15
|
||||||
|
sympy==1.12
|
||||||
|
threadpoolctl==3.3.0
|
||||||
|
tika==2.6.0
|
||||||
|
tiktoken==0.6.0
|
||||||
|
tokenizers==0.15.2
|
||||||
|
torch==2.2.1
|
||||||
|
tqdm==4.66.2
|
||||||
|
transformers==4.38.1
|
||||||
|
triton==2.2.0
|
||||||
|
typing_extensions==4.10.0
|
||||||
|
tzdata==2024.1
|
||||||
|
urllib3==2.2.1
|
||||||
|
Werkzeug==3.0.1
|
||||||
|
xgboost==2.0.3
|
||||||
|
XlsxWriter==3.2.0
|
||||||
|
xpinyin==0.7.6
|
||||||
|
xxhash==3.4.1
|
||||||
|
yarl==1.9.4
|
||||||
|
zhipuai==2.0.1
|
||||||
|
BCEmbedding
|
||||||
|
loguru==0.7.2
|
||||||
|
ollama==0.1.8
|
||||||
|
redis==5.0.4
|
||||||
@ -27,7 +27,7 @@ export default defineConfig({
|
|||||||
devtool: 'source-map',
|
devtool: 'source-map',
|
||||||
proxy: {
|
proxy: {
|
||||||
'/v1': {
|
'/v1': {
|
||||||
target: 'http://192.168.200.233:9380/',
|
target: 'http://123.60.95.134:9380/',
|
||||||
changeOrigin: true,
|
changeOrigin: true,
|
||||||
// pathRewrite: { '^/v1': '/v1' },
|
// pathRewrite: { '^/v1': '/v1' },
|
||||||
},
|
},
|
||||||
|
|||||||
2384
web/package-lock.json
generated
2384
web/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -3,7 +3,7 @@
|
|||||||
"author": "zhaofengchao <13723060510@163.com>",
|
"author": "zhaofengchao <13723060510@163.com>",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "umi build",
|
"build": "umi build",
|
||||||
"dev": "cross-env PORT=9000 umi dev",
|
"dev": "cross-env PORT=9200 umi dev",
|
||||||
"postinstall": "umi setup",
|
"postinstall": "umi setup",
|
||||||
"lint": "umi lint --eslint-only",
|
"lint": "umi lint --eslint-only",
|
||||||
"setup": "umi setup",
|
"setup": "umi setup",
|
||||||
@ -13,6 +13,7 @@
|
|||||||
"@ant-design/icons": "^5.2.6",
|
"@ant-design/icons": "^5.2.6",
|
||||||
"@ant-design/pro-components": "^2.6.46",
|
"@ant-design/pro-components": "^2.6.46",
|
||||||
"@ant-design/pro-layout": "^7.17.16",
|
"@ant-design/pro-layout": "^7.17.16",
|
||||||
|
"@js-preview/excel": "^1.7.8",
|
||||||
"ahooks": "^3.7.10",
|
"ahooks": "^3.7.10",
|
||||||
"antd": "^5.12.7",
|
"antd": "^5.12.7",
|
||||||
"axios": "^1.6.3",
|
"axios": "^1.6.3",
|
||||||
@ -25,12 +26,14 @@
|
|||||||
"rc-tween-one": "^3.0.6",
|
"rc-tween-one": "^3.0.6",
|
||||||
"react-chat-elements": "^12.0.13",
|
"react-chat-elements": "^12.0.13",
|
||||||
"react-copy-to-clipboard": "^5.1.0",
|
"react-copy-to-clipboard": "^5.1.0",
|
||||||
|
"react-file-viewer": "^1.2.1",
|
||||||
"react-i18next": "^14.0.0",
|
"react-i18next": "^14.0.0",
|
||||||
"react-infinite-scroll-component": "^6.1.0",
|
"react-infinite-scroll-component": "^6.1.0",
|
||||||
"react-markdown": "^9.0.1",
|
"react-markdown": "^9.0.1",
|
||||||
"react-pdf-highlighter": "^6.1.0",
|
"react-pdf-highlighter": "^6.1.0",
|
||||||
"react-string-replace": "^1.1.1",
|
"react-string-replace": "^1.1.1",
|
||||||
"react-syntax-highlighter": "^15.5.0",
|
"react-syntax-highlighter": "^15.5.0",
|
||||||
|
"reactflow": "^11.11.2",
|
||||||
"recharts": "^2.12.4",
|
"recharts": "^2.12.4",
|
||||||
"remark-gfm": "^4.0.0",
|
"remark-gfm": "^4.0.0",
|
||||||
"umi": "^4.0.90",
|
"umi": "^4.0.90",
|
||||||
|
|||||||
6
web/src/assets/svg/llm/deepseek.svg
Normal file
6
web/src/assets/svg/llm/deepseek.svg
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<svg t="1715133624982" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="4263"
|
||||||
|
width="200" height="200">
|
||||||
|
<path
|
||||||
|
d="M320.512 804.864C46.08 676.864 77.824 274.432 362.496 274.432c34.816 0 86.016-7.168 114.688-14.336 59.392-16.384 99.328-10.24 69.632 10.24-9.216 7.168-15.36 19.456-13.312 28.672 5.12 20.48 158.72 161.792 177.152 161.792 27.648 0 27.648-32.768 1.024-57.344-43.008-38.912-55.296-90.112-35.84-141.312l9.216-26.624 54.272 52.224c35.84 34.816 58.368 49.152 68.608 44.032 9.216-4.096 30.72-9.216 49.152-12.288 18.432-2.048 38.912-10.24 45.056-18.432 19.456-23.552 43.008-17.408 35.84 9.216-3.072 12.288-6.144 27.648-6.144 34.816 0 23.552-62.464 83.968-92.16 90.112-23.552 5.12-30.72 12.288-30.72 30.72 0 46.08-38.912 148.48-75.776 198.656l-37.888 51.2 36.864 15.36c56.32 23.552 40.96 41.984-37.888 43.008-43.008 1.024-75.776 7.168-92.16 18.432-68.608 45.056-198.656 50.176-281.6 12.288z m251.904-86.016c-24.576-27.648-66.56-79.872-93.184-117.76-69.632-98.304-158.72-150.528-256-150.528-37.888 0-38.912 1.024-38.912 34.816 0 94.208 99.328 240.64 175.104 257.024 38.912 9.216 59.392-7.168 39.936-29.696-7.168-9.216-10.24-23.552-6.144-31.744 5.12-14.336 9.216-14.336 38.912 1.024 18.432 9.216 50.176 29.696 69.632 45.056 35.84 27.648 58.368 37.888 96.256 39.936 14.336 1.024 9.216-10.24-25.6-48.128z m88.064-145.408c8.192-13.312-31.744-78.848-56.32-92.16-10.24-6.144-26.624-10.24-34.816-10.24-23.552 0-20.48 27.648 4.096 33.792 13.312 3.072 20.48 14.336 20.48 29.696 0 13.312 5.12 29.696 12.288 36.864 15.36 15.36 46.08 16.384 54.272 2.048z"
|
||||||
|
fill="#4D6BFE" p-id="4264"></path>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 1.6 KiB |
@ -74,9 +74,9 @@ export const useFetchParserListOnMount = (
|
|||||||
setSelectedTag(parserId);
|
setSelectedTag(parserId);
|
||||||
}, [parserId, documentId]);
|
}, [parserId, documentId]);
|
||||||
|
|
||||||
const handleChange = (tag: string, checked: boolean) => {
|
const handleChange = (tag: string) => {
|
||||||
const nextSelectedTag = checked ? tag : selectedTag;
|
// const nextSelectedTag = checked ? tag : selectedTag;
|
||||||
setSelectedTag(nextSelectedTag);
|
setSelectedTag(tag);
|
||||||
};
|
};
|
||||||
|
|
||||||
return { parserList: nextParserList, handleChange, selectedTag };
|
return { parserList: nextParserList, handleChange, selectedTag };
|
||||||
|
|||||||
@ -8,3 +8,7 @@
|
|||||||
cursor: help;
|
cursor: help;
|
||||||
writing-mode: horizontal-tb;
|
writing-mode: horizontal-tb;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.chunkMethod {
|
||||||
|
margin-bottom: 0;
|
||||||
|
}
|
||||||
|
|||||||
@ -13,9 +13,9 @@ import {
|
|||||||
Form,
|
Form,
|
||||||
InputNumber,
|
InputNumber,
|
||||||
Modal,
|
Modal,
|
||||||
|
Select,
|
||||||
Space,
|
Space,
|
||||||
Switch,
|
Switch,
|
||||||
Tag,
|
|
||||||
Tooltip,
|
Tooltip,
|
||||||
} from 'antd';
|
} from 'antd';
|
||||||
import omit from 'lodash/omit';
|
import omit from 'lodash/omit';
|
||||||
@ -25,8 +25,6 @@ import { useFetchParserListOnMount } from './hooks';
|
|||||||
import { useTranslate } from '@/hooks/commonHooks';
|
import { useTranslate } from '@/hooks/commonHooks';
|
||||||
import styles from './index.less';
|
import styles from './index.less';
|
||||||
|
|
||||||
const { CheckableTag } = Tag;
|
|
||||||
|
|
||||||
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
||||||
loading: boolean;
|
loading: boolean;
|
||||||
onOk: (
|
onOk: (
|
||||||
@ -50,6 +48,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
|||||||
visible,
|
visible,
|
||||||
documentExtension,
|
documentExtension,
|
||||||
parserConfig,
|
parserConfig,
|
||||||
|
loading,
|
||||||
}) => {
|
}) => {
|
||||||
const { parserList, handleChange, selectedTag } = useFetchParserListOnMount(
|
const { parserList, handleChange, selectedTag } = useFetchParserListOnMount(
|
||||||
documentId,
|
documentId,
|
||||||
@ -111,23 +110,17 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
|||||||
onOk={handleOk}
|
onOk={handleOk}
|
||||||
onCancel={hideModal}
|
onCancel={hideModal}
|
||||||
afterClose={afterClose}
|
afterClose={afterClose}
|
||||||
|
confirmLoading={loading}
|
||||||
>
|
>
|
||||||
<Space size={[0, 8]} wrap>
|
<Space size={[0, 8]} wrap>
|
||||||
<div className={styles.tags}>
|
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
|
||||||
{parserList.map((x) => {
|
<Select
|
||||||
return (
|
style={{ width: 120 }}
|
||||||
<CheckableTag
|
onChange={handleChange}
|
||||||
key={x.value}
|
value={selectedTag}
|
||||||
checked={selectedTag === x.value}
|
options={parserList}
|
||||||
onChange={(checked) => {
|
/>
|
||||||
handleChange(x.value, checked);
|
</Form.Item>
|
||||||
}}
|
|
||||||
>
|
|
||||||
{x.label}
|
|
||||||
</CheckableTag>
|
|
||||||
);
|
|
||||||
})}
|
|
||||||
</div>
|
|
||||||
</Space>
|
</Space>
|
||||||
{hideDivider || <Divider></Divider>}
|
{hideDivider || <Divider></Divider>}
|
||||||
<Form name="dynamic_form_nest_item" autoComplete="off" form={form}>
|
<Form name="dynamic_form_nest_item" autoComplete="off" form={form}>
|
||||||
|
|||||||
8
web/src/components/file-upload-modal/index.less
Normal file
8
web/src/components/file-upload-modal/index.less
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
.uploader {
|
||||||
|
:global {
|
||||||
|
.ant-upload-list {
|
||||||
|
max-height: 40vh;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
import { useTranslate } from '@/hooks/commonHooks';
|
||||||
import { IModalProps } from '@/interfaces/common';
|
import { IModalProps } from '@/interfaces/common';
|
||||||
import { InboxOutlined } from '@ant-design/icons';
|
import { InboxOutlined } from '@ant-design/icons';
|
||||||
import {
|
import {
|
||||||
@ -12,6 +13,8 @@ import {
|
|||||||
} from 'antd';
|
} from 'antd';
|
||||||
import { Dispatch, SetStateAction, useState } from 'react';
|
import { Dispatch, SetStateAction, useState } from 'react';
|
||||||
|
|
||||||
|
import styles from './index.less';
|
||||||
|
|
||||||
const { Dragger } = Upload;
|
const { Dragger } = Upload;
|
||||||
|
|
||||||
const FileUpload = ({
|
const FileUpload = ({
|
||||||
@ -23,6 +26,7 @@ const FileUpload = ({
|
|||||||
fileList: UploadFile[];
|
fileList: UploadFile[];
|
||||||
setFileList: Dispatch<SetStateAction<UploadFile[]>>;
|
setFileList: Dispatch<SetStateAction<UploadFile[]>>;
|
||||||
}) => {
|
}) => {
|
||||||
|
const { t } = useTranslate('fileManager');
|
||||||
const props: UploadProps = {
|
const props: UploadProps = {
|
||||||
multiple: true,
|
multiple: true,
|
||||||
onRemove: (file) => {
|
onRemove: (file) => {
|
||||||
@ -43,17 +47,12 @@ const FileUpload = ({
|
|||||||
};
|
};
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Dragger {...props}>
|
<Dragger {...props} className={styles.uploader}>
|
||||||
<p className="ant-upload-drag-icon">
|
<p className="ant-upload-drag-icon">
|
||||||
<InboxOutlined />
|
<InboxOutlined />
|
||||||
</p>
|
</p>
|
||||||
<p className="ant-upload-text">
|
<p className="ant-upload-text">{t('uploadTitle')}</p>
|
||||||
Click or drag file to this area to upload
|
<p className="ant-upload-hint">{t('uploadDescription')}</p>
|
||||||
</p>
|
|
||||||
<p className="ant-upload-hint">
|
|
||||||
Support for a single or bulk upload. Strictly prohibited from uploading
|
|
||||||
company data or other banned files.
|
|
||||||
</p>
|
|
||||||
</Dragger>
|
</Dragger>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
@ -64,18 +63,29 @@ const FileUploadModal = ({
|
|||||||
loading,
|
loading,
|
||||||
onOk: onFileUploadOk,
|
onOk: onFileUploadOk,
|
||||||
}: IModalProps<UploadFile[]>) => {
|
}: IModalProps<UploadFile[]>) => {
|
||||||
|
const { t } = useTranslate('fileManager');
|
||||||
const [value, setValue] = useState<string | number>('local');
|
const [value, setValue] = useState<string | number>('local');
|
||||||
const [fileList, setFileList] = useState<UploadFile[]>([]);
|
const [fileList, setFileList] = useState<UploadFile[]>([]);
|
||||||
const [directoryFileList, setDirectoryFileList] = useState<UploadFile[]>([]);
|
const [directoryFileList, setDirectoryFileList] = useState<UploadFile[]>([]);
|
||||||
|
|
||||||
const onOk = () => {
|
const clearFileList = () => {
|
||||||
return onFileUploadOk?.([...fileList, ...directoryFileList]);
|
setFileList([]);
|
||||||
|
setDirectoryFileList([]);
|
||||||
|
};
|
||||||
|
|
||||||
|
const onOk = async () => {
|
||||||
|
const ret = await onFileUploadOk?.([...fileList, ...directoryFileList]);
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
const afterClose = () => {
|
||||||
|
clearFileList();
|
||||||
};
|
};
|
||||||
|
|
||||||
const items: TabsProps['items'] = [
|
const items: TabsProps['items'] = [
|
||||||
{
|
{
|
||||||
key: '1',
|
key: '1',
|
||||||
label: 'File',
|
label: t('file'),
|
||||||
children: (
|
children: (
|
||||||
<FileUpload
|
<FileUpload
|
||||||
directory={false}
|
directory={false}
|
||||||
@ -86,7 +96,7 @@ const FileUploadModal = ({
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
key: '2',
|
key: '2',
|
||||||
label: 'Directory',
|
label: t('directory'),
|
||||||
children: (
|
children: (
|
||||||
<FileUpload
|
<FileUpload
|
||||||
directory
|
directory
|
||||||
@ -100,17 +110,18 @@ const FileUploadModal = ({
|
|||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
<Modal
|
<Modal
|
||||||
title="File upload"
|
title={t('uploadFile')}
|
||||||
open={visible}
|
open={visible}
|
||||||
onOk={onOk}
|
onOk={onOk}
|
||||||
onCancel={hideModal}
|
onCancel={hideModal}
|
||||||
confirmLoading={loading}
|
confirmLoading={loading}
|
||||||
|
afterClose={afterClose}
|
||||||
>
|
>
|
||||||
<Flex gap={'large'} vertical>
|
<Flex gap={'large'} vertical>
|
||||||
<Segmented
|
<Segmented
|
||||||
options={[
|
options={[
|
||||||
{ label: 'Local uploads', value: 'local' },
|
{ label: t('local'), value: 'local' },
|
||||||
{ label: 'S3 uploads', value: 's3' },
|
{ label: t('s3'), value: 's3' },
|
||||||
]}
|
]}
|
||||||
block
|
block
|
||||||
value={value}
|
value={value}
|
||||||
@ -119,7 +130,7 @@ const FileUploadModal = ({
|
|||||||
{value === 'local' ? (
|
{value === 'local' ? (
|
||||||
<Tabs defaultActiveKey="1" items={items} />
|
<Tabs defaultActiveKey="1" items={items} />
|
||||||
) : (
|
) : (
|
||||||
'coming soon'
|
t('comingSoon', { keyPrefix: 'common' })
|
||||||
)}
|
)}
|
||||||
</Flex>
|
</Flex>
|
||||||
</Modal>
|
</Modal>
|
||||||
@ -46,3 +46,25 @@ export const LanguageTranslationMap = {
|
|||||||
Chinese: 'zh',
|
Chinese: 'zh',
|
||||||
'Traditional Chinese': 'zh-TRADITIONAL',
|
'Traditional Chinese': 'zh-TRADITIONAL',
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const FileMimeTypeMap = {
|
||||||
|
bmp: 'image/bmp',
|
||||||
|
csv: 'text/csv',
|
||||||
|
odt: 'application/vnd.oasis.opendocument.text',
|
||||||
|
doc: 'application/msword',
|
||||||
|
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
gif: 'image/gif',
|
||||||
|
htm: 'text/htm',
|
||||||
|
html: 'text/html',
|
||||||
|
jpg: 'image/jpg',
|
||||||
|
jpeg: 'image/jpeg',
|
||||||
|
pdf: 'application/pdf',
|
||||||
|
png: 'image/png',
|
||||||
|
ppt: 'application/vnd.ms-powerpoint',
|
||||||
|
pptx: 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||||
|
tiff: 'image/tiff',
|
||||||
|
txt: 'text/plain',
|
||||||
|
xls: 'application/vnd.ms-excel',
|
||||||
|
xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
|
mp4: 'video/mp4',
|
||||||
|
};
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import { useCallback, useMemo, useState } from 'react';
|
|||||||
import { IHighlight } from 'react-pdf-highlighter';
|
import { IHighlight } from 'react-pdf-highlighter';
|
||||||
import { useDispatch, useSelector } from 'umi';
|
import { useDispatch, useSelector } from 'umi';
|
||||||
import { useGetKnowledgeSearchParams } from './routeHook';
|
import { useGetKnowledgeSearchParams } from './routeHook';
|
||||||
|
import { useOneNamespaceEffectsLoading } from './storeHooks';
|
||||||
|
|
||||||
export const useGetDocumentUrl = (documentId: string) => {
|
export const useGetDocumentUrl = (documentId: string) => {
|
||||||
const url = useMemo(() => {
|
const url = useMemo(() => {
|
||||||
@ -160,12 +161,12 @@ export const useRemoveDocument = () => {
|
|||||||
const { knowledgeId } = useGetKnowledgeSearchParams();
|
const { knowledgeId } = useGetKnowledgeSearchParams();
|
||||||
|
|
||||||
const removeDocument = useCallback(
|
const removeDocument = useCallback(
|
||||||
(documentId: string) => {
|
(documentIds: string[]) => {
|
||||||
try {
|
try {
|
||||||
return dispatch<any>({
|
return dispatch<any>({
|
||||||
type: 'kFModel/document_rm',
|
type: 'kFModel/document_rm',
|
||||||
payload: {
|
payload: {
|
||||||
doc_id: documentId,
|
doc_id: documentIds,
|
||||||
kb_id: knowledgeId,
|
kb_id: knowledgeId,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@ -184,12 +185,12 @@ export const useUploadDocument = () => {
|
|||||||
const { knowledgeId } = useGetKnowledgeSearchParams();
|
const { knowledgeId } = useGetKnowledgeSearchParams();
|
||||||
|
|
||||||
const uploadDocument = useCallback(
|
const uploadDocument = useCallback(
|
||||||
(file: UploadFile) => {
|
(fileList: UploadFile[]) => {
|
||||||
try {
|
try {
|
||||||
return dispatch<any>({
|
return dispatch<any>({
|
||||||
type: 'kFModel/upload_document',
|
type: 'kFModel/upload_document',
|
||||||
payload: {
|
payload: {
|
||||||
file,
|
fileList,
|
||||||
kb_id: knowledgeId,
|
kb_id: knowledgeId,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@ -222,3 +223,8 @@ export const useRunDocument = () => {
|
|||||||
|
|
||||||
return runDocumentByIds;
|
return runDocumentByIds;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const useSelectRunDocumentLoading = () => {
|
||||||
|
const loading = useOneNamespaceEffectsLoading('kFModel', ['document_run']);
|
||||||
|
return loading;
|
||||||
|
};
|
||||||
|
|||||||
@ -125,13 +125,19 @@ export const useFetchKnowledgeBaseConfiguration = () => {
|
|||||||
}, [fetchKnowledgeBaseConfiguration]);
|
}, [fetchKnowledgeBaseConfiguration]);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const useSelectKnowledgeList = () => {
|
||||||
|
const knowledgeModel = useSelector((state) => state.knowledgeModel);
|
||||||
|
const { data = [] } = knowledgeModel;
|
||||||
|
return data;
|
||||||
|
};
|
||||||
|
|
||||||
export const useFetchKnowledgeList = (
|
export const useFetchKnowledgeList = (
|
||||||
shouldFilterListWithoutDocument: boolean = false,
|
shouldFilterListWithoutDocument: boolean = false,
|
||||||
) => {
|
) => {
|
||||||
const dispatch = useDispatch();
|
const dispatch = useDispatch();
|
||||||
const loading = useOneNamespaceEffectsLoading('knowledgeModel', ['getList']);
|
const loading = useOneNamespaceEffectsLoading('knowledgeModel', ['getList']);
|
||||||
|
|
||||||
const knowledgeModel = useSelector((state: any) => state.knowledgeModel);
|
const knowledgeModel = useSelector((state) => state.knowledgeModel);
|
||||||
const { data = [] } = knowledgeModel;
|
const { data = [] } = knowledgeModel;
|
||||||
const list: IKnowledge[] = useMemo(() => {
|
const list: IKnowledge[] = useMemo(() => {
|
||||||
return shouldFilterListWithoutDocument
|
return shouldFilterListWithoutDocument
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import {
|
|||||||
IThirdOAIModelCollection,
|
IThirdOAIModelCollection,
|
||||||
} from '@/interfaces/database/llm';
|
} from '@/interfaces/database/llm';
|
||||||
import { IAddLlmRequestBody } from '@/interfaces/request/llm';
|
import { IAddLlmRequestBody } from '@/interfaces/request/llm';
|
||||||
|
import { sortLLmFactoryListBySpecifiedOrder } from '@/utils/commonUtil';
|
||||||
import { useCallback, useEffect, useMemo } from 'react';
|
import { useCallback, useEffect, useMemo } from 'react';
|
||||||
import { useDispatch, useSelector } from 'umi';
|
import { useDispatch, useSelector } from 'umi';
|
||||||
|
|
||||||
@ -110,13 +111,12 @@ export const useFetchLlmFactoryListOnMount = () => {
|
|||||||
const factoryList = useSelectLlmFactoryList();
|
const factoryList = useSelectLlmFactoryList();
|
||||||
const myLlmList = useSelectMyLlmList();
|
const myLlmList = useSelectMyLlmList();
|
||||||
|
|
||||||
const list = useMemo(
|
const list = useMemo(() => {
|
||||||
() =>
|
const currentList = factoryList.filter((x) =>
|
||||||
factoryList.filter((x) =>
|
|
||||||
Object.keys(myLlmList).every((y) => y !== x.name),
|
Object.keys(myLlmList).every((y) => y !== x.name),
|
||||||
),
|
|
||||||
[factoryList, myLlmList],
|
|
||||||
);
|
);
|
||||||
|
return sortLLmFactoryListBySpecifiedOrder(currentList);
|
||||||
|
}, [factoryList, myLlmList]);
|
||||||
|
|
||||||
const fetchLlmFactoryList = useCallback(() => {
|
const fetchLlmFactoryList = useCallback(() => {
|
||||||
dispatch({
|
dispatch({
|
||||||
|
|||||||
@ -14,5 +14,5 @@ export interface IModalProps<T> {
|
|||||||
hideModal(): void;
|
hideModal(): void;
|
||||||
visible: boolean;
|
visible: boolean;
|
||||||
loading?: boolean;
|
loading?: boolean;
|
||||||
onOk?(payload?: T): Promise<void> | void;
|
onOk?(payload?: T): Promise<any> | void;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -21,11 +21,11 @@ export interface LlmSetting {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export interface Variable {
|
export interface Variable {
|
||||||
frequency_penalty: number;
|
frequency_penalty?: number;
|
||||||
max_tokens: number;
|
max_tokens?: number;
|
||||||
presence_penalty: number;
|
presence_penalty?: number;
|
||||||
temperature: number;
|
temperature?: number;
|
||||||
top_p: number;
|
top_p?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface IDialog {
|
export interface IDialog {
|
||||||
@ -38,7 +38,7 @@ export interface IDialog {
|
|||||||
kb_names: string[];
|
kb_names: string[];
|
||||||
language: string;
|
language: string;
|
||||||
llm_id: string;
|
llm_id: string;
|
||||||
llm_setting: LlmSetting;
|
llm_setting: Variable;
|
||||||
llm_setting_type: string;
|
llm_setting_type: string;
|
||||||
name: string;
|
name: string;
|
||||||
prompt_config: PromptConfig;
|
prompt_config: PromptConfig;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
import { ReactComponent as StarIon } from '@/assets/svg/chat-star.svg';
|
import { ReactComponent as StarIon } from '@/assets/svg/chat-star.svg';
|
||||||
// import { ReactComponent as FileIcon } from '@/assets/svg/file-management.svg';
|
import { ReactComponent as FileIcon } from '@/assets/svg/file-management.svg';
|
||||||
import { ReactComponent as KnowledgeBaseIcon } from '@/assets/svg/knowledge-base.svg';
|
import { ReactComponent as KnowledgeBaseIcon } from '@/assets/svg/knowledge-base.svg';
|
||||||
import { ReactComponent as Logo } from '@/assets/svg/logo.svg';
|
import { ReactComponent as Logo } from '@/assets/svg/logo.svg';
|
||||||
import { useTranslate } from '@/hooks/commonHooks';
|
import { useTranslate } from '@/hooks/commonHooks';
|
||||||
@ -25,7 +25,7 @@ const RagHeader = () => {
|
|||||||
() => [
|
() => [
|
||||||
{ path: '/knowledge', name: t('knowledgeBase'), icon: KnowledgeBaseIcon },
|
{ path: '/knowledge', name: t('knowledgeBase'), icon: KnowledgeBaseIcon },
|
||||||
{ path: '/chat', name: t('chat'), icon: StarIon },
|
{ path: '/chat', name: t('chat'), icon: StarIon },
|
||||||
// { path: '/file', name: 'File Management', icon: FileIcon },
|
{ path: '/file', name: t('fileManager'), icon: FileIcon },
|
||||||
],
|
],
|
||||||
[t],
|
[t],
|
||||||
);
|
);
|
||||||
|
|||||||
@ -15,3 +15,7 @@
|
|||||||
vertical-align: middle;
|
vertical-align: middle;
|
||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.language {
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
import { ReactComponent as TranslationIcon } from '@/assets/svg/translation.svg';
|
|
||||||
import { useTranslate } from '@/hooks/commonHooks';
|
import { useTranslate } from '@/hooks/commonHooks';
|
||||||
import { GithubOutlined } from '@ant-design/icons';
|
import { DownOutlined, GithubOutlined } from '@ant-design/icons';
|
||||||
import { Dropdown, MenuProps, Space } from 'antd';
|
import { Dropdown, MenuProps, Space } from 'antd';
|
||||||
import camelCase from 'lodash/camelCase';
|
import camelCase from 'lodash/camelCase';
|
||||||
import React from 'react';
|
import React from 'react';
|
||||||
@ -8,6 +7,7 @@ import User from '../user';
|
|||||||
|
|
||||||
import { LanguageList } from '@/constants/common';
|
import { LanguageList } from '@/constants/common';
|
||||||
import { useChangeLanguage } from '@/hooks/logicHooks';
|
import { useChangeLanguage } from '@/hooks/logicHooks';
|
||||||
|
import { useSelector } from 'umi';
|
||||||
import styled from './index.less';
|
import styled from './index.less';
|
||||||
|
|
||||||
const Circle = ({ children, ...restProps }: React.PropsWithChildren) => {
|
const Circle = ({ children, ...restProps }: React.PropsWithChildren) => {
|
||||||
@ -25,6 +25,7 @@ const handleGithubCLick = () => {
|
|||||||
const RightToolBar = () => {
|
const RightToolBar = () => {
|
||||||
const { t } = useTranslate('common');
|
const { t } = useTranslate('common');
|
||||||
const changeLanguage = useChangeLanguage();
|
const changeLanguage = useChangeLanguage();
|
||||||
|
const { language = '' } = useSelector((state) => state.settingModel.userInfo);
|
||||||
|
|
||||||
const handleItemClick: MenuProps['onClick'] = ({ key }) => {
|
const handleItemClick: MenuProps['onClick'] = ({ key }) => {
|
||||||
changeLanguage(key);
|
changeLanguage(key);
|
||||||
@ -40,14 +41,15 @@ const RightToolBar = () => {
|
|||||||
return (
|
return (
|
||||||
<div className={styled.toolbarWrapper}>
|
<div className={styled.toolbarWrapper}>
|
||||||
<Space wrap size={16}>
|
<Space wrap size={16}>
|
||||||
|
<Dropdown menu={{ items, onClick: handleItemClick }} placement="bottom">
|
||||||
|
<Space className={styled.language}>
|
||||||
|
<b>{t(camelCase(language))}</b>
|
||||||
|
<DownOutlined />
|
||||||
|
</Space>
|
||||||
|
</Dropdown>
|
||||||
<Circle>
|
<Circle>
|
||||||
<GithubOutlined onClick={handleGithubCLick} />
|
<GithubOutlined onClick={handleGithubCLick} />
|
||||||
</Circle>
|
</Circle>
|
||||||
<Dropdown menu={{ items, onClick: handleItemClick }} placement="bottom">
|
|
||||||
<Circle>
|
|
||||||
<TranslationIcon />
|
|
||||||
</Circle>
|
|
||||||
</Dropdown>
|
|
||||||
{/* <Circle>
|
{/* <Circle>
|
||||||
<MonIcon />
|
<MonIcon />
|
||||||
</Circle> */}
|
</Circle> */}
|
||||||
|
|||||||
@ -42,3 +42,17 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.textEllipsis() {
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.multipleLineEllipsis(@line) {
|
||||||
|
display: -webkit-box;
|
||||||
|
-webkit-box-orient: vertical;
|
||||||
|
-webkit-line-clamp: @line;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
}
|
||||||
|
|||||||
@ -22,6 +22,9 @@ export default {
|
|||||||
languagePlaceholder: 'select your language',
|
languagePlaceholder: 'select your language',
|
||||||
copy: 'Copy',
|
copy: 'Copy',
|
||||||
copied: 'Copied',
|
copied: 'Copied',
|
||||||
|
comingSoon: 'Coming Soon',
|
||||||
|
download: 'Download',
|
||||||
|
close: 'Close',
|
||||||
},
|
},
|
||||||
login: {
|
login: {
|
||||||
login: 'Sign in',
|
login: 'Sign in',
|
||||||
@ -52,6 +55,7 @@ export default {
|
|||||||
home: 'Home',
|
home: 'Home',
|
||||||
setting: '用户设置',
|
setting: '用户设置',
|
||||||
logout: '登出',
|
logout: '登出',
|
||||||
|
fileManager: 'File Management',
|
||||||
},
|
},
|
||||||
knowledgeList: {
|
knowledgeList: {
|
||||||
welcome: 'Welcome back',
|
welcome: 'Welcome back',
|
||||||
@ -60,6 +64,7 @@ export default {
|
|||||||
name: 'Name',
|
name: 'Name',
|
||||||
namePlaceholder: 'Please input name!',
|
namePlaceholder: 'Please input name!',
|
||||||
doc: 'Docs',
|
doc: 'Docs',
|
||||||
|
searchKnowledgePlaceholder: 'Search',
|
||||||
},
|
},
|
||||||
knowledgeDetails: {
|
knowledgeDetails: {
|
||||||
dataset: 'Dataset',
|
dataset: 'Dataset',
|
||||||
@ -274,6 +279,8 @@ export default {
|
|||||||
keyword: 'Keyword',
|
keyword: 'Keyword',
|
||||||
function: 'Function',
|
function: 'Function',
|
||||||
chunkMessage: 'Please input value!',
|
chunkMessage: 'Please input value!',
|
||||||
|
full: 'Full text',
|
||||||
|
ellipse: 'Ellipse',
|
||||||
},
|
},
|
||||||
chat: {
|
chat: {
|
||||||
createAssistant: 'Create an Assistant',
|
createAssistant: 'Create an Assistant',
|
||||||
@ -459,6 +466,7 @@ export default {
|
|||||||
renamed: 'Renamed',
|
renamed: 'Renamed',
|
||||||
operated: 'Operated',
|
operated: 'Operated',
|
||||||
updated: 'Updated',
|
updated: 'Updated',
|
||||||
|
uploaded: 'Uploaded',
|
||||||
200: 'The server successfully returns the requested data.',
|
200: 'The server successfully returns the requested data.',
|
||||||
201: 'Create or modify data successfully.',
|
201: 'Create or modify data successfully.',
|
||||||
202: 'A request has been queued in the background (asynchronous task).',
|
202: 'A request has been queued in the background (asynchronous task).',
|
||||||
@ -480,6 +488,24 @@ export default {
|
|||||||
networkAnomaly: 'network anomaly',
|
networkAnomaly: 'network anomaly',
|
||||||
hint: 'hint',
|
hint: 'hint',
|
||||||
},
|
},
|
||||||
|
fileManager: {
|
||||||
|
name: 'Name',
|
||||||
|
uploadDate: 'Upload Date',
|
||||||
|
knowledgeBase: 'Knowledge Base',
|
||||||
|
size: 'Size',
|
||||||
|
action: 'Action',
|
||||||
|
addToKnowledge: 'Add to Knowledge Base',
|
||||||
|
pleaseSelect: 'Please select',
|
||||||
|
newFolder: 'New Folder',
|
||||||
|
file: 'File',
|
||||||
|
uploadFile: 'Upload File',
|
||||||
|
directory: 'Directory',
|
||||||
|
uploadTitle: 'Click or drag file to this area to upload',
|
||||||
|
uploadDescription:
|
||||||
|
'Support for a single or bulk upload. Strictly prohibited from uploading company data or other banned files.',
|
||||||
|
local: 'Local uploads',
|
||||||
|
s3: 'S3 uploads',
|
||||||
|
},
|
||||||
footer: {
|
footer: {
|
||||||
profile: 'All rights reserved @ React',
|
profile: 'All rights reserved @ React',
|
||||||
},
|
},
|
||||||
|
|||||||
@ -22,6 +22,9 @@ export default {
|
|||||||
languagePlaceholder: '請選擇語言',
|
languagePlaceholder: '請選擇語言',
|
||||||
copy: '複製',
|
copy: '複製',
|
||||||
copied: '複製成功',
|
copied: '複製成功',
|
||||||
|
comingSoon: '即將推出',
|
||||||
|
download: '下載',
|
||||||
|
close: '关闭',
|
||||||
},
|
},
|
||||||
login: {
|
login: {
|
||||||
login: '登入',
|
login: '登入',
|
||||||
@ -52,6 +55,7 @@ export default {
|
|||||||
home: '首頁',
|
home: '首頁',
|
||||||
setting: '用戶設置',
|
setting: '用戶設置',
|
||||||
logout: '登出',
|
logout: '登出',
|
||||||
|
fileManager: '文件管理',
|
||||||
},
|
},
|
||||||
knowledgeList: {
|
knowledgeList: {
|
||||||
welcome: '歡迎回來',
|
welcome: '歡迎回來',
|
||||||
@ -60,6 +64,7 @@ export default {
|
|||||||
name: '名稱',
|
name: '名稱',
|
||||||
namePlaceholder: '請輸入名稱',
|
namePlaceholder: '請輸入名稱',
|
||||||
doc: '文件',
|
doc: '文件',
|
||||||
|
searchKnowledgePlaceholder: '搜索',
|
||||||
},
|
},
|
||||||
knowledgeDetails: {
|
knowledgeDetails: {
|
||||||
dataset: '數據集',
|
dataset: '數據集',
|
||||||
@ -218,7 +223,7 @@ export default {
|
|||||||
您只需與<i>'ragflow'</i>交談即可列出所有符合資格的候選人。
|
您只需與<i>'ragflow'</i>交談即可列出所有符合資格的候選人。
|
||||||
</p>
|
</p>
|
||||||
`,
|
`,
|
||||||
table: `支持<p><b>excel</b>和<b>csv/txt</b>格式文件。</p><p>以下是一些提示: <ul> <li>对于Csv或Txt文件,列之间的分隔符为 <em><b>tab</b></em>。</li> <li>第一行必须是列标题。</li> <li>列标题必须是有意义的术语,以便我们的法学硕士能够理解。列举一些同义词时最好使用斜杠<i>'/'</i>来分隔,甚至更好使用方括号枚举值,例如 <i>“性別/性別(男性,女性)”</i>.<p>以下是标题的一些示例:<ol> <li>供应商/供货商<b>'tab'</b>顏色(黃色、紅色、棕色)<b>'tab'</b>性別(男、女)<b>'tab'</B>尺码(m、l、xl、xxl)</li> <li>姓名/名字<b>'tab'</b>電話/手機/微信<b>'tab'</b>最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,mpa,mba,emba)</li> </ol> </p> </li> <li>表中的每一行都将被视为一个块。</li> </ul>`,
|
table: `支持<p><b>excel</b>和<b>csv/txt</b>格式文件。</p><p>以下是一些提示: <ul> <li>对于Csv或Txt文件,列之间的分隔符为 <em><b>tab</b></em>。</li> <li>第一行必须是列标题。</li> <li>列标题必须是有意义的术语,以便我们的大語言模型能够理解。列举一些同义词时最好使用斜杠<i>'/'</i>来分隔,甚至更好使用方括号枚举值,例如 <i>“性別/性別(男性,女性)”</i>.<p>以下是标题的一些示例:<ol> <li>供应商/供货商<b>'tab'</b>顏色(黃色、紅色、棕色)<b>'tab'</b>性別(男、女)<b>'tab'</B>尺码(m、l、xl、xxl)</li> <li>姓名/名字<b>'tab'</b>電話/手機/微信<b>'tab'</b>最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,mpa,mba,emba)</li> </ol> </p> </li> <li>表中的每一行都将被视为一个块。</li> </ul>`,
|
||||||
picture: `
|
picture: `
|
||||||
<p>支持圖像文件。視頻即將推出。</p><p>
|
<p>支持圖像文件。視頻即將推出。</p><p>
|
||||||
如果圖片中有文字,則應用 OCR 提取文字作為其文字描述。
|
如果圖片中有文字,則應用 OCR 提取文字作為其文字描述。
|
||||||
@ -247,6 +252,8 @@ export default {
|
|||||||
keyword: '關鍵詞',
|
keyword: '關鍵詞',
|
||||||
function: '函數',
|
function: '函數',
|
||||||
chunkMessage: '請輸入值!',
|
chunkMessage: '請輸入值!',
|
||||||
|
full: '全文',
|
||||||
|
ellipse: '省略',
|
||||||
},
|
},
|
||||||
chat: {
|
chat: {
|
||||||
createAssistant: '新建助理',
|
createAssistant: '新建助理',
|
||||||
@ -424,6 +431,7 @@ export default {
|
|||||||
renamed: '重命名成功',
|
renamed: '重命名成功',
|
||||||
operated: '操作成功',
|
operated: '操作成功',
|
||||||
updated: '更新成功',
|
updated: '更新成功',
|
||||||
|
uploaded: '上傳成功',
|
||||||
200: '服務器成功返回請求的數據。',
|
200: '服務器成功返回請求的數據。',
|
||||||
201: '新建或修改數據成功。',
|
201: '新建或修改數據成功。',
|
||||||
202: '一個請求已經進入後台排隊(異步任務)。',
|
202: '一個請求已經進入後台排隊(異步任務)。',
|
||||||
@ -444,6 +452,23 @@ export default {
|
|||||||
networkAnomaly: '網絡異常',
|
networkAnomaly: '網絡異常',
|
||||||
hint: '提示',
|
hint: '提示',
|
||||||
},
|
},
|
||||||
|
fileManager: {
|
||||||
|
name: '名稱',
|
||||||
|
uploadDate: '上傳日期',
|
||||||
|
knowledgeBase: '知識庫',
|
||||||
|
size: '大小',
|
||||||
|
action: '操作',
|
||||||
|
addToKnowledge: '添加到知識庫',
|
||||||
|
pleaseSelect: '請選擇',
|
||||||
|
newFolder: '新建文件夾',
|
||||||
|
uploadFile: '上傳文件',
|
||||||
|
uploadTitle: '點擊或拖拽文件至此區域即可上傳',
|
||||||
|
uploadDescription: '支持單次或批量上傳。嚴禁上傳公司數據或其他違禁文件。',
|
||||||
|
file: '文件',
|
||||||
|
directory: '文件夾',
|
||||||
|
local: '本地上傳',
|
||||||
|
s3: 'S3 上傳',
|
||||||
|
},
|
||||||
footer: {
|
footer: {
|
||||||
profile: '“保留所有權利 @ react”',
|
profile: '“保留所有權利 @ react”',
|
||||||
},
|
},
|
||||||
|
|||||||
@ -22,6 +22,9 @@ export default {
|
|||||||
languagePlaceholder: '请选择语言',
|
languagePlaceholder: '请选择语言',
|
||||||
copy: '复制',
|
copy: '复制',
|
||||||
copied: '复制成功',
|
copied: '复制成功',
|
||||||
|
comingSoon: '即将推出',
|
||||||
|
download: '下载',
|
||||||
|
close: '关闭',
|
||||||
},
|
},
|
||||||
login: {
|
login: {
|
||||||
login: '登录',
|
login: '登录',
|
||||||
@ -52,6 +55,7 @@ export default {
|
|||||||
home: '首页',
|
home: '首页',
|
||||||
setting: '用户设置',
|
setting: '用户设置',
|
||||||
logout: '登出',
|
logout: '登出',
|
||||||
|
fileManager: '文件管理',
|
||||||
},
|
},
|
||||||
knowledgeList: {
|
knowledgeList: {
|
||||||
welcome: '欢迎回来',
|
welcome: '欢迎回来',
|
||||||
@ -60,6 +64,7 @@ export default {
|
|||||||
name: '名称',
|
name: '名称',
|
||||||
namePlaceholder: '请输入名称',
|
namePlaceholder: '请输入名称',
|
||||||
doc: '文档',
|
doc: '文档',
|
||||||
|
searchKnowledgePlaceholder: '搜索',
|
||||||
},
|
},
|
||||||
knowledgeDetails: {
|
knowledgeDetails: {
|
||||||
dataset: '数据集',
|
dataset: '数据集',
|
||||||
@ -225,7 +230,7 @@ export default {
|
|||||||
<ul>
|
<ul>
|
||||||
<li>对于 csv 或 txt 文件,列之间的分隔符为 <em><b>TAB</b></em>。</li>
|
<li>对于 csv 或 txt 文件,列之间的分隔符为 <em><b>TAB</b></em>。</li>
|
||||||
<li>第一行必须是列标题。</li>
|
<li>第一行必须是列标题。</li>
|
||||||
<li>列标题必须是有意义的术语,以便我们的法学硕士能够理解。
|
<li>列标题必须是有意义的术语,以便我们的大语言模型能够理解。
|
||||||
列举一些同义词时最好使用斜杠<i>'/'</i>来分隔,甚至更好
|
列举一些同义词时最好使用斜杠<i>'/'</i>来分隔,甚至更好
|
||||||
使用方括号枚举值,例如 <i>'gender/sex(male,female)'</i>.<p>
|
使用方括号枚举值,例如 <i>'gender/sex(male,female)'</i>.<p>
|
||||||
以下是标题的一些示例:<ol>
|
以下是标题的一些示例:<ol>
|
||||||
@ -264,6 +269,8 @@ export default {
|
|||||||
keyword: '关键词',
|
keyword: '关键词',
|
||||||
function: '函数',
|
function: '函数',
|
||||||
chunkMessage: '请输入值!',
|
chunkMessage: '请输入值!',
|
||||||
|
full: '全文',
|
||||||
|
ellipse: '省略',
|
||||||
},
|
},
|
||||||
chat: {
|
chat: {
|
||||||
createAssistant: '新建助理',
|
createAssistant: '新建助理',
|
||||||
@ -298,7 +305,7 @@ export default {
|
|||||||
systemTip:
|
systemTip:
|
||||||
'当LLM回答问题时,你需要LLM遵循的说明,比如角色设计、答案长度和答案语言等。',
|
'当LLM回答问题时,你需要LLM遵循的说明,比如角色设计、答案长度和答案语言等。',
|
||||||
topN: 'Top N',
|
topN: 'Top N',
|
||||||
topNTip: `并非所有相似度得分高于“相似度阈值”的块都会被提供给法学硕士。 LLM 只能看到这些“Top N”块。`,
|
topNTip: `并非所有相似度得分高于“相似度阈值”的块都会被提供给大语言模型。 LLM 只能看到这些“Top N”块。`,
|
||||||
variable: '变量',
|
variable: '变量',
|
||||||
variableTip: `如果您使用对话 API,变量可能会帮助您使用不同的策略与客户聊天。
|
variableTip: `如果您使用对话 API,变量可能会帮助您使用不同的策略与客户聊天。
|
||||||
这些变量用于填写提示中的“系统”部分,以便给LLM一个提示。
|
这些变量用于填写提示中的“系统”部分,以便给LLM一个提示。
|
||||||
@ -315,7 +322,7 @@ export default {
|
|||||||
improvise: '即兴创作',
|
improvise: '即兴创作',
|
||||||
precise: '精确',
|
precise: '精确',
|
||||||
balance: '平衡',
|
balance: '平衡',
|
||||||
freedomTip: `“精确”意味着法学硕士会保守并谨慎地回答你的问题。 “即兴发挥”意味着你希望法学硕士能够自由地畅所欲言。 “平衡”是谨慎与自由之间的平衡。`,
|
freedomTip: `“精确”意味着大语言模型会保守并谨慎地回答你的问题。 “即兴发挥”意味着你希望大语言模型能够自由地畅所欲言。 “平衡”是谨慎与自由之间的平衡。`,
|
||||||
temperature: '温度',
|
temperature: '温度',
|
||||||
temperatureMessage: '温度是必填项',
|
temperatureMessage: '温度是必填项',
|
||||||
temperatureTip:
|
temperatureTip:
|
||||||
@ -441,6 +448,7 @@ export default {
|
|||||||
renamed: '重命名成功',
|
renamed: '重命名成功',
|
||||||
operated: '操作成功',
|
operated: '操作成功',
|
||||||
updated: '更新成功',
|
updated: '更新成功',
|
||||||
|
uploaded: '上传成功',
|
||||||
200: '服务器成功返回请求的数据。',
|
200: '服务器成功返回请求的数据。',
|
||||||
201: '新建或修改数据成功。',
|
201: '新建或修改数据成功。',
|
||||||
202: '一个请求已经进入后台排队(异步任务)。',
|
202: '一个请求已经进入后台排队(异步任务)。',
|
||||||
@ -461,6 +469,24 @@ export default {
|
|||||||
networkAnomaly: '网络异常',
|
networkAnomaly: '网络异常',
|
||||||
hint: '提示',
|
hint: '提示',
|
||||||
},
|
},
|
||||||
|
fileManager: {
|
||||||
|
name: '名称',
|
||||||
|
uploadDate: '上传日期',
|
||||||
|
knowledgeBase: '知识库',
|
||||||
|
size: '大小',
|
||||||
|
action: '操作',
|
||||||
|
addToKnowledge: '添加到知识库',
|
||||||
|
pleaseSelect: '请选择',
|
||||||
|
newFolder: '新建文件夹',
|
||||||
|
uploadFile: '上传文件',
|
||||||
|
uploadTitle: '点击或拖拽文件至此区域即可上传',
|
||||||
|
uploadDescription:
|
||||||
|
'支持单次或批量上传。 严禁上传公司数据或其他违禁文件。',
|
||||||
|
file: '文件',
|
||||||
|
directory: '文件夹',
|
||||||
|
local: '本地上传',
|
||||||
|
s3: 'S3 上传',
|
||||||
|
},
|
||||||
footer: {
|
footer: {
|
||||||
profile: 'All rights reserved @ React',
|
profile: 'All rights reserved @ React',
|
||||||
},
|
},
|
||||||
|
|||||||
@ -14,6 +14,10 @@
|
|||||||
.chunkText;
|
.chunkText;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.contentEllipsis {
|
||||||
|
.multipleLineEllipsis(3);
|
||||||
|
}
|
||||||
|
|
||||||
.chunkCard {
|
.chunkCard {
|
||||||
width: 100%;
|
width: 100%;
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user