mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Compare commits
53 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 48607c3cfb | |||
| d15ba37313 | |||
| a553dc8dbd | |||
| eb27a4309e | |||
| 48e1534bf4 | |||
| e9d19c4684 | |||
| 8d6d7f6887 | |||
| a6e4b74d94 | |||
| a5aed2412f | |||
| 2810c60757 | |||
| 62afcf5ac8 | |||
| a74c755d83 | |||
| 7013d7f620 | |||
| de839fc3f0 | |||
| c6b6c748ae | |||
| ca5acc151a | |||
| 385dbe5ab5 | |||
| 3050a8cb07 | |||
| 9c77d367d0 | |||
| 5f03a4de11 | |||
| 290e5d958d | |||
| 9703633a57 | |||
| 7d3b68bb1e | |||
| c89f3c3cdb | |||
| 5d7f573379 | |||
| cab274f560 | |||
| 7059ec2298 | |||
| 674b3aeafd | |||
| 4c1476032d | |||
| 2af74cc494 | |||
| 38f0cc016f | |||
| 6874c6f3a7 | |||
| 8acc01a227 | |||
| 8c07992b6c | |||
| aee8b48d2f | |||
| daf215d266 | |||
| cdcc779705 | |||
| d589b0f568 | |||
| 9d60a84958 | |||
| aadb9cbec8 | |||
| 038822f3bd | |||
| ae501c58fa | |||
| 944776f207 | |||
| f1c98aad6b | |||
| ab06f502d7 | |||
| 6329339a32 | |||
| 84b39c60f6 | |||
| eb62c669ae | |||
| f69ff39fa0 | |||
| b1cd203904 | |||
| b75d75e995 | |||
| 76c477f211 | |||
| 1b01c4fe69 |
2
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
2
.github/ISSUE_TEMPLATE/bug_report.yml
vendored
@ -1,5 +1,5 @@
|
||||
name: Bug Report
|
||||
description: Create a bug issue for infinity
|
||||
description: Create a bug issue for RAGFlow
|
||||
title: "[Bug]: "
|
||||
labels: [bug]
|
||||
body:
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/feature_request.md
vendored
2
.github/ISSUE_TEMPLATE/feature_request.md
vendored
@ -1,7 +1,7 @@
|
||||
---
|
||||
name: Feature request
|
||||
title: '[Feature Request]: '
|
||||
about: Suggest an idea for Infinity
|
||||
about: Suggest an idea for RAGFlow
|
||||
labels: ''
|
||||
---
|
||||
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
2
.github/ISSUE_TEMPLATE/feature_request.yml
vendored
@ -1,5 +1,5 @@
|
||||
name: Feature request
|
||||
description: Propose a feature request for infinity.
|
||||
description: Propose a feature request for RAGFlow.
|
||||
title: "[Feature Request]: "
|
||||
labels: [feature request]
|
||||
body:
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/question.yml
vendored
2
.github/ISSUE_TEMPLATE/question.yml
vendored
@ -1,5 +1,5 @@
|
||||
name: Question
|
||||
description: Ask questions on infinity
|
||||
description: Ask questions on RAGFlow
|
||||
title: "[Question]: "
|
||||
labels: [question]
|
||||
body:
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/subtask.yml
vendored
2
.github/ISSUE_TEMPLATE/subtask.yml
vendored
@ -1,5 +1,5 @@
|
||||
name: Subtask
|
||||
description: "Propose a subtask for infinity"
|
||||
description: "Propose a subtask for RAGFlow"
|
||||
title: "[Subtask]: "
|
||||
labels: [subtask]
|
||||
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -27,3 +27,5 @@ Cargo.lock
|
||||
|
||||
# Exclude the log folder
|
||||
docker/ragflow-logs/
|
||||
/flask_session
|
||||
/logs
|
||||
|
||||
@ -4,7 +4,7 @@ USER root
|
||||
WORKDIR /ragflow
|
||||
|
||||
ADD ./web ./web
|
||||
RUN cd ./web && npm i && npm run build
|
||||
RUN cd ./web && npm i --force && npm run build
|
||||
|
||||
ADD ./api ./api
|
||||
ADD ./conf ./conf
|
||||
|
||||
@ -9,7 +9,7 @@ RUN /root/miniconda3/envs/py11/bin/pip install onnxruntime-gpu --extra-index-url
|
||||
|
||||
|
||||
ADD ./web ./web
|
||||
RUN cd ./web && npm i && npm run build
|
||||
RUN cd ./web && npm i --force && npm run build
|
||||
|
||||
ADD ./api ./api
|
||||
ADD ./conf ./conf
|
||||
|
||||
@ -34,7 +34,7 @@ ADD ./requirements.txt ./requirements.txt
|
||||
RUN apt install openmpi-bin openmpi-common libopenmpi-dev
|
||||
ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH
|
||||
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
||||
RUN cd ./web && npm i && npm run build
|
||||
RUN cd ./web && npm i --force && npm run build
|
||||
RUN conda run -n py11 pip install -i https://mirrors.aliyun.com/pypi/simple/ -r ./requirements.txt
|
||||
|
||||
RUN apt-get update && \
|
||||
|
||||
@ -35,7 +35,7 @@ RUN dnf install -y openmpi openmpi-devel python3-openmpi
|
||||
ENV C_INCLUDE_PATH /usr/include/openmpi-x86_64:$C_INCLUDE_PATH
|
||||
ENV LD_LIBRARY_PATH /usr/lib64/openmpi/lib:$LD_LIBRARY_PATH
|
||||
RUN rm /root/miniconda3/envs/py11/compiler_compat/ld
|
||||
RUN cd ./web && npm i && npm run build
|
||||
RUN cd ./web && npm i --force && npm run build
|
||||
RUN conda run -n py11 pip install $(grep -ivE "mpi4py" ./requirements.txt) # without mpi4py==3.1.5
|
||||
RUN conda run -n py11 pip install redis
|
||||
|
||||
|
||||
88
README.md
88
README.md
@ -11,16 +11,16 @@
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://github.com/infiniflow/infinity/releases/latest">
|
||||
<a href="https://github.com/infiniflow/ragflow/releases/latest">
|
||||
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
||||
</a>
|
||||
<a href="https://demo.ragflow.io" target="_blank">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Online-Demo-4e6b99"></a>
|
||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.3.1-brightgreen"
|
||||
alt="docker pull infiniflow/ragflow:v0.3.1"></a>
|
||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.5.0-brightgreen"
|
||||
alt="docker pull infiniflow/ragflow:v0.5.0"></a>
|
||||
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=7d09f1" alt="license">
|
||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=1570EF" alt="license">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
@ -58,13 +58,14 @@
|
||||
|
||||
## 📌 Latest Features
|
||||
|
||||
- 2024-04-19 Support conversation API ([detail](./docs/conversation_api.md)).
|
||||
- 2024-04-16 Add an embedding model 'bce-embedding-base_v1' from [BCEmbedding](https://github.com/netease-youdao/BCEmbedding).
|
||||
- 2024-04-16 Add [FastEmbed](https://github.com/qdrant/fastembed), which is designed specifically for light and speedy embedding.
|
||||
- 2024-04-11 Support [Xinference](./docs/xinference.md) for local LLM deployment.
|
||||
- 2024-04-10 Add a new layout recognization model for analyzing Laws documentation.
|
||||
- 2024-04-08 Support [Ollama](./docs/ollama.md) for local LLM deployment.
|
||||
- 2024-04-07 Support Chinese UI.
|
||||
- 2024-05-08 Integrates LLM DeepSeek.
|
||||
- 2024-04-26 Adds file management.
|
||||
- 2024-04-19 Supports conversation API ([detail](./docs/conversation_api.md)).
|
||||
- 2024-04-16 Integrates an embedding model 'bce-embedding-base_v1' from [BCEmbedding](https://github.com/netease-youdao/BCEmbedding), and [FastEmbed](https://github.com/qdrant/fastembed), which is designed specifically for light and speedy embedding.
|
||||
- 2024-04-11 Supports [Xinference](./docs/xinference.md) for local LLM deployment.
|
||||
- 2024-04-10 Adds a new layout recognition model for analyzing Laws documentation.
|
||||
- 2024-04-08 Supports [Ollama](./docs/ollama.md) for local LLM deployment.
|
||||
- 2024-04-07 Supports Chinese UI.
|
||||
|
||||
## 🔎 System Architecture
|
||||
|
||||
@ -118,6 +119,7 @@
|
||||
$ chmod +x ./entrypoint.sh
|
||||
$ docker compose up -d
|
||||
```
|
||||
> Please note that running the above commands will automatically download the development version docker image of RAGFlow. If you want to download and run a specific version of docker image, please find the RAGFLOW_VERSION variable in the docker/.env file, change it to the corresponding version, for example, RAGFLOW_VERSION=v0.5.0, and run the above commands.
|
||||
|
||||
> The core image is about 9 GB in size and may take a while to load.
|
||||
|
||||
@ -179,12 +181,72 @@ To build the Docker images from source:
|
||||
```bash
|
||||
$ git clone https://github.com/infiniflow/ragflow.git
|
||||
$ cd ragflow/
|
||||
$ docker build -t infiniflow/ragflow:v0.3.1 .
|
||||
$ docker build -t infiniflow/ragflow:dev .
|
||||
$ cd ragflow/docker
|
||||
$ chmod +x ./entrypoint.sh
|
||||
$ docker compose up -d
|
||||
```
|
||||
|
||||
## 🛠️ Launch Service from Source
|
||||
|
||||
To launch the service from source, please follow these steps:
|
||||
|
||||
1. Clone the repository
|
||||
```bash
|
||||
$ git clone https://github.com/infiniflow/ragflow.git
|
||||
$ cd ragflow/
|
||||
```
|
||||
|
||||
2. Create a virtual environment (ensure Anaconda or Miniconda is installed)
|
||||
```bash
|
||||
$ conda create -n ragflow python=3.11.0
|
||||
$ conda activate ragflow
|
||||
$ pip install -r requirements.txt
|
||||
```
|
||||
If CUDA version is greater than 12.0, execute the following additional commands:
|
||||
```bash
|
||||
$ pip uninstall -y onnxruntime-gpu
|
||||
$ pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
||||
```
|
||||
|
||||
3. Copy the entry script and configure environment variables
|
||||
```bash
|
||||
$ cp docker/entrypoint.sh .
|
||||
$ vi entrypoint.sh
|
||||
```
|
||||
Use the following commands to obtain the Python path and the ragflow project path:
|
||||
```bash
|
||||
$ which python
|
||||
$ pwd
|
||||
```
|
||||
|
||||
Set the output of `which python` as the value for `PY` and the output of `pwd` as the value for `PYTHONPATH`.
|
||||
|
||||
If `LD_LIBRARY_PATH` is already configured, it can be commented out.
|
||||
|
||||
```bash
|
||||
# Adjust configurations according to your actual situation; the two export commands are newly added.
|
||||
PY=${PY}
|
||||
export PYTHONPATH=${PYTHONPATH}
|
||||
# Optional: Add Hugging Face mirror
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
```
|
||||
|
||||
4. Start the base services
|
||||
```bash
|
||||
$ cd docker
|
||||
$ docker compose -f docker-compose-base.yml up -d
|
||||
```
|
||||
|
||||
5. Check the configuration files
|
||||
Ensure that the settings in **docker/.env** match those in **conf/service_conf.yaml**. The IP addresses and ports for related services in **service_conf.yaml** should be changed to the local machine IP and ports exposed by the container.
|
||||
|
||||
6. Launch the service
|
||||
```bash
|
||||
$ chmod +x ./entrypoint.sh
|
||||
$ bash ./entrypoint.sh
|
||||
```
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
- [FAQ](./docs/faq.md)
|
||||
|
||||
78
README_ja.md
78
README_ja.md
@ -11,16 +11,16 @@
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://github.com/infiniflow/infinity/releases/latest">
|
||||
<a href="https://github.com/infiniflow/ragflow/releases/latest">
|
||||
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
||||
</a>
|
||||
<a href="https://demo.ragflow.io" target="_blank">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Online-Demo-4e6b99"></a>
|
||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.3.1-brightgreen"
|
||||
alt="docker pull infiniflow/ragflow:v0.3.1"></a>
|
||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.5.0-brightgreen"
|
||||
alt="docker pull infiniflow/ragflow:v0.5.0"></a>
|
||||
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=7d09f1" alt="license">
|
||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=1570EF" alt="license">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
@ -58,6 +58,8 @@
|
||||
|
||||
## 📌 最新の機能
|
||||
|
||||
- 2024-05-08
|
||||
- 2024-04-26 「ファイル管理」機能を追加しました。
|
||||
- 2024-04-19 会話 API をサポートします ([詳細](./docs/conversation_api.md))。
|
||||
- 2024-04-16 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) から埋め込みモデル「bce-embedding-base_v1」を追加します。
|
||||
- 2024-04-16 [FastEmbed](https://github.com/qdrant/fastembed) は、軽量かつ高速な埋め込み用に設計されています。
|
||||
@ -119,7 +121,9 @@
|
||||
$ docker compose up -d
|
||||
```
|
||||
|
||||
> コアイメージのサイズは約 15 GB で、ロードに時間がかかる場合があります。
|
||||
> 上記のコマンドを実行すると、RAGFlowの開発版dockerイメージが自動的にダウンロードされます。 特定のバージョンのDockerイメージをダウンロードして実行したい場合は、docker/.envファイルのRAGFLOW_VERSION変数を見つけて、対応するバージョンに変更してください。 例えば、RAGFLOW_VERSION=v0.5.0として、上記のコマンドを実行してください。
|
||||
|
||||
> コアイメージのサイズは約 9 GB で、ロードに時間がかかる場合があります。
|
||||
|
||||
4. サーバーを立ち上げた後、サーバーの状態を確認する:
|
||||
|
||||
@ -179,12 +183,72 @@
|
||||
```bash
|
||||
$ git clone https://github.com/infiniflow/ragflow.git
|
||||
$ cd ragflow/
|
||||
$ docker build -t infiniflow/ragflow:v0.3.1 .
|
||||
$ docker build -t infiniflow/ragflow:v0.5.0 .
|
||||
$ cd ragflow/docker
|
||||
$ chmod +x ./entrypoint.sh
|
||||
$ docker compose up -d
|
||||
```
|
||||
|
||||
## 🛠️ ソースコードからサービスを起動する方法
|
||||
|
||||
ソースコードからサービスを起動する場合は、以下の手順に従ってください:
|
||||
|
||||
1. リポジトリをクローンします
|
||||
```bash
|
||||
$ git clone https://github.com/infiniflow/ragflow.git
|
||||
$ cd ragflow/
|
||||
```
|
||||
|
||||
2. 仮想環境を作成します(AnacondaまたはMinicondaがインストールされていることを確認してください)
|
||||
```bash
|
||||
$ conda create -n ragflow python=3.11.0
|
||||
$ conda activate ragflow
|
||||
$ pip install -r requirements.txt
|
||||
```
|
||||
CUDAのバージョンが12.0以上の場合、以下の追加コマンドを実行してください:
|
||||
```bash
|
||||
$ pip uninstall -y onnxruntime-gpu
|
||||
$ pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
||||
```
|
||||
|
||||
3. エントリースクリプトをコピーし、環境変数を設定します
|
||||
```bash
|
||||
$ cp docker/entrypoint.sh .
|
||||
$ vi entrypoint.sh
|
||||
```
|
||||
以下のコマンドでPythonのパスとragflowプロジェクトのパスを取得します:
|
||||
```bash
|
||||
$ which python
|
||||
$ pwd
|
||||
```
|
||||
|
||||
`which python`の出力を`PY`の値として、`pwd`の出力を`PYTHONPATH`の値として設定します。
|
||||
|
||||
`LD_LIBRARY_PATH`が既に設定されている場合は、コメントアウトできます。
|
||||
|
||||
```bash
|
||||
# 実際の状況に応じて設定を調整してください。以下の二つのexportは新たに追加された設定です
|
||||
PY=${PY}
|
||||
export PYTHONPATH=${PYTHONPATH}
|
||||
# オプション:Hugging Faceミラーを追加
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
```
|
||||
|
||||
4. 基本サービスを起動します
|
||||
```bash
|
||||
$ cd docker
|
||||
$ docker compose -f docker-compose-base.yml up -d
|
||||
```
|
||||
|
||||
5. 設定ファイルを確認します
|
||||
**docker/.env**内の設定が**conf/service_conf.yaml**内の設定と一致していることを確認してください。**service_conf.yaml**内の関連サービスのIPアドレスとポートは、ローカルマシンのIPアドレスとコンテナが公開するポートに変更する必要があります。
|
||||
|
||||
6. サービスを起動します
|
||||
```bash
|
||||
$ chmod +x ./entrypoint.sh
|
||||
$ bash ./entrypoint.sh
|
||||
```
|
||||
|
||||
## 📚 ドキュメンテーション
|
||||
|
||||
- [FAQ](./docs/faq.md)
|
||||
|
||||
81
README_zh.md
81
README_zh.md
@ -11,16 +11,16 @@
|
||||
</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://github.com/infiniflow/infinity/releases/latest">
|
||||
<a href="https://github.com/infiniflow/ragflow/releases/latest">
|
||||
<img src="https://img.shields.io/github/v/release/infiniflow/ragflow?color=blue&label=Latest%20Release" alt="Latest Release">
|
||||
</a>
|
||||
<a href="https://demo.ragflow.io" target="_blank">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/RAGFLOW-LLM-white?&labelColor=dd0af7"></a>
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Online-Demo-4e6b99"></a>
|
||||
<a href="https://hub.docker.com/r/infiniflow/ragflow" target="_blank">
|
||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.3.1-brightgreen"
|
||||
alt="docker pull infiniflow/ragflow:v0.3.1"></a>
|
||||
<img src="https://img.shields.io/badge/docker_pull-ragflow:v0.5.0-brightgreen"
|
||||
alt="docker pull infiniflow/ragflow:v0.5.0"></a>
|
||||
<a href="https://github.com/infiniflow/ragflow/blob/main/LICENSE">
|
||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=7d09f1" alt="license">
|
||||
<img height="21" src="https://img.shields.io/badge/License-Apache--2.0-ffffff?style=flat-square&labelColor=d4eaf7&color=1570EF" alt="license">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
@ -58,9 +58,10 @@
|
||||
|
||||
## 📌 新增功能
|
||||
|
||||
- 2024-05-08 集成大模型 DeepSeek
|
||||
- 2024-04-26 增添了'文件管理'功能.
|
||||
- 2024-04-19 支持对话 API ([更多](./docs/conversation_api.md)).
|
||||
- 2024-04-16 添加嵌入模型 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) 。
|
||||
- 2024-04-16 添加 [FastEmbed](https://github.com/qdrant/fastembed) 专为轻型和高速嵌入而设计。
|
||||
- 2024-04-16 集成嵌入模型 [BCEmbedding](https://github.com/netease-youdao/BCEmbedding) 和 专为轻型和高速嵌入而设计的 [FastEmbed](https://github.com/qdrant/fastembed) 。
|
||||
- 2024-04-11 支持用 [Xinference](./docs/xinference.md) 本地化部署大模型。
|
||||
- 2024-04-10 为‘Laws’版面分析增加了底层模型。
|
||||
- 2024-04-08 支持用 [Ollama](./docs/ollama.md) 本地化部署大模型。
|
||||
@ -119,7 +120,9 @@
|
||||
$ docker compose -f docker-compose-CN.yml up -d
|
||||
```
|
||||
|
||||
> 核心镜像文件大约 15 GB,可能需要一定时间拉取。请耐心等待。
|
||||
> 请注意,运行上述命令会自动下载 RAGFlow 的开发版本 docker 镜像。如果你想下载并运行特定版本的 docker 镜像,请在 docker/.env 文件中找到 RAGFLOW_VERSION 变量,将其改为对应版本。例如 RAGFLOW_VERSION=v0.5.0,然后运行上述命令。
|
||||
|
||||
> 核心镜像文件大约 9 GB,可能需要一定时间拉取。请耐心等待。
|
||||
|
||||
4. 服务器启动成功后再次确认服务器状态:
|
||||
|
||||
@ -179,12 +182,72 @@
|
||||
```bash
|
||||
$ git clone https://github.com/infiniflow/ragflow.git
|
||||
$ cd ragflow/
|
||||
$ docker build -t infiniflow/ragflow:v0.3.1 .
|
||||
$ docker build -t infiniflow/ragflow:v0.5.0 .
|
||||
$ cd ragflow/docker
|
||||
$ chmod +x ./entrypoint.sh
|
||||
$ docker compose up -d
|
||||
```
|
||||
|
||||
## 🛠️ 源码启动服务
|
||||
|
||||
如需从源码启动服务,请参考以下步骤:
|
||||
|
||||
1. 克隆仓库
|
||||
```bash
|
||||
$ git clone https://github.com/infiniflow/ragflow.git
|
||||
$ cd ragflow/
|
||||
```
|
||||
|
||||
2. 创建虚拟环境(确保已安装 Anaconda 或 Miniconda)
|
||||
```bash
|
||||
$ conda create -n ragflow python=3.11.0
|
||||
$ conda activate ragflow
|
||||
$ pip install -r requirements.txt
|
||||
```
|
||||
如果cuda > 12.0,需额外执行以下命令:
|
||||
```bash
|
||||
$ pip uninstall -y onnxruntime-gpu
|
||||
$ pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
|
||||
```
|
||||
|
||||
3. 拷贝入口脚本并配置环境变量
|
||||
```bash
|
||||
$ cp docker/entrypoint.sh .
|
||||
$ vi entrypoint.sh
|
||||
```
|
||||
使用以下命令获取python路径及ragflow项目路径:
|
||||
```bash
|
||||
$ which python
|
||||
$ pwd
|
||||
```
|
||||
|
||||
将上述`which python`的输出作为`PY`的值,将`pwd`的输出作为`PYTHONPATH`的值。
|
||||
|
||||
`LD_LIBRARY_PATH`如果环境已经配置好,可以注释掉。
|
||||
|
||||
```bash
|
||||
# 此处配置需要按照实际情况调整,两个export为新增配置
|
||||
PY=${PY}
|
||||
export PYTHONPATH=${PYTHONPATH}
|
||||
# 可选:添加Hugging Face镜像
|
||||
export HF_ENDPOINT=https://hf-mirror.com
|
||||
```
|
||||
|
||||
4. 启动基础服务
|
||||
```bash
|
||||
$ cd docker
|
||||
$ docker compose -f docker-compose-base.yml up -d
|
||||
```
|
||||
|
||||
5. 检查配置文件
|
||||
确保**docker/.env**中的配置与**conf/service_conf.yaml**中配置一致, **service_conf.yaml**中相关服务的IP地址与端口应该改成本机IP地址及容器映射出来的端口。
|
||||
|
||||
6. 启动服务
|
||||
```bash
|
||||
$ chmod +x ./entrypoint.sh
|
||||
$ bash ./entrypoint.sh
|
||||
```
|
||||
|
||||
## 📚 技术文档
|
||||
|
||||
- [FAQ](./docs/faq.md)
|
||||
|
||||
@ -33,7 +33,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge
|
||||
from itsdangerous import URLSafeTimedSerializer
|
||||
|
||||
from api.utils.file_utils import filename_type, thumbnail
|
||||
from rag.utils import MINIO
|
||||
from rag.utils.minio_conn import MINIO
|
||||
|
||||
|
||||
def generate_confirmation_token(tenent_id):
|
||||
|
||||
@ -20,8 +20,9 @@ from flask_login import login_required, current_user
|
||||
from elasticsearch_dsl import Q
|
||||
|
||||
from rag.app.qa import rmPrefix, beAdoc
|
||||
from rag.nlp import search, huqie
|
||||
from rag.utils import ELASTICSEARCH, rmSpace
|
||||
from rag.nlp import search, rag_tokenizer
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from rag.utils import rmSpace
|
||||
from api.db import LLMType, ParserType
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import TenantLLMService
|
||||
@ -124,10 +125,10 @@ def set():
|
||||
d = {
|
||||
"id": req["chunk_id"],
|
||||
"content_with_weight": req["content_with_weight"]}
|
||||
d["content_ltks"] = huqie.qie(req["content_with_weight"])
|
||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
d["important_kwd"] = req["important_kwd"]
|
||||
d["important_tks"] = huqie.qie(" ".join(req["important_kwd"]))
|
||||
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
|
||||
if "available_int" in req:
|
||||
d["available_int"] = req["available_int"]
|
||||
|
||||
@ -151,7 +152,7 @@ def set():
|
||||
retmsg="Q&A must be separated by TAB/ENTER key.")
|
||||
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
||||
d = beAdoc(d, arr[0], arr[1], not any(
|
||||
[huqie.is_chinese(t) for t in q + a]))
|
||||
[rag_tokenizer.is_chinese(t) for t in q + a]))
|
||||
|
||||
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
||||
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
||||
@ -201,11 +202,11 @@ def create():
|
||||
md5 = hashlib.md5()
|
||||
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
||||
chunck_id = md5.hexdigest()
|
||||
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
|
||||
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
|
||||
"content_with_weight": req["content_with_weight"]}
|
||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
d["important_kwd"] = req.get("important_kwd", [])
|
||||
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
|
||||
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
|
||||
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
||||
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
||||
|
||||
|
||||
@ -35,13 +35,7 @@ def set_dialog():
|
||||
top_n = req.get("top_n", 6)
|
||||
similarity_threshold = req.get("similarity_threshold", 0.1)
|
||||
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
|
||||
llm_setting = req.get("llm_setting", {
|
||||
"temperature": 0.1,
|
||||
"top_p": 0.3,
|
||||
"frequency_penalty": 0.7,
|
||||
"presence_penalty": 0.4,
|
||||
"max_tokens": 215
|
||||
})
|
||||
llm_setting = req.get("llm_setting", {})
|
||||
default_prompt = {
|
||||
"system": """你是一个智能助手,请总结知识库的内容来回答问题,请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时,你的回答必须包括“知识库中未找到您要的答案!”这句话。回答需要考虑聊天历史。
|
||||
以下是知识库:
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
# limitations under the License
|
||||
#
|
||||
|
||||
import base64
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
@ -23,8 +22,13 @@ import flask
|
||||
from elasticsearch_dsl import Q
|
||||
from flask import request
|
||||
from flask_login import login_required, current_user
|
||||
|
||||
from api.db.db_models import Task
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.task_service import TaskService, queue_tasks
|
||||
from rag.nlp import search
|
||||
from rag.utils import ELASTICSEARCH
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||
@ -48,34 +52,35 @@ def upload():
|
||||
if 'file' not in request.files:
|
||||
return get_json_result(
|
||||
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
||||
file = request.files['file']
|
||||
if file.filename == '':
|
||||
|
||||
file_objs = request.files.getlist('file')
|
||||
for file_obj in file_objs:
|
||||
if file_obj.filename == '':
|
||||
return get_json_result(
|
||||
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
err = []
|
||||
for file in file_objs:
|
||||
try:
|
||||
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||
if not e:
|
||||
return get_data_error_result(
|
||||
retmsg="Can't find this knowledgebase!")
|
||||
raise LookupError("Can't find this knowledgebase!")
|
||||
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
||||
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
|
||||
return get_data_error_result(
|
||||
retmsg="Exceed the maximum file number of a free user!")
|
||||
raise RuntimeError("Exceed the maximum file number of a free user!")
|
||||
|
||||
filename = duplicate_name(
|
||||
DocumentService.query,
|
||||
name=file.filename,
|
||||
kb_id=kb.id)
|
||||
filetype = filename_type(filename)
|
||||
if not filetype:
|
||||
return get_data_error_result(
|
||||
retmsg="This type of file has not been supported yet!")
|
||||
if filetype == FileType.OTHER.value:
|
||||
raise RuntimeError("This type of file has not been supported yet!")
|
||||
|
||||
location = filename
|
||||
while MINIO.obj_exist(kb_id, location):
|
||||
location += "_"
|
||||
blob = request.files['file'].read()
|
||||
blob = file.read()
|
||||
MINIO.put(kb_id, location, blob)
|
||||
doc = {
|
||||
"id": get_uuid(),
|
||||
@ -93,10 +98,13 @@ def upload():
|
||||
doc["parser_id"] = ParserType.PICTURE.value
|
||||
if re.search(r"\.(ppt|pptx|pages)$", filename):
|
||||
doc["parser_id"] = ParserType.PRESENTATION.value
|
||||
doc = DocumentService.insert(doc)
|
||||
return get_json_result(data=doc.to_json())
|
||||
DocumentService.insert(doc)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
err.append(file.filename + ": " + str(e))
|
||||
if err:
|
||||
return get_json_result(
|
||||
data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
||||
return get_json_result(data=True)
|
||||
|
||||
|
||||
@manager.route('/create', methods=['POST'])
|
||||
@ -218,26 +226,37 @@ def change_status():
|
||||
@validate_request("doc_id")
|
||||
def rm():
|
||||
req = request.json
|
||||
doc_ids = req["doc_id"]
|
||||
if isinstance(doc_ids, str): doc_ids = [doc_ids]
|
||||
errors = ""
|
||||
for doc_id in doc_ids:
|
||||
try:
|
||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Document not found!")
|
||||
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
||||
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||
if not tenant_id:
|
||||
return get_data_error_result(retmsg="Tenant not found!")
|
||||
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||
|
||||
DocumentService.increment_chunk_num(
|
||||
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||
if not DocumentService.delete(doc):
|
||||
return get_data_error_result(
|
||||
retmsg="Database error (Document removal)!")
|
||||
|
||||
informs = File2DocumentService.get_by_document_id(doc_id)
|
||||
if not informs:
|
||||
MINIO.rm(doc.kb_id, doc.location)
|
||||
return get_json_result(data=True)
|
||||
else:
|
||||
File2DocumentService.delete_by_document_id(doc_id)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
errors += str(e)
|
||||
|
||||
if errors: return server_error_response(e)
|
||||
return get_json_result(data=True)
|
||||
|
||||
|
||||
@manager.route('/run', methods=['POST'])
|
||||
@ -260,6 +279,14 @@ def run():
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
||||
|
||||
if str(req["run"]) == TaskStatus.RUNNING.value:
|
||||
TaskService.filter_delete([Task.doc_id == id])
|
||||
e, doc = DocumentService.get_by_id(id)
|
||||
doc = doc.to_dict()
|
||||
doc["tenant_id"] = tenant_id
|
||||
bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
|
||||
queue_tasks(doc, bucket, name)
|
||||
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -289,6 +316,11 @@ def rename():
|
||||
return get_data_error_result(
|
||||
retmsg="Database error (Document rename)!")
|
||||
|
||||
informs = File2DocumentService.get_by_document_id(req["doc_id"])
|
||||
if informs:
|
||||
e, file = FileService.get_by_id(informs[0].file_id)
|
||||
FileService.update_by_id(file.id, {"name": req["name"]})
|
||||
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -302,7 +334,13 @@ def get(doc_id):
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Document not found!")
|
||||
|
||||
informs = File2DocumentService.get_by_document_id(doc_id)
|
||||
if not informs:
|
||||
response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
|
||||
else:
|
||||
e, file = FileService.get_by_id(informs[0].file_id)
|
||||
response = flask.make_response(MINIO.get(file.parent_id, doc.location))
|
||||
|
||||
ext = re.search(r"\.([^.]+)$", doc.name)
|
||||
if ext:
|
||||
if doc.type == FileType.VISUAL.value:
|
||||
@ -338,7 +376,8 @@ def change_parser():
|
||||
return get_data_error_result(retmsg="Not supported yet!")
|
||||
|
||||
e = DocumentService.update_by_id(doc.id,
|
||||
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"})
|
||||
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
|
||||
"run": TaskStatus.UNSTART.value})
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Document not found!")
|
||||
if "parser_config" in req:
|
||||
|
||||
137
api/apps/file2document_app.py
Normal file
137
api/apps/file2document_app.py
Normal file
@ -0,0 +1,137 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License
|
||||
#
|
||||
from elasticsearch_dsl import Q
|
||||
|
||||
from api.db.db_models import File2Document
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
|
||||
from flask import request
|
||||
from flask_login import login_required, current_user
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||
from api.utils import get_uuid
|
||||
from api.db import FileType
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.settings import RetCode
|
||||
from api.utils.api_utils import get_json_result
|
||||
from rag.nlp import search
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
|
||||
|
||||
@manager.route('/convert', methods=['POST'])
|
||||
@login_required
|
||||
@validate_request("file_ids", "kb_ids")
|
||||
def convert():
|
||||
req = request.json
|
||||
kb_ids = req["kb_ids"]
|
||||
file_ids = req["file_ids"]
|
||||
file2documents = []
|
||||
|
||||
try:
|
||||
for file_id in file_ids:
|
||||
e, file = FileService.get_by_id(file_id)
|
||||
file_ids_list = [file_id]
|
||||
if file.type == FileType.FOLDER.value:
|
||||
file_ids_list = FileService.get_all_innermost_file_ids(file_id, [])
|
||||
for id in file_ids_list:
|
||||
informs = File2DocumentService.get_by_file_id(id)
|
||||
# delete
|
||||
for inform in informs:
|
||||
doc_id = inform.document_id
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Document not found!")
|
||||
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||
if not tenant_id:
|
||||
return get_data_error_result(retmsg="Tenant not found!")
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||
DocumentService.increment_chunk_num(
|
||||
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||
if not DocumentService.delete(doc):
|
||||
return get_data_error_result(
|
||||
retmsg="Database error (Document removal)!")
|
||||
File2DocumentService.delete_by_file_id(id)
|
||||
|
||||
# insert
|
||||
for kb_id in kb_ids:
|
||||
e, kb = KnowledgebaseService.get_by_id(kb_id)
|
||||
if not e:
|
||||
return get_data_error_result(
|
||||
retmsg="Can't find this knowledgebase!")
|
||||
e, file = FileService.get_by_id(id)
|
||||
if not e:
|
||||
return get_data_error_result(
|
||||
retmsg="Can't find this file!")
|
||||
|
||||
doc = DocumentService.insert({
|
||||
"id": get_uuid(),
|
||||
"kb_id": kb.id,
|
||||
"parser_id": kb.parser_id,
|
||||
"parser_config": kb.parser_config,
|
||||
"created_by": current_user.id,
|
||||
"type": file.type,
|
||||
"name": file.name,
|
||||
"location": file.location,
|
||||
"size": file.size
|
||||
})
|
||||
file2document = File2DocumentService.insert({
|
||||
"id": get_uuid(),
|
||||
"file_id": id,
|
||||
"document_id": doc.id,
|
||||
})
|
||||
file2documents.append(file2document.to_json())
|
||||
return get_json_result(data=file2documents)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/rm', methods=['POST'])
|
||||
@login_required
|
||||
@validate_request("file_ids")
|
||||
def rm():
|
||||
req = request.json
|
||||
file_ids = req["file_ids"]
|
||||
if not file_ids:
|
||||
return get_json_result(
|
||||
data=False, retmsg='Lack of "Files ID"', retcode=RetCode.ARGUMENT_ERROR)
|
||||
try:
|
||||
for file_id in file_ids:
|
||||
informs = File2DocumentService.get_by_file_id(file_id)
|
||||
if not informs:
|
||||
return get_data_error_result(retmsg="Inform not found!")
|
||||
for inform in informs:
|
||||
if not inform:
|
||||
return get_data_error_result(retmsg="Inform not found!")
|
||||
File2DocumentService.delete_by_file_id(file_id)
|
||||
doc_id = inform.document_id
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Document not found!")
|
||||
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||
if not tenant_id:
|
||||
return get_data_error_result(retmsg="Tenant not found!")
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||
DocumentService.increment_chunk_num(
|
||||
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||
if not DocumentService.delete(doc):
|
||||
return get_data_error_result(
|
||||
retmsg="Database error (Document removal)!")
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
347
api/apps/file_app.py
Normal file
347
api/apps/file_app.py
Normal file
@ -0,0 +1,347 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License
|
||||
#
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
|
||||
import flask
|
||||
from elasticsearch_dsl import Q
|
||||
from flask import request
|
||||
from flask_login import login_required, current_user
|
||||
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||
from api.utils import get_uuid
|
||||
from api.db import FileType
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.file_service import FileService
|
||||
from api.settings import RetCode
|
||||
from api.utils.api_utils import get_json_result
|
||||
from api.utils.file_utils import filename_type
|
||||
from rag.nlp import search
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from rag.utils.minio_conn import MINIO
|
||||
|
||||
|
||||
@manager.route('/upload', methods=['POST'])
|
||||
@login_required
|
||||
# @validate_request("parent_id")
|
||||
def upload():
|
||||
pf_id = request.form.get("parent_id")
|
||||
|
||||
if not pf_id:
|
||||
root_folder = FileService.get_root_folder(current_user.id)
|
||||
pf_id = root_folder.id
|
||||
|
||||
if 'file' not in request.files:
|
||||
return get_json_result(
|
||||
data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
|
||||
file_objs = request.files.getlist('file')
|
||||
|
||||
for file_obj in file_objs:
|
||||
if file_obj.filename == '':
|
||||
return get_json_result(
|
||||
data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
||||
file_res = []
|
||||
try:
|
||||
for file_obj in file_objs:
|
||||
e, file = FileService.get_by_id(pf_id)
|
||||
if not e:
|
||||
return get_data_error_result(
|
||||
retmsg="Can't find this folder!")
|
||||
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
|
||||
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(current_user.id) >= MAX_FILE_NUM_PER_USER:
|
||||
return get_data_error_result(
|
||||
retmsg="Exceed the maximum file number of a free user!")
|
||||
|
||||
# split file name path
|
||||
if not file_obj.filename:
|
||||
e, file = FileService.get_by_id(pf_id)
|
||||
file_obj_names = [file.name, file_obj.filename]
|
||||
else:
|
||||
full_path = '/' + file_obj.filename
|
||||
file_obj_names = full_path.split('/')
|
||||
file_len = len(file_obj_names)
|
||||
|
||||
# get folder
|
||||
file_id_list = FileService.get_id_list_by_id(pf_id, file_obj_names, 1, [pf_id])
|
||||
len_id_list = len(file_id_list)
|
||||
|
||||
# create folder
|
||||
if file_len != len_id_list:
|
||||
e, file = FileService.get_by_id(file_id_list[len_id_list - 1])
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Folder not found!")
|
||||
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 1], file_obj_names,
|
||||
len_id_list)
|
||||
else:
|
||||
e, file = FileService.get_by_id(file_id_list[len_id_list - 2])
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Folder not found!")
|
||||
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 2], file_obj_names,
|
||||
len_id_list)
|
||||
|
||||
# file type
|
||||
filetype = filename_type(file_obj_names[file_len - 1])
|
||||
location = file_obj_names[file_len - 1]
|
||||
while MINIO.obj_exist(last_folder.id, location):
|
||||
location += "_"
|
||||
blob = file_obj.read()
|
||||
filename = duplicate_name(
|
||||
FileService.query,
|
||||
name=file_obj_names[file_len - 1],
|
||||
parent_id=last_folder.id)
|
||||
file = {
|
||||
"id": get_uuid(),
|
||||
"parent_id": last_folder.id,
|
||||
"tenant_id": current_user.id,
|
||||
"created_by": current_user.id,
|
||||
"type": filetype,
|
||||
"name": filename,
|
||||
"location": location,
|
||||
"size": len(blob),
|
||||
}
|
||||
file = FileService.insert(file)
|
||||
MINIO.put(last_folder.id, location, blob)
|
||||
file_res.append(file.to_json())
|
||||
return get_json_result(data=file_res)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/create', methods=['POST'])
|
||||
@login_required
|
||||
@validate_request("name")
|
||||
def create():
|
||||
req = request.json
|
||||
pf_id = request.json.get("parent_id")
|
||||
input_file_type = request.json.get("type")
|
||||
if not pf_id:
|
||||
root_folder = FileService.get_root_folder(current_user.id)
|
||||
pf_id = root_folder.id
|
||||
|
||||
try:
|
||||
if not FileService.is_parent_folder_exist(pf_id):
|
||||
return get_json_result(
|
||||
data=False, retmsg="Parent Folder Doesn't Exist!", retcode=RetCode.OPERATING_ERROR)
|
||||
if FileService.query(name=req["name"], parent_id=pf_id):
|
||||
return get_data_error_result(
|
||||
retmsg="Duplicated folder name in the same folder.")
|
||||
|
||||
if input_file_type == FileType.FOLDER.value:
|
||||
file_type = FileType.FOLDER.value
|
||||
else:
|
||||
file_type = FileType.VIRTUAL.value
|
||||
|
||||
file = FileService.insert({
|
||||
"id": get_uuid(),
|
||||
"parent_id": pf_id,
|
||||
"tenant_id": current_user.id,
|
||||
"created_by": current_user.id,
|
||||
"name": req["name"],
|
||||
"location": "",
|
||||
"size": 0,
|
||||
"type": file_type
|
||||
})
|
||||
|
||||
return get_json_result(data=file.to_json())
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/list', methods=['GET'])
|
||||
@login_required
|
||||
def list():
|
||||
pf_id = request.args.get("parent_id")
|
||||
|
||||
keywords = request.args.get("keywords", "")
|
||||
|
||||
page_number = int(request.args.get("page", 1))
|
||||
items_per_page = int(request.args.get("page_size", 15))
|
||||
orderby = request.args.get("orderby", "create_time")
|
||||
desc = request.args.get("desc", True)
|
||||
if not pf_id:
|
||||
root_folder = FileService.get_root_folder(current_user.id)
|
||||
pf_id = root_folder.id
|
||||
try:
|
||||
e, file = FileService.get_by_id(pf_id)
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Folder not found!")
|
||||
|
||||
files, total = FileService.get_by_pf_id(
|
||||
current_user.id, pf_id, page_number, items_per_page, orderby, desc, keywords)
|
||||
|
||||
parent_folder = FileService.get_parent_folder(pf_id)
|
||||
if not FileService.get_parent_folder(pf_id):
|
||||
return get_json_result(retmsg="File not found!")
|
||||
|
||||
return get_json_result(data={"total": total, "files": files, "parent_folder": parent_folder.to_json()})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/root_folder', methods=['GET'])
|
||||
@login_required
|
||||
def get_root_folder():
|
||||
try:
|
||||
root_folder = FileService.get_root_folder(current_user.id)
|
||||
return get_json_result(data={"root_folder": root_folder.to_json()})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/parent_folder', methods=['GET'])
|
||||
@login_required
|
||||
def get_parent_folder():
|
||||
file_id = request.args.get("file_id")
|
||||
try:
|
||||
e, file = FileService.get_by_id(file_id)
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Folder not found!")
|
||||
|
||||
parent_folder = FileService.get_parent_folder(file_id)
|
||||
return get_json_result(data={"parent_folder": parent_folder.to_json()})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/all_parent_folder', methods=['GET'])
|
||||
@login_required
|
||||
def get_all_parent_folders():
|
||||
file_id = request.args.get("file_id")
|
||||
try:
|
||||
e, file = FileService.get_by_id(file_id)
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Folder not found!")
|
||||
|
||||
parent_folders = FileService.get_all_parent_folders(file_id)
|
||||
parent_folders_res = []
|
||||
for parent_folder in parent_folders:
|
||||
parent_folders_res.append(parent_folder.to_json())
|
||||
return get_json_result(data={"parent_folders": parent_folders_res})
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/rm', methods=['POST'])
|
||||
@login_required
|
||||
@validate_request("file_ids")
|
||||
def rm():
|
||||
req = request.json
|
||||
file_ids = req["file_ids"]
|
||||
try:
|
||||
for file_id in file_ids:
|
||||
e, file = FileService.get_by_id(file_id)
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="File or Folder not found!")
|
||||
if not file.tenant_id:
|
||||
return get_data_error_result(retmsg="Tenant not found!")
|
||||
|
||||
if file.type == FileType.FOLDER.value:
|
||||
file_id_list = FileService.get_all_innermost_file_ids(file_id, [])
|
||||
for inner_file_id in file_id_list:
|
||||
e, file = FileService.get_by_id(inner_file_id)
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="File not found!")
|
||||
MINIO.rm(file.parent_id, file.location)
|
||||
FileService.delete_folder_by_pf_id(current_user.id, file_id)
|
||||
else:
|
||||
if not FileService.delete(file):
|
||||
return get_data_error_result(
|
||||
retmsg="Database error (File removal)!")
|
||||
|
||||
# delete file2document
|
||||
informs = File2DocumentService.get_by_file_id(file_id)
|
||||
for inform in informs:
|
||||
doc_id = inform.document_id
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Document not found!")
|
||||
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||
if not tenant_id:
|
||||
return get_data_error_result(retmsg="Tenant not found!")
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||
DocumentService.increment_chunk_num(
|
||||
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||
if not DocumentService.delete(doc):
|
||||
return get_data_error_result(
|
||||
retmsg="Database error (Document removal)!")
|
||||
File2DocumentService.delete_by_file_id(file_id)
|
||||
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/rename', methods=['POST'])
|
||||
@login_required
|
||||
@validate_request("file_id", "name")
|
||||
def rename():
|
||||
req = request.json
|
||||
try:
|
||||
e, file = FileService.get_by_id(req["file_id"])
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="File not found!")
|
||||
if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
|
||||
file.name.lower()).suffix:
|
||||
return get_json_result(
|
||||
data=False,
|
||||
retmsg="The extension of file can't be changed",
|
||||
retcode=RetCode.ARGUMENT_ERROR)
|
||||
if FileService.query(name=req["name"], pf_id=file.parent_id):
|
||||
return get_data_error_result(
|
||||
retmsg="Duplicated file name in the same folder.")
|
||||
|
||||
if not FileService.update_by_id(
|
||||
req["file_id"], {"name": req["name"]}):
|
||||
return get_data_error_result(
|
||||
retmsg="Database error (File rename)!")
|
||||
|
||||
informs = File2DocumentService.get_by_file_id(req["file_id"])
|
||||
if informs:
|
||||
if not DocumentService.update_by_id(
|
||||
informs[0].document_id, {"name": req["name"]}):
|
||||
return get_data_error_result(
|
||||
retmsg="Database error (Document rename)!")
|
||||
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
|
||||
@manager.route('/get/<file_id>', methods=['GET'])
|
||||
# @login_required
|
||||
def get(file_id):
|
||||
try:
|
||||
e, file = FileService.get_by_id(file_id)
|
||||
if not e:
|
||||
return get_data_error_result(retmsg="Document not found!")
|
||||
|
||||
response = flask.make_response(MINIO.get(file.parent_id, file.location))
|
||||
ext = re.search(r"\.([^.]+)$", file.name)
|
||||
if ext:
|
||||
if file.type == FileType.VISUAL.value:
|
||||
response.headers.set('Content-Type', 'image/%s' % ext.group(1))
|
||||
else:
|
||||
response.headers.set(
|
||||
'Content-Type',
|
||||
'application/%s' %
|
||||
ext.group(1))
|
||||
return response
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -28,7 +28,7 @@ from api.db.db_models import Knowledgebase
|
||||
from api.settings import stat_logger, RetCode
|
||||
from api.utils.api_utils import get_json_result
|
||||
from rag.nlp import search
|
||||
from rag.utils import ELASTICSEARCH
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
|
||||
|
||||
@manager.route('/create', methods=['post'])
|
||||
@ -111,7 +111,7 @@ def detail():
|
||||
@login_required
|
||||
def list():
|
||||
page_number = request.args.get("page", 1)
|
||||
items_per_page = request.args.get("page_size", 15)
|
||||
items_per_page = request.args.get("page_size", 150)
|
||||
orderby = request.args.get("orderby", "create_time")
|
||||
desc = request.args.get("desc", True)
|
||||
try:
|
||||
|
||||
@ -24,10 +24,11 @@ from api.db.db_models import TenantLLM
|
||||
from api.db.services.llm_service import TenantLLMService, LLMService
|
||||
from api.utils.api_utils import server_error_response, validate_request
|
||||
from api.utils import get_uuid, get_format_time, decrypt, download_img, current_timestamp, datetime_format
|
||||
from api.db import UserTenantRole, LLMType
|
||||
from api.db import UserTenantRole, LLMType, FileType
|
||||
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
|
||||
LLM_FACTORY, LLM_BASE_URL
|
||||
from api.db.services.user_service import UserService, TenantService, UserTenantService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.settings import stat_logger
|
||||
from api.utils.api_utils import get_json_result, cors_reponse
|
||||
|
||||
@ -221,6 +222,17 @@ def user_register(user_id, user):
|
||||
"invited_by": user_id,
|
||||
"role": UserTenantRole.OWNER
|
||||
}
|
||||
file_id = get_uuid()
|
||||
file = {
|
||||
"id": file_id,
|
||||
"parent_id": file_id,
|
||||
"tenant_id": user_id,
|
||||
"created_by": user_id,
|
||||
"name": "/",
|
||||
"type": FileType.FOLDER.value,
|
||||
"size": 0,
|
||||
"location": "",
|
||||
}
|
||||
tenant_llm = []
|
||||
for llm in LLMService.query(fid=LLM_FACTORY):
|
||||
tenant_llm.append({"tenant_id": user_id,
|
||||
@ -236,6 +248,7 @@ def user_register(user_id, user):
|
||||
TenantService.insert(**tenant)
|
||||
UserTenantService.insert(**usr_tenant)
|
||||
TenantLLMService.insert_many(tenant_llm)
|
||||
FileService.insert(file)
|
||||
return UserService.query(email=user["email"])
|
||||
|
||||
|
||||
|
||||
@ -45,6 +45,8 @@ class FileType(StrEnum):
|
||||
VISUAL = 'visual'
|
||||
AURAL = 'aural'
|
||||
VIRTUAL = 'virtual'
|
||||
FOLDER = 'folder'
|
||||
OTHER = "other"
|
||||
|
||||
|
||||
class LLMType(StrEnum):
|
||||
@ -62,6 +64,7 @@ class ChatStyle(StrEnum):
|
||||
|
||||
|
||||
class TaskStatus(StrEnum):
|
||||
UNSTART = "0"
|
||||
RUNNING = "1"
|
||||
CANCEL = "2"
|
||||
DONE = "3"
|
||||
|
||||
@ -669,6 +669,61 @@ class Document(DataBaseModel):
|
||||
db_table = "document"
|
||||
|
||||
|
||||
class File(DataBaseModel):
|
||||
id = CharField(
|
||||
max_length=32,
|
||||
primary_key=True,
|
||||
)
|
||||
parent_id = CharField(
|
||||
max_length=32,
|
||||
null=False,
|
||||
help_text="parent folder id",
|
||||
index=True)
|
||||
tenant_id = CharField(
|
||||
max_length=32,
|
||||
null=False,
|
||||
help_text="tenant id",
|
||||
index=True)
|
||||
created_by = CharField(
|
||||
max_length=32,
|
||||
null=False,
|
||||
help_text="who created it")
|
||||
name = CharField(
|
||||
max_length=255,
|
||||
null=False,
|
||||
help_text="file name or folder name",
|
||||
index=True)
|
||||
location = CharField(
|
||||
max_length=255,
|
||||
null=True,
|
||||
help_text="where dose it store")
|
||||
size = IntegerField(default=0)
|
||||
type = CharField(max_length=32, null=False, help_text="file extension")
|
||||
|
||||
class Meta:
|
||||
db_table = "file"
|
||||
|
||||
|
||||
class File2Document(DataBaseModel):
|
||||
id = CharField(
|
||||
max_length=32,
|
||||
primary_key=True,
|
||||
)
|
||||
file_id = CharField(
|
||||
max_length=32,
|
||||
null=True,
|
||||
help_text="file id",
|
||||
index=True)
|
||||
document_id = CharField(
|
||||
max_length=32,
|
||||
null=True,
|
||||
help_text="document id",
|
||||
index=True)
|
||||
|
||||
class Meta:
|
||||
db_table = "file2document"
|
||||
|
||||
|
||||
class Task(DataBaseModel):
|
||||
id = CharField(max_length=32, primary_key=True)
|
||||
doc_id = CharField(max_length=32, null=False, index=True)
|
||||
|
||||
@ -124,6 +124,11 @@ factory_infos = [{
|
||||
"logo": "",
|
||||
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
||||
"status": "1",
|
||||
},{
|
||||
"name": "DeepSeek",
|
||||
"logo": "",
|
||||
"tags": "LLM",
|
||||
"status": "1",
|
||||
},
|
||||
# {
|
||||
# "name": "文心一言",
|
||||
@ -331,6 +336,21 @@ def init_llm_factory():
|
||||
"max_tokens": 512,
|
||||
"model_type": LLMType.EMBEDDING.value
|
||||
},
|
||||
# ------------------------ DeepSeek -----------------------
|
||||
{
|
||||
"fid": factory_infos[8]["name"],
|
||||
"llm_name": "deepseek-chat",
|
||||
"tags": "LLM,CHAT,",
|
||||
"max_tokens": 32768,
|
||||
"model_type": LLMType.CHAT.value
|
||||
},
|
||||
{
|
||||
"fid": factory_infos[8]["name"],
|
||||
"llm_name": "deepseek-coder",
|
||||
"tags": "LLM,CHAT,",
|
||||
"max_tokens": 16385,
|
||||
"model_type": LLMType.CHAT.value
|
||||
},
|
||||
]
|
||||
for info in factory_infos:
|
||||
try:
|
||||
@ -347,8 +367,8 @@ def init_llm_factory():
|
||||
LLMService.filter_delete([LLM.fid == "Local"])
|
||||
LLMService.filter_delete([LLM.fid == "Moonshot", LLM.llm_name == "flag-embedding"])
|
||||
TenantLLMService.filter_delete([TenantLLM.llm_factory == "Moonshot", TenantLLM.llm_name == "flag-embedding"])
|
||||
LLMFactoriesService.filter_update([LLMFactoriesService.model.name == "QAnything"], {"name": "Youdao"})
|
||||
LLMService.filter_update([LLMService.model.fid == "QAnything"], {"fid": "Youdao"})
|
||||
LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
|
||||
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
|
||||
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
|
||||
"""
|
||||
drop table llm;
|
||||
|
||||
@ -136,7 +136,7 @@ def chat(dialog, messages, **kwargs):
|
||||
chat_logger.info("User: {}|Assistant: {}".format(
|
||||
msg[-1]["content"], answer))
|
||||
|
||||
if knowledges and prompt_config.get("quote", True):
|
||||
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
||||
answer, idx = retrievaler.insert_citations(answer,
|
||||
[ck["content_ltks"]
|
||||
for ck in kbinfos["chunks"]],
|
||||
|
||||
@ -13,10 +13,18 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from peewee import Expression
|
||||
import random
|
||||
from datetime import datetime
|
||||
from elasticsearch_dsl import Q
|
||||
|
||||
from api.settings import stat_logger
|
||||
from api.utils import current_timestamp, get_format_time
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from rag.utils.minio_conn import MINIO
|
||||
from rag.nlp import search
|
||||
|
||||
from api.db import FileType, TaskStatus
|
||||
from api.db.db_models import DB, Knowledgebase, Tenant
|
||||
from api.db.db_models import DB, Knowledgebase, Tenant, Task
|
||||
from api.db.db_models import Document
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
@ -71,7 +79,21 @@ class DocumentService(CommonService):
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_newly_uploaded(cls, tm, mod=0, comm=1, items_per_page=64):
|
||||
def remove_document(cls, doc, tenant_id):
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||
|
||||
cls.increment_chunk_num(
|
||||
doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, 0)
|
||||
if not cls.delete(doc):
|
||||
raise RuntimeError("Database error (Document removal)!")
|
||||
|
||||
MINIO.rm(doc.kb_id, doc.location)
|
||||
return cls.delete_by_id(doc.id)
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_newly_uploaded(cls):
|
||||
fields = [
|
||||
cls.model.id,
|
||||
cls.model.kb_id,
|
||||
@ -93,11 +115,9 @@ class DocumentService(CommonService):
|
||||
cls.model.status == StatusEnum.VALID.value,
|
||||
~(cls.model.type == FileType.VIRTUAL.value),
|
||||
cls.model.progress == 0,
|
||||
cls.model.update_time >= tm,
|
||||
cls.model.run == TaskStatus.RUNNING.value,
|
||||
(Expression(cls.model.create_time, "%%", comm) == mod))\
|
||||
.order_by(cls.model.update_time.asc())\
|
||||
.paginate(1, items_per_page)
|
||||
cls.model.update_time >= current_timestamp() - 1000 * 600,
|
||||
cls.model.run == TaskStatus.RUNNING.value)\
|
||||
.order_by(cls.model.update_time.asc())
|
||||
return list(docs.dicts())
|
||||
|
||||
@classmethod
|
||||
@ -177,3 +197,55 @@ class DocumentService(CommonService):
|
||||
on=(Knowledgebase.id == cls.model.kb_id)).where(
|
||||
Knowledgebase.tenant_id == tenant_id)
|
||||
return len(docs)
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def begin2parse(cls, docid):
|
||||
cls.update_by_id(
|
||||
docid, {"progress": random.random() * 1 / 100.,
|
||||
"progress_msg": "Task dispatched...",
|
||||
"process_begin_at": get_format_time()
|
||||
})
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def update_progress(cls):
|
||||
docs = cls.get_unfinished_docs()
|
||||
for d in docs:
|
||||
try:
|
||||
tsks = Task.query(doc_id=d["id"], order_by=Task.create_time)
|
||||
if not tsks:
|
||||
continue
|
||||
msg = []
|
||||
prg = 0
|
||||
finished = True
|
||||
bad = 0
|
||||
status = TaskStatus.RUNNING.value
|
||||
for t in tsks:
|
||||
if 0 <= t.progress < 1:
|
||||
finished = False
|
||||
prg += t.progress if t.progress >= 0 else 0
|
||||
msg.append(t.progress_msg)
|
||||
if t.progress == -1:
|
||||
bad += 1
|
||||
prg /= len(tsks)
|
||||
if finished and bad:
|
||||
prg = -1
|
||||
status = TaskStatus.FAIL.value
|
||||
elif finished:
|
||||
status = TaskStatus.DONE.value
|
||||
|
||||
msg = "\n".join(msg)
|
||||
info = {
|
||||
"process_duation": datetime.timestamp(
|
||||
datetime.now()) -
|
||||
d["process_begin_at"].timestamp(),
|
||||
"run": status}
|
||||
if prg != 0:
|
||||
info["progress"] = prg
|
||||
if msg:
|
||||
info["progress_msg"] = msg
|
||||
cls.update_by_id(d["id"], info)
|
||||
except Exception as e:
|
||||
stat_logger.error("fetch task exception:" + str(e))
|
||||
|
||||
|
||||
83
api/db/services/file2document_service.py
Normal file
83
api/db/services/file2document_service.py
Normal file
@ -0,0 +1,83 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from datetime import datetime
|
||||
|
||||
from api.db.db_models import DB
|
||||
from api.db.db_models import File, Document, File2Document
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.utils import current_timestamp, datetime_format
|
||||
|
||||
|
||||
class File2DocumentService(CommonService):
|
||||
model = File2Document
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_by_file_id(cls, file_id):
|
||||
objs = cls.model.select().where(cls.model.file_id == file_id)
|
||||
return objs
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_by_document_id(cls, document_id):
|
||||
objs = cls.model.select().where(cls.model.document_id == document_id)
|
||||
return objs
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def insert(cls, obj):
|
||||
if not cls.save(**obj):
|
||||
raise RuntimeError("Database error (File)!")
|
||||
e, obj = cls.get_by_id(obj["id"])
|
||||
if not e:
|
||||
raise RuntimeError("Database error (File retrieval)!")
|
||||
return obj
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def delete_by_file_id(cls, file_id):
|
||||
return cls.model.delete().where(cls.model.file_id == file_id).execute()
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def delete_by_document_id(cls, doc_id):
|
||||
return cls.model.delete().where(cls.model.document_id == doc_id).execute()
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def update_by_file_id(cls, file_id, obj):
|
||||
obj["update_time"] = current_timestamp()
|
||||
obj["update_date"] = datetime_format(datetime.now())
|
||||
num = cls.model.update(obj).where(cls.model.id == file_id).execute()
|
||||
e, obj = cls.get_by_id(cls.model.id)
|
||||
return obj
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_minio_address(cls, doc_id=None, file_id=None):
|
||||
if doc_id:
|
||||
ids = File2DocumentService.get_by_document_id(doc_id)
|
||||
else:
|
||||
ids = File2DocumentService.get_by_file_id(file_id)
|
||||
if ids:
|
||||
e, file = FileService.get_by_id(ids[0].file_id)
|
||||
return file.parent_id, file.location
|
||||
else:
|
||||
assert doc_id, "please specify doc_id"
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
return doc.kb_id, doc.location
|
||||
243
api/db/services/file_service.py
Normal file
243
api/db/services/file_service.py
Normal file
@ -0,0 +1,243 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from flask_login import current_user
|
||||
from peewee import fn
|
||||
|
||||
from api.db import FileType
|
||||
from api.db.db_models import DB, File2Document, Knowledgebase
|
||||
from api.db.db_models import File, Document
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.utils import get_uuid
|
||||
|
||||
|
||||
class FileService(CommonService):
|
||||
model = File
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_by_pf_id(cls, tenant_id, pf_id, page_number, items_per_page,
|
||||
orderby, desc, keywords):
|
||||
if keywords:
|
||||
files = cls.model.select().where(
|
||||
(cls.model.tenant_id == tenant_id)
|
||||
& (cls.model.parent_id == pf_id), (fn.LOWER(cls.model.name).like(f"%%{keywords.lower()}%%")))
|
||||
else:
|
||||
files = cls.model.select().where((cls.model.tenant_id == tenant_id)
|
||||
& (cls.model.parent_id == pf_id))
|
||||
count = files.count()
|
||||
if desc:
|
||||
files = files.order_by(cls.model.getter_by(orderby).desc())
|
||||
else:
|
||||
files = files.order_by(cls.model.getter_by(orderby).asc())
|
||||
|
||||
files = files.paginate(page_number, items_per_page)
|
||||
|
||||
res_files = list(files.dicts())
|
||||
for file in res_files:
|
||||
if file["type"] == FileType.FOLDER.value:
|
||||
file["size"] = cls.get_folder_size(file["id"])
|
||||
file['kbs_info'] = []
|
||||
continue
|
||||
kbs_info = cls.get_kb_id_by_file_id(file['id'])
|
||||
file['kbs_info'] = kbs_info
|
||||
|
||||
return res_files, count
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_kb_id_by_file_id(cls, file_id):
|
||||
kbs = (cls.model.select(*[Knowledgebase.id, Knowledgebase.name])
|
||||
.join(File2Document, on=(File2Document.file_id == file_id))
|
||||
.join(Document, on=(File2Document.document_id == Document.id))
|
||||
.join(Knowledgebase, on=(Knowledgebase.id == Document.kb_id))
|
||||
.where(cls.model.id == file_id))
|
||||
if not kbs: return []
|
||||
kbs_info_list = []
|
||||
for kb in list(kbs.dicts()):
|
||||
kbs_info_list.append({"kb_id": kb['id'], "kb_name": kb['name']})
|
||||
return kbs_info_list
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_by_pf_id_name(cls, id, name):
|
||||
file = cls.model.select().where((cls.model.parent_id == id) & (cls.model.name == name))
|
||||
if file.count():
|
||||
e, file = cls.get_by_id(file[0].id)
|
||||
if not e:
|
||||
raise RuntimeError("Database error (File retrieval)!")
|
||||
return file
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_id_list_by_id(cls, id, name, count, res):
|
||||
if count < len(name):
|
||||
file = cls.get_by_pf_id_name(id, name[count])
|
||||
if file:
|
||||
res.append(file.id)
|
||||
return cls.get_id_list_by_id(file.id, name, count + 1, res)
|
||||
else:
|
||||
return res
|
||||
else:
|
||||
return res
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_all_innermost_file_ids(cls, folder_id, result_ids):
|
||||
subfolders = cls.model.select().where(cls.model.parent_id == folder_id)
|
||||
if subfolders.exists():
|
||||
for subfolder in subfolders:
|
||||
cls.get_all_innermost_file_ids(subfolder.id, result_ids)
|
||||
else:
|
||||
result_ids.append(folder_id)
|
||||
return result_ids
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def create_folder(cls, file, parent_id, name, count):
|
||||
if count > len(name) - 2:
|
||||
return file
|
||||
else:
|
||||
file = cls.insert({
|
||||
"id": get_uuid(),
|
||||
"parent_id": parent_id,
|
||||
"tenant_id": current_user.id,
|
||||
"created_by": current_user.id,
|
||||
"name": name[count],
|
||||
"location": "",
|
||||
"size": 0,
|
||||
"type": FileType.FOLDER.value
|
||||
})
|
||||
return cls.create_folder(file, file.id, name, count + 1)
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def is_parent_folder_exist(cls, parent_id):
|
||||
parent_files = cls.model.select().where(cls.model.id == parent_id)
|
||||
if parent_files.count():
|
||||
return True
|
||||
cls.delete_folder_by_pf_id(parent_id)
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_root_folder(cls, tenant_id):
|
||||
file = cls.model.select().where(cls.model.tenant_id == tenant_id and
|
||||
cls.model.parent_id == cls.model.id)
|
||||
if not file:
|
||||
file_id = get_uuid()
|
||||
file = {
|
||||
"id": file_id,
|
||||
"parent_id": file_id,
|
||||
"tenant_id": tenant_id,
|
||||
"created_by": tenant_id,
|
||||
"name": "/",
|
||||
"type": FileType.FOLDER.value,
|
||||
"size": 0,
|
||||
"location": "",
|
||||
}
|
||||
cls.save(**file)
|
||||
else:
|
||||
file_id = file[0].id
|
||||
|
||||
e, file = cls.get_by_id(file_id)
|
||||
if not e:
|
||||
raise RuntimeError("Database error (File retrieval)!")
|
||||
return file
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_parent_folder(cls, file_id):
|
||||
file = cls.model.select().where(cls.model.id == file_id)
|
||||
if file.count():
|
||||
e, file = cls.get_by_id(file[0].parent_id)
|
||||
if not e:
|
||||
raise RuntimeError("Database error (File retrieval)!")
|
||||
else:
|
||||
raise RuntimeError("Database error (File doesn't exist)!")
|
||||
return file
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_all_parent_folders(cls, start_id):
|
||||
parent_folders = []
|
||||
current_id = start_id
|
||||
while current_id:
|
||||
e, file = cls.get_by_id(current_id)
|
||||
if file.parent_id != file.id and e:
|
||||
parent_folders.append(file)
|
||||
current_id = file.parent_id
|
||||
else:
|
||||
parent_folders.append(file)
|
||||
break
|
||||
return parent_folders
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def insert(cls, file):
|
||||
if not cls.save(**file):
|
||||
raise RuntimeError("Database error (File)!")
|
||||
e, file = cls.get_by_id(file["id"])
|
||||
if not e:
|
||||
raise RuntimeError("Database error (File retrieval)!")
|
||||
return file
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def delete(cls, file):
|
||||
return cls.delete_by_id(file.id)
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def delete_by_pf_id(cls, folder_id):
|
||||
return cls.model.delete().where(cls.model.parent_id == folder_id).execute()
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def delete_folder_by_pf_id(cls, user_id, folder_id):
|
||||
try:
|
||||
files = cls.model.select().where((cls.model.tenant_id == user_id)
|
||||
& (cls.model.parent_id == folder_id))
|
||||
for file in files:
|
||||
cls.delete_folder_by_pf_id(user_id, file.id)
|
||||
return cls.model.delete().where((cls.model.tenant_id == user_id)
|
||||
& (cls.model.id == folder_id)).execute(),
|
||||
except Exception as e:
|
||||
print(e)
|
||||
raise RuntimeError("Database error (File retrieval)!")
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_file_count(cls, tenant_id):
|
||||
files = cls.model.select(cls.model.id).where(cls.model.tenant_id == tenant_id)
|
||||
return len(files)
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_folder_size(cls, folder_id):
|
||||
size = 0
|
||||
|
||||
def dfs(parent_id):
|
||||
nonlocal size
|
||||
for f in cls.model.select(*[cls.model.id, cls.model.size, cls.model.type]).where(
|
||||
cls.model.parent_id == parent_id, cls.model.id != parent_id):
|
||||
size += f.size
|
||||
if f.type == FileType.FOLDER.value:
|
||||
dfs(f.id)
|
||||
|
||||
dfs(folder_id)
|
||||
return size
|
||||
|
||||
@ -128,7 +128,9 @@ class TenantLLMService(CommonService):
|
||||
else:
|
||||
assert False, "LLM type error"
|
||||
|
||||
num = cls.model.update(used_tokens=cls.model.used_tokens + used_tokens)\
|
||||
num = 0
|
||||
for u in cls.query(tenant_id = tenant_id, llm_name=mdlnm):
|
||||
num += cls.model.update(used_tokens = u.used_tokens + used_tokens)\
|
||||
.where(cls.model.tenant_id == tenant_id, cls.model.llm_name == mdlnm)\
|
||||
.execute()
|
||||
return num
|
||||
|
||||
@ -15,13 +15,19 @@
|
||||
#
|
||||
import random
|
||||
|
||||
from peewee import Expression
|
||||
from api.db.db_models import DB
|
||||
from api.db.db_utils import bulk_insert_into_db
|
||||
from deepdoc.parser import PdfParser
|
||||
from peewee import JOIN
|
||||
from api.db.db_models import DB, File2Document, File
|
||||
from api.db import StatusEnum, FileType, TaskStatus
|
||||
from api.db.db_models import Task, Document, Knowledgebase, Tenant
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.utils import current_timestamp
|
||||
from api.utils import current_timestamp, get_uuid
|
||||
from deepdoc.parser.excel_parser import RAGFlowExcelParser
|
||||
from rag.settings import SVR_QUEUE_NAME
|
||||
from rag.utils.minio_conn import MINIO
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
|
||||
|
||||
class TaskService(CommonService):
|
||||
@ -29,7 +35,7 @@ class TaskService(CommonService):
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=1, takeit=True):
|
||||
def get_tasks(cls, task_id):
|
||||
fields = [
|
||||
cls.model.id,
|
||||
cls.model.doc_id,
|
||||
@ -48,26 +54,16 @@ class TaskService(CommonService):
|
||||
Tenant.img2txt_id,
|
||||
Tenant.asr_id,
|
||||
cls.model.update_time]
|
||||
with DB.lock("get_task", -1):
|
||||
docs = cls.model.select(*fields) \
|
||||
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
||||
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
|
||||
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \
|
||||
.where(
|
||||
Document.status == StatusEnum.VALID.value,
|
||||
Document.run == TaskStatus.RUNNING.value,
|
||||
~(Document.type == FileType.VIRTUAL.value),
|
||||
cls.model.progress == 0,
|
||||
#cls.model.update_time >= tm,
|
||||
#(Expression(cls.model.create_time, "%%", comm) == mod)
|
||||
)\
|
||||
.order_by(cls.model.update_time.asc())\
|
||||
.paginate(0, items_per_page)
|
||||
.where(cls.model.id == task_id)
|
||||
docs = list(docs.dicts())
|
||||
if not docs: return []
|
||||
if not takeit: return docs
|
||||
|
||||
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", progress=random.random()/10.).where(
|
||||
cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.",
|
||||
progress=random.random() / 10.).where(
|
||||
cls.model.id == docs[0]["id"]).execute()
|
||||
return docs
|
||||
|
||||
@ -75,20 +71,21 @@ class TaskService(CommonService):
|
||||
@DB.connection_context()
|
||||
def get_ongoing_doc_name(cls):
|
||||
with DB.lock("get_task", -1):
|
||||
docs = cls.model.select(*[Document.kb_id, Document.location]) \
|
||||
docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
|
||||
.join(Document, on=(cls.model.doc_id == Document.id)) \
|
||||
.join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
|
||||
.join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \
|
||||
.where(
|
||||
Document.status == StatusEnum.VALID.value,
|
||||
Document.run == TaskStatus.RUNNING.value,
|
||||
~(Document.type == FileType.VIRTUAL.value),
|
||||
cls.model.progress >= 0,
|
||||
cls.model.progress < 1,
|
||||
cls.model.create_time >= current_timestamp() - 180000
|
||||
cls.model.create_time >= current_timestamp() - 1000 * 600
|
||||
)
|
||||
docs = list(docs.dicts())
|
||||
if not docs: return []
|
||||
|
||||
return list(set([(d["kb_id"], d["location"]) for d in docs]))
|
||||
return list(set([(d["parent_id"] if d["parent_id"] else d["kb_id"], d["location"]) for d in docs]))
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
@ -111,3 +108,55 @@ class TaskService(CommonService):
|
||||
if "progress" in info:
|
||||
cls.model.update(progress=info["progress"]).where(
|
||||
cls.model.id == id).execute()
|
||||
|
||||
|
||||
def queue_tasks(doc, bucket, name):
|
||||
def new_task():
|
||||
nonlocal doc
|
||||
return {
|
||||
"id": get_uuid(),
|
||||
"doc_id": doc["id"]
|
||||
}
|
||||
tsks = []
|
||||
|
||||
if doc["type"] == FileType.PDF.value:
|
||||
file_bin = MINIO.get(bucket, name)
|
||||
do_layout = doc["parser_config"].get("layout_recognize", True)
|
||||
pages = PdfParser.total_page_number(doc["name"], file_bin)
|
||||
page_size = doc["parser_config"].get("task_page_size", 12)
|
||||
if doc["parser_id"] == "paper":
|
||||
page_size = doc["parser_config"].get("task_page_size", 22)
|
||||
if doc["parser_id"] == "one":
|
||||
page_size = 1000000000
|
||||
if not do_layout:
|
||||
page_size = 1000000000
|
||||
page_ranges = doc["parser_config"].get("pages")
|
||||
if not page_ranges:
|
||||
page_ranges = [(1, 100000)]
|
||||
for s, e in page_ranges:
|
||||
s -= 1
|
||||
s = max(0, s)
|
||||
e = min(e - 1, pages)
|
||||
for p in range(s, e, page_size):
|
||||
task = new_task()
|
||||
task["from_page"] = p
|
||||
task["to_page"] = min(p + page_size, e)
|
||||
tsks.append(task)
|
||||
|
||||
elif doc["parser_id"] == "table":
|
||||
file_bin = MINIO.get(bucket, name)
|
||||
rn = RAGFlowExcelParser.row_number(
|
||||
doc["name"], file_bin)
|
||||
for i in range(0, rn, 3000):
|
||||
task = new_task()
|
||||
task["from_page"] = i
|
||||
task["to_page"] = min(i + 3000, rn)
|
||||
tsks.append(task)
|
||||
else:
|
||||
tsks.append(new_task())
|
||||
|
||||
bulk_insert_into_db(Task, tsks, True)
|
||||
DocumentService.begin2parse(doc["id"])
|
||||
|
||||
for t in tsks:
|
||||
REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t)
|
||||
@ -18,10 +18,14 @@ import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from werkzeug.serving import run_simple
|
||||
from api.apps import app
|
||||
from api.db.runtime_config import RuntimeConfig
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.settings import (
|
||||
HOST, HTTP_PORT, access_logger, database_logger, stat_logger,
|
||||
)
|
||||
@ -31,6 +35,16 @@ from api.db.db_models import init_database_tables as init_web_db
|
||||
from api.db.init_data import init_web_data
|
||||
from api.versions import get_versions
|
||||
|
||||
|
||||
def update_progress():
|
||||
while True:
|
||||
time.sleep(1)
|
||||
try:
|
||||
DocumentService.update_progress()
|
||||
except Exception as e:
|
||||
stat_logger.error("update_progress exception:" + str(e))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("""
|
||||
____ ______ __
|
||||
@ -71,6 +85,9 @@ if __name__ == '__main__':
|
||||
peewee_logger.addHandler(database_logger.handlers[0])
|
||||
peewee_logger.setLevel(database_logger.level)
|
||||
|
||||
thr = ThreadPoolExecutor(max_workers=1)
|
||||
thr.submit(update_progress)
|
||||
|
||||
# start http server
|
||||
try:
|
||||
stat_logger.info("RAG Flow http server start...")
|
||||
|
||||
@ -32,7 +32,7 @@ access_logger = getLogger("access")
|
||||
database_logger = getLogger("database")
|
||||
chat_logger = getLogger("chat")
|
||||
|
||||
from rag.utils import ELASTICSEARCH
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from rag.nlp import search
|
||||
from api.utils import get_base_config, decrypt_database_config
|
||||
|
||||
|
||||
@ -19,7 +19,7 @@ import os
|
||||
import re
|
||||
from io import BytesIO
|
||||
|
||||
import fitz
|
||||
import pdfplumber
|
||||
from PIL import Image
|
||||
from cachetools import LRUCache, cached
|
||||
from ruamel.yaml import YAML
|
||||
@ -66,6 +66,15 @@ def get_rag_python_directory(*args):
|
||||
return get_rag_directory("python", *args)
|
||||
|
||||
|
||||
def get_home_cache_dir():
|
||||
dir = os.path.join(os.path.expanduser('~'), ".ragflow")
|
||||
try:
|
||||
os.mkdir(dir)
|
||||
except OSError as error:
|
||||
pass
|
||||
return dir
|
||||
|
||||
|
||||
@cached(cache=LRUCache(maxsize=10))
|
||||
def load_json_conf(conf_path):
|
||||
if os.path.isabs(conf_path):
|
||||
@ -155,17 +164,17 @@ def filename_type(filename):
|
||||
return FileType.AURAL.value
|
||||
|
||||
if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
|
||||
return FileType.VISUAL
|
||||
return FileType.VISUAL.value
|
||||
|
||||
return FileType.OTHER.value
|
||||
|
||||
|
||||
def thumbnail(filename, blob):
|
||||
filename = filename.lower()
|
||||
if re.match(r".*\.pdf$", filename):
|
||||
pdf = fitz.open(stream=blob, filetype="pdf")
|
||||
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
|
||||
pdf = pdfplumber.open(BytesIO(blob))
|
||||
buffered = BytesIO()
|
||||
Image.frombytes("RGB", [pix.width, pix.height],
|
||||
pix.samples).save(buffered, format="png")
|
||||
pdf.pages[0].to_image().annotated.save(buffered, format="png")
|
||||
return "data:image/png;base64," + \
|
||||
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
|
||||
|
||||
@ -13,12 +13,12 @@ minio:
|
||||
user: 'rag_flow'
|
||||
password: 'infini_rag_flow'
|
||||
host: 'minio:9000'
|
||||
es:
|
||||
hosts: 'http://es01:9200'
|
||||
redis:
|
||||
db: 1
|
||||
password: 'infini_rag_flow'
|
||||
host: 'redis:6379'
|
||||
es:
|
||||
hosts: 'http://es01:9200'
|
||||
user_default_llm:
|
||||
factory: 'Tongyi-Qianwen'
|
||||
api_key: 'sk-xxxxxxxxxxxxx'
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
|
||||
|
||||
from .pdf_parser import HuParser as PdfParser, PlainParser
|
||||
from .docx_parser import HuDocxParser as DocxParser
|
||||
from .excel_parser import HuExcelParser as ExcelParser
|
||||
from .ppt_parser import HuPptParser as PptParser
|
||||
from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
|
||||
from .docx_parser import RAGFlowDocxParser as DocxParser
|
||||
from .excel_parser import RAGFlowExcelParser as ExcelParser
|
||||
from .ppt_parser import RAGFlowPptParser as PptParser
|
||||
|
||||
@ -3,11 +3,11 @@ from docx import Document
|
||||
import re
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
class HuDocxParser:
|
||||
class RAGFlowDocxParser:
|
||||
|
||||
def __extract_table_content(self, tb):
|
||||
df = []
|
||||
@ -35,14 +35,14 @@ class HuDocxParser:
|
||||
for p, n in patt:
|
||||
if re.search(p, b):
|
||||
return n
|
||||
tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1]
|
||||
tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
|
||||
if len(tks) > 3:
|
||||
if len(tks) < 12:
|
||||
return "Tx"
|
||||
else:
|
||||
return "Lx"
|
||||
|
||||
if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
|
||||
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
||||
return "Nr"
|
||||
|
||||
return "Ot"
|
||||
|
||||
@ -6,7 +6,7 @@ from io import BytesIO
|
||||
from rag.nlp import find_codec
|
||||
|
||||
|
||||
class HuExcelParser:
|
||||
class RAGFlowExcelParser:
|
||||
def html(self, fnm):
|
||||
if isinstance(fnm, str):
|
||||
wb = load_workbook(fnm)
|
||||
@ -69,10 +69,10 @@ class HuExcelParser:
|
||||
|
||||
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
return len(txt.split("\n"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
psr = HuExcelParser()
|
||||
psr = RAGFlowExcelParser()
|
||||
psr(sys.argv[1])
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
import os
|
||||
import random
|
||||
|
||||
import fitz
|
||||
import xgboost as xgb
|
||||
from io import BytesIO
|
||||
import torch
|
||||
@ -16,14 +15,14 @@ from PyPDF2 import PdfReader as pdf2_read
|
||||
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from copy import deepcopy
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
class HuParser:
|
||||
class RAGFlowPdfParser:
|
||||
def __init__(self):
|
||||
self.ocr = OCR()
|
||||
if hasattr(self, "model_speciess"):
|
||||
@ -95,13 +94,13 @@ class HuParser:
|
||||
h = max(self.__height(up), self.__height(down))
|
||||
y_dis = self._y_dis(up, down)
|
||||
LEN = 6
|
||||
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
|
||||
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
|
||||
tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
|
||||
tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
|
||||
tks_all = up["text"][-LEN:].strip() \
|
||||
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
||||
up["text"][-1] + down["text"][0]) else "") \
|
||||
+ down["text"][:LEN].strip()
|
||||
tks_all = huqie.qie(tks_all).split(" ")
|
||||
tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
|
||||
fea = [
|
||||
up.get("R", -1) == down.get("R", -1),
|
||||
y_dis / h,
|
||||
@ -142,8 +141,8 @@ class HuParser:
|
||||
tks_down[-1] == tks_up[-1],
|
||||
max(down["in_row"], up["in_row"]),
|
||||
abs(down["in_row"] - up["in_row"]),
|
||||
len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
|
||||
len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
|
||||
len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
|
||||
len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
|
||||
]
|
||||
return fea
|
||||
|
||||
@ -470,7 +469,8 @@ class HuParser:
|
||||
continue
|
||||
|
||||
if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
|
||||
or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
|
||||
or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
|
||||
or not down["text"].strip():
|
||||
i += 1
|
||||
continue
|
||||
|
||||
@ -598,7 +598,7 @@ class HuParser:
|
||||
|
||||
if b["text"].strip()[0] != b_["text"].strip()[0] \
|
||||
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
|
||||
or huqie.is_chinese(b["text"].strip()[0]) \
|
||||
or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
|
||||
or b["top"] > b_["bottom"]:
|
||||
i += 1
|
||||
continue
|
||||
@ -921,9 +921,7 @@ class HuParser:
|
||||
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
||||
return len(pdf.pages)
|
||||
except Exception as e:
|
||||
pdf = fitz.open(fnm) if not binary else fitz.open(
|
||||
stream=fnm, filetype="pdf")
|
||||
return len(pdf)
|
||||
logging.error(str(e))
|
||||
|
||||
def __images__(self, fnm, zoomin=3, page_from=0,
|
||||
page_to=299, callback=None):
|
||||
@ -945,23 +943,7 @@ class HuParser:
|
||||
self.pdf.pages[page_from:page_to]]
|
||||
self.total_page = len(self.pdf.pages)
|
||||
except Exception as e:
|
||||
self.pdf = fitz.open(fnm) if isinstance(
|
||||
fnm, str) else fitz.open(
|
||||
stream=fnm, filetype="pdf")
|
||||
self.page_images = []
|
||||
self.page_chars = []
|
||||
mat = fitz.Matrix(zoomin, zoomin)
|
||||
self.total_page = len(self.pdf)
|
||||
for i, page in enumerate(self.pdf):
|
||||
if i < page_from:
|
||||
continue
|
||||
if i >= page_to:
|
||||
break
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
||||
pix.samples)
|
||||
self.page_images.append(img)
|
||||
self.page_chars.append([])
|
||||
logging.error(str(e))
|
||||
|
||||
self.outlines = []
|
||||
try:
|
||||
|
||||
@ -14,7 +14,7 @@ from io import BytesIO
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class HuPptParser(object):
|
||||
class RAGFlowPptParser(object):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import re,json,os
|
||||
import pandas as pd
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from . import regions
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
|
||||
@ -22,14 +22,14 @@ def baike(cid, default_v=0):
|
||||
def corpNorm(nm, add_region=True):
|
||||
global CORP_TKS
|
||||
if not nm or type(nm)!=type(""):return ""
|
||||
nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower()
|
||||
nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
|
||||
nm = re.sub(r"&", "&", nm)
|
||||
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
|
||||
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
|
||||
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
|
||||
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
|
||||
|
||||
tks = huqie.qie(nm).split(" ")
|
||||
tks = rag_tokenizer.tokenize(nm).split(" ")
|
||||
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
|
||||
nm = ""
|
||||
for t in tks:
|
||||
|
||||
@ -3,7 +3,7 @@ import re, copy, time, datetime, demjson3, \
|
||||
traceback, signal
|
||||
import numpy as np
|
||||
from deepdoc.parser.resume.entities import degrees, schools, corporations
|
||||
from rag.nlp import huqie, surname
|
||||
from rag.nlp import rag_tokenizer, surname
|
||||
from xpinyin import Pinyin
|
||||
from contextlib import contextmanager
|
||||
|
||||
@ -83,7 +83,7 @@ def forEdu(cv):
|
||||
if n.get("school_name") and isinstance(n["school_name"], str):
|
||||
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
|
||||
e["sch_nm_kwd"] = sch[-1]
|
||||
fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1])
|
||||
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
|
||||
|
||||
if n.get("discipline_name") and isinstance(n["discipline_name"], str):
|
||||
maj.append(n["discipline_name"])
|
||||
@ -166,10 +166,10 @@ def forEdu(cv):
|
||||
if "tag_kwd" not in cv: cv["tag_kwd"] = []
|
||||
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
|
||||
|
||||
if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj))
|
||||
if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch))
|
||||
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch))
|
||||
if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj))
|
||||
if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
|
||||
if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
|
||||
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
|
||||
if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
|
||||
|
||||
return cv
|
||||
|
||||
@ -187,11 +187,11 @@ def forProj(cv):
|
||||
if n.get("achivement"): desc.append(str(n["achivement"]))
|
||||
|
||||
if pro_nms:
|
||||
# cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms))
|
||||
cv["project_name_tks"] = huqie.qie(pro_nms[0])
|
||||
# cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
|
||||
cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
|
||||
if desc:
|
||||
cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc)))
|
||||
cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0]))
|
||||
cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
|
||||
cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
|
||||
|
||||
return cv
|
||||
|
||||
@ -280,25 +280,25 @@ def forWork(cv):
|
||||
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
|
||||
|
||||
if fea["position_name"]:
|
||||
cv["position_name_tks"] = huqie.qie(fea["position_name"][0])
|
||||
cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"])
|
||||
cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:]))
|
||||
cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
|
||||
cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
|
||||
cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
|
||||
|
||||
if fea["industry_name"]:
|
||||
cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0])
|
||||
cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"])
|
||||
cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:]))
|
||||
cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
|
||||
cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
|
||||
cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
|
||||
|
||||
if fea["corporation_name"]:
|
||||
cv["corporation_name_kwd"] = fea["corporation_name"][0]
|
||||
cv["corp_nm_kwd"] = fea["corporation_name"]
|
||||
cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0])
|
||||
cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"])
|
||||
cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:]))
|
||||
cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
|
||||
cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
|
||||
cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
|
||||
|
||||
if fea["responsibilities"]:
|
||||
cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0])
|
||||
cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:]))
|
||||
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
|
||||
cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
|
||||
|
||||
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
|
||||
re.match(r"[^0-9]+$", str(i))]
|
||||
@ -444,15 +444,15 @@ def parse(cv):
|
||||
if nms:
|
||||
t = k[:-4]
|
||||
cv[f"{t}_kwd"] = nms
|
||||
cv[f"{t}_tks"] = huqie.qie(" ".join(nms))
|
||||
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
|
||||
except Exception as e:
|
||||
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
|
||||
cv[k] = []
|
||||
|
||||
# tokenize fields
|
||||
if k in tks_fld:
|
||||
cv[f"{k}_tks"] = huqie.qie(cv[k])
|
||||
if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"])
|
||||
cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
|
||||
if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
|
||||
|
||||
# keyword fields
|
||||
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
|
||||
@ -492,7 +492,7 @@ def parse(cv):
|
||||
cv["name_kwd"] = name
|
||||
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
|
||||
cv["name_tks"] = (
|
||||
huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
||||
rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
|
||||
) if name else ""
|
||||
else:
|
||||
cv["integerity_flt"] /= 2.
|
||||
@ -515,7 +515,7 @@ def parse(cv):
|
||||
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
|
||||
# long text tokenize
|
||||
|
||||
if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"]))
|
||||
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
|
||||
|
||||
# for yes or no field
|
||||
fea = []
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
import pdfplumber
|
||||
|
||||
from .ocr import OCR
|
||||
from .recognizer import Recognizer
|
||||
from .layout_recognizer import LayoutRecognizer
|
||||
from .table_structure_recognizer import TableStructureRecognizer
|
||||
|
||||
|
||||
def init_in_out(args):
|
||||
from PIL import Image
|
||||
import fitz
|
||||
import os
|
||||
import traceback
|
||||
from api.utils.file_utils import traversal_files
|
||||
@ -18,13 +19,11 @@ def init_in_out(args):
|
||||
|
||||
def pdf_pages(fnm, zoomin=3):
|
||||
nonlocal outputs, images
|
||||
pdf = fitz.open(fnm)
|
||||
mat = fitz.Matrix(zoomin, zoomin)
|
||||
for i, page in enumerate(pdf):
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
||||
pix.samples)
|
||||
images.append(img)
|
||||
pdf = pdfplumber.open(fnm)
|
||||
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||
enumerate(pdf.pages)]
|
||||
|
||||
for i, page in enumerate(images):
|
||||
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
||||
|
||||
def images_and_outputs(fnm):
|
||||
|
||||
@ -11,10 +11,6 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from deepdoc.vision.seeit import draw_box
|
||||
from deepdoc.vision import OCR, init_in_out
|
||||
import argparse
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
sys.path.insert(
|
||||
@ -25,6 +21,11 @@ sys.path.insert(
|
||||
os.path.abspath(__file__)),
|
||||
'../../')))
|
||||
|
||||
from deepdoc.vision.seeit import draw_box
|
||||
from deepdoc.vision import OCR, init_in_out
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
|
||||
def main(args):
|
||||
ocr = OCR()
|
||||
|
||||
@ -10,17 +10,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from deepdoc.vision.seeit import draw_box
|
||||
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
|
||||
import os, sys
|
||||
sys.path.insert(
|
||||
0,
|
||||
os.path.abspath(
|
||||
@ -29,6 +19,13 @@ sys.path.insert(
|
||||
os.path.abspath(__file__)),
|
||||
'../../')))
|
||||
|
||||
from deepdoc.vision.seeit import draw_box
|
||||
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
import argparse
|
||||
import re
|
||||
import numpy as np
|
||||
|
||||
|
||||
def main(args):
|
||||
images, outputs = init_in_out(args)
|
||||
|
||||
@ -19,7 +19,7 @@ import numpy as np
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from .recognizer import Recognizer
|
||||
|
||||
|
||||
@ -117,14 +117,14 @@ class TableStructureRecognizer(Recognizer):
|
||||
for p, n in patt:
|
||||
if re.search(p, b["text"].strip()):
|
||||
return n
|
||||
tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
|
||||
tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1]
|
||||
if len(tks) > 3:
|
||||
if len(tks) < 12:
|
||||
return "Tx"
|
||||
else:
|
||||
return "Lx"
|
||||
|
||||
if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
|
||||
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
||||
return "Nr"
|
||||
|
||||
return "Ot"
|
||||
|
||||
@ -25,9 +25,11 @@ MINIO_PORT=9000
|
||||
MINIO_USER=rag_flow
|
||||
MINIO_PASSWORD=infini_rag_flow
|
||||
|
||||
REDIS_PASSWORD=infini_rag_flow
|
||||
|
||||
SVR_HTTP_PORT=9380
|
||||
|
||||
RAGFLOW_VERSION=v0.3.1
|
||||
RAGFLOW_VERSION=dev
|
||||
|
||||
TIMEZONE='Asia/Shanghai'
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ The serving port of mysql inside the container. The modification should be synch
|
||||
The max database connection.
|
||||
|
||||
### stale_timeout
|
||||
The timeout duation in seconds.
|
||||
The timeout duration in seconds.
|
||||
|
||||
## minio
|
||||
|
||||
|
||||
@ -29,24 +29,6 @@ services:
|
||||
- ragflow
|
||||
restart: always
|
||||
|
||||
#kibana:
|
||||
# depends_on:
|
||||
# es01:
|
||||
# condition: service_healthy
|
||||
# image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
|
||||
# container_name: ragflow-kibana
|
||||
# volumes:
|
||||
# - kibanadata:/usr/share/kibana/data
|
||||
# ports:
|
||||
# - ${KIBANA_PORT}:5601
|
||||
# environment:
|
||||
# - SERVERNAME=kibana
|
||||
# - ELASTICSEARCH_HOSTS=http://es01:9200
|
||||
# - TZ=${TIMEZONE}
|
||||
# mem_limit: ${MEM_LIMIT}
|
||||
# networks:
|
||||
# - ragflow
|
||||
|
||||
mysql:
|
||||
image: mysql:5.7.18
|
||||
container_name: ragflow-mysql
|
||||
@ -74,7 +56,6 @@ services:
|
||||
retries: 3
|
||||
restart: always
|
||||
|
||||
|
||||
minio:
|
||||
image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
|
||||
container_name: ragflow-minio
|
||||
@ -92,16 +73,27 @@ services:
|
||||
- ragflow
|
||||
restart: always
|
||||
|
||||
redis:
|
||||
image: redis:7.2.4
|
||||
container_name: ragflow-redis
|
||||
command: redis-server --requirepass ${REDIS_PASSWORD} --maxmemory 128mb --maxmemory-policy allkeys-lru
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
networks:
|
||||
- ragflow
|
||||
restart: always
|
||||
|
||||
|
||||
|
||||
volumes:
|
||||
esdata01:
|
||||
driver: local
|
||||
kibanadata:
|
||||
driver: local
|
||||
mysql_data:
|
||||
driver: local
|
||||
minio_data:
|
||||
driver: local
|
||||
redis_data:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
ragflow:
|
||||
|
||||
@ -12,28 +12,14 @@ function task_exe(){
|
||||
done
|
||||
}
|
||||
|
||||
function watch_broker(){
|
||||
while [ 1 -eq 1 ];do
|
||||
C=`ps aux|grep "task_broker.py"|grep -v grep|wc -l`;
|
||||
if [ $C -lt 1 ];then
|
||||
$PY rag/svr/task_broker.py &
|
||||
fi
|
||||
sleep 5;
|
||||
done
|
||||
}
|
||||
|
||||
function task_bro(){
|
||||
watch_broker;
|
||||
}
|
||||
|
||||
task_bro &
|
||||
|
||||
WS=1
|
||||
for ((i=0;i<WS;i++))
|
||||
do
|
||||
task_exe $i $WS &
|
||||
done
|
||||
|
||||
while [ 1 -eq 1 ];do
|
||||
$PY api/ragflow_server.py
|
||||
done
|
||||
|
||||
wait;
|
||||
@ -13,12 +13,12 @@ minio:
|
||||
user: 'rag_flow'
|
||||
password: 'infini_rag_flow'
|
||||
host: 'minio:9000'
|
||||
es:
|
||||
hosts: 'http://es01:9200'
|
||||
redis:
|
||||
db: 1
|
||||
password: 'infini_rag_flow'
|
||||
host: 'redis:6379'
|
||||
es:
|
||||
hosts: 'http://es01:9200'
|
||||
user_default_llm:
|
||||
factory: 'Tongyi-Qianwen'
|
||||
api_key: 'sk-xxxxxxxxxxxxx'
|
||||
|
||||
@ -221,6 +221,7 @@ This will be called to get the answer to users' questions.
|
||||
|------|-------|----|----|
|
||||
| conversation_id| string | No | This is from calling /new_conversation.|
|
||||
| messages| json | No | All the conversation history stored here including the latest user's question.|
|
||||
| quote | bool | Yes | Default: true |
|
||||
|
||||
### Response
|
||||
```json
|
||||
|
||||
86
docs/faq.md
86
docs/faq.md
@ -55,7 +55,7 @@ This feature and the related APIs are still in development. Contributions are we
|
||||
```
|
||||
$ git clone https://github.com/infiniflow/ragflow.git
|
||||
$ cd ragflow
|
||||
$ docker build -t infiniflow/ragflow:v0.3.1 .
|
||||
$ docker build -t infiniflow/ragflow:latest .
|
||||
$ cd ragflow/docker
|
||||
$ chmod +x ./entrypoint.sh
|
||||
$ docker compose up -d
|
||||
@ -193,18 +193,31 @@ docker logs -f ragflow-server
|
||||
2. Check if the **task_executor.py** process exists.
|
||||
3. Check if your RAGFlow server can access hf-mirror.com or huggingface.com.
|
||||
|
||||
#### 4.5 Why does my pdf parsing stall near completion, while the log does not show any error?
|
||||
|
||||
#### 4.5 `Index failure`
|
||||
If your RAGFlow is deployed *locally*, the parsing process is likely killed due to insufficient RAM. Try increasing your memory allocation by increasing the `MEM_LIMIT` value in **docker/.env**.
|
||||
|
||||
> Ensure that you restart up your RAGFlow server for your changes to take effect!
|
||||
> ```bash
|
||||
> docker compose stop
|
||||
> ```
|
||||
> ```bash
|
||||
> docker compose up -d
|
||||
> ```
|
||||
|
||||

|
||||
|
||||
#### 4.6 `Index failure`
|
||||
|
||||
An index failure usually indicates an unavailable Elasticsearch service.
|
||||
|
||||
#### 4.6 How to check the log of RAGFlow?
|
||||
#### 4.7 How to check the log of RAGFlow?
|
||||
|
||||
```bash
|
||||
tail -f path_to_ragflow/docker/ragflow-logs/rag/*.log
|
||||
```
|
||||
|
||||
#### 4.7 How to check the status of each component in RAGFlow?
|
||||
#### 4.8 How to check the status of each component in RAGFlow?
|
||||
|
||||
```bash
|
||||
$ docker ps
|
||||
@ -212,13 +225,13 @@ $ docker ps
|
||||
*The system displays the following if all your RAGFlow components are running properly:*
|
||||
|
||||
```
|
||||
5bc45806b680 infiniflow/ragflow:v0.3.1 "./entrypoint.sh" 11 hours ago Up 11 hours 0.0.0.0:80->80/tcp, :::80->80/tcp, 0.0.0.0:443->443/tcp, :::443->443/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp ragflow-server
|
||||
5bc45806b680 infiniflow/ragflow:latest "./entrypoint.sh" 11 hours ago Up 11 hours 0.0.0.0:80->80/tcp, :::80->80/tcp, 0.0.0.0:443->443/tcp, :::443->443/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp ragflow-server
|
||||
91220e3285dd docker.elastic.co/elasticsearch/elasticsearch:8.11.3 "/bin/tini -- /usr/l…" 11 hours ago Up 11 hours (healthy) 9300/tcp, 0.0.0.0:9200->9200/tcp, :::9200->9200/tcp ragflow-es-01
|
||||
d8c86f06c56b mysql:5.7.18 "docker-entrypoint.s…" 7 days ago Up 16 seconds (healthy) 0.0.0.0:3306->3306/tcp, :::3306->3306/tcp ragflow-mysql
|
||||
cd29bcb254bc quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z "/usr/bin/docker-ent…" 2 weeks ago Up 11 hours 0.0.0.0:9001->9001/tcp, :::9001->9001/tcp, 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp ragflow-minio
|
||||
```
|
||||
|
||||
#### 4.8 `Exception: Can't connect to ES cluster`
|
||||
#### 4.9 `Exception: Can't connect to ES cluster`
|
||||
|
||||
1. Check the status of your Elasticsearch component:
|
||||
|
||||
@ -245,23 +258,26 @@ $ docker ps
|
||||
curl http://<IP_OF_ES>:<PORT_OF_ES>
|
||||
```
|
||||
|
||||
#### 4.10 Can't start ES container and get `Elasticsearch did not exit normally`
|
||||
|
||||
#### 4.9 `{"data":null,"retcode":100,"retmsg":"<NotFound '404: Not Found'>"}`
|
||||
This is because you forgot to update the `vm.max_map_count` value in **/etc/sysctl.conf** and your change to this value was reset after a system reboot.
|
||||
|
||||
#### 4.11 `{"data":null,"retcode":100,"retmsg":"<NotFound '404: Not Found'>"}`
|
||||
|
||||
Your IP address or port number may be incorrect. If you are using the default configurations, enter http://<IP_OF_YOUR_MACHINE> (**NOT 9380, AND NO PORT NUMBER REQUIRED!**) in your browser. This should work.
|
||||
|
||||
#### 4.10 `Ollama - Mistral instance running at 127.0.0.1:11434 but cannot add Ollama as model in RagFlow`
|
||||
#### 4.12 `Ollama - Mistral instance running at 127.0.0.1:11434 but cannot add Ollama as model in RagFlow`
|
||||
|
||||
A correct Ollama IP address and port is crucial to adding models to Ollama:
|
||||
|
||||
- If you are on demo.ragflow.io, ensure that the server hosting Ollama has a publicly accessible IP address.Note that 127.0.0.1 is not a publicly accessible IP address.
|
||||
- If you deploy RAGFlow locally, ensure that Ollama and RAGFlow are in the same LAN and can comunicate with each other.
|
||||
|
||||
#### 4.11 Do you offer examples of using deepdoc to parse PDF or other files?
|
||||
#### 4.13 Do you offer examples of using deepdoc to parse PDF or other files?
|
||||
|
||||
Yes, we do. See the Python files under the **rag/app** folder.
|
||||
|
||||
#### 4.12 Why did I fail to upload a 10MB+ file to my locally deployed RAGFlow?
|
||||
#### 4.14 Why did I fail to upload a 10MB+ file to my locally deployed RAGFlow?
|
||||
|
||||
You probably forgot to update the **MAX_CONTENT_LENGTH** environment variable:
|
||||
|
||||
@ -280,7 +296,7 @@ docker compose up ragflow -d
|
||||
```
|
||||
*Now you should be able to upload files of sizes less than 100MB.*
|
||||
|
||||
#### 4.13 `Table 'rag_flow.document' doesn't exist`
|
||||
#### 4.15 `Table 'rag_flow.document' doesn't exist`
|
||||
|
||||
This exception occurs when starting up the RAGFlow server. Try the following:
|
||||
|
||||
@ -303,7 +319,7 @@ This exception occurs when starting up the RAGFlow server. Try the following:
|
||||
docker compose up
|
||||
```
|
||||
|
||||
#### 4.14 `hint : 102 Fail to access model Connection error`
|
||||
#### 4.16 `hint : 102 Fail to access model Connection error`
|
||||
|
||||

|
||||
|
||||
@ -311,6 +327,13 @@ This exception occurs when starting up the RAGFlow server. Try the following:
|
||||
2. Do not forget to append **/v1/** to **http://IP:port**:
|
||||
**http://IP:port/v1/**
|
||||
|
||||
#### 4.17 `FileNotFoundError: [Errno 2] No such file or directory`
|
||||
|
||||
1. Check if the status of your minio container is healthy:
|
||||
```bash
|
||||
docker ps
|
||||
```
|
||||
2. Ensure that the username and password settings of MySQL and MinIO in **docker/.env** are in line with those in **docker/service_conf.yml**.
|
||||
|
||||
## Usage
|
||||
|
||||
@ -340,10 +363,43 @@ You can use Ollama to deploy local LLM. See [here](https://github.com/infiniflow
|
||||
|
||||
### 6. How to configure RAGFlow to respond with 100% matched results, rather than utilizing LLM?
|
||||
|
||||
1. Click the **Knowledge Base** tab in the middle top of the page.
|
||||
1. Click **Knowledge Base** in the middle top of the page.
|
||||
2. Right click the desired knowledge base to display the **Configuration** dialogue.
|
||||
3. Choose **Q&A** as the chunk method and click **Save** to confirm your change.
|
||||
|
||||
### Do I need to connect to Redis?
|
||||
### 7 Do I need to connect to Redis?
|
||||
|
||||
No, connecting to Redis is not required to use RAGFlow.
|
||||
No, connecting to Redis is not required.
|
||||
|
||||
### 8 `Error: Range of input length should be [1, 30000]`
|
||||
|
||||
This error occurs because there are too many chunks matching your search criteria. Try reducing the **TopN** and increasing **Similarity threshold** to fix this issue:
|
||||
|
||||
1. Click **Chat** in the middle top of the page.
|
||||
2. Right click the desired conversation > **Edit** > **Prompt Engine**
|
||||
3. Reduce the **TopN** and/or raise **Silimarity threshold**.
|
||||
4. Click **OK** to confirm your changes.
|
||||
|
||||

|
||||
|
||||
### 9 How to update RAGFlow to the latest version?
|
||||
|
||||
1. Pull the latest source code
|
||||
```bash
|
||||
cd ragflow
|
||||
git pull
|
||||
```
|
||||
2. If you used `docker compose up -d` to start up RAGFlow server:
|
||||
```bash
|
||||
docker pull infiniflow/ragflow:dev
|
||||
```
|
||||
```bash
|
||||
docker compose up ragflow -d
|
||||
```
|
||||
3. If you used `docker compose -f docker-compose-CN.yml up -d` to start up RAGFlow server:
|
||||
```bash
|
||||
docker pull swr.cn-north-4.myhuaweicloud.com/infiniflow/ragflow:dev
|
||||
```
|
||||
```bash
|
||||
docker compose -f docker-compose-CN.yml up -d
|
||||
```
|
||||
|
||||
@ -18,7 +18,7 @@ from io import BytesIO
|
||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
|
||||
tokenize_chunks, find_codec
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
||||
|
||||
|
||||
@ -63,9 +63,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
"""
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
pdf_parser = None
|
||||
sections, tbls = [], []
|
||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
txt = ""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
|
||||
@ -19,7 +19,7 @@ from docx import Document
|
||||
from api.db import ParserType
|
||||
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||
make_colon_as_title, add_positions, tokenize_chunks, find_codec
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser
|
||||
from rag.settings import cron_logger
|
||||
|
||||
@ -89,9 +89,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
"""
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
pdf_parser = None
|
||||
sections = []
|
||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
@ -113,7 +113,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
txt = ""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
|
||||
@ -2,7 +2,7 @@ import copy
|
||||
import re
|
||||
|
||||
from api.db import ParserType
|
||||
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||
from deepdoc.parser import PdfParser, PlainParser
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
@ -70,8 +70,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
doc = {
|
||||
"docnm_kwd": filename
|
||||
}
|
||||
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ from docx import Document
|
||||
from timeit import default_timer as timer
|
||||
import re
|
||||
from deepdoc.parser.pdf_parser import PlainParser
|
||||
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
||||
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||
from rag.settings import cron_logger
|
||||
|
||||
@ -112,9 +112,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
res = []
|
||||
pdf_parser = None
|
||||
sections = []
|
||||
@ -141,7 +141,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
txt = ""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
|
||||
@ -14,7 +14,7 @@ from tika import parser
|
||||
from io import BytesIO
|
||||
import re
|
||||
from rag.app import laws
|
||||
from rag.nlp import huqie, tokenize, find_codec
|
||||
from rag.nlp import rag_tokenizer, tokenize, find_codec
|
||||
from deepdoc.parser import PdfParser, ExcelParser, PlainParser
|
||||
|
||||
|
||||
@ -85,7 +85,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
txt = ""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
@ -111,9 +111,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
tokenize(doc, "\n".join(sections), eng)
|
||||
return [doc]
|
||||
|
||||
|
||||
@ -15,7 +15,7 @@ import re
|
||||
from collections import Counter
|
||||
|
||||
from api.db import ParserType
|
||||
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||
from deepdoc.parser import PdfParser, PlainParser
|
||||
import numpy as np
|
||||
from rag.utils import num_tokens_from_string
|
||||
@ -153,10 +153,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
|
||||
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
|
||||
"title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
||||
doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
|
||||
"title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
print("It's English.....", eng)
|
||||
|
||||
@ -17,7 +17,7 @@ from io import BytesIO
|
||||
from PIL import Image
|
||||
|
||||
from rag.nlp import tokenize, is_english
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||
from PyPDF2 import PdfReader as pdf2_read
|
||||
|
||||
@ -96,9 +96,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
eng = lang.lower() == "english"
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
res = []
|
||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||
ppt_parser = Ppt()
|
||||
|
||||
@ -16,7 +16,7 @@ from io import BytesIO
|
||||
from nltk import word_tokenize
|
||||
from openpyxl import load_workbook
|
||||
from rag.nlp import is_english, random_choices, find_codec
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
@ -73,8 +73,8 @@ def beAdoc(d, q, a, eng):
|
||||
aprefix = "Answer: " if eng else "回答:"
|
||||
d["content_with_weight"] = "\t".join(
|
||||
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
||||
d["content_ltks"] = huqie.qie(q)
|
||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
return d
|
||||
|
||||
|
||||
@ -94,7 +94,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
res = []
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
@ -107,7 +107,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
txt = ""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
@ -116,18 +116,31 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
break
|
||||
txt += l
|
||||
lines = txt.split("\n")
|
||||
#is_english([rmPrefix(l) for l in lines[:100]])
|
||||
comma, tab = 0, 0
|
||||
for l in lines:
|
||||
if len(l.split(",")) == 2: comma += 1
|
||||
if len(l.split("\t")) == 2: tab += 1
|
||||
delimiter = "\t" if tab >= comma else ","
|
||||
|
||||
fails = []
|
||||
for i, line in enumerate(lines):
|
||||
arr = [l for l in line.split("\t") if len(l) > 1]
|
||||
question, answer = "", ""
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
arr = lines[i].split(delimiter)
|
||||
if len(arr) != 2:
|
||||
fails.append(str(i))
|
||||
continue
|
||||
res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
|
||||
if question: answer += "\n" + lines[i]
|
||||
else:
|
||||
fails.append(str(i+1))
|
||||
elif len(arr) == 2:
|
||||
if question and answer: res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||
question, answer = arr
|
||||
i += 1
|
||||
if len(res) % 999 == 0:
|
||||
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
|
||||
if question: res.append(beAdoc(deepcopy(doc), question, answer, eng))
|
||||
|
||||
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ import re
|
||||
import pandas as pd
|
||||
import requests
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser.resume import refactor
|
||||
from deepdoc.parser.resume import step_one, step_two
|
||||
from rag.settings import cron_logger
|
||||
@ -131,9 +131,9 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
titles.append(str(v))
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie("-".join(titles) + "-简历")
|
||||
"title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
|
||||
}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
pairs = []
|
||||
for n, m in field_map.items():
|
||||
if not resume.get(n):
|
||||
@ -147,8 +147,8 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
|
||||
doc["content_with_weight"] = "\n".join(
|
||||
["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
|
||||
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
|
||||
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
|
||||
doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
|
||||
doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
|
||||
for n, _ in field_map.items():
|
||||
if n not in resume:
|
||||
continue
|
||||
@ -156,7 +156,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
||||
resume[n] = resume[n][0]
|
||||
if n.find("_tks") > 0:
|
||||
resume[n] = huqie.qieqie(resume[n])
|
||||
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
|
||||
doc[n] = resume[n]
|
||||
|
||||
print(doc)
|
||||
|
||||
@ -20,7 +20,7 @@ from openpyxl import load_workbook
|
||||
from dateutil.parser import parse as datetime_parse
|
||||
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from rag.nlp import huqie, is_english, tokenize, find_codec
|
||||
from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
|
||||
from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
@ -47,6 +47,7 @@ class Excel(ExcelParser):
|
||||
cell.value for i,
|
||||
cell in enumerate(
|
||||
rows[0]) if i not in missed]
|
||||
if not headers:continue
|
||||
data = []
|
||||
for i, r in enumerate(rows[1:]):
|
||||
rn += 1
|
||||
@ -148,7 +149,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
||||
txt = ""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
@ -216,7 +217,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
||||
for ii, row in df.iterrows():
|
||||
d = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
row_txt = []
|
||||
for j in range(len(clmns)):
|
||||
@ -227,7 +228,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
||||
if pd.isna(row[clmns[j]]):
|
||||
continue
|
||||
fld = clmns_map[j][0]
|
||||
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
|
||||
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
|
||||
row[clmns[j]])
|
||||
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
|
||||
if not row_txt:
|
||||
|
||||
@ -22,7 +22,7 @@ EmbeddingModel = {
|
||||
"Ollama": OllamaEmbed,
|
||||
"OpenAI": OpenAIEmbed,
|
||||
"Xinference": XinferenceEmbed,
|
||||
"Tongyi-Qianwen": HuEmbedding, #QWenEmbed,
|
||||
"Tongyi-Qianwen": DefaultEmbedding, #QWenEmbed,
|
||||
"ZHIPU-AI": ZhipuEmbed,
|
||||
"FastEmbed": FastEmbed,
|
||||
"Youdao": YoudaoEmbed
|
||||
@ -45,6 +45,7 @@ ChatModel = {
|
||||
"Tongyi-Qianwen": QWenChat,
|
||||
"Ollama": OllamaChat,
|
||||
"Xinference": XinferenceChat,
|
||||
"Moonshot": MoonshotChat
|
||||
"Moonshot": MoonshotChat,
|
||||
"DeepSeek": DeepSeekChat
|
||||
}
|
||||
|
||||
|
||||
@ -24,16 +24,7 @@ from rag.utils import num_tokens_from_string
|
||||
|
||||
|
||||
class Base(ABC):
|
||||
def __init__(self, key, model_name):
|
||||
pass
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
raise NotImplementedError("Please implement encode method!")
|
||||
|
||||
|
||||
class GptTurbo(Base):
|
||||
def __init__(self, key, model_name="gpt-3.5-turbo", base_url="https://api.openai.com/v1"):
|
||||
if not base_url: base_url="https://api.openai.com/v1"
|
||||
def __init__(self, key, model_name, base_url):
|
||||
self.client = OpenAI(api_key=key, base_url=base_url)
|
||||
self.model_name = model_name
|
||||
|
||||
@ -54,28 +45,28 @@ class GptTurbo(Base):
|
||||
return "**ERROR**: " + str(e), 0
|
||||
|
||||
|
||||
class MoonshotChat(GptTurbo):
|
||||
class GptTurbo(Base):
|
||||
def __init__(self, key, model_name="gpt-3.5-turbo", base_url="https://api.openai.com/v1"):
|
||||
if not base_url: base_url="https://api.openai.com/v1"
|
||||
super().__init__(key, model_name, base_url)
|
||||
|
||||
|
||||
class MoonshotChat(Base):
|
||||
def __init__(self, key, model_name="moonshot-v1-8k", base_url="https://api.moonshot.cn/v1"):
|
||||
if not base_url: base_url="https://api.moonshot.cn/v1"
|
||||
self.client = OpenAI(
|
||||
api_key=key, base_url=base_url)
|
||||
self.model_name = model_name
|
||||
super().__init__(key, model_name, base_url)
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
if system:
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model_name,
|
||||
messages=history,
|
||||
**gen_conf)
|
||||
ans = response.choices[0].message.content.strip()
|
||||
if response.choices[0].finish_reason == "length":
|
||||
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
||||
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
||||
return ans, response.usage.total_tokens
|
||||
except openai.APIError as e:
|
||||
return "**ERROR**: " + str(e), 0
|
||||
|
||||
class XinferenceChat(Base):
|
||||
def __init__(self, key=None, model_name="", base_url=""):
|
||||
key = "xxx"
|
||||
super().__init__(key, model_name, base_url)
|
||||
|
||||
|
||||
class DeepSeekChat(Base):
|
||||
def __init__(self, key, model_name="deepseek-chat", base_url="https://api.deepseek.com/v1"):
|
||||
if not base_url: base_url="https://api.deepseek.com/v1"
|
||||
super().__init__(key, model_name, base_url)
|
||||
|
||||
|
||||
class QWenChat(Base):
|
||||
@ -141,12 +132,12 @@ class OllamaChat(Base):
|
||||
if system:
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
try:
|
||||
options = {"temperature": gen_conf.get("temperature", 0.1),
|
||||
"num_predict": gen_conf.get("max_tokens", 128),
|
||||
"top_k": gen_conf.get("top_p", 0.3),
|
||||
"presence_penalty": gen_conf.get("presence_penalty", 0.4),
|
||||
"frequency_penalty": gen_conf.get("frequency_penalty", 0.7),
|
||||
}
|
||||
options = {}
|
||||
if "temperature" in gen_conf: options["temperature"] = gen_conf["temperature"]
|
||||
if "max_tokens" in gen_conf: options["num_predict"] = gen_conf["max_tokens"]
|
||||
if "top_p" in gen_conf: options["top_k"] = gen_conf["top_p"]
|
||||
if "presence_penalty" in gen_conf: options["presence_penalty"] = gen_conf["presence_penalty"]
|
||||
if "frequency_penalty" in gen_conf: options["frequency_penalty"] = gen_conf["frequency_penalty"]
|
||||
response = self.client.chat(
|
||||
model=self.model_name,
|
||||
messages=history,
|
||||
@ -157,25 +148,3 @@ class OllamaChat(Base):
|
||||
except Exception as e:
|
||||
return "**ERROR**: " + str(e), 0
|
||||
|
||||
|
||||
class XinferenceChat(Base):
|
||||
def __init__(self, key=None, model_name="", base_url=""):
|
||||
self.client = OpenAI(api_key="xxx", base_url=base_url)
|
||||
self.model_name = model_name
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
if system:
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model_name,
|
||||
messages=history,
|
||||
**gen_conf)
|
||||
ans = response.choices[0].message.content.strip()
|
||||
if response.choices[0].finish_reason == "length":
|
||||
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
||||
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
||||
return ans, response.usage.total_tokens
|
||||
except openai.APIError as e:
|
||||
return "**ERROR**: " + str(e), 0
|
||||
|
||||
|
||||
@ -26,19 +26,17 @@ from FlagEmbedding import FlagModel
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.file_utils import get_project_base_directory, get_home_cache_dir
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
|
||||
try:
|
||||
flag_model = FlagModel(os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res/bge-large-zh-v1.5"),
|
||||
flag_model = FlagModel(os.path.join(get_home_cache_dir(), "bge-large-zh-v1.5"),
|
||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||
use_fp16=torch.cuda.is_available())
|
||||
except Exception as e:
|
||||
model_dir = snapshot_download(repo_id="BAAI/bge-large-zh-v1.5",
|
||||
local_dir=os.path.join(get_project_base_directory(), "rag/res/bge-large-zh-v1.5"),
|
||||
local_dir=os.path.join(get_home_cache_dir(), "bge-large-zh-v1.5"),
|
||||
local_dir_use_symlinks=False)
|
||||
flag_model = FlagModel(model_dir,
|
||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||
@ -56,7 +54,7 @@ class Base(ABC):
|
||||
raise NotImplementedError("Please implement encode method!")
|
||||
|
||||
|
||||
class HuEmbedding(Base):
|
||||
class DefaultEmbedding(Base):
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
||||
@ -97,8 +95,7 @@ class OpenAIEmbed(Base):
|
||||
def encode(self, texts: list, batch_size=32):
|
||||
res = self.client.embeddings.create(input=texts,
|
||||
model=self.model_name)
|
||||
return np.array([d.embedding for d in res.data]
|
||||
), res.usage.total_tokens
|
||||
return np.array([d.embedding for d in res.data]), res.usage.total_tokens
|
||||
|
||||
def encode_queries(self, text):
|
||||
res = self.client.embeddings.create(input=[text],
|
||||
@ -238,8 +235,8 @@ class YoudaoEmbed(Base):
|
||||
try:
|
||||
print("LOADING BCE...")
|
||||
YoudaoEmbed._client = qanthing(model_name_or_path=os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res/bce-embedding-base_v1"))
|
||||
get_home_cache_dir(),
|
||||
"bce-embedding-base_v1"))
|
||||
except Exception as e:
|
||||
YoudaoEmbed._client = qanthing(
|
||||
model_name_or_path=model_name.replace(
|
||||
|
||||
@ -2,7 +2,7 @@ import random
|
||||
from collections import Counter
|
||||
|
||||
from rag.utils import num_tokens_from_string
|
||||
from . import huqie
|
||||
from . import rag_tokenizer
|
||||
import re
|
||||
import copy
|
||||
|
||||
@ -28,11 +28,17 @@ all_codecs = [
|
||||
def find_codec(blob):
|
||||
global all_codecs
|
||||
for c in all_codecs:
|
||||
try:
|
||||
blob[:1024].decode(c)
|
||||
return c
|
||||
except Exception as e:
|
||||
pass
|
||||
try:
|
||||
blob.decode(c)
|
||||
return c
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return "utf-8"
|
||||
|
||||
|
||||
@ -109,8 +115,8 @@ def is_english(texts):
|
||||
def tokenize(d, t, eng):
|
||||
d["content_with_weight"] = t
|
||||
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
||||
d["content_ltks"] = huqie.qie(t)
|
||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(t)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
|
||||
|
||||
def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
||||
|
||||
@ -1,475 +0,0 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import re
|
||||
import os
|
||||
import copy
|
||||
import base64
|
||||
import magic
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
import numpy as np
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
class HuChunker:
|
||||
|
||||
@dataclass
|
||||
class Fields:
|
||||
text_chunks: List = None
|
||||
table_chunks: List = None
|
||||
|
||||
def __init__(self):
|
||||
self.MAX_LVL = 12
|
||||
self.proj_patt = [
|
||||
(r"第[零一二三四五六七八九十百]+章", 1),
|
||||
(r"第[零一二三四五六七八九十百]+[条节]", 2),
|
||||
(r"[零一二三四五六七八九十百]+[、 ]", 3),
|
||||
(r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
|
||||
(r"[0-9]+(、|\.[ ]|\.[^0-9])", 5),
|
||||
(r"[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 6),
|
||||
(r"[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 7),
|
||||
(r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[ ]|[^0-9])", 8),
|
||||
(r".{,48}[::??]@", 9),
|
||||
(r"[0-9]+)", 10),
|
||||
(r"[\((][0-9]+[)\)]", 11),
|
||||
(r"[零一二三四五六七八九十百]+是", 12),
|
||||
(r"[⚫•➢✓ ]", 12)
|
||||
]
|
||||
self.lines = []
|
||||
|
||||
def _garbage(self, txt):
|
||||
patt = [
|
||||
r"(在此保证|不得以任何形式翻版|请勿传阅|仅供内部使用|未经事先书面授权)",
|
||||
r"(版权(归本公司)*所有|免责声明|保留一切权力|承担全部责任|特别声明|报告中涉及)",
|
||||
r"(不承担任何责任|投资者的通知事项:|任何机构和个人|本报告仅为|不构成投资)",
|
||||
r"(不构成对任何个人或机构投资建议|联系其所在国家|本报告由从事证券交易)",
|
||||
r"(本研究报告由|「认可投资者」|所有研究报告均以|请发邮件至)",
|
||||
r"(本报告仅供|市场有风险,投资需谨慎|本报告中提及的)",
|
||||
r"(本报告反映|此信息仅供|证券分析师承诺|具备证券投资咨询业务资格)",
|
||||
r"^(时间|签字|签章)[::]",
|
||||
r"(参考文献|目录索引|图表索引)",
|
||||
r"[ ]*年[ ]+月[ ]+日",
|
||||
r"^(中国证券业协会|[0-9]+年[0-9]+月[0-9]+日)$",
|
||||
r"\.{10,}",
|
||||
r"(———————END|帮我转发|欢迎收藏|快来关注我吧)"
|
||||
]
|
||||
return any([re.search(p, txt) for p in patt])
|
||||
|
||||
def _proj_match(self, line):
|
||||
for p, j in self.proj_patt:
|
||||
if re.match(p, line):
|
||||
return j
|
||||
return
|
||||
|
||||
def _does_proj_match(self):
|
||||
mat = [None for _ in range(len(self.lines))]
|
||||
for i in range(len(self.lines)):
|
||||
mat[i] = self._proj_match(self.lines[i])
|
||||
return mat
|
||||
|
||||
def naive_text_chunk(self, text, ti="", MAX_LEN=612):
|
||||
if text:
|
||||
self.lines = [l.strip().replace(u'\u3000', u' ')
|
||||
.replace(u'\xa0', u'')
|
||||
for l in text.split("\n\n")]
|
||||
self.lines = [l for l in self.lines if not self._garbage(l)]
|
||||
self.lines = [re.sub(r"([ ]+| )", " ", l)
|
||||
for l in self.lines if l]
|
||||
if not self.lines:
|
||||
return []
|
||||
arr = self.lines
|
||||
|
||||
res = [""]
|
||||
i = 0
|
||||
while i < len(arr):
|
||||
a = arr[i]
|
||||
if not a:
|
||||
i += 1
|
||||
continue
|
||||
if len(a) > MAX_LEN:
|
||||
a_ = a.split("\n")
|
||||
if len(a_) >= 2:
|
||||
arr.pop(i)
|
||||
for j in range(2, len(a_) + 1):
|
||||
if len("\n".join(a_[:j])) >= MAX_LEN:
|
||||
arr.insert(i, "\n".join(a_[:j - 1]))
|
||||
arr.insert(i + 1, "\n".join(a_[j - 1:]))
|
||||
break
|
||||
else:
|
||||
assert False, f"Can't split: {a}"
|
||||
continue
|
||||
|
||||
if len(res[-1]) < MAX_LEN / 3:
|
||||
res[-1] += "\n" + a
|
||||
else:
|
||||
res.append(a)
|
||||
i += 1
|
||||
|
||||
if ti:
|
||||
for i in range(len(res)):
|
||||
if res[i].find("——来自") >= 0:
|
||||
continue
|
||||
res[i] += f"\t——来自“{ti}”"
|
||||
|
||||
return res
|
||||
|
||||
def _merge(self):
|
||||
# merge continuous same level text
|
||||
lines = [self.lines[0]] if self.lines else []
|
||||
for i in range(1, len(self.lines)):
|
||||
if self.mat[i] == self.mat[i - 1] \
|
||||
and len(lines[-1]) < 256 \
|
||||
and len(self.lines[i]) < 256:
|
||||
lines[-1] += "\n" + self.lines[i]
|
||||
continue
|
||||
lines.append(self.lines[i])
|
||||
self.lines = lines
|
||||
self.mat = self._does_proj_match()
|
||||
return self.mat
|
||||
|
||||
def text_chunks(self, text):
|
||||
if text:
|
||||
self.lines = [l.strip().replace(u'\u3000', u' ')
|
||||
.replace(u'\xa0', u'')
|
||||
for l in re.split(r"[\r\n]", text)]
|
||||
self.lines = [l for l in self.lines if not self._garbage(l)]
|
||||
self.lines = [l for l in self.lines if l]
|
||||
self.mat = self._does_proj_match()
|
||||
mat = self._merge()
|
||||
|
||||
tree = []
|
||||
for i in range(len(self.lines)):
|
||||
tree.append({"proj": mat[i],
|
||||
"children": [],
|
||||
"read": False})
|
||||
# find all children
|
||||
for i in range(len(self.lines) - 1):
|
||||
if tree[i]["proj"] is None:
|
||||
continue
|
||||
ed = i + 1
|
||||
while ed < len(tree) and (tree[ed]["proj"] is None or
|
||||
tree[ed]["proj"] > tree[i]["proj"]):
|
||||
ed += 1
|
||||
|
||||
nxt = tree[i]["proj"] + 1
|
||||
st = set([p["proj"] for p in tree[i + 1: ed] if p["proj"]])
|
||||
while nxt not in st:
|
||||
nxt += 1
|
||||
if nxt > self.MAX_LVL:
|
||||
break
|
||||
if nxt <= self.MAX_LVL:
|
||||
for j in range(i + 1, ed):
|
||||
if tree[j]["proj"] is not None:
|
||||
break
|
||||
tree[i]["children"].append(j)
|
||||
for j in range(i + 1, ed):
|
||||
if tree[j]["proj"] != nxt:
|
||||
continue
|
||||
tree[i]["children"].append(j)
|
||||
else:
|
||||
for j in range(i + 1, ed):
|
||||
tree[i]["children"].append(j)
|
||||
|
||||
# get DFS combinations, find all the paths to leaf
|
||||
paths = []
|
||||
|
||||
def dfs(i, path):
|
||||
nonlocal tree, paths
|
||||
path.append(i)
|
||||
tree[i]["read"] = True
|
||||
if len(self.lines[i]) > 256:
|
||||
paths.append(path)
|
||||
return
|
||||
if not tree[i]["children"]:
|
||||
if len(path) > 1 or len(self.lines[i]) >= 32:
|
||||
paths.append(path)
|
||||
return
|
||||
for j in tree[i]["children"]:
|
||||
dfs(j, copy.deepcopy(path))
|
||||
|
||||
for i, t in enumerate(tree):
|
||||
if t["read"]:
|
||||
continue
|
||||
dfs(i, [])
|
||||
|
||||
# concat txt on the path for all paths
|
||||
res = []
|
||||
lines = np.array(self.lines)
|
||||
for p in paths:
|
||||
if len(p) < 2:
|
||||
tree[p[0]]["read"] = False
|
||||
continue
|
||||
txt = "\n".join(lines[p[:-1]]) + "\n" + lines[p[-1]]
|
||||
res.append(txt)
|
||||
# concat continuous orphans
|
||||
assert len(tree) == len(lines)
|
||||
ii = 0
|
||||
while ii < len(tree):
|
||||
if tree[ii]["read"]:
|
||||
ii += 1
|
||||
continue
|
||||
txt = lines[ii]
|
||||
e = ii + 1
|
||||
while e < len(tree) and not tree[e]["read"] and len(txt) < 256:
|
||||
txt += "\n" + lines[e]
|
||||
e += 1
|
||||
res.append(txt)
|
||||
ii = e
|
||||
|
||||
# if the node has not been read, find its daddy
|
||||
def find_daddy(st):
|
||||
nonlocal lines, tree
|
||||
proj = tree[st]["proj"]
|
||||
if len(self.lines[st]) > 512:
|
||||
return [st]
|
||||
if proj is None:
|
||||
proj = self.MAX_LVL + 1
|
||||
for i in range(st - 1, -1, -1):
|
||||
if tree[i]["proj"] and tree[i]["proj"] < proj:
|
||||
a = [st] + find_daddy(i)
|
||||
return a
|
||||
return []
|
||||
|
||||
return res
|
||||
|
||||
|
||||
class PdfChunker(HuChunker):
|
||||
|
||||
def __init__(self, pdf_parser):
|
||||
self.pdf = pdf_parser
|
||||
super().__init__()
|
||||
|
||||
def tableHtmls(self, pdfnm):
|
||||
_, tbls = self.pdf(pdfnm, return_html=True)
|
||||
res = []
|
||||
for img, arr in tbls:
|
||||
if arr[0].find("<table>") < 0:
|
||||
continue
|
||||
buffered = BytesIO()
|
||||
if img:
|
||||
img.save(buffered, format="JPEG")
|
||||
img_str = base64.b64encode(
|
||||
buffered.getvalue()).decode('utf-8') if img else ""
|
||||
res.append({"table": arr[0], "image": img_str})
|
||||
return res
|
||||
|
||||
def html(self, pdfnm):
|
||||
txts, tbls = self.pdf(pdfnm, return_html=True)
|
||||
res = []
|
||||
txt_cks = self.text_chunks(txts)
|
||||
for txt, img in [(self.pdf.remove_tag(c), self.pdf.crop(c))
|
||||
for c in txt_cks]:
|
||||
buffered = BytesIO()
|
||||
if img:
|
||||
img.save(buffered, format="JPEG")
|
||||
img_str = base64.b64encode(
|
||||
buffered.getvalue()).decode('utf-8') if img else ""
|
||||
res.append({"table": "<p>%s</p>" % txt.replace("\n", "<br/>"),
|
||||
"image": img_str})
|
||||
|
||||
for img, arr in tbls:
|
||||
if not arr:
|
||||
continue
|
||||
buffered = BytesIO()
|
||||
if img:
|
||||
img.save(buffered, format="JPEG")
|
||||
img_str = base64.b64encode(
|
||||
buffered.getvalue()).decode('utf-8') if img else ""
|
||||
res.append({"table": arr[0], "image": img_str})
|
||||
|
||||
return res
|
||||
|
||||
def __call__(self, pdfnm, return_image=True, naive_chunk=False):
|
||||
flds = self.Fields()
|
||||
text, tbls = self.pdf(pdfnm)
|
||||
fnm = pdfnm
|
||||
txt_cks = self.text_chunks(text) if not naive_chunk else \
|
||||
self.naive_text_chunk(text, ti=fnm if isinstance(fnm, str) else "")
|
||||
flds.text_chunks = [(self.pdf.remove_tag(c),
|
||||
self.pdf.crop(c) if return_image else None) for c in txt_cks]
|
||||
|
||||
flds.table_chunks = [(arr, img if return_image else None)
|
||||
for img, arr in tbls]
|
||||
return flds
|
||||
|
||||
|
||||
class DocxChunker(HuChunker):
|
||||
|
||||
def __init__(self, doc_parser):
|
||||
self.doc = doc_parser
|
||||
super().__init__()
|
||||
|
||||
def _does_proj_match(self):
|
||||
mat = []
|
||||
for s in self.styles:
|
||||
s = s.split(" ")[-1]
|
||||
try:
|
||||
mat.append(int(s))
|
||||
except Exception as e:
|
||||
mat.append(None)
|
||||
return mat
|
||||
|
||||
def _merge(self):
|
||||
i = 1
|
||||
while i < len(self.lines):
|
||||
if self.mat[i] == self.mat[i - 1] \
|
||||
and len(self.lines[i - 1]) < 256 \
|
||||
and len(self.lines[i]) < 256:
|
||||
self.lines[i - 1] += "\n" + self.lines[i]
|
||||
self.styles.pop(i)
|
||||
self.lines.pop(i)
|
||||
self.mat.pop(i)
|
||||
continue
|
||||
i += 1
|
||||
self.mat = self._does_proj_match()
|
||||
return self.mat
|
||||
|
||||
def __call__(self, fnm):
|
||||
flds = self.Fields()
|
||||
flds.title = os.path.splitext(
|
||||
os.path.basename(fnm))[0] if isinstance(
|
||||
fnm, type("")) else ""
|
||||
secs, tbls = self.doc(fnm)
|
||||
self.lines = [l for l, s in secs]
|
||||
self.styles = [s for l, s in secs]
|
||||
|
||||
txt_cks = self.text_chunks("")
|
||||
flds.text_chunks = [(t, None) for t in txt_cks if not self._garbage(t)]
|
||||
flds.table_chunks = [(tb, None) for tb in tbls for t in tb if t]
|
||||
return flds
|
||||
|
||||
|
||||
class ExcelChunker(HuChunker):
|
||||
|
||||
def __init__(self, excel_parser):
|
||||
self.excel = excel_parser
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, fnm):
|
||||
flds = self.Fields()
|
||||
flds.text_chunks = [(t, None) for t in self.excel(fnm)]
|
||||
flds.table_chunks = []
|
||||
return flds
|
||||
|
||||
|
||||
class PptChunker(HuChunker):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __extract(self, shape):
|
||||
if shape.shape_type == 19:
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(
|
||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
if shape.has_text_frame:
|
||||
return shape.text_frame.text
|
||||
|
||||
if shape.shape_type == 6:
|
||||
texts = []
|
||||
for p in shape.shapes:
|
||||
t = self.__extract(p)
|
||||
if t:
|
||||
texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
def __call__(self, fnm):
|
||||
from pptx import Presentation
|
||||
ppt = Presentation(fnm) if isinstance(
|
||||
fnm, str) else Presentation(
|
||||
BytesIO(fnm))
|
||||
txts = []
|
||||
for slide in ppt.slides:
|
||||
texts = []
|
||||
for shape in slide.shapes:
|
||||
txt = self.__extract(shape)
|
||||
if txt:
|
||||
texts.append(txt)
|
||||
txts.append("\n".join(texts))
|
||||
|
||||
import aspose.slides as slides
|
||||
import aspose.pydrawing as drawing
|
||||
imgs = []
|
||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
||||
for slide in presentation.slides:
|
||||
buffered = BytesIO()
|
||||
slide.get_thumbnail(
|
||||
0.5, 0.5).save(
|
||||
buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
imgs.append(buffered.getvalue())
|
||||
assert len(imgs) == len(
|
||||
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
|
||||
flds = self.Fields()
|
||||
flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||
flds.table_chunks = []
|
||||
|
||||
return flds
|
||||
|
||||
|
||||
class TextChunker(HuChunker):
|
||||
|
||||
@dataclass
|
||||
class Fields:
|
||||
text_chunks: List = None
|
||||
table_chunks: List = None
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@staticmethod
|
||||
def is_binary_file(file_path):
|
||||
mime = magic.Magic(mime=True)
|
||||
if isinstance(file_path, str):
|
||||
file_type = mime.from_file(file_path)
|
||||
else:
|
||||
file_type = mime.from_buffer(file_path)
|
||||
if 'text' in file_type:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def __call__(self, fnm):
|
||||
flds = self.Fields()
|
||||
if self.is_binary_file(fnm):
|
||||
return flds
|
||||
txt = ""
|
||||
if isinstance(fnm, str):
|
||||
with open(fnm, "r") as f:
|
||||
txt = f.read()
|
||||
else:
|
||||
txt = fnm.decode("utf-8")
|
||||
flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)]
|
||||
flds.table_chunks = []
|
||||
return flds
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(__file__) + "/../")
|
||||
if sys.argv[1].split(".")[-1].lower() == "pdf":
|
||||
from deepdoc.parser import PdfParser
|
||||
ckr = PdfChunker(PdfParser())
|
||||
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
|
||||
from deepdoc.parser import DocxParser
|
||||
ckr = DocxChunker(DocxParser())
|
||||
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
|
||||
from deepdoc.parser import ExcelParser
|
||||
ckr = ExcelChunker(ExcelParser())
|
||||
|
||||
# ckr.html(sys.argv[1])
|
||||
print(ckr(sys.argv[1]))
|
||||
@ -7,14 +7,13 @@ import logging
|
||||
import copy
|
||||
from elasticsearch_dsl import Q
|
||||
|
||||
from rag.nlp import huqie, term_weight, synonym
|
||||
|
||||
from rag.nlp import rag_tokenizer, term_weight, synonym
|
||||
|
||||
class EsQueryer:
|
||||
def __init__(self, es):
|
||||
self.tw = term_weight.Dealer()
|
||||
self.es = es
|
||||
self.syn = synonym.Dealer(None)
|
||||
self.syn = synonym.Dealer()
|
||||
self.flds = ["ask_tks^10", "ask_small_tks"]
|
||||
|
||||
@staticmethod
|
||||
@ -47,13 +46,13 @@ class EsQueryer:
|
||||
txt = re.sub(
|
||||
r"[ \r\n\t,,。??/`!!&]+",
|
||||
" ",
|
||||
huqie.tradi2simp(
|
||||
huqie.strQ2B(
|
||||
rag_tokenizer.tradi2simp(
|
||||
rag_tokenizer.strQ2B(
|
||||
txt.lower()))).strip()
|
||||
txt = EsQueryer.rmWWW(txt)
|
||||
|
||||
if not self.isChinese(txt):
|
||||
tks = huqie.qie(txt).split(" ")
|
||||
tks = rag_tokenizer.tokenize(txt).split(" ")
|
||||
q = copy.deepcopy(tks)
|
||||
for i in range(1, len(tks)):
|
||||
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
|
||||
@ -65,7 +64,7 @@ class EsQueryer:
|
||||
boost=1)#, minimum_should_match=min_match)
|
||||
), tks
|
||||
|
||||
def needQieqie(tk):
|
||||
def need_fine_grained_tokenize(tk):
|
||||
if len(tk) < 4:
|
||||
return False
|
||||
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
|
||||
@ -81,7 +80,7 @@ class EsQueryer:
|
||||
logging.info(json.dumps(twts, ensure_ascii=False))
|
||||
tms = []
|
||||
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
|
||||
sm = huqie.qieqie(tk).split(" ") if needQieqie(tk) else []
|
||||
sm = rag_tokenizer.fine_grained_tokenize(tk).split(" ") if need_fine_grained_tokenize(tk) else []
|
||||
sm = [
|
||||
re.sub(
|
||||
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
|
||||
@ -110,10 +109,10 @@ class EsQueryer:
|
||||
if len(twts) > 1:
|
||||
tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
|
||||
if re.match(r"[0-9a-z ]+$", tt):
|
||||
tms = f"(\"{tt}\" OR \"%s\")" % huqie.qie(tt)
|
||||
tms = f"(\"{tt}\" OR \"%s\")" % rag_tokenizer.tokenize(tt)
|
||||
|
||||
syns = " OR ".join(
|
||||
["\"%s\"^0.7" % EsQueryer.subSpecialChar(huqie.qie(s)) for s in syns])
|
||||
["\"%s\"^0.7" % EsQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) for s in syns])
|
||||
if syns:
|
||||
tms = f"({tms})^5 OR ({syns})^0.7"
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ from nltk.stem import PorterStemmer, WordNetLemmatizer
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
|
||||
|
||||
class Huqie:
|
||||
class RagTokenizer:
|
||||
def key_(self, line):
|
||||
return str(line.lower().encode("utf-8"))[2:-1]
|
||||
|
||||
@ -241,7 +241,7 @@ class Huqie:
|
||||
|
||||
return self.score_(res[::-1])
|
||||
|
||||
def qie(self, line):
|
||||
def tokenize(self, line):
|
||||
line = self._strQ2B(line).lower()
|
||||
line = self._tradi2simp(line)
|
||||
zh_num = len([1 for c in line if is_chinese(c)])
|
||||
@ -298,7 +298,7 @@ class Huqie:
|
||||
print("[TKS]", self.merge_(res))
|
||||
return self.merge_(res)
|
||||
|
||||
def qieqie(self, tks):
|
||||
def fine_grained_tokenize(self, tks):
|
||||
tks = tks.split(" ")
|
||||
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
|
||||
if zh_num < len(tks) * 0.2:
|
||||
@ -371,53 +371,53 @@ def naiveQie(txt):
|
||||
return tks
|
||||
|
||||
|
||||
hq = Huqie()
|
||||
qie = hq.qie
|
||||
qieqie = hq.qieqie
|
||||
tag = hq.tag
|
||||
freq = hq.freq
|
||||
loadUserDict = hq.loadUserDict
|
||||
addUserDict = hq.addUserDict
|
||||
tradi2simp = hq._tradi2simp
|
||||
strQ2B = hq._strQ2B
|
||||
tokenizer = RagTokenizer()
|
||||
tokenize = tokenizer.tokenize
|
||||
fine_grained_tokenize = tokenizer.fine_grained_tokenize
|
||||
tag = tokenizer.tag
|
||||
freq = tokenizer.freq
|
||||
loadUserDict = tokenizer.loadUserDict
|
||||
addUserDict = tokenizer.addUserDict
|
||||
tradi2simp = tokenizer._tradi2simp
|
||||
strQ2B = tokenizer._strQ2B
|
||||
|
||||
if __name__ == '__main__':
|
||||
huqie = Huqie(debug=True)
|
||||
tknzr = RagTokenizer(debug=True)
|
||||
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
|
||||
tks = huqie.qie(
|
||||
tks = tknzr.tokenize(
|
||||
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
|
||||
print(huqie.qieqie(tks))
|
||||
tks = huqie.qie(
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
|
||||
print(huqie.qieqie(tks))
|
||||
tks = huqie.qie(
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
|
||||
print(huqie.qieqie(tks))
|
||||
tks = huqie.qie(
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
|
||||
print(huqie.qieqie(tks))
|
||||
tks = huqie.qie("虽然我不怎么玩")
|
||||
print(huqie.qieqie(tks))
|
||||
tks = huqie.qie("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
||||
print(huqie.qieqie(tks))
|
||||
tks = huqie.qie(
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("虽然我不怎么玩")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
|
||||
print(huqie.qieqie(tks))
|
||||
tks = huqie.qie("这周日你去吗?这周日你有空吗?")
|
||||
print(huqie.qieqie(tks))
|
||||
tks = huqie.qie("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
||||
print(huqie.qieqie(tks))
|
||||
tks = huqie.qie(
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
tks = tknzr.tokenize(
|
||||
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
|
||||
print(huqie.qieqie(tks))
|
||||
print(tknzr.fine_grained_tokenize(tks))
|
||||
if len(sys.argv) < 2:
|
||||
sys.exit()
|
||||
huqie.DEBUG = False
|
||||
huqie.loadUserDict(sys.argv[1])
|
||||
tknzr.DEBUG = False
|
||||
tknzr.loadUserDict(sys.argv[1])
|
||||
of = open(sys.argv[2], "r")
|
||||
while True:
|
||||
line = of.readline()
|
||||
if not line:
|
||||
break
|
||||
print(huqie.qie(line))
|
||||
print(tknzr.tokenize(line))
|
||||
of.close()
|
||||
@ -9,7 +9,7 @@ from dataclasses import dataclass
|
||||
|
||||
from rag.settings import es_logger
|
||||
from rag.utils import rmSpace
|
||||
from rag.nlp import huqie, query
|
||||
from rag.nlp import rag_tokenizer, query
|
||||
import numpy as np
|
||||
|
||||
|
||||
@ -128,7 +128,7 @@ class Dealer:
|
||||
kwds = set([])
|
||||
for k in keywords:
|
||||
kwds.add(k)
|
||||
for kk in huqie.qieqie(k).split(" "):
|
||||
for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
|
||||
if len(kk) < 2:
|
||||
continue
|
||||
if kk in kwds:
|
||||
@ -243,7 +243,7 @@ class Dealer:
|
||||
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
||||
len(ans_v[0]), len(chunk_v[0]))
|
||||
|
||||
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
|
||||
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
|
||||
for ck in chunks]
|
||||
cites = {}
|
||||
thr = 0.63
|
||||
@ -251,7 +251,7 @@ class Dealer:
|
||||
for i, a in enumerate(pieces_):
|
||||
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
||||
chunk_v,
|
||||
huqie.qie(
|
||||
rag_tokenizer.tokenize(
|
||||
self.qryr.rmWWW(pieces_[i])).split(" "),
|
||||
chunks_tks,
|
||||
tkweight, vtweight)
|
||||
@ -310,8 +310,8 @@ class Dealer:
|
||||
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
||||
return self.qryr.hybrid_similarity(ans_embd,
|
||||
ins_embd,
|
||||
huqie.qie(ans).split(" "),
|
||||
huqie.qie(inst).split(" "))
|
||||
rag_tokenizer.tokenize(ans).split(" "),
|
||||
rag_tokenizer.tokenize(inst).split(" "))
|
||||
|
||||
def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
|
||||
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
|
||||
@ -385,7 +385,7 @@ class Dealer:
|
||||
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
||||
fld, v = r.group(1), r.group(3)
|
||||
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
||||
fld, huqie.qieqie(huqie.qie(v)))
|
||||
fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v)))
|
||||
replaces.append(
|
||||
("{}{}'{}'".format(
|
||||
r.group(1),
|
||||
|
||||
@ -17,7 +17,7 @@ class Dealer:
|
||||
try:
|
||||
self.dictionary = json.load(open(path, 'r'))
|
||||
except Exception as e:
|
||||
logging.warn("Miss synonym.json")
|
||||
logging.warn("Missing synonym.json")
|
||||
self.dictionary = {}
|
||||
|
||||
if not redis:
|
||||
|
||||
@ -4,7 +4,7 @@ import json
|
||||
import re
|
||||
import os
|
||||
import numpy as np
|
||||
from rag.nlp import huqie
|
||||
from rag.nlp import rag_tokenizer
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
|
||||
|
||||
@ -83,7 +83,7 @@ class Dealer:
|
||||
txt = re.sub(p, r, txt)
|
||||
|
||||
res = []
|
||||
for t in huqie.qie(txt).split(" "):
|
||||
for t in rag_tokenizer.tokenize(txt).split(" "):
|
||||
tk = t
|
||||
if (stpwd and tk in self.stop_words) or (
|
||||
re.match(r"[0-9]$", tk) and not num):
|
||||
@ -161,7 +161,7 @@ class Dealer:
|
||||
return m[self.ne[t]]
|
||||
|
||||
def postag(t):
|
||||
t = huqie.tag(t)
|
||||
t = rag_tokenizer.tag(t)
|
||||
if t in set(["r", "c", "d"]):
|
||||
return 0.3
|
||||
if t in set(["ns", "nt"]):
|
||||
@ -175,14 +175,14 @@ class Dealer:
|
||||
def freq(t):
|
||||
if re.match(r"[0-9. -]{2,}$", t):
|
||||
return 3
|
||||
s = huqie.freq(t)
|
||||
s = rag_tokenizer.freq(t)
|
||||
if not s and re.match(r"[a-z. -]+$", t):
|
||||
return 300
|
||||
if not s:
|
||||
s = 0
|
||||
|
||||
if not s and len(t) >= 4:
|
||||
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
|
||||
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
|
||||
if len(s) > 1:
|
||||
s = np.min([freq(tt) for tt in s]) / 6.
|
||||
else:
|
||||
@ -198,7 +198,7 @@ class Dealer:
|
||||
elif re.match(r"[a-z. -]+$", t):
|
||||
return 300
|
||||
elif len(t) >= 4:
|
||||
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
|
||||
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
|
||||
if len(s) > 1:
|
||||
return max(3, np.min([df(tt) for tt in s]) / 6.)
|
||||
|
||||
|
||||
@ -47,3 +47,9 @@ cron_logger = getLogger("cron_logger")
|
||||
cron_logger.setLevel(20)
|
||||
chunk_logger = getLogger("chunk_logger")
|
||||
database_logger = getLogger("database")
|
||||
|
||||
SVR_QUEUE_NAME = "rag_flow_svr_queue"
|
||||
SVR_QUEUE_RETENTION = 60*60
|
||||
SVR_QUEUE_MAX_LEN = 1024
|
||||
SVR_CONSUMER_NAME = "rag_flow_svr_consumer"
|
||||
SVR_CONSUMER_GROUP_NAME = "rag_flow_svr_consumer_group"
|
||||
|
||||
@ -4,13 +4,14 @@ import traceback
|
||||
|
||||
from api.db.db_models import close_connection
|
||||
from api.db.services.task_service import TaskService
|
||||
from rag.utils import MINIO
|
||||
from rag.settings import cron_logger
|
||||
from rag.utils.minio_conn import MINIO
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
|
||||
|
||||
def collect():
|
||||
doc_locations = TaskService.get_ongoing_doc_name()
|
||||
#print(tasks)
|
||||
print(doc_locations)
|
||||
if len(doc_locations) == 0:
|
||||
time.sleep(1)
|
||||
return
|
||||
@ -28,7 +29,7 @@ def main():
|
||||
if REDIS_CONN.exist(key):continue
|
||||
file_bin = MINIO.get(kb_id, loc)
|
||||
REDIS_CONN.transaction(key, file_bin, 12 * 60)
|
||||
print("CACHE:", loc)
|
||||
cron_logger.info("CACHE: {}".format(loc))
|
||||
except Exception as e:
|
||||
traceback.print_stack(e)
|
||||
except Exception as e:
|
||||
|
||||
@ -1,193 +0,0 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import random
|
||||
from datetime import datetime
|
||||
from api.db.db_models import Task
|
||||
from api.db.db_utils import bulk_insert_into_db
|
||||
from api.db.services.task_service import TaskService
|
||||
from deepdoc.parser import PdfParser
|
||||
from deepdoc.parser.excel_parser import HuExcelParser
|
||||
from rag.settings import cron_logger
|
||||
from rag.utils import MINIO
|
||||
from rag.utils import findMaxTm
|
||||
import pandas as pd
|
||||
from api.db import FileType, TaskStatus
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.settings import database_logger
|
||||
from api.utils import get_format_time, get_uuid
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
from api.db.db_models import init_database_tables as init_web_db
|
||||
from api.db.init_data import init_web_data
|
||||
|
||||
|
||||
def collect(tm):
|
||||
docs = DocumentService.get_newly_uploaded(tm)
|
||||
if len(docs) == 0:
|
||||
return pd.DataFrame()
|
||||
docs = pd.DataFrame(docs)
|
||||
mtm = docs["update_time"].max()
|
||||
cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
|
||||
return docs
|
||||
|
||||
|
||||
def set_dispatching(docid):
|
||||
try:
|
||||
DocumentService.update_by_id(
|
||||
docid, {"progress": random.random() * 1 / 100.,
|
||||
"progress_msg": "Task dispatched...",
|
||||
"process_begin_at": get_format_time()
|
||||
})
|
||||
except Exception as e:
|
||||
cron_logger.error("set_dispatching:({}), {}".format(docid, str(e)))
|
||||
|
||||
|
||||
def dispatch():
|
||||
tm_fnm = os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res",
|
||||
f"broker.tm")
|
||||
tm = findMaxTm(tm_fnm)
|
||||
rows = collect(tm)
|
||||
if len(rows) == 0:
|
||||
return
|
||||
|
||||
tmf = open(tm_fnm, "a+")
|
||||
for _, r in rows.iterrows():
|
||||
try:
|
||||
tsks = TaskService.query(doc_id=r["id"])
|
||||
if tsks:
|
||||
for t in tsks:
|
||||
TaskService.delete_by_id(t.id)
|
||||
except Exception as e:
|
||||
cron_logger.exception(e)
|
||||
|
||||
def new_task():
|
||||
nonlocal r
|
||||
return {
|
||||
"id": get_uuid(),
|
||||
"doc_id": r["id"]
|
||||
}
|
||||
|
||||
tsks = []
|
||||
try:
|
||||
file_bin = MINIO.get(r["kb_id"], r["location"])
|
||||
if REDIS_CONN.is_alive():
|
||||
try:
|
||||
REDIS_CONN.set("{}/{}".format(r["kb_id"], r["location"]), file_bin, 12*60)
|
||||
except Exception as e:
|
||||
cron_logger.warning("Put into redis[EXCEPTION]:" + str(e))
|
||||
|
||||
if r["type"] == FileType.PDF.value:
|
||||
do_layout = r["parser_config"].get("layout_recognize", True)
|
||||
pages = PdfParser.total_page_number(r["name"], file_bin)
|
||||
page_size = r["parser_config"].get("task_page_size", 12)
|
||||
if r["parser_id"] == "paper":
|
||||
page_size = r["parser_config"].get("task_page_size", 22)
|
||||
if r["parser_id"] == "one":
|
||||
page_size = 1000000000
|
||||
if not do_layout:
|
||||
page_size = 1000000000
|
||||
page_ranges = r["parser_config"].get("pages")
|
||||
if not page_ranges:
|
||||
page_ranges = [(1, 100000)]
|
||||
for s, e in page_ranges:
|
||||
s -= 1
|
||||
s = max(0, s)
|
||||
e = min(e - 1, pages)
|
||||
for p in range(s, e, page_size):
|
||||
task = new_task()
|
||||
task["from_page"] = p
|
||||
task["to_page"] = min(p + page_size, e)
|
||||
tsks.append(task)
|
||||
|
||||
elif r["parser_id"] == "table":
|
||||
rn = HuExcelParser.row_number(
|
||||
r["name"], file_bin)
|
||||
for i in range(0, rn, 3000):
|
||||
task = new_task()
|
||||
task["from_page"] = i
|
||||
task["to_page"] = min(i + 3000, rn)
|
||||
tsks.append(task)
|
||||
else:
|
||||
tsks.append(new_task())
|
||||
|
||||
bulk_insert_into_db(Task, tsks, True)
|
||||
set_dispatching(r["id"])
|
||||
except Exception as e:
|
||||
cron_logger.exception(e)
|
||||
|
||||
tmf.write(str(r["update_time"]) + "\n")
|
||||
tmf.close()
|
||||
|
||||
|
||||
def update_progress():
|
||||
docs = DocumentService.get_unfinished_docs()
|
||||
for d in docs:
|
||||
try:
|
||||
tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
|
||||
if not tsks:
|
||||
continue
|
||||
msg = []
|
||||
prg = 0
|
||||
finished = True
|
||||
bad = 0
|
||||
status = TaskStatus.RUNNING.value
|
||||
for t in tsks:
|
||||
if 0 <= t.progress < 1:
|
||||
finished = False
|
||||
prg += t.progress if t.progress >= 0 else 0
|
||||
msg.append(t.progress_msg)
|
||||
if t.progress == -1:
|
||||
bad += 1
|
||||
prg /= len(tsks)
|
||||
if finished and bad:
|
||||
prg = -1
|
||||
status = TaskStatus.FAIL.value
|
||||
elif finished:
|
||||
status = TaskStatus.DONE.value
|
||||
|
||||
msg = "\n".join(msg)
|
||||
info = {
|
||||
"process_duation": datetime.timestamp(
|
||||
datetime.now()) -
|
||||
d["process_begin_at"].timestamp(),
|
||||
"run": status}
|
||||
if prg != 0:
|
||||
info["progress"] = prg
|
||||
if msg:
|
||||
info["progress_msg"] = msg
|
||||
DocumentService.update_by_id(d["id"], info)
|
||||
except Exception as e:
|
||||
cron_logger.error("fetch task exception:" + str(e))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
peewee_logger = logging.getLogger('peewee')
|
||||
peewee_logger.propagate = False
|
||||
peewee_logger.addHandler(database_logger.handlers[0])
|
||||
peewee_logger.setLevel(database_logger.level)
|
||||
# init db
|
||||
init_web_db()
|
||||
init_web_data()
|
||||
|
||||
while True:
|
||||
dispatch()
|
||||
time.sleep(1)
|
||||
update_progress()
|
||||
@ -24,16 +24,18 @@ import sys
|
||||
import time
|
||||
import traceback
|
||||
from functools import partial
|
||||
from rag.utils import MINIO
|
||||
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from rag.utils.minio_conn import MINIO
|
||||
from api.db.db_models import close_connection
|
||||
from rag.settings import database_logger
|
||||
from rag.settings import database_logger, SVR_QUEUE_NAME
|
||||
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
|
||||
from multiprocessing import Pool
|
||||
import numpy as np
|
||||
from elasticsearch_dsl import Q
|
||||
from multiprocessing.context import TimeoutError
|
||||
from api.db.services.task_service import TaskService
|
||||
from rag.utils import ELASTICSEARCH
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from timeit import default_timer as timer
|
||||
from rag.utils import rmSpace, findMaxTm
|
||||
|
||||
@ -87,36 +89,34 @@ def set_progress(task_id, from_page=0, to_page=-1,
|
||||
except Exception as e:
|
||||
cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
|
||||
|
||||
close_connection()
|
||||
if cancel:
|
||||
sys.exit()
|
||||
|
||||
|
||||
def collect(comm, mod, tm):
|
||||
tasks = TaskService.get_tasks(tm, mod, comm)
|
||||
#print(tasks)
|
||||
if len(tasks) == 0:
|
||||
def collect():
|
||||
try:
|
||||
payload = REDIS_CONN.queue_consumer(SVR_QUEUE_NAME, "rag_flow_svr_task_broker", "rag_flow_svr_task_consumer")
|
||||
if not payload:
|
||||
time.sleep(1)
|
||||
return pd.DataFrame()
|
||||
except Exception as e:
|
||||
cron_logger.error("Get task event from queue exception:" + str(e))
|
||||
return pd.DataFrame()
|
||||
|
||||
msg = payload.get_message()
|
||||
payload.ack()
|
||||
if not msg: return pd.DataFrame()
|
||||
|
||||
if TaskService.do_cancel(msg["id"]):
|
||||
return pd.DataFrame()
|
||||
tasks = TaskService.get_tasks(msg["id"])
|
||||
assert tasks, "{} empty task!".format(msg["id"])
|
||||
tasks = pd.DataFrame(tasks)
|
||||
mtm = tasks["update_time"].max()
|
||||
cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm))
|
||||
return tasks
|
||||
|
||||
|
||||
def get_minio_binary(bucket, name):
|
||||
global MINIO
|
||||
if REDIS_CONN.is_alive():
|
||||
try:
|
||||
for _ in range(30):
|
||||
if REDIS_CONN.exist("{}/{}".format(bucket, name)):
|
||||
time.sleep(1)
|
||||
break
|
||||
time.sleep(1)
|
||||
r = REDIS_CONN.get("{}/{}".format(bucket, name))
|
||||
if r: return r
|
||||
cron_logger.warning("Cache missing: {}".format(name))
|
||||
except Exception as e:
|
||||
cron_logger.warning("Get redis[EXCEPTION]:" + str(e))
|
||||
return MINIO.get(bucket, name)
|
||||
|
||||
|
||||
@ -132,12 +132,10 @@ def build(row):
|
||||
row["from_page"],
|
||||
row["to_page"])
|
||||
chunker = FACTORY[row["parser_id"].lower()]
|
||||
pool = Pool(processes=1)
|
||||
try:
|
||||
st = timer()
|
||||
thr = pool.apply_async(get_minio_binary, args=(row["kb_id"], row["location"]))
|
||||
binary = thr.get(timeout=90)
|
||||
pool.terminate()
|
||||
bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"])
|
||||
binary = get_minio_binary(bucket, name)
|
||||
cron_logger.info(
|
||||
"From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
|
||||
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
|
||||
@ -156,7 +154,6 @@ def build(row):
|
||||
else:
|
||||
callback(-1, f"Internal server error: %s" %
|
||||
str(e).replace("'", ""))
|
||||
pool.terminate()
|
||||
traceback.print_exc()
|
||||
|
||||
cron_logger.error(
|
||||
@ -247,20 +244,13 @@ def embedding(docs, mdl, parser_config={}, callback=None):
|
||||
return tk_count
|
||||
|
||||
|
||||
def main(comm, mod):
|
||||
tm_fnm = os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res",
|
||||
f"{comm}-{mod}.tm")
|
||||
tm = findMaxTm(tm_fnm)
|
||||
rows = collect(comm, mod, tm)
|
||||
def main():
|
||||
rows = collect()
|
||||
if len(rows) == 0:
|
||||
return
|
||||
|
||||
tmf = open(tm_fnm, "a+")
|
||||
for _, r in rows.iterrows():
|
||||
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
||||
#callback(random.random()/10., "Task has been received.")
|
||||
try:
|
||||
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
||||
except Exception as e:
|
||||
@ -274,7 +264,6 @@ def main(comm, mod):
|
||||
if cks is None:
|
||||
continue
|
||||
if not cks:
|
||||
tmf.write(str(r["update_time"]) + "\n")
|
||||
callback(1., "No chunk! Done!")
|
||||
continue
|
||||
# TODO: exception handler
|
||||
@ -314,8 +303,6 @@ def main(comm, mod):
|
||||
"Chunk doc({}), token({}), chunks({}), elapsed:{}".format(
|
||||
r["id"], tk_count, len(cks), timer()-st))
|
||||
|
||||
tmf.write(str(r["update_time"]) + "\n")
|
||||
tmf.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -324,8 +311,5 @@ if __name__ == "__main__":
|
||||
peewee_logger.addHandler(database_logger.handlers[0])
|
||||
peewee_logger.setLevel(database_logger.level)
|
||||
|
||||
#from mpi4py import MPI
|
||||
#comm = MPI.COMM_WORLD
|
||||
while True:
|
||||
main(int(sys.argv[2]), int(sys.argv[1]))
|
||||
close_connection()
|
||||
main()
|
||||
|
||||
@ -15,9 +15,6 @@ def singleton(cls, *args, **kw):
|
||||
return _singleton
|
||||
|
||||
|
||||
from .minio_conn import MINIO
|
||||
from .es_conn import ELASTICSEARCH
|
||||
|
||||
def rmSpace(txt):
|
||||
txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
|
||||
@ -15,7 +15,7 @@ es_logger.info("Elasticsearch version: "+str(elasticsearch.__version__))
|
||||
|
||||
|
||||
@singleton
|
||||
class HuEs:
|
||||
class ESConnection:
|
||||
def __init__(self):
|
||||
self.info = {}
|
||||
self.conn()
|
||||
@ -454,4 +454,4 @@ class HuEs:
|
||||
scroll_size = len(page['hits']['hits'])
|
||||
|
||||
|
||||
ELASTICSEARCH = HuEs()
|
||||
ELASTICSEARCH = ESConnection()
|
||||
|
||||
@ -8,7 +8,7 @@ from rag.utils import singleton
|
||||
|
||||
|
||||
@singleton
|
||||
class HuMinio(object):
|
||||
class RAGFlowMinio(object):
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
self.__open__()
|
||||
@ -35,7 +35,7 @@ class HuMinio(object):
|
||||
self.conn = None
|
||||
|
||||
def put(self, bucket, fnm, binary):
|
||||
for _ in range(10):
|
||||
for _ in range(3):
|
||||
try:
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
self.conn.make_bucket(bucket)
|
||||
@ -86,10 +86,12 @@ class HuMinio(object):
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
MINIO = HuMinio()
|
||||
|
||||
MINIO = RAGFlowMinio()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
conn = HuMinio()
|
||||
conn = RAGFlowMinio()
|
||||
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
||||
from PIL import Image
|
||||
img = Image.open(fnm)
|
||||
|
||||
@ -5,6 +5,27 @@ import logging
|
||||
from rag import settings
|
||||
from rag.utils import singleton
|
||||
|
||||
|
||||
class Payload:
|
||||
def __init__(self, consumer, queue_name, group_name, msg_id, message):
|
||||
self.__consumer = consumer
|
||||
self.__queue_name = queue_name
|
||||
self.__group_name = group_name
|
||||
self.__msg_id = msg_id
|
||||
self.__message = json.loads(message['message'])
|
||||
|
||||
def ack(self):
|
||||
try:
|
||||
self.__consumer.xack(self.__queue_name, self.__group_name, self.__msg_id)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning("[EXCEPTION]ack" + str(self.__queue_name) + "||" + str(e))
|
||||
return False
|
||||
|
||||
def get_message(self):
|
||||
return self.__message
|
||||
|
||||
|
||||
@singleton
|
||||
class RedisDB:
|
||||
def __init__(self):
|
||||
@ -14,10 +35,11 @@ class RedisDB:
|
||||
|
||||
def __open__(self):
|
||||
try:
|
||||
self.REDIS = redis.Redis(host=self.config.get("host", "redis").split(":")[0],
|
||||
self.REDIS = redis.StrictRedis(host=self.config["host"].split(":")[0],
|
||||
port=int(self.config.get("host", ":6379").split(":")[1]),
|
||||
db=int(self.config.get("db", 1)),
|
||||
password=self.config.get("password"))
|
||||
password=self.config.get("password"),
|
||||
decode_responses=True)
|
||||
except Exception as e:
|
||||
logging.warning("Redis can't be connected.")
|
||||
return self.REDIS
|
||||
@ -70,5 +92,48 @@ class RedisDB:
|
||||
self.__open__()
|
||||
return False
|
||||
|
||||
def queue_product(self, queue, message, exp=settings.SVR_QUEUE_RETENTION) -> bool:
|
||||
try:
|
||||
payload = {"message": json.dumps(message)}
|
||||
pipeline = self.REDIS.pipeline()
|
||||
pipeline.xadd(queue, payload)
|
||||
pipeline.expire(queue, exp)
|
||||
pipeline.execute()
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.warning("[EXCEPTION]producer" + str(queue) + "||" + str(e))
|
||||
return False
|
||||
|
||||
def queue_consumer(self, queue_name, group_name, consumer_name, msg_id=b">") -> Payload:
|
||||
try:
|
||||
group_info = self.REDIS.xinfo_groups(queue_name)
|
||||
if not any(e["name"] == group_name for e in group_info):
|
||||
self.REDIS.xgroup_create(
|
||||
queue_name,
|
||||
group_name,
|
||||
id="$",
|
||||
mkstream=True
|
||||
)
|
||||
args = {
|
||||
"groupname": group_name,
|
||||
"consumername": consumer_name,
|
||||
"count": 1,
|
||||
"block": 10000,
|
||||
"streams": {queue_name: msg_id},
|
||||
}
|
||||
messages = self.REDIS.xreadgroup(**args)
|
||||
if not messages:
|
||||
return None
|
||||
stream, element_list = messages[0]
|
||||
msg_id, payload = element_list[0]
|
||||
res = Payload(self.REDIS, queue_name, group_name, msg_id, payload)
|
||||
return res
|
||||
except Exception as e:
|
||||
if 'key' in str(e):
|
||||
pass
|
||||
else:
|
||||
logging.warning("[EXCEPTION]consumer" + str(queue_name) + "||" + str(e))
|
||||
return None
|
||||
|
||||
|
||||
REDIS_CONN = RedisDB()
|
||||
@ -50,7 +50,6 @@ joblib==1.3.2
|
||||
lxml==5.1.0
|
||||
MarkupSafe==2.1.5
|
||||
minio==7.2.4
|
||||
mpi4py==3.1.5
|
||||
mpmath==1.3.0
|
||||
multidict==6.0.5
|
||||
multiprocess==0.70.16
|
||||
@ -69,6 +68,7 @@ nvidia-cusparse-cu12==12.1.0.106
|
||||
nvidia-nccl-cu12==2.19.3
|
||||
nvidia-nvjitlink-cu12==12.3.101
|
||||
nvidia-nvtx-cu12==12.1.105
|
||||
ollama==0.1.9
|
||||
onnxruntime-gpu==1.17.1
|
||||
openai==1.12.0
|
||||
opencv-python==4.9.0.80
|
||||
@ -91,8 +91,6 @@ pycryptodomex==3.20.0
|
||||
pydantic==2.6.2
|
||||
pydantic_core==2.16.3
|
||||
PyJWT==2.8.0
|
||||
PyMuPDF==1.23.25
|
||||
PyMuPDFb==1.23.22
|
||||
PyMySQL==1.1.0
|
||||
PyPDF2==3.0.1
|
||||
pypdfium2==4.27.0
|
||||
@ -102,6 +100,7 @@ python-dotenv==1.0.1
|
||||
python-pptx==0.6.23
|
||||
pytz==2024.1
|
||||
PyYAML==6.0.1
|
||||
redis==5.0.3
|
||||
regex==2023.12.25
|
||||
requests==2.31.0
|
||||
ruamel.yaml==0.18.6
|
||||
|
||||
126
requirements_dev.txt
Normal file
126
requirements_dev.txt
Normal file
@ -0,0 +1,126 @@
|
||||
accelerate==0.27.2
|
||||
aiohttp==3.9.3
|
||||
aiosignal==1.3.1
|
||||
annotated-types==0.6.0
|
||||
anyio==4.3.0
|
||||
argon2-cffi==23.1.0
|
||||
argon2-cffi-bindings==21.2.0
|
||||
Aspose.Slides==24.2.0
|
||||
attrs==23.2.0
|
||||
blinker==1.7.0
|
||||
cachelib==0.12.0
|
||||
cachetools==5.3.3
|
||||
certifi==2024.2.2
|
||||
cffi==1.16.0
|
||||
charset-normalizer==3.3.2
|
||||
click==8.1.7
|
||||
coloredlogs==15.0.1
|
||||
cryptography==42.0.5
|
||||
dashscope==1.14.1
|
||||
datasets==2.17.1
|
||||
datrie==0.8.2
|
||||
demjson3==3.0.6
|
||||
dill==0.3.8
|
||||
distro==1.9.0
|
||||
elastic-transport==8.12.0
|
||||
elasticsearch==8.12.1
|
||||
elasticsearch-dsl==8.12.0
|
||||
et-xmlfile==1.1.0
|
||||
filelock==3.13.1
|
||||
fastembed==0.2.6
|
||||
FlagEmbedding==1.2.5
|
||||
Flask==3.0.2
|
||||
Flask-Cors==4.0.0
|
||||
Flask-Login==0.6.3
|
||||
Flask-Session==0.6.0
|
||||
flatbuffers==23.5.26
|
||||
frozenlist==1.4.1
|
||||
fsspec==2023.10.0
|
||||
h11==0.14.0
|
||||
hanziconv==0.3.2
|
||||
httpcore==1.0.4
|
||||
httpx==0.27.0
|
||||
huggingface-hub==0.20.3
|
||||
humanfriendly==10.0
|
||||
idna==3.6
|
||||
install==1.3.5
|
||||
itsdangerous==2.1.2
|
||||
Jinja2==3.1.3
|
||||
joblib==1.3.2
|
||||
lxml==5.1.0
|
||||
MarkupSafe==2.1.5
|
||||
minio==7.2.4
|
||||
mpi4py==3.1.5
|
||||
mpmath==1.3.0
|
||||
multidict==6.0.5
|
||||
multiprocess==0.70.16
|
||||
networkx==3.2.1
|
||||
nltk==3.8.1
|
||||
numpy==1.26.4
|
||||
openai==1.12.0
|
||||
opencv-python==4.9.0.80
|
||||
openpyxl==3.1.2
|
||||
packaging==23.2
|
||||
pandas==2.2.1
|
||||
pdfminer.six==20221105
|
||||
pdfplumber==0.10.4
|
||||
peewee==3.17.1
|
||||
pillow==10.2.0
|
||||
protobuf==4.25.3
|
||||
psutil==5.9.8
|
||||
pyarrow==15.0.0
|
||||
pyarrow-hotfix==0.6
|
||||
pyclipper==1.3.0.post5
|
||||
pycparser==2.21
|
||||
pycryptodome==3.20.0
|
||||
pycryptodome-test-vectors==1.0.14
|
||||
pycryptodomex==3.20.0
|
||||
pydantic==2.6.2
|
||||
pydantic_core==2.16.3
|
||||
PyJWT==2.8.0
|
||||
PyMuPDF==1.23.25
|
||||
PyMuPDFb==1.23.22
|
||||
PyMySQL==1.1.0
|
||||
PyPDF2==3.0.1
|
||||
pypdfium2==4.27.0
|
||||
python-dateutil==2.8.2
|
||||
python-docx==1.1.0
|
||||
python-dotenv==1.0.1
|
||||
python-pptx==0.6.23
|
||||
pytz==2024.1
|
||||
PyYAML==6.0.1
|
||||
regex==2023.12.25
|
||||
requests==2.31.0
|
||||
ruamel.yaml==0.18.6
|
||||
ruamel.yaml.clib==0.2.8
|
||||
safetensors==0.4.2
|
||||
scikit-learn==1.4.1.post1
|
||||
scipy==1.12.0
|
||||
sentence-transformers==2.4.0
|
||||
shapely==2.0.3
|
||||
six==1.16.0
|
||||
sniffio==1.3.1
|
||||
StrEnum==0.4.15
|
||||
sympy==1.12
|
||||
threadpoolctl==3.3.0
|
||||
tika==2.6.0
|
||||
tiktoken==0.6.0
|
||||
tokenizers==0.15.2
|
||||
torch==2.2.1
|
||||
tqdm==4.66.2
|
||||
transformers==4.38.1
|
||||
triton==2.2.0
|
||||
typing_extensions==4.10.0
|
||||
tzdata==2024.1
|
||||
urllib3==2.2.1
|
||||
Werkzeug==3.0.1
|
||||
xgboost==2.0.3
|
||||
XlsxWriter==3.2.0
|
||||
xpinyin==0.7.6
|
||||
xxhash==3.4.1
|
||||
yarl==1.9.4
|
||||
zhipuai==2.0.1
|
||||
BCEmbedding
|
||||
loguru==0.7.2
|
||||
ollama==0.1.8
|
||||
redis==5.0.4
|
||||
@ -27,7 +27,7 @@ export default defineConfig({
|
||||
devtool: 'source-map',
|
||||
proxy: {
|
||||
'/v1': {
|
||||
target: 'http://192.168.200.233:9380/',
|
||||
target: 'http://123.60.95.134:9380/',
|
||||
changeOrigin: true,
|
||||
// pathRewrite: { '^/v1': '/v1' },
|
||||
},
|
||||
|
||||
2384
web/package-lock.json
generated
2384
web/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -3,7 +3,7 @@
|
||||
"author": "zhaofengchao <13723060510@163.com>",
|
||||
"scripts": {
|
||||
"build": "umi build",
|
||||
"dev": "cross-env PORT=9000 umi dev",
|
||||
"dev": "cross-env PORT=9200 umi dev",
|
||||
"postinstall": "umi setup",
|
||||
"lint": "umi lint --eslint-only",
|
||||
"setup": "umi setup",
|
||||
@ -13,6 +13,7 @@
|
||||
"@ant-design/icons": "^5.2.6",
|
||||
"@ant-design/pro-components": "^2.6.46",
|
||||
"@ant-design/pro-layout": "^7.17.16",
|
||||
"@js-preview/excel": "^1.7.8",
|
||||
"ahooks": "^3.7.10",
|
||||
"antd": "^5.12.7",
|
||||
"axios": "^1.6.3",
|
||||
@ -25,12 +26,14 @@
|
||||
"rc-tween-one": "^3.0.6",
|
||||
"react-chat-elements": "^12.0.13",
|
||||
"react-copy-to-clipboard": "^5.1.0",
|
||||
"react-file-viewer": "^1.2.1",
|
||||
"react-i18next": "^14.0.0",
|
||||
"react-infinite-scroll-component": "^6.1.0",
|
||||
"react-markdown": "^9.0.1",
|
||||
"react-pdf-highlighter": "^6.1.0",
|
||||
"react-string-replace": "^1.1.1",
|
||||
"react-syntax-highlighter": "^15.5.0",
|
||||
"reactflow": "^11.11.2",
|
||||
"recharts": "^2.12.4",
|
||||
"remark-gfm": "^4.0.0",
|
||||
"umi": "^4.0.90",
|
||||
|
||||
6
web/src/assets/svg/llm/deepseek.svg
Normal file
6
web/src/assets/svg/llm/deepseek.svg
Normal file
@ -0,0 +1,6 @@
|
||||
<svg t="1715133624982" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="4263"
|
||||
width="200" height="200">
|
||||
<path
|
||||
d="M320.512 804.864C46.08 676.864 77.824 274.432 362.496 274.432c34.816 0 86.016-7.168 114.688-14.336 59.392-16.384 99.328-10.24 69.632 10.24-9.216 7.168-15.36 19.456-13.312 28.672 5.12 20.48 158.72 161.792 177.152 161.792 27.648 0 27.648-32.768 1.024-57.344-43.008-38.912-55.296-90.112-35.84-141.312l9.216-26.624 54.272 52.224c35.84 34.816 58.368 49.152 68.608 44.032 9.216-4.096 30.72-9.216 49.152-12.288 18.432-2.048 38.912-10.24 45.056-18.432 19.456-23.552 43.008-17.408 35.84 9.216-3.072 12.288-6.144 27.648-6.144 34.816 0 23.552-62.464 83.968-92.16 90.112-23.552 5.12-30.72 12.288-30.72 30.72 0 46.08-38.912 148.48-75.776 198.656l-37.888 51.2 36.864 15.36c56.32 23.552 40.96 41.984-37.888 43.008-43.008 1.024-75.776 7.168-92.16 18.432-68.608 45.056-198.656 50.176-281.6 12.288z m251.904-86.016c-24.576-27.648-66.56-79.872-93.184-117.76-69.632-98.304-158.72-150.528-256-150.528-37.888 0-38.912 1.024-38.912 34.816 0 94.208 99.328 240.64 175.104 257.024 38.912 9.216 59.392-7.168 39.936-29.696-7.168-9.216-10.24-23.552-6.144-31.744 5.12-14.336 9.216-14.336 38.912 1.024 18.432 9.216 50.176 29.696 69.632 45.056 35.84 27.648 58.368 37.888 96.256 39.936 14.336 1.024 9.216-10.24-25.6-48.128z m88.064-145.408c8.192-13.312-31.744-78.848-56.32-92.16-10.24-6.144-26.624-10.24-34.816-10.24-23.552 0-20.48 27.648 4.096 33.792 13.312 3.072 20.48 14.336 20.48 29.696 0 13.312 5.12 29.696 12.288 36.864 15.36 15.36 46.08 16.384 54.272 2.048z"
|
||||
fill="#4D6BFE" p-id="4264"></path>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1.6 KiB |
@ -74,9 +74,9 @@ export const useFetchParserListOnMount = (
|
||||
setSelectedTag(parserId);
|
||||
}, [parserId, documentId]);
|
||||
|
||||
const handleChange = (tag: string, checked: boolean) => {
|
||||
const nextSelectedTag = checked ? tag : selectedTag;
|
||||
setSelectedTag(nextSelectedTag);
|
||||
const handleChange = (tag: string) => {
|
||||
// const nextSelectedTag = checked ? tag : selectedTag;
|
||||
setSelectedTag(tag);
|
||||
};
|
||||
|
||||
return { parserList: nextParserList, handleChange, selectedTag };
|
||||
|
||||
@ -8,3 +8,7 @@
|
||||
cursor: help;
|
||||
writing-mode: horizontal-tb;
|
||||
}
|
||||
|
||||
.chunkMethod {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
@ -13,9 +13,9 @@ import {
|
||||
Form,
|
||||
InputNumber,
|
||||
Modal,
|
||||
Select,
|
||||
Space,
|
||||
Switch,
|
||||
Tag,
|
||||
Tooltip,
|
||||
} from 'antd';
|
||||
import omit from 'lodash/omit';
|
||||
@ -25,8 +25,6 @@ import { useFetchParserListOnMount } from './hooks';
|
||||
import { useTranslate } from '@/hooks/commonHooks';
|
||||
import styles from './index.less';
|
||||
|
||||
const { CheckableTag } = Tag;
|
||||
|
||||
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
||||
loading: boolean;
|
||||
onOk: (
|
||||
@ -50,6 +48,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
||||
visible,
|
||||
documentExtension,
|
||||
parserConfig,
|
||||
loading,
|
||||
}) => {
|
||||
const { parserList, handleChange, selectedTag } = useFetchParserListOnMount(
|
||||
documentId,
|
||||
@ -111,23 +110,17 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
||||
onOk={handleOk}
|
||||
onCancel={hideModal}
|
||||
afterClose={afterClose}
|
||||
confirmLoading={loading}
|
||||
>
|
||||
<Space size={[0, 8]} wrap>
|
||||
<div className={styles.tags}>
|
||||
{parserList.map((x) => {
|
||||
return (
|
||||
<CheckableTag
|
||||
key={x.value}
|
||||
checked={selectedTag === x.value}
|
||||
onChange={(checked) => {
|
||||
handleChange(x.value, checked);
|
||||
}}
|
||||
>
|
||||
{x.label}
|
||||
</CheckableTag>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
|
||||
<Select
|
||||
style={{ width: 120 }}
|
||||
onChange={handleChange}
|
||||
value={selectedTag}
|
||||
options={parserList}
|
||||
/>
|
||||
</Form.Item>
|
||||
</Space>
|
||||
{hideDivider || <Divider></Divider>}
|
||||
<Form name="dynamic_form_nest_item" autoComplete="off" form={form}>
|
||||
|
||||
8
web/src/components/file-upload-modal/index.less
Normal file
8
web/src/components/file-upload-modal/index.less
Normal file
@ -0,0 +1,8 @@
|
||||
.uploader {
|
||||
:global {
|
||||
.ant-upload-list {
|
||||
max-height: 40vh;
|
||||
overflow-y: auto;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,3 +1,4 @@
|
||||
import { useTranslate } from '@/hooks/commonHooks';
|
||||
import { IModalProps } from '@/interfaces/common';
|
||||
import { InboxOutlined } from '@ant-design/icons';
|
||||
import {
|
||||
@ -12,6 +13,8 @@ import {
|
||||
} from 'antd';
|
||||
import { Dispatch, SetStateAction, useState } from 'react';
|
||||
|
||||
import styles from './index.less';
|
||||
|
||||
const { Dragger } = Upload;
|
||||
|
||||
const FileUpload = ({
|
||||
@ -23,6 +26,7 @@ const FileUpload = ({
|
||||
fileList: UploadFile[];
|
||||
setFileList: Dispatch<SetStateAction<UploadFile[]>>;
|
||||
}) => {
|
||||
const { t } = useTranslate('fileManager');
|
||||
const props: UploadProps = {
|
||||
multiple: true,
|
||||
onRemove: (file) => {
|
||||
@ -43,17 +47,12 @@ const FileUpload = ({
|
||||
};
|
||||
|
||||
return (
|
||||
<Dragger {...props}>
|
||||
<Dragger {...props} className={styles.uploader}>
|
||||
<p className="ant-upload-drag-icon">
|
||||
<InboxOutlined />
|
||||
</p>
|
||||
<p className="ant-upload-text">
|
||||
Click or drag file to this area to upload
|
||||
</p>
|
||||
<p className="ant-upload-hint">
|
||||
Support for a single or bulk upload. Strictly prohibited from uploading
|
||||
company data or other banned files.
|
||||
</p>
|
||||
<p className="ant-upload-text">{t('uploadTitle')}</p>
|
||||
<p className="ant-upload-hint">{t('uploadDescription')}</p>
|
||||
</Dragger>
|
||||
);
|
||||
};
|
||||
@ -64,18 +63,29 @@ const FileUploadModal = ({
|
||||
loading,
|
||||
onOk: onFileUploadOk,
|
||||
}: IModalProps<UploadFile[]>) => {
|
||||
const { t } = useTranslate('fileManager');
|
||||
const [value, setValue] = useState<string | number>('local');
|
||||
const [fileList, setFileList] = useState<UploadFile[]>([]);
|
||||
const [directoryFileList, setDirectoryFileList] = useState<UploadFile[]>([]);
|
||||
|
||||
const onOk = () => {
|
||||
return onFileUploadOk?.([...fileList, ...directoryFileList]);
|
||||
const clearFileList = () => {
|
||||
setFileList([]);
|
||||
setDirectoryFileList([]);
|
||||
};
|
||||
|
||||
const onOk = async () => {
|
||||
const ret = await onFileUploadOk?.([...fileList, ...directoryFileList]);
|
||||
return ret;
|
||||
};
|
||||
|
||||
const afterClose = () => {
|
||||
clearFileList();
|
||||
};
|
||||
|
||||
const items: TabsProps['items'] = [
|
||||
{
|
||||
key: '1',
|
||||
label: 'File',
|
||||
label: t('file'),
|
||||
children: (
|
||||
<FileUpload
|
||||
directory={false}
|
||||
@ -86,7 +96,7 @@ const FileUploadModal = ({
|
||||
},
|
||||
{
|
||||
key: '2',
|
||||
label: 'Directory',
|
||||
label: t('directory'),
|
||||
children: (
|
||||
<FileUpload
|
||||
directory
|
||||
@ -100,17 +110,18 @@ const FileUploadModal = ({
|
||||
return (
|
||||
<>
|
||||
<Modal
|
||||
title="File upload"
|
||||
title={t('uploadFile')}
|
||||
open={visible}
|
||||
onOk={onOk}
|
||||
onCancel={hideModal}
|
||||
confirmLoading={loading}
|
||||
afterClose={afterClose}
|
||||
>
|
||||
<Flex gap={'large'} vertical>
|
||||
<Segmented
|
||||
options={[
|
||||
{ label: 'Local uploads', value: 'local' },
|
||||
{ label: 'S3 uploads', value: 's3' },
|
||||
{ label: t('local'), value: 'local' },
|
||||
{ label: t('s3'), value: 's3' },
|
||||
]}
|
||||
block
|
||||
value={value}
|
||||
@ -119,7 +130,7 @@ const FileUploadModal = ({
|
||||
{value === 'local' ? (
|
||||
<Tabs defaultActiveKey="1" items={items} />
|
||||
) : (
|
||||
'coming soon'
|
||||
t('comingSoon', { keyPrefix: 'common' })
|
||||
)}
|
||||
</Flex>
|
||||
</Modal>
|
||||
@ -46,3 +46,25 @@ export const LanguageTranslationMap = {
|
||||
Chinese: 'zh',
|
||||
'Traditional Chinese': 'zh-TRADITIONAL',
|
||||
};
|
||||
|
||||
export const FileMimeTypeMap = {
|
||||
bmp: 'image/bmp',
|
||||
csv: 'text/csv',
|
||||
odt: 'application/vnd.oasis.opendocument.text',
|
||||
doc: 'application/msword',
|
||||
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
gif: 'image/gif',
|
||||
htm: 'text/htm',
|
||||
html: 'text/html',
|
||||
jpg: 'image/jpg',
|
||||
jpeg: 'image/jpeg',
|
||||
pdf: 'application/pdf',
|
||||
png: 'image/png',
|
||||
ppt: 'application/vnd.ms-powerpoint',
|
||||
pptx: 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
tiff: 'image/tiff',
|
||||
txt: 'text/plain',
|
||||
xls: 'application/vnd.ms-excel',
|
||||
xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
mp4: 'video/mp4',
|
||||
};
|
||||
|
||||
@ -7,6 +7,7 @@ import { useCallback, useMemo, useState } from 'react';
|
||||
import { IHighlight } from 'react-pdf-highlighter';
|
||||
import { useDispatch, useSelector } from 'umi';
|
||||
import { useGetKnowledgeSearchParams } from './routeHook';
|
||||
import { useOneNamespaceEffectsLoading } from './storeHooks';
|
||||
|
||||
export const useGetDocumentUrl = (documentId: string) => {
|
||||
const url = useMemo(() => {
|
||||
@ -160,12 +161,12 @@ export const useRemoveDocument = () => {
|
||||
const { knowledgeId } = useGetKnowledgeSearchParams();
|
||||
|
||||
const removeDocument = useCallback(
|
||||
(documentId: string) => {
|
||||
(documentIds: string[]) => {
|
||||
try {
|
||||
return dispatch<any>({
|
||||
type: 'kFModel/document_rm',
|
||||
payload: {
|
||||
doc_id: documentId,
|
||||
doc_id: documentIds,
|
||||
kb_id: knowledgeId,
|
||||
},
|
||||
});
|
||||
@ -184,12 +185,12 @@ export const useUploadDocument = () => {
|
||||
const { knowledgeId } = useGetKnowledgeSearchParams();
|
||||
|
||||
const uploadDocument = useCallback(
|
||||
(file: UploadFile) => {
|
||||
(fileList: UploadFile[]) => {
|
||||
try {
|
||||
return dispatch<any>({
|
||||
type: 'kFModel/upload_document',
|
||||
payload: {
|
||||
file,
|
||||
fileList,
|
||||
kb_id: knowledgeId,
|
||||
},
|
||||
});
|
||||
@ -222,3 +223,8 @@ export const useRunDocument = () => {
|
||||
|
||||
return runDocumentByIds;
|
||||
};
|
||||
|
||||
export const useSelectRunDocumentLoading = () => {
|
||||
const loading = useOneNamespaceEffectsLoading('kFModel', ['document_run']);
|
||||
return loading;
|
||||
};
|
||||
|
||||
@ -125,13 +125,19 @@ export const useFetchKnowledgeBaseConfiguration = () => {
|
||||
}, [fetchKnowledgeBaseConfiguration]);
|
||||
};
|
||||
|
||||
export const useSelectKnowledgeList = () => {
|
||||
const knowledgeModel = useSelector((state) => state.knowledgeModel);
|
||||
const { data = [] } = knowledgeModel;
|
||||
return data;
|
||||
};
|
||||
|
||||
export const useFetchKnowledgeList = (
|
||||
shouldFilterListWithoutDocument: boolean = false,
|
||||
) => {
|
||||
const dispatch = useDispatch();
|
||||
const loading = useOneNamespaceEffectsLoading('knowledgeModel', ['getList']);
|
||||
|
||||
const knowledgeModel = useSelector((state: any) => state.knowledgeModel);
|
||||
const knowledgeModel = useSelector((state) => state.knowledgeModel);
|
||||
const { data = [] } = knowledgeModel;
|
||||
const list: IKnowledge[] = useMemo(() => {
|
||||
return shouldFilterListWithoutDocument
|
||||
|
||||
@ -5,6 +5,7 @@ import {
|
||||
IThirdOAIModelCollection,
|
||||
} from '@/interfaces/database/llm';
|
||||
import { IAddLlmRequestBody } from '@/interfaces/request/llm';
|
||||
import { sortLLmFactoryListBySpecifiedOrder } from '@/utils/commonUtil';
|
||||
import { useCallback, useEffect, useMemo } from 'react';
|
||||
import { useDispatch, useSelector } from 'umi';
|
||||
|
||||
@ -110,13 +111,12 @@ export const useFetchLlmFactoryListOnMount = () => {
|
||||
const factoryList = useSelectLlmFactoryList();
|
||||
const myLlmList = useSelectMyLlmList();
|
||||
|
||||
const list = useMemo(
|
||||
() =>
|
||||
factoryList.filter((x) =>
|
||||
const list = useMemo(() => {
|
||||
const currentList = factoryList.filter((x) =>
|
||||
Object.keys(myLlmList).every((y) => y !== x.name),
|
||||
),
|
||||
[factoryList, myLlmList],
|
||||
);
|
||||
return sortLLmFactoryListBySpecifiedOrder(currentList);
|
||||
}, [factoryList, myLlmList]);
|
||||
|
||||
const fetchLlmFactoryList = useCallback(() => {
|
||||
dispatch({
|
||||
|
||||
@ -14,5 +14,5 @@ export interface IModalProps<T> {
|
||||
hideModal(): void;
|
||||
visible: boolean;
|
||||
loading?: boolean;
|
||||
onOk?(payload?: T): Promise<void> | void;
|
||||
onOk?(payload?: T): Promise<any> | void;
|
||||
}
|
||||
|
||||
@ -21,11 +21,11 @@ export interface LlmSetting {
|
||||
}
|
||||
|
||||
export interface Variable {
|
||||
frequency_penalty: number;
|
||||
max_tokens: number;
|
||||
presence_penalty: number;
|
||||
temperature: number;
|
||||
top_p: number;
|
||||
frequency_penalty?: number;
|
||||
max_tokens?: number;
|
||||
presence_penalty?: number;
|
||||
temperature?: number;
|
||||
top_p?: number;
|
||||
}
|
||||
|
||||
export interface IDialog {
|
||||
@ -38,7 +38,7 @@ export interface IDialog {
|
||||
kb_names: string[];
|
||||
language: string;
|
||||
llm_id: string;
|
||||
llm_setting: LlmSetting;
|
||||
llm_setting: Variable;
|
||||
llm_setting_type: string;
|
||||
name: string;
|
||||
prompt_config: PromptConfig;
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
import { ReactComponent as StarIon } from '@/assets/svg/chat-star.svg';
|
||||
// import { ReactComponent as FileIcon } from '@/assets/svg/file-management.svg';
|
||||
import { ReactComponent as FileIcon } from '@/assets/svg/file-management.svg';
|
||||
import { ReactComponent as KnowledgeBaseIcon } from '@/assets/svg/knowledge-base.svg';
|
||||
import { ReactComponent as Logo } from '@/assets/svg/logo.svg';
|
||||
import { useTranslate } from '@/hooks/commonHooks';
|
||||
@ -25,7 +25,7 @@ const RagHeader = () => {
|
||||
() => [
|
||||
{ path: '/knowledge', name: t('knowledgeBase'), icon: KnowledgeBaseIcon },
|
||||
{ path: '/chat', name: t('chat'), icon: StarIon },
|
||||
// { path: '/file', name: 'File Management', icon: FileIcon },
|
||||
{ path: '/file', name: t('fileManager'), icon: FileIcon },
|
||||
],
|
||||
[t],
|
||||
);
|
||||
|
||||
@ -15,3 +15,7 @@
|
||||
vertical-align: middle;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.language {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
import { ReactComponent as TranslationIcon } from '@/assets/svg/translation.svg';
|
||||
import { useTranslate } from '@/hooks/commonHooks';
|
||||
import { GithubOutlined } from '@ant-design/icons';
|
||||
import { DownOutlined, GithubOutlined } from '@ant-design/icons';
|
||||
import { Dropdown, MenuProps, Space } from 'antd';
|
||||
import camelCase from 'lodash/camelCase';
|
||||
import React from 'react';
|
||||
@ -8,6 +7,7 @@ import User from '../user';
|
||||
|
||||
import { LanguageList } from '@/constants/common';
|
||||
import { useChangeLanguage } from '@/hooks/logicHooks';
|
||||
import { useSelector } from 'umi';
|
||||
import styled from './index.less';
|
||||
|
||||
const Circle = ({ children, ...restProps }: React.PropsWithChildren) => {
|
||||
@ -25,6 +25,7 @@ const handleGithubCLick = () => {
|
||||
const RightToolBar = () => {
|
||||
const { t } = useTranslate('common');
|
||||
const changeLanguage = useChangeLanguage();
|
||||
const { language = '' } = useSelector((state) => state.settingModel.userInfo);
|
||||
|
||||
const handleItemClick: MenuProps['onClick'] = ({ key }) => {
|
||||
changeLanguage(key);
|
||||
@ -40,14 +41,15 @@ const RightToolBar = () => {
|
||||
return (
|
||||
<div className={styled.toolbarWrapper}>
|
||||
<Space wrap size={16}>
|
||||
<Dropdown menu={{ items, onClick: handleItemClick }} placement="bottom">
|
||||
<Space className={styled.language}>
|
||||
<b>{t(camelCase(language))}</b>
|
||||
<DownOutlined />
|
||||
</Space>
|
||||
</Dropdown>
|
||||
<Circle>
|
||||
<GithubOutlined onClick={handleGithubCLick} />
|
||||
</Circle>
|
||||
<Dropdown menu={{ items, onClick: handleItemClick }} placement="bottom">
|
||||
<Circle>
|
||||
<TranslationIcon />
|
||||
</Circle>
|
||||
</Dropdown>
|
||||
{/* <Circle>
|
||||
<MonIcon />
|
||||
</Circle> */}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user