mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
add auto keywords and auto-question (#2965)
### What problem does this PR solve? #2687 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -25,7 +25,7 @@ from api.db import FileType, LLMType, ParserType, FileSource
|
||||
from api.db.db_models import APIToken, Task, File
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.api_service import APITokenService, API4ConversationService
|
||||
from api.db.services.dialog_service import DialogService, chat
|
||||
from api.db.services.dialog_service import DialogService, chat, keyword_extraction
|
||||
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
@ -38,7 +38,6 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge
|
||||
generate_confirmation_token
|
||||
|
||||
from api.utils.file_utils import filename_type, thumbnail
|
||||
from rag.nlp import keyword_extraction
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
from api.db.services.canvas_service import UserCanvasService
|
||||
|
||||
@ -21,8 +21,9 @@ from flask import request
|
||||
from flask_login import login_required, current_user
|
||||
from elasticsearch_dsl import Q
|
||||
|
||||
from api.db.services.dialog_service import keyword_extraction
|
||||
from rag.app.qa import rmPrefix, beAdoc
|
||||
from rag.nlp import search, rag_tokenizer, keyword_extraction
|
||||
from rag.nlp import search, rag_tokenizer
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from rag.utils import rmSpace
|
||||
from api.db import LLMType, ParserType
|
||||
|
||||
@ -16,16 +16,15 @@
|
||||
from flask import request
|
||||
|
||||
from api.db import StatusEnum
|
||||
from api.db.db_models import TenantLLM
|
||||
from api.db.services.dialog_service import DialogService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMService, TenantLLMService
|
||||
from api.db.services.llm_service import TenantLLMService
|
||||
from api.db.services.user_service import TenantService
|
||||
from api.settings import RetCode
|
||||
from api.utils import get_uuid
|
||||
from api.utils.api_utils import get_error_data_result, token_required
|
||||
from api.utils.api_utils import get_result
|
||||
|
||||
|
||||
@manager.route('/chat', methods=['POST'])
|
||||
@token_required
|
||||
def create(tenant_id):
|
||||
|
||||
@ -1,10 +1,25 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from flask import request, jsonify
|
||||
|
||||
from db import LLMType, ParserType
|
||||
from db.services.knowledgebase_service import KnowledgebaseService
|
||||
from db.services.llm_service import LLMBundle
|
||||
from settings import retrievaler, kg_retrievaler, RetCode
|
||||
from utils.api_utils import validate_request, build_error_result, apikey_required
|
||||
from api.db import LLMType, ParserType
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.settings import retrievaler, kg_retrievaler, RetCode
|
||||
from api.utils.api_utils import validate_request, build_error_result, apikey_required
|
||||
|
||||
|
||||
@manager.route('/dify/retrieval', methods=['POST'])
|
||||
|
||||
@ -1,48 +1,37 @@
|
||||
#
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import pathlib
|
||||
import re
|
||||
import datetime
|
||||
import json
|
||||
import traceback
|
||||
|
||||
from botocore.docs.method import document_model_driven_method
|
||||
from flask import request
|
||||
from flask_login import login_required, current_user
|
||||
from elasticsearch_dsl import Q
|
||||
from pygments import highlight
|
||||
from sphinx.addnodes import document
|
||||
|
||||
from api.db.services.dialog_service import keyword_extraction
|
||||
from rag.app.qa import rmPrefix, beAdoc
|
||||
from rag.nlp import search, rag_tokenizer, keyword_extraction
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from rag.utils import rmSpace
|
||||
from rag.nlp import rag_tokenizer
|
||||
from api.db import LLMType, ParserType
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import TenantLLMService
|
||||
from api.db.services.user_service import UserTenantService
|
||||
from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.settings import RetCode, retrievaler, kg_retrievaler
|
||||
from api.utils.api_utils import get_result
|
||||
from api.settings import kg_retrievaler
|
||||
import hashlib
|
||||
import re
|
||||
from api.utils.api_utils import get_result, token_required, get_error_data_result
|
||||
|
||||
from api.db.db_models import Task, File
|
||||
|
||||
from api.utils.api_utils import token_required
|
||||
from api.db.db_models import Task
|
||||
from api.db.services.task_service import TaskService, queue_tasks
|
||||
from api.db.services.user_service import TenantService, UserTenantService
|
||||
|
||||
from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
|
||||
|
||||
from api.utils.api_utils import get_result, get_result, get_error_data_result
|
||||
|
||||
from functools import partial
|
||||
from api.utils.api_utils import server_error_response
|
||||
from api.utils.api_utils import get_result, get_error_data_result
|
||||
from io import BytesIO
|
||||
|
||||
from elasticsearch_dsl import Q
|
||||
from flask import request, send_file
|
||||
from flask_login import login_required
|
||||
|
||||
from api.db import FileSource, TaskStatus, FileType
|
||||
from api.db.db_models import File
|
||||
from api.db.services.document_service import DocumentService
|
||||
@ -50,8 +39,7 @@ from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.settings import RetCode, retrievaler
|
||||
from api.utils.api_utils import construct_json_result, construct_error_response
|
||||
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
|
||||
from api.utils.api_utils import construct_json_result
|
||||
from rag.nlp import search
|
||||
from rag.utils import rmSpace
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
@ -365,7 +353,6 @@ def list_chunks(tenant_id,dataset_id,document_id):
|
||||
return get_result(data=res)
|
||||
|
||||
|
||||
|
||||
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
|
||||
@token_required
|
||||
def create(tenant_id,dataset_id,document_id):
|
||||
@ -454,7 +441,6 @@ def rm_chunk(tenant_id,dataset_id,document_id):
|
||||
return get_result()
|
||||
|
||||
|
||||
|
||||
@manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
|
||||
@token_required
|
||||
def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
|
||||
@ -512,7 +498,6 @@ def update_chunk(tenant_id,dataset_id,document_id,chunk_id):
|
||||
return get_result()
|
||||
|
||||
|
||||
|
||||
@manager.route('/retrieval', methods=['POST'])
|
||||
@token_required
|
||||
def retrieval_test(tenant_id):
|
||||
|
||||
@ -28,7 +28,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
|
||||
from api.settings import chat_logger, retrievaler, kg_retrievaler
|
||||
from rag.app.resume import forbidden_select_fields4resume
|
||||
from rag.nlp import keyword_extraction
|
||||
from rag.nlp.search import index_name
|
||||
from rag.utils import rmSpace, num_tokens_from_string, encoder
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
@ -80,6 +79,7 @@ class ConversationService(CommonService):
|
||||
|
||||
return list(sessions.dicts())
|
||||
|
||||
|
||||
def message_fit_in(msg, max_length=4000):
|
||||
def count():
|
||||
nonlocal msg
|
||||
@ -456,6 +456,58 @@ def rewrite(tenant_id, llm_id, question):
|
||||
return ans
|
||||
|
||||
|
||||
def keyword_extraction(chat_mdl, content, topn=3):
|
||||
prompt = f"""
|
||||
Role: You're a text analyzer.
|
||||
Task: extract the most important keywords/phrases of a given piece of text content.
|
||||
Requirements:
|
||||
- Summarize the text content, and give top {topn} important keywords/phrases.
|
||||
- The keywords MUST be in language of the given piece of text content.
|
||||
- The keywords are delimited by ENGLISH COMMA.
|
||||
- Keywords ONLY in output.
|
||||
|
||||
### Text Content
|
||||
{content}
|
||||
|
||||
"""
|
||||
msg = [
|
||||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": "Output: "}
|
||||
]
|
||||
_, msg = message_fit_in(msg, chat_mdl.max_length)
|
||||
kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
|
||||
if isinstance(kwd, tuple): kwd = kwd[0]
|
||||
if kwd.find("**ERROR**") >=0: return ""
|
||||
return kwd
|
||||
|
||||
|
||||
def question_proposal(chat_mdl, content, topn=3):
|
||||
prompt = f"""
|
||||
Role: You're a text analyzer.
|
||||
Task: propose {topn} questions about a given piece of text content.
|
||||
Requirements:
|
||||
- Understand and summarize the text content, and propose top {topn} important questions.
|
||||
- The questions SHOULD NOT have overlapping meanings.
|
||||
- The questions SHOULD cover the main content of the text as much as possible.
|
||||
- The questions MUST be in language of the given piece of text content.
|
||||
- One question per line.
|
||||
- Question ONLY in output.
|
||||
|
||||
### Text Content
|
||||
{content}
|
||||
|
||||
"""
|
||||
msg = [
|
||||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": "Output: "}
|
||||
]
|
||||
_, msg = message_fit_in(msg, chat_mdl.max_length)
|
||||
kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
|
||||
if isinstance(kwd, tuple): kwd = kwd[0]
|
||||
if kwd.find("**ERROR**") >= 0: return ""
|
||||
return kwd
|
||||
|
||||
|
||||
def full_question(tenant_id, llm_id, messages):
|
||||
if llm_id2llm_type(llm_id) == "image2text":
|
||||
chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
|
||||
|
||||
Reference in New Issue
Block a user