remove unused codes, seperate layout detection out as a new api. Add new rag methed 'table' (#55)

This commit is contained in:
KevinHuSh
2024-02-05 18:08:17 +08:00
committed by GitHub
parent f305776217
commit 407b2523b6
33 changed files with 306 additions and 505 deletions

View File

@ -77,3 +77,4 @@ class ParserType(StrEnum):
RESUME = "resume"
BOOK = "book"
QA = "qa"
TABLE = "table"

View File

@ -29,7 +29,7 @@ from peewee import (
)
from playhouse.pool import PooledMySQLDatabase
from api.db import SerializedType
from api.db import SerializedType, ParserType
from api.settings import DATABASE, stat_logger, SECRET_KEY
from api.utils.log_utils import getLogger
from api import utils
@ -381,7 +381,8 @@ class Tenant(DataBaseModel):
embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID")
asr_id = CharField(max_length=128, null=False, help_text="default ASR model ID")
img2txt_id = CharField(max_length=128, null=False, help_text="default image to text model ID")
parser_ids = CharField(max_length=128, null=False, help_text="default image to text model ID")
parser_ids = CharField(max_length=128, null=False, help_text="document processors")
credit = IntegerField(default=512)
status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted1: validate)", default="1")
class Meta:
@ -472,7 +473,8 @@ class Knowledgebase(DataBaseModel):
similarity_threshold = FloatField(default=0.2)
vector_similarity_weight = FloatField(default=0.3)
parser_id = CharField(max_length=32, null=False, help_text="default parser ID")
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.GENERAL.value)
parser_config = JSONField(null=False, default={"from_page":0, "to_page": 100000})
status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted1: validate)", default="1")
def __str__(self):
@ -487,6 +489,7 @@ class Document(DataBaseModel):
thumbnail = TextField(null=True, help_text="thumbnail base64 string")
kb_id = CharField(max_length=256, null=False, index=True)
parser_id = CharField(max_length=32, null=False, help_text="default parser ID")
parser_config = JSONField(null=False, default={"from_page":0, "to_page": 100000})
source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document from")
type = CharField(max_length=32, null=False, help_text="file extension")
created_by = CharField(max_length=32, null=False, help_text="who created it")

View File

@ -1,157 +0,0 @@
#
# Copyright 2021 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import abc
import json
import time
from functools import wraps
from shortuuid import ShortUUID
from api.versions import get_rag_version
from api.errors.error_services import *
from api.settings import (
GRPC_PORT, HOST, HTTP_PORT,
RANDOM_INSTANCE_ID, stat_logger,
)
instance_id = ShortUUID().random(length=8) if RANDOM_INSTANCE_ID else f'flow-{HOST}-{HTTP_PORT}'
server_instance = (
f'{HOST}:{GRPC_PORT}',
json.dumps({
'instance_id': instance_id,
'timestamp': round(time.time() * 1000),
'version': get_rag_version() or '',
'host': HOST,
'grpc_port': GRPC_PORT,
'http_port': HTTP_PORT,
}),
)
def check_service_supported(method):
"""Decorator to check if `service_name` is supported.
The attribute `supported_services` MUST be defined in class.
The first and second arguments of `method` MUST be `self` and `service_name`.
:param Callable method: The class method.
:return: The inner wrapper function.
:rtype: Callable
"""
@wraps(method)
def magic(self, service_name, *args, **kwargs):
if service_name not in self.supported_services:
raise ServiceNotSupported(service_name=service_name)
return method(self, service_name, *args, **kwargs)
return magic
class ServicesDB(abc.ABC):
"""Database for storage service urls.
Abstract base class for the real backends.
"""
@property
@abc.abstractmethod
def supported_services(self):
"""The names of supported services.
The returned list SHOULD contain `ragflow` (model download) and `servings` (RAG-Serving).
:return: The service names.
:rtype: list
"""
pass
@abc.abstractmethod
def _get_serving(self):
pass
def get_serving(self):
try:
return self._get_serving()
except ServicesError as e:
stat_logger.exception(e)
return []
@abc.abstractmethod
def _insert(self, service_name, service_url, value=''):
pass
@check_service_supported
def insert(self, service_name, service_url, value=''):
"""Insert a service url to database.
:param str service_name: The service name.
:param str service_url: The service url.
:return: None
"""
try:
self._insert(service_name, service_url, value)
except ServicesError as e:
stat_logger.exception(e)
@abc.abstractmethod
def _delete(self, service_name, service_url):
pass
@check_service_supported
def delete(self, service_name, service_url):
"""Delete a service url from database.
:param str service_name: The service name.
:param str service_url: The service url.
:return: None
"""
try:
self._delete(service_name, service_url)
except ServicesError as e:
stat_logger.exception(e)
def register_flow(self):
"""Call `self.insert` for insert the flow server address to databae.
:return: None
"""
self.insert('flow-server', *server_instance)
def unregister_flow(self):
"""Call `self.delete` for delete the flow server address from databae.
:return: None
"""
self.delete('flow-server', server_instance[0])
@abc.abstractmethod
def _get_urls(self, service_name, with_values=False):
pass
@check_service_supported
def get_urls(self, service_name, with_values=False):
"""Query service urls from database. The urls may belong to other nodes.
Currently, only `ragflow` (model download) urls and `servings` (RAG-Serving) urls are supported.
`ragflow` is a url containing scheme, host, port and path,
while `servings` only contains host and port.
:param str service_name: The service name.
:return: The service urls.
:rtype: list
"""
try:
return self._get_urls(service_name, with_values)
except ServicesError as e:
stat_logger.exception(e)
return []

View File

@ -63,7 +63,7 @@ class DocumentService(CommonService):
@classmethod
@DB.connection_context()
def get_newly_uploaded(cls, tm, mod=0, comm=1, items_per_page=64):
fields = [cls.model.id, cls.model.kb_id, cls.model.parser_id, cls.model.name, cls.model.type, cls.model.location, cls.model.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
fields = [cls.model.id, cls.model.kb_id, cls.model.parser_id, cls.model.parser_config, cls.model.name, cls.model.type, cls.model.location, cls.model.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
docs = cls.model.select(*fields) \
.join(Knowledgebase, on=(cls.model.kb_id == Knowledgebase.id)) \
.join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\

View File

@ -52,7 +52,8 @@ class KnowledgebaseService(CommonService):
cls.model.doc_num,
cls.model.token_num,
cls.model.chunk_num,
cls.model.parser_id]
cls.model.parser_id,
cls.model.parser_config]
kbs = cls.model.select(*fields).join(Tenant, on=((Tenant.id == cls.model.tenant_id)&(Tenant.status== StatusEnum.VALID.value))).where(
(cls.model.id == kb_id),
(cls.model.status == StatusEnum.VALID.value)

View File

@ -27,7 +27,7 @@ class TaskService(CommonService):
@classmethod
@DB.connection_context()
def get_tasks(cls, tm, mod=0, comm=1, items_per_page=64):
fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
fields = [cls.model.id, cls.model.doc_id, cls.model.from_page,cls.model.to_page, Document.kb_id, Document.parser_id, Document.parser_config, Document.name, Document.type, Document.location, Document.size, Knowledgebase.tenant_id, Tenant.embd_id, Tenant.img2txt_id, Tenant.asr_id, cls.model.update_time]
docs = cls.model.select(*fields) \
.join(Document, on=(cls.model.doc_id == Document.id)) \
.join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
@ -53,3 +53,13 @@ class TaskService(CommonService):
except Exception as e:
pass
return True
@classmethod
@DB.connection_context()
def update_progress(cls, id, info):
cls.model.update(progress_msg=cls.model.progress_msg + "\n"+info["progress_msg"]).where(
cls.model.id == id).execute()
if "progress" in info:
cls.model.update(progress=info["progress"]).where(
cls.model.id == id).execute()

View File

@ -92,6 +92,12 @@ class TenantService(CommonService):
.join(UserTenant, on=((cls.model.id == UserTenant.tenant_id) & (UserTenant.user_id==user_id) & (UserTenant.status == StatusEnum.VALID.value) & (UserTenant.role==UserTenantRole.NORMAL.value)))\
.where(cls.model.status == StatusEnum.VALID.value).dicts())
@classmethod
@DB.connection_context()
def decrease(cls, user_id, num):
num = cls.model.update(credit=cls.model.credit - num).where(
cls.model.id == user_id).execute()
if num == 0: raise LookupError("Tenant not found which is supposed to be there")
class UserTenantService(CommonService):
model = UserTenant