Supports obtaining PDF documents from web pages (#1107)

### What problem does this PR solve?

Knowledge base management supports crawling information from web pages
and generating PDF documents

### Type of change
- [x] New Feature (Support document from web pages)
This commit is contained in:
Fakai Zhao
2024-06-11 10:45:19 +08:00
committed by GitHub
parent 68a698655a
commit 7eb69fe6d9
14 changed files with 336 additions and 17 deletions

View File

@ -39,6 +39,7 @@ from api.settings import RetCode
from api.utils.api_utils import get_json_result
from rag.utils.minio_conn import MINIO
from api.utils.file_utils import filename_type, thumbnail
from api.utils.web_utils import html2pdf, is_valid_url
@manager.route('/upload', methods=['POST'])
@ -289,7 +290,7 @@ def run():
return get_data_error_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
if str(req["run"]) == TaskStatus.RUNNING.value:
TaskService.filter_delete([Task.doc_id == id])
e, doc = DocumentService.get_by_id(id)
@ -416,3 +417,69 @@ def get_image(image_id):
return response
except Exception as e:
return server_error_response(e)
@manager.route('/web_crawl', methods=['POST'])
@login_required
def web_crawl():
kb_id = request.form.get("kb_id")
if not kb_id:
return get_json_result(
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
name = request.form.get("name")
url = request.form.get("url")
if not name:
return get_json_result(
data=False, retmsg='Lack of "name"', retcode=RetCode.ARGUMENT_ERROR)
if not url:
return get_json_result(
data=False, retmsg='Lack of "url"', retcode=RetCode.ARGUMENT_ERROR)
if not is_valid_url(url):
return get_json_result(
data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
raise LookupError("Can't find this knowledgebase!")
root_folder = FileService.get_root_folder(current_user.id)
pf_id = root_folder["id"]
FileService.init_knowledgebase_docs(pf_id, current_user.id)
kb_root_folder = FileService.get_kb_folder(current_user.id)
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
try:
filename = duplicate_name(
DocumentService.query,
name=name+".pdf",
kb_id=kb.id)
filetype = filename_type(filename)
if filetype == FileType.OTHER.value:
raise RuntimeError("This type of file has not been supported yet!")
location = filename
while MINIO.obj_exist(kb_id, location):
location += "_"
blob = html2pdf(url)
MINIO.put(kb_id, location, blob)
doc = {
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": filetype,
"name": filename,
"location": location,
"size": len(blob),
"thumbnail": thumbnail(filename, blob)
}
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
DocumentService.insert(doc)
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
except Exception as e:
return get_json_result(
data=False, retmsg=e, retcode=RetCode.SERVER_ERROR)
return get_json_result(data=True)