Supports obtaining PDF documents from web pages (#1107)

### What problem does this PR solve?

Knowledge base management supports crawling information from web pages
and generating PDF documents

### Type of change
- [x] New Feature (Support document from web pages)
This commit is contained in:
Fakai Zhao
2024-06-11 10:45:19 +08:00
committed by GitHub
parent 68a698655a
commit 7eb69fe6d9
14 changed files with 336 additions and 17 deletions

View File

@ -1,13 +1,13 @@
import { IChunk, IKnowledgeFile } from '@/interfaces/database/knowledge';
import { IChangeParserConfigRequestBody } from '@/interfaces/request/document';
import { api_host } from '@/utils/api';
import { buildChunkHighlights } from '@/utils/documentUtils';
import { UploadFile } from 'antd';
import { useCallback, useMemo, useState } from 'react';
import { IHighlight } from 'react-pdf-highlighter';
import { useDispatch, useSelector } from 'umi';
import { useGetKnowledgeSearchParams } from './routeHook';
import { useOneNamespaceEffectsLoading } from './storeHooks';
import {IChunk, IKnowledgeFile} from '@/interfaces/database/knowledge';
import {IChangeParserConfigRequestBody} from '@/interfaces/request/document';
import {api_host} from '@/utils/api';
import {buildChunkHighlights} from '@/utils/documentUtils';
import {UploadFile} from 'antd';
import {useCallback, useMemo, useState} from 'react';
import {IHighlight} from 'react-pdf-highlighter';
import {useDispatch, useSelector} from 'umi';
import {useGetKnowledgeSearchParams} from './routeHook';
import {useOneNamespaceEffectsLoading} from './storeHooks';
export const useGetDocumentUrl = (documentId?: string) => {
const getDocumentUrl = useCallback(
@ -207,6 +207,28 @@ export const useUploadDocument = () => {
return uploadDocument;
};
export const useWebCrawl = () => {
const dispatch = useDispatch();
const { knowledgeId } = useGetKnowledgeSearchParams();
return useCallback(
(name: string, url: string) => {
try {
return dispatch<any>({
type: 'kFModel/web_crawl',
payload: {
name,
url,
kb_id: knowledgeId,
},
});
} catch (errorInfo) {
console.log('Failed:', errorInfo);
}
},
[dispatch],
);
};
export const useRunDocument = () => {
const dispatch = useDispatch();

View File

@ -81,6 +81,7 @@ export default {
searchFiles: 'Search your files',
localFiles: 'Local files',
emptyFiles: 'Create empty file',
webCrawl: 'Web Crawl',
chunkNumber: 'Chunk Number',
uploadDate: 'Upload Date',
chunkMethod: 'Chunk Method',

View File

@ -80,6 +80,7 @@ export default {
searchFiles: '搜索文件',
localFiles: '本地文件',
emptyFiles: '新建空文件',
webCrawl: '網頁抓取',
chunkNumber: '分塊數',
uploadDate: '上傳日期',
chunkMethod: '解析方法',

View File

@ -80,6 +80,7 @@ export default {
searchFiles: '搜索文件',
localFiles: '本地文件',
emptyFiles: '新建空文件',
webCrawl: '网页抓取',
chunkNumber: '分块数',
uploadDate: '上传日期',
chunkMethod: '解析方法',

View File

@ -29,13 +29,15 @@ import styles from './index.less';
interface IProps {
selectedRowKeys: string[];
showCreateModal(): void;
showWebCrawlModal(): void;
showDocumentUploadModal(): void;
}
const DocumentToolbar = ({
selectedRowKeys,
showCreateModal,
showDocumentUploadModal,
selectedRowKeys,
showCreateModal,
showWebCrawlModal,
showDocumentUploadModal,
}: IProps) => {
const { t } = useTranslate('knowledgeDetails');
const { fetchDocumentList } = useFetchDocumentListOnMount();
@ -66,6 +68,19 @@ const DocumentToolbar = ({
{ type: 'divider' },
{
key: '2',
onClick: showWebCrawlModal,
label: (
<div>
<Button type="link">
<FileTextOutlined />
{t('webCrawl')}
</Button>
</div>
),
},
{ type: 'divider' },
{
key: '3',
onClick: showCreateModal,
label: (
<div>
@ -77,7 +92,7 @@ const DocumentToolbar = ({
),
},
];
}, [showDocumentUploadModal, showCreateModal, t]);
}, [showDocumentUploadModal, showWebCrawlModal, showCreateModal, t]);
const handleDelete = useCallback(() => {
showDeleteConfirm({

View File

@ -7,6 +7,7 @@ import {
useSelectRunDocumentLoading,
useSetDocumentParser,
useUploadDocument,
useWebCrawl,
} from '@/hooks/documentHooks';
import { useGetKnowledgeSearchParams } from '@/hooks/routeHook';
import { useOneNamespaceEffectsLoading } from '@/hooks/storeHooks';
@ -286,6 +287,37 @@ export const useHandleUploadDocument = () => {
};
};
export const useHandleWebCrawl = () => {
const {
visible: webCrawlUploadVisible,
hideModal: hideWebCrawlUploadModal,
showModal: showWebCrawlUploadModal,
} = useSetModalState();
const webCrawl = useWebCrawl();
const onWebCrawlUploadOk = useCallback(
async (name: string, url: string ) => {
const ret = await webCrawl(name, url);
if (ret === 0) {
hideWebCrawlUploadModal();
return 0
}
return -1
},
[webCrawl, hideWebCrawlUploadModal],
);
const loading = useOneNamespaceEffectsLoading('kFModel', ['web_crawl']);
return {
webCrawlUploadLoading: loading,
onWebCrawlUploadOk,
webCrawlUploadVisible,
hideWebCrawlUploadModal,
showWebCrawlUploadModal,
};
};
export const useHandleRunDocumentByIds = (id: string) => {
const loading = useSelectRunDocumentLoading();
const runDocumentByIds = useRunDocument();

View File

@ -12,6 +12,7 @@ import { Divider, Flex, Switch, Table, Typography } from 'antd';
import type { ColumnsType } from 'antd/es/table';
import { useTranslation } from 'react-i18next';
import CreateFileModal from './create-file-modal';
import WebCrawlModal from './web-crawl-modal';
import DocumentToolbar from './document-toolbar';
import {
useChangeDocumentParser,
@ -19,7 +20,7 @@ import {
useFetchDocumentListOnMount,
useGetPagination,
useGetRowSelection,
useHandleUploadDocument,
useHandleUploadDocument, useHandleWebCrawl,
useNavigateToOtherPage,
useRenameDocument,
} from './hooks';
@ -69,6 +70,13 @@ const KnowledgeFile = () => {
onDocumentUploadOk,
documentUploadLoading,
} = useHandleUploadDocument();
const {
webCrawlUploadVisible,
hideWebCrawlUploadModal,
showWebCrawlUploadModal,
onWebCrawlUploadOk,
webCrawlUploadLoading,
} = useHandleWebCrawl();
const { t } = useTranslation('translation', {
keyPrefix: 'knowledgeDetails',
});
@ -170,6 +178,7 @@ const KnowledgeFile = () => {
<DocumentToolbar
selectedRowKeys={rowSelection.selectedRowKeys as string[]}
showCreateModal={showCreateModal}
showWebCrawlModal={showWebCrawlUploadModal}
showDocumentUploadModal={showDocumentUploadModal}
></DocumentToolbar>
<Table
@ -211,6 +220,12 @@ const KnowledgeFile = () => {
loading={documentUploadLoading}
onOk={onDocumentUploadOk}
></FileUploadModal>
<WebCrawlModal
visible={webCrawlUploadVisible}
hideModal={hideWebCrawlUploadModal}
loading={webCrawlUploadLoading}
onOk={onWebCrawlUploadOk}
></WebCrawlModal>
</div>
);
};

View File

@ -232,6 +232,27 @@ const model: DvaModel<KFModelState> = {
}
return data;
},
*web_crawl({ payload = {} }, { call, put }) {
const formData = new FormData();
formData.append('name', payload.name);
formData.append('url', payload.url);
formData.append('kb_id', payload.kb_id);
const { data } = yield call(kbService.web_crawl, formData);
const succeed = data.retcode === 0;
if (succeed) {
message.success(i18n.t('message.uploaded'));
}
if (succeed || data.retcode === 500) {
yield put({
type: 'getKfList',
payload: { kb_id: payload.kb_id },
});
}
return data.retcode;
},
},
subscriptions: {
setup({ dispatch, history }) {

View File

@ -0,0 +1,54 @@
import { IModalManagerChildrenProps } from '@/components/modal-manager';
import { Form, Input, Modal } from 'antd';
import React from 'react';
import {useTranslate} from "@/hooks/commonHooks";
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
loading: boolean;
onOk: (name: string, url: string) => void;
showModal?(): void;
}
const WebCrawlModal: React.FC<IProps> = ({ visible, hideModal, onOk }) => {
const [form] = Form.useForm();
const { t } = useTranslate('knowledgeDetails');
const handleOk = async () => {
const values = await form.validateFields();
onOk(values.name, values.url);
};
return (
<Modal
title={t('webCrawl')}
open={visible}
onOk={handleOk}
onCancel={hideModal}
>
<Form
form={form}
name="validateOnly"
labelCol={{ span: 4 }}
wrapperCol={{ span: 20 }}
style={{ maxWidth: 600 }}
autoComplete="off"
>
<Form.Item
label="Name"
name="name"
rules={[{ required: true, message: 'Please input name!' },{ max: 10, message: 'The maximum length of name is 128 characters' }]}
>
<Input placeholder="Document name" />
</Form.Item>
<Form.Item
label="URL"
name="url"
rules={[{ required: true, message: 'Please input url!' },{pattern: new RegExp('(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'), message: 'Please enter a valid URL!'}]}
>
<Input placeholder="https://www.baidu.com" />
</Form.Item>
</Form>
</Modal>
);
};
export default WebCrawlModal;

View File

@ -26,6 +26,7 @@ const {
document_run,
get_document_file,
document_upload,
web_crawl,
} = api;
const methods = {
@ -87,6 +88,10 @@ const methods = {
url: document_upload,
method: 'post',
},
web_crawl: {
url: web_crawl,
method: 'post',
},
// chunk管理
chunk_list: {
url: chunk_list,

View File

@ -48,6 +48,7 @@ export default {
document_thumbnails: `${api_host}/document/thumbnails`,
get_document_file: `${api_host}/document/get`,
document_upload: `${api_host}/document/upload`,
web_crawl: `${api_host}/document/web_crawl`,
// chat
setDialog: `${api_host}/dialog/set`,