mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Supports obtaining PDF documents from web pages (#1107)
### What problem does this PR solve? Knowledge base management supports crawling information from web pages and generating PDF documents ### Type of change - [x] New Feature (Support document from web pages)
This commit is contained in:
@ -1,13 +1,13 @@
|
||||
import { IChunk, IKnowledgeFile } from '@/interfaces/database/knowledge';
|
||||
import { IChangeParserConfigRequestBody } from '@/interfaces/request/document';
|
||||
import { api_host } from '@/utils/api';
|
||||
import { buildChunkHighlights } from '@/utils/documentUtils';
|
||||
import { UploadFile } from 'antd';
|
||||
import { useCallback, useMemo, useState } from 'react';
|
||||
import { IHighlight } from 'react-pdf-highlighter';
|
||||
import { useDispatch, useSelector } from 'umi';
|
||||
import { useGetKnowledgeSearchParams } from './routeHook';
|
||||
import { useOneNamespaceEffectsLoading } from './storeHooks';
|
||||
import {IChunk, IKnowledgeFile} from '@/interfaces/database/knowledge';
|
||||
import {IChangeParserConfigRequestBody} from '@/interfaces/request/document';
|
||||
import {api_host} from '@/utils/api';
|
||||
import {buildChunkHighlights} from '@/utils/documentUtils';
|
||||
import {UploadFile} from 'antd';
|
||||
import {useCallback, useMemo, useState} from 'react';
|
||||
import {IHighlight} from 'react-pdf-highlighter';
|
||||
import {useDispatch, useSelector} from 'umi';
|
||||
import {useGetKnowledgeSearchParams} from './routeHook';
|
||||
import {useOneNamespaceEffectsLoading} from './storeHooks';
|
||||
|
||||
export const useGetDocumentUrl = (documentId?: string) => {
|
||||
const getDocumentUrl = useCallback(
|
||||
@ -207,6 +207,28 @@ export const useUploadDocument = () => {
|
||||
return uploadDocument;
|
||||
};
|
||||
|
||||
export const useWebCrawl = () => {
|
||||
const dispatch = useDispatch();
|
||||
const { knowledgeId } = useGetKnowledgeSearchParams();
|
||||
return useCallback(
|
||||
(name: string, url: string) => {
|
||||
try {
|
||||
return dispatch<any>({
|
||||
type: 'kFModel/web_crawl',
|
||||
payload: {
|
||||
name,
|
||||
url,
|
||||
kb_id: knowledgeId,
|
||||
},
|
||||
});
|
||||
} catch (errorInfo) {
|
||||
console.log('Failed:', errorInfo);
|
||||
}
|
||||
},
|
||||
[dispatch],
|
||||
);
|
||||
};
|
||||
|
||||
export const useRunDocument = () => {
|
||||
const dispatch = useDispatch();
|
||||
|
||||
|
||||
@ -81,6 +81,7 @@ export default {
|
||||
searchFiles: 'Search your files',
|
||||
localFiles: 'Local files',
|
||||
emptyFiles: 'Create empty file',
|
||||
webCrawl: 'Web Crawl',
|
||||
chunkNumber: 'Chunk Number',
|
||||
uploadDate: 'Upload Date',
|
||||
chunkMethod: 'Chunk Method',
|
||||
|
||||
@ -80,6 +80,7 @@ export default {
|
||||
searchFiles: '搜索文件',
|
||||
localFiles: '本地文件',
|
||||
emptyFiles: '新建空文件',
|
||||
webCrawl: '網頁抓取',
|
||||
chunkNumber: '分塊數',
|
||||
uploadDate: '上傳日期',
|
||||
chunkMethod: '解析方法',
|
||||
|
||||
@ -80,6 +80,7 @@ export default {
|
||||
searchFiles: '搜索文件',
|
||||
localFiles: '本地文件',
|
||||
emptyFiles: '新建空文件',
|
||||
webCrawl: '网页抓取',
|
||||
chunkNumber: '分块数',
|
||||
uploadDate: '上传日期',
|
||||
chunkMethod: '解析方法',
|
||||
|
||||
@ -29,13 +29,15 @@ import styles from './index.less';
|
||||
interface IProps {
|
||||
selectedRowKeys: string[];
|
||||
showCreateModal(): void;
|
||||
showWebCrawlModal(): void;
|
||||
showDocumentUploadModal(): void;
|
||||
}
|
||||
|
||||
const DocumentToolbar = ({
|
||||
selectedRowKeys,
|
||||
showCreateModal,
|
||||
showDocumentUploadModal,
|
||||
selectedRowKeys,
|
||||
showCreateModal,
|
||||
showWebCrawlModal,
|
||||
showDocumentUploadModal,
|
||||
}: IProps) => {
|
||||
const { t } = useTranslate('knowledgeDetails');
|
||||
const { fetchDocumentList } = useFetchDocumentListOnMount();
|
||||
@ -66,6 +68,19 @@ const DocumentToolbar = ({
|
||||
{ type: 'divider' },
|
||||
{
|
||||
key: '2',
|
||||
onClick: showWebCrawlModal,
|
||||
label: (
|
||||
<div>
|
||||
<Button type="link">
|
||||
<FileTextOutlined />
|
||||
{t('webCrawl')}
|
||||
</Button>
|
||||
</div>
|
||||
),
|
||||
},
|
||||
{ type: 'divider' },
|
||||
{
|
||||
key: '3',
|
||||
onClick: showCreateModal,
|
||||
label: (
|
||||
<div>
|
||||
@ -77,7 +92,7 @@ const DocumentToolbar = ({
|
||||
),
|
||||
},
|
||||
];
|
||||
}, [showDocumentUploadModal, showCreateModal, t]);
|
||||
}, [showDocumentUploadModal, showWebCrawlModal, showCreateModal, t]);
|
||||
|
||||
const handleDelete = useCallback(() => {
|
||||
showDeleteConfirm({
|
||||
|
||||
@ -7,6 +7,7 @@ import {
|
||||
useSelectRunDocumentLoading,
|
||||
useSetDocumentParser,
|
||||
useUploadDocument,
|
||||
useWebCrawl,
|
||||
} from '@/hooks/documentHooks';
|
||||
import { useGetKnowledgeSearchParams } from '@/hooks/routeHook';
|
||||
import { useOneNamespaceEffectsLoading } from '@/hooks/storeHooks';
|
||||
@ -286,6 +287,37 @@ export const useHandleUploadDocument = () => {
|
||||
};
|
||||
};
|
||||
|
||||
export const useHandleWebCrawl = () => {
|
||||
const {
|
||||
visible: webCrawlUploadVisible,
|
||||
hideModal: hideWebCrawlUploadModal,
|
||||
showModal: showWebCrawlUploadModal,
|
||||
} = useSetModalState();
|
||||
const webCrawl = useWebCrawl();
|
||||
|
||||
const onWebCrawlUploadOk = useCallback(
|
||||
async (name: string, url: string ) => {
|
||||
const ret = await webCrawl(name, url);
|
||||
if (ret === 0) {
|
||||
hideWebCrawlUploadModal();
|
||||
return 0
|
||||
}
|
||||
return -1
|
||||
},
|
||||
[webCrawl, hideWebCrawlUploadModal],
|
||||
);
|
||||
|
||||
const loading = useOneNamespaceEffectsLoading('kFModel', ['web_crawl']);
|
||||
|
||||
return {
|
||||
webCrawlUploadLoading: loading,
|
||||
onWebCrawlUploadOk,
|
||||
webCrawlUploadVisible,
|
||||
hideWebCrawlUploadModal,
|
||||
showWebCrawlUploadModal,
|
||||
};
|
||||
};
|
||||
|
||||
export const useHandleRunDocumentByIds = (id: string) => {
|
||||
const loading = useSelectRunDocumentLoading();
|
||||
const runDocumentByIds = useRunDocument();
|
||||
|
||||
@ -12,6 +12,7 @@ import { Divider, Flex, Switch, Table, Typography } from 'antd';
|
||||
import type { ColumnsType } from 'antd/es/table';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import CreateFileModal from './create-file-modal';
|
||||
import WebCrawlModal from './web-crawl-modal';
|
||||
import DocumentToolbar from './document-toolbar';
|
||||
import {
|
||||
useChangeDocumentParser,
|
||||
@ -19,7 +20,7 @@ import {
|
||||
useFetchDocumentListOnMount,
|
||||
useGetPagination,
|
||||
useGetRowSelection,
|
||||
useHandleUploadDocument,
|
||||
useHandleUploadDocument, useHandleWebCrawl,
|
||||
useNavigateToOtherPage,
|
||||
useRenameDocument,
|
||||
} from './hooks';
|
||||
@ -69,6 +70,13 @@ const KnowledgeFile = () => {
|
||||
onDocumentUploadOk,
|
||||
documentUploadLoading,
|
||||
} = useHandleUploadDocument();
|
||||
const {
|
||||
webCrawlUploadVisible,
|
||||
hideWebCrawlUploadModal,
|
||||
showWebCrawlUploadModal,
|
||||
onWebCrawlUploadOk,
|
||||
webCrawlUploadLoading,
|
||||
} = useHandleWebCrawl();
|
||||
const { t } = useTranslation('translation', {
|
||||
keyPrefix: 'knowledgeDetails',
|
||||
});
|
||||
@ -170,6 +178,7 @@ const KnowledgeFile = () => {
|
||||
<DocumentToolbar
|
||||
selectedRowKeys={rowSelection.selectedRowKeys as string[]}
|
||||
showCreateModal={showCreateModal}
|
||||
showWebCrawlModal={showWebCrawlUploadModal}
|
||||
showDocumentUploadModal={showDocumentUploadModal}
|
||||
></DocumentToolbar>
|
||||
<Table
|
||||
@ -211,6 +220,12 @@ const KnowledgeFile = () => {
|
||||
loading={documentUploadLoading}
|
||||
onOk={onDocumentUploadOk}
|
||||
></FileUploadModal>
|
||||
<WebCrawlModal
|
||||
visible={webCrawlUploadVisible}
|
||||
hideModal={hideWebCrawlUploadModal}
|
||||
loading={webCrawlUploadLoading}
|
||||
onOk={onWebCrawlUploadOk}
|
||||
></WebCrawlModal>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
@ -232,6 +232,27 @@ const model: DvaModel<KFModelState> = {
|
||||
}
|
||||
return data;
|
||||
},
|
||||
*web_crawl({ payload = {} }, { call, put }) {
|
||||
const formData = new FormData();
|
||||
formData.append('name', payload.name);
|
||||
formData.append('url', payload.url);
|
||||
formData.append('kb_id', payload.kb_id);
|
||||
|
||||
const { data } = yield call(kbService.web_crawl, formData);
|
||||
|
||||
const succeed = data.retcode === 0;
|
||||
|
||||
if (succeed) {
|
||||
message.success(i18n.t('message.uploaded'));
|
||||
}
|
||||
if (succeed || data.retcode === 500) {
|
||||
yield put({
|
||||
type: 'getKfList',
|
||||
payload: { kb_id: payload.kb_id },
|
||||
});
|
||||
}
|
||||
return data.retcode;
|
||||
},
|
||||
},
|
||||
subscriptions: {
|
||||
setup({ dispatch, history }) {
|
||||
|
||||
@ -0,0 +1,54 @@
|
||||
import { IModalManagerChildrenProps } from '@/components/modal-manager';
|
||||
import { Form, Input, Modal } from 'antd';
|
||||
import React from 'react';
|
||||
import {useTranslate} from "@/hooks/commonHooks";
|
||||
|
||||
|
||||
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
||||
loading: boolean;
|
||||
onOk: (name: string, url: string) => void;
|
||||
showModal?(): void;
|
||||
}
|
||||
|
||||
const WebCrawlModal: React.FC<IProps> = ({ visible, hideModal, onOk }) => {
|
||||
const [form] = Form.useForm();
|
||||
const { t } = useTranslate('knowledgeDetails');
|
||||
const handleOk = async () => {
|
||||
const values = await form.validateFields();
|
||||
onOk(values.name, values.url);
|
||||
};
|
||||
|
||||
return (
|
||||
<Modal
|
||||
title={t('webCrawl')}
|
||||
open={visible}
|
||||
onOk={handleOk}
|
||||
onCancel={hideModal}
|
||||
>
|
||||
<Form
|
||||
form={form}
|
||||
name="validateOnly"
|
||||
labelCol={{ span: 4 }}
|
||||
wrapperCol={{ span: 20 }}
|
||||
style={{ maxWidth: 600 }}
|
||||
autoComplete="off"
|
||||
>
|
||||
<Form.Item
|
||||
label="Name"
|
||||
name="name"
|
||||
rules={[{ required: true, message: 'Please input name!' },{ max: 10, message: 'The maximum length of name is 128 characters' }]}
|
||||
>
|
||||
<Input placeholder="Document name" />
|
||||
</Form.Item>
|
||||
<Form.Item
|
||||
label="URL"
|
||||
name="url"
|
||||
rules={[{ required: true, message: 'Please input url!' },{pattern: new RegExp('(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'), message: 'Please enter a valid URL!'}]}
|
||||
>
|
||||
<Input placeholder="https://www.baidu.com" />
|
||||
</Form.Item>
|
||||
</Form>
|
||||
</Modal>
|
||||
);
|
||||
};
|
||||
export default WebCrawlModal;
|
||||
@ -26,6 +26,7 @@ const {
|
||||
document_run,
|
||||
get_document_file,
|
||||
document_upload,
|
||||
web_crawl,
|
||||
} = api;
|
||||
|
||||
const methods = {
|
||||
@ -87,6 +88,10 @@ const methods = {
|
||||
url: document_upload,
|
||||
method: 'post',
|
||||
},
|
||||
web_crawl: {
|
||||
url: web_crawl,
|
||||
method: 'post',
|
||||
},
|
||||
// chunk管理
|
||||
chunk_list: {
|
||||
url: chunk_list,
|
||||
|
||||
@ -48,6 +48,7 @@ export default {
|
||||
document_thumbnails: `${api_host}/document/thumbnails`,
|
||||
get_document_file: `${api_host}/document/get`,
|
||||
document_upload: `${api_host}/document/upload`,
|
||||
web_crawl: `${api_host}/document/web_crawl`,
|
||||
|
||||
// chat
|
||||
setDialog: `${api_host}/dialog/set`,
|
||||
|
||||
Reference in New Issue
Block a user