From eb8feaf20a4c308f0631b6e7c68319f5007182aa Mon Sep 17 00:00:00 2001
From: balibabu
Supported file formats are DOCX, EXCEL, PPT, IMAGE, PDF, TXT.
+ naive: `Supported file formats are DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML.
This method apply the naive ways to chunk files:
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
`, + knowledgeGraph: `Supported file formats are DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML + +
After files being chunked, it uses chunks to extract knowledge graph and mind map of the entire document. This method apply the naive ways to chunk files: +Successive text will be sliced into pieces each of which is around 512 token number.
+Next, chunks will be transmited to LLM to extract nodes and relationships of a knowledge graph, and a mind map.
+ +Mind the entiry type you need to specify.`, useRaptor: 'Use RAPTOR to enhance retrieval', useRaptorTip: 'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059', diff --git a/web/src/locales/zh-traditional.ts b/web/src/locales/zh-traditional.ts index 200b0b231..27887d82a 100644 --- a/web/src/locales/zh-traditional.ts +++ b/web/src/locales/zh-traditional.ts @@ -190,7 +190,7 @@ export default { 我們假設手冊具有分層部分結構。我們使用最低的部分標題作為對文檔進行切片的樞軸。 因此,同一部分中的圖和表不會被分割,並且塊大小可能會很大。 `, - naive: `支持的文件格式為DOCX、EXCEL、PPT、IMAGE、PDF、TXT。
+ naive: `支持的文件格式為DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML。
此方法將簡單的方法應用於塊文件:
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
`, + knowledgeGraph: `支援的檔案格式為DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML + +
文件分塊後,使用分塊擷取整個文件的知識圖譜和心智圖。此方法將簡單的方法應用於區塊檔案: +連續的文字將被分割成多個片段,每個片段大約有 512 個令牌數。 +
接下來,區塊將傳送到LLM以提取知識圖譜和思維導圖的節點和關係。 + +
請注意您需要指定的條目類型。
`, useRaptor: '使用RAPTOR文件增強策略', useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059', prompt: '提示詞', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 730e9038e..e0ee6d157 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -191,7 +191,7 @@ export default { 我们假设手册具有分层部分结构。 我们使用最低的部分标题作为对文档进行切片的枢轴。 因此,同一部分中的图和表不会被分割,并且块大小可能会很大。 `, - naive: `支持的文件格式为DOCX、EXCEL、PPT、IMAGE、PDF、TXT。
+ naive: `支持的文件格式为DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML。
此方法将简单的方法应用于块文件:
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
`, + knowledgeGraph: `支持的文件格式为DOCX、EXCEL、PPT、IMAGE、PDF、TXT、MD、JSON、EML + +
文件分块后,使用分块提取整个文档的知识图谱和思维导图。此方法将简单的方法应用于分块文件: +连续的文本将被切成大约 512 个 token 数的块。
+接下来,将分块传输到 LLM 以提取知识图谱和思维导图的节点和关系。
+ +注意您需要指定的条目类型。`, useRaptor: '使用召回增强RAPTOR策略', useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059', prompt: '提示词', diff --git a/web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx b/web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx index bf5552909..c4eb8ab65 100644 --- a/web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx +++ b/web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx @@ -3,6 +3,7 @@ import { useTranslate } from '@/hooks/common-hooks'; import { useSelectParserList } from '@/hooks/user-setting-hooks'; import { Col, Divider, Empty, Row, Typography } from 'antd'; import DOMPurify from 'dompurify'; +import camelCase from 'lodash/camelCase'; import { useMemo } from 'react'; import styles from './index.less'; import { ImageMap } from './utils'; @@ -18,7 +19,7 @@ const CategoryPanel = ({ chunkMethod }: { chunkMethod: string }) => { if (item) { return { title: item.label, - description: t(item.value), + description: t(camelCase(item.value)), }; } return { title: '', description: '' }; diff --git a/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts b/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts index 303f804a2..725c780eb 100644 --- a/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts +++ b/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts @@ -37,6 +37,9 @@ export const useSubmitKnowledgeConfiguration = (form: FormInstance) => { }; }; +// The value that does not need to be displayed in the analysis method Select +const HiddenFields = ['email', 'picture', 'audio']; + export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => { const parserList = useSelectParserList(); const allOptions = useSelectLlmOptionsByModelType(); @@ -62,7 +65,9 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => { }, [form, knowledgeDetails]); return { - parserList, + parserList: parserList.filter( + (x) => !HiddenFields.some((y) => y === x.value), + ), embeddingModelOptions: allOptions[LlmModelType.Embedding], disabled: knowledgeDetails.chunk_num > 0, }; diff --git a/web/src/pages/add-knowledge/components/knowledge-setting/utils.ts b/web/src/pages/add-knowledge/components/knowledge-setting/utils.ts index 57d74c548..3c4f94f43 100644 --- a/web/src/pages/add-knowledge/components/knowledge-setting/utils.ts +++ b/web/src/pages/add-knowledge/components/knowledge-setting/utils.ts @@ -15,6 +15,7 @@ export const ImageMap = { resume: getImageName('resume', 2), table: getImageName('table', 2), one: getImageName('one', 2), + knowledge_graph: getImageName('knowledge-graph', 2), }; export const TextMap = {