From 1edbd36baf1c30e8b67d2264e0b42b95d367a2a3 Mon Sep 17 00:00:00 2001
From: KevinHuSh
Date: Fri, 22 Mar 2024 15:35:06 +0800
Subject: [PATCH] add help info (#142)
---
.../components/similarity-slider/index.tsx | 10 +-
.../knowledge-setting/category-panel.tsx | 6 +-
.../knowledge-setting/configuration.tsx | 14 +--
.../components/knowledge-setting/utils.ts | 113 ++++++++++++------
.../testing-control/index.tsx | 5 +-
.../assistant-setting.tsx | 10 +-
.../model-setting.tsx | 16 +--
.../prompt-engine.tsx | 11 +-
.../setting-model/api-key-modal/index.tsx | 2 +-
.../system-model-setting-modal/index.tsx | 22 ++--
.../user-setting/setting-profile/index.tsx | 6 -
11 files changed, 131 insertions(+), 84 deletions(-)
diff --git a/web/src/components/similarity-slider/index.tsx b/web/src/components/similarity-slider/index.tsx
index 7f6019991..a70dfcb56 100644
--- a/web/src/components/similarity-slider/index.tsx
+++ b/web/src/components/similarity-slider/index.tsx
@@ -15,7 +15,10 @@ const SimilaritySlider = ({ isTooltipShown = false }: IProps) => {
label="Similarity threshold"
name={'similarity_threshold'}
- tooltip={isTooltipShown && 'coming soon'}
+ tooltip={isTooltipShown && `We use hybrid similarity score to evaluate distance between two lines of text.
+ It\'s weighted keywords similarity and vector cosine similarity.
+ If the similarity between query and chunk is less than this threshold, the chunk will be filtered out.`
+ }
initialValue={0.2}
>
@@ -24,7 +27,10 @@ const SimilaritySlider = ({ isTooltipShown = false }: IProps) => {
label="Vector similarity weight"
name={'vector_similarity_weight'}
initialValue={0.3}
- tooltip={isTooltipShown && 'coming soon'}
+ tooltip={isTooltipShown && `We use hybrid similarity score to evaluate distance between two lines of text.
+ It\'s weighted keywords similarity and vector cosine similarity.
+ The sum of both weights is 1.0.
+ `}
>
diff --git a/web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx b/web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx
index 84af184f1..8b6168e81 100644
--- a/web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx
+++ b/web/src/pages/add-knowledge/components/knowledge-setting/category-panel.tsx
@@ -33,16 +33,16 @@ const CategoryPanel = ({ chunkMethod }: { chunkMethod: string }) => {
{imageList.length > 0 ? (
<>
- {item.title} Category
+ "{item.title}" Chunking Method Description
- {item.title} Image Examples
+ "{item.title}" Examples
- We've prepared detailed visual guides to make understanding easier
+ This visual guides is in order to make understanding easier
for you.
diff --git a/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx b/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx
index f69f631f7..fc8d056aa 100644
--- a/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx
+++ b/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx
@@ -83,7 +83,7 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
@@ -93,22 +93,22 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
-
Since a book is long and not all the parts are useful, if it's a PDF,
- please setup the page ranges for every book in order eliminate negative effects and save computing time for analyzing.`,
+ please setup the page ranges for every book in order eliminate negative effects and save computing time for analyzing.
+ We assume manual has hierarchical section structure. We use the lowest section titles as pivots to slice documents.
+ So, the figures and tables in the same section will not be sliced apart, and chunk size might be large.
+
` },
naive: {
title: '',
- description: `Supported file formats are docx, pdf, txt.
- This method apply the naive ways to chunk files.
- Successive text will be sliced into pieces using 'delimiter'.
- Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.`,
+ description: `
Supported file formats are DOCX, EXCEL, PPT, IMAGE, PDF, TXT.
+
This method apply the naive ways to chunk files:
+
+
Successive text will be sliced into pieces using vision detection model.
+
Next, these successive pieces are merge into chunks whose token number is no more than 'Token number'.
`,
},
paper: {
title: '',
- description: `Only pdf is supported.
- The special part is that, the abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.`,
+ description: `
Only PDF file is supported.
+ If our model works well, the paper will be sliced by it's sections, like abstract, 1.1, 1.2, etc.
+ The benefit of doing this is that LLM can better summarize the content of relevant sections in the paper,
+ resulting in more comprehensive answers that help readers better understand the paper.
+ The downside is that it increases the context of the LLM conversation and adds computational cost,
+ so during the conversation, you can consider reducing the ‘topN’ setting.
`,
},
presentation: {
title: '',
- description: `The supported file formats are pdf, pptx.
- Every page will be treated as a chunk. And the thumbnail of every page will be stored.
- PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.`,
+ description: `
The supported file formats are PDF, PPTX.
+ Every page will be treated as a chunk. And the thumbnail of every page will be stored.
+ All the PPT files you uploaded will be chunked by using this method automatically, setting-up for every PPT file is not necessary.
`,
},
qa: {
title: '',
- description: `Excel and csv(txt) format files are supported.
- If the file is in excel format, there should be 2 column question and answer without header.
+ description: `
EXCEL and CSV/TXT files are supported.
+ If the file is in excel format, there should be 2 columns question and answer without header.
And question column is ahead of answer column.
- And it's O.K if it has multiple sheets as long as the columns are rightly composed.
+ And it's O.K if it has multiple sheets as long as the columns are rightly composed.
- If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
+ If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
- All the deformed lines will be ignored.
- Every pair of Q&A will be treated as a chunk.`,
+ All the deformed lines will be ignored.
+ Every pair of Q&A will be treated as a chunk.
`,
},
resume: {
title: '',
- description: `The supported file formats are pdf, docx and txt.`,
+ description: `
The supported file formats are DOCX, PDF, TXT.
+
+ The résumé comes in a variety of formats, just like a person’s personality, but we often have to organize them into structured data that makes it easy to search.
+
+ Instead of chunking the résumé, we parse the résumé into structured data. As a HR, you can dump all the résumé you have,
+ the you can list all the candidates that match the qualifications just by talk with 'RagFlow'.
+
+ `,
},
table: {
title: '',
- description: `Excel and csv(txt) format files are supported.
- For csv or txt file, the delimiter between columns is TAB.
- The first line must be column headers.
- Column headers must be meaningful terms inorder to make our NLP model understanding.
- It's good to enumerate some synonyms using slash '/' to separate, and even better to
- enumerate values using brackets like 'gender/sex(male, female)'.
- Here are some examples for headers:
- 1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
- 2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)
- Every row in table will be treated as a chunk.
-
-visual:
- Image files are supported. Video is comming soon.
- If the picture has text in it, OCR is applied to extract the text as a description of it.
- If the text extracted by OCR is not enough, visual LLM is used to get the descriptions.`,
+ description: `
EXCEL and CSV/TXT format files are supported.
+ Here're some tips:
+
+
For csv or txt file, the delimiter between columns is TAB.
+
The first line must be column headers.
+
Column headers must be meaningful terms in order to make our LLM understanding.
+ It's good to enumerate some synonyms using slash '/' to separate, and even better to
+ enumerate values using brackets like 'gender/sex(male, female)'.