Compare commits

...

14 Commits

Author SHA1 Message Date
cd77425b87 Fix: potential negative max_tokens in RAPTOR (#10701)
### What problem does this PR solve?

Fix potential negative max_tokens in RAPTOR. #10235.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue
2025-10-21 15:49:51 +08:00
544c9990e3 Feat: Move the pipeline translation field to flow #9869 (#10697)
### What problem does this PR solve?

Feat: Move the pipeline translation field to flow #9869

### Type of change


- [X] New Feature (non-breaking change which adds functionality)
2025-10-21 15:23:37 +08:00
41a647fe32 Feat: A pipeline's child node can only have one node #9869 (#10695)
### What problem does this PR solve?

Feat: A pipeline's child node can only have one node #9869
### Type of change


- [x] New Feature (non-breaking change which adds functionality)
2025-10-21 13:55:46 +08:00
594bf485d4 Test: update test cases for chunk retrieval pagination (#10694)
### What problem does this PR solve?

Updated test cases in test_retrieval_chunks.py to:
- Remove skip mark from page pagination test case (issues/6646 resolved)
- Add skip marks for page_size=1 tests due to new issue (issues/10692)

### Type of change

- [x] Test
2025-10-21 13:02:29 +08:00
863c3e3d9c Fix: tree merge (#10691)
### What problem does this PR solve?

Fix: Fix tree merge, solved #10636

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-10-21 13:02:01 +08:00
1767039be3 Feat: Display the pipeline operation sheet on the agent page #9869 (#10690)
### What problem does this PR solve?

Feat: Display the pipeline operation sheet on the agent page #9869

### Type of change


- [x] New Feature (non-breaking change which adds functionality)
2025-10-21 12:59:30 +08:00
cd75fa02b1 Feat: Make knowledge base renaming automatically reflected in agent discussions, solved #10597 (#10680)
### What problem does this PR solve?
Feat: Make knowledge base renaming automatically reflected in agent
discussions, solved #10597

### Type of change
- [x] New Feature (non-breaking change which adds functionality)
2025-10-21 10:42:05 +08:00
cfdd37820a Feat: Support attribute filtering #8703 (#10670)
### What problem does this PR solve?

Feat: Support attribute filtering #8703

### Type of change

- [X] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: writinwaters <cai.keith@gmail.com>
2025-10-21 10:38:40 +08:00
9d12380806 Fix: Excel2HTML can't support XLS(Excel 97-2003) (#10660)
### What problem does this PR solve?
https://github.com/infiniflow/ragflow/issues/10602

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-10-21 09:52:59 +08:00
866098634b Feat:setting metadata in the retrieval (#10682)
### What problem does this PR solve?
issue:
[#9272](https://github.com/infiniflow/ragflow/issues/9272)
change:
setting metadata in the retrieval

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-10-21 09:52:26 +08:00
8013505daf Fix(edit-tag): Fix the bug that the edit-tag tag cannot be deleted #9869 (#10679)
### What problem does this PR solve?

fix(edit-tag): Fix the bug that the edit-tag tag cannot be deleted #9869

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-10-21 09:38:36 +08:00
deb81810e9 Update message printout when start ingestion server (#10677)
### What problem does this PR solve?

```
     ____                                  __     _                                                                  
    /  _/   ____    ____ _  ___    _____  / /_   (_)  ____    ____           _____  ___    _____ _   __  ___    _____
    / /    / __ \  / __ `/ / _ \  / ___/ / __/  / /  / __ \  / __ \         / ___/ / _ \  / ___/| | / / / _ \  / ___/
 _/ /    / / / / / /_/ / /  __/ (__  ) / /_   / /  / /_/ / / / / /        (__  ) /  __/ / /    | |/ / /  __/ / /    
/___/   /_/ /_/  \__, /  \___/ /____/  \__/  /_/   \____/ /_/ /_/        /____/  \___/ /_/     |___/  \___/ /_/     
                /____/          
```

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2025-10-21 09:38:20 +08:00
6ab96287c9 Feat:Vision Model Image Enhancement in Manual/Paper/Book/One chunker (#10640)
### What problem does this PR solve?
issue:
[#7472](https://github.com/infiniflow/ragflow/issues/7472)
change:
Vision Model Image Enhancement in Manual chunker
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-10-21 09:36:27 +08:00
aaa4776657 Feat: Qwen-VL series supports video parsing (#10676)
### What problem does this PR solve?

Qwen-VL series supports video parsing. #10617.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-10-21 09:36:13 +08:00
48 changed files with 924 additions and 462 deletions

View File

@ -18,12 +18,14 @@ import re
from abc import ABC
from agent.tools.base import ToolParamBase, ToolBase, ToolMeta
from api.db import LLMType
from api.db.services.document_service import DocumentService
from api.db.services.dialog_service import meta_filter
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api import settings
from api.utils.api_utils import timeout
from rag.app.tag import label_question
from rag.prompts.generator import cross_languages, kb_prompt
from rag.prompts.generator import cross_languages, kb_prompt, gen_meta_filter
class RetrievalParam(ToolParamBase):
@ -58,6 +60,7 @@ class RetrievalParam(ToolParamBase):
self.use_kg = False
self.cross_languages = []
self.toc_enhance = False
self.meta_data_filter={}
def check(self):
self.check_decimal_float(self.similarity_threshold, "[Retrieval] Similarity threshold")
@ -117,6 +120,21 @@ class Retrieval(ToolBase, ABC):
vars = self.get_input_elements_from_text(kwargs["query"])
vars = {k:o["value"] for k,o in vars.items()}
query = self.string_format(kwargs["query"], vars)
doc_ids=[]
if self._param.meta_data_filter!={}:
metas = DocumentService.get_meta_by_kbs(kb_ids)
if self._param.meta_data_filter.get("method") == "auto":
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT)
filters = gen_meta_filter(chat_mdl, metas, query)
doc_ids.extend(meta_filter(metas, filters))
if not doc_ids:
doc_ids = None
elif self._param.meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, self._param.meta_data_filter["manual"]))
if not doc_ids:
doc_ids = None
if self._param.cross_languages:
query = cross_languages(kbs[0].tenant_id, None, query, self._param.cross_languages)
@ -131,6 +149,7 @@ class Retrieval(ToolBase, ABC):
self._param.top_n,
self._param.similarity_threshold,
1 - self._param.keywords_similarity_weight,
doc_ids=doc_ids,
aggs=False,
rerank_mdl=rerank_mdl,
rank_feature=label_question(query, kbs),

View File

@ -45,7 +45,7 @@ from api.utils.api_utils import (
from api.utils.file_utils import filename_type, get_project_base_directory, thumbnail
from api.utils.web_utils import CONTENT_TYPE_MAP, html2pdf, is_valid_url
from deepdoc.parser.html_parser import RAGFlowHtmlParser
from rag.nlp import search
from rag.nlp import search, rag_tokenizer
from rag.utils.storage_factory import STORAGE_IMPL
@ -524,6 +524,21 @@ def rename():
e, file = FileService.get_by_id(informs[0].file_id)
FileService.update_by_id(file.id, {"name": req["name"]})
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
title_tks = rag_tokenizer.tokenize(req["name"])
es_body = {
"docnm_kwd": req["name"],
"title_tks": title_tks,
"title_sm_tks": rag_tokenizer.fine_grained_tokenize(title_tks),
}
if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.update(
{"doc_id": req["doc_id"]},
es_body,
search.index_name(tenant_id),
doc.kb_id,
)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)

View File

@ -470,6 +470,20 @@ def list_docs(dataset_id, tenant_id):
required: false
default: 0
description: Unix timestamp for filtering documents created before this time. 0 means no filter.
- in: query
name: suffix
type: array
items:
type: string
required: false
description: Filter by file suffix (e.g., ["pdf", "txt", "docx"]).
- in: query
name: run
type: array
items:
type: string
required: false
description: Filter by document run status. Supports both numeric ("0", "1", "2", "3", "4") and text formats ("UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL").
- in: header
name: Authorization
type: string
@ -512,63 +526,62 @@ def list_docs(dataset_id, tenant_id):
description: Processing status.
"""
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
id = request.args.get("id")
name = request.args.get("name")
return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
if id and not DocumentService.query(id=id, kb_id=dataset_id):
return get_error_data_result(message=f"You don't own the document {id}.")
q = request.args
document_id = q.get("id")
name = q.get("name")
if document_id and not DocumentService.query(id=document_id, kb_id=dataset_id):
return get_error_data_result(message=f"You don't own the document {document_id}.")
if name and not DocumentService.query(name=name, kb_id=dataset_id):
return get_error_data_result(message=f"You don't own the document {name}.")
page = int(request.args.get("page", 1))
keywords = request.args.get("keywords", "")
page_size = int(request.args.get("page_size", 30))
orderby = request.args.get("orderby", "create_time")
if request.args.get("desc") == "False":
desc = False
else:
desc = True
docs, tol = DocumentService.get_list(dataset_id, page, page_size, orderby, desc, keywords, id, name)
page = int(q.get("page", 1))
page_size = int(q.get("page_size", 30))
orderby = q.get("orderby", "create_time")
desc = str(q.get("desc", "true")).strip().lower() != "false"
keywords = q.get("keywords", "")
create_time_from = int(request.args.get("create_time_from", 0))
create_time_to = int(request.args.get("create_time_to", 0))
# filters - align with OpenAPI parameter names
suffix = q.getlist("suffix")
run_status = q.getlist("run")
create_time_from = int(q.get("create_time_from", 0))
create_time_to = int(q.get("create_time_to", 0))
# map run status (accept text or numeric) - align with API parameter
run_status_text_to_numeric = {"UNSTART": "0", "RUNNING": "1", "CANCEL": "2", "DONE": "3", "FAIL": "4"}
run_status_converted = [run_status_text_to_numeric.get(v, v) for v in run_status]
docs, total = DocumentService.get_list(
dataset_id, page, page_size, orderby, desc, keywords, document_id, name, suffix, run_status_converted
)
# time range filter (0 means no bound)
if create_time_from or create_time_to:
filtered_docs = []
for doc in docs:
doc_create_time = doc.get("create_time", 0)
if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to):
filtered_docs.append(doc)
docs = filtered_docs
docs = [
d for d in docs
if (create_time_from == 0 or d.get("create_time", 0) >= create_time_from)
and (create_time_to == 0 or d.get("create_time", 0) <= create_time_to)
]
# rename key's name
renamed_doc_list = []
# rename keys + map run status back to text for output
key_mapping = {
"chunk_num": "chunk_count",
"kb_id": "dataset_id",
"kb_id": "dataset_id",
"token_num": "token_count",
"parser_id": "chunk_method",
}
run_mapping = {
"0": "UNSTART",
"1": "RUNNING",
"2": "CANCEL",
"3": "DONE",
"4": "FAIL",
}
for doc in docs:
renamed_doc = {}
for key, value in doc.items():
if key == "run":
renamed_doc["run"] = run_mapping.get(str(value))
new_key = key_mapping.get(key, key)
renamed_doc[new_key] = value
if key == "run":
renamed_doc["run"] = run_mapping.get(value)
renamed_doc_list.append(renamed_doc)
return get_result(data={"total": tol, "docs": renamed_doc_list})
run_status_numeric_to_text = {"0": "UNSTART", "1": "RUNNING", "2": "CANCEL", "3": "DONE", "4": "FAIL"}
output_docs = []
for d in docs:
renamed_doc = {key_mapping.get(k, k): v for k, v in d.items()}
if "run" in d:
renamed_doc["run"] = run_status_numeric_to_text.get(str(d["run"]), d["run"])
output_docs.append(renamed_doc)
return get_result(data={"total": total, "docs": output_docs})
@manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
@token_required

View File

@ -79,7 +79,7 @@ class DocumentService(CommonService):
@classmethod
@DB.connection_context()
def get_list(cls, kb_id, page_number, items_per_page,
orderby, desc, keywords, id, name):
orderby, desc, keywords, id, name, suffix=None, run = None):
fields = cls.get_cls_model_fields()
docs = cls.model.select(*[*fields, UserCanvas.title]).join(File2Document, on = (File2Document.document_id == cls.model.id))\
.join(File, on = (File.id == File2Document.file_id))\
@ -96,6 +96,10 @@ class DocumentService(CommonService):
docs = docs.where(
fn.LOWER(cls.model.name).contains(keywords.lower())
)
if suffix:
docs = docs.where(cls.model.suffix.in_(suffix))
if run:
docs = docs.where(cls.model.run.in_(run))
if desc:
docs = docs.order_by(cls.model.getter_by(orderby).desc())
else:

View File

@ -54,8 +54,8 @@ class RAGFlowExcelParser:
try:
file_like_object.seek(0)
try:
df = pd.read_excel(file_like_object)
return RAGFlowExcelParser._dataframe_to_workbook(df)
dfs = pd.read_excel(file_like_object, sheet_name=None)
return RAGFlowExcelParser._dataframe_to_workbook(dfs)
except Exception as ex:
logging.info(f"pandas with default engine load error: {ex}, try calamine instead")
file_like_object.seek(0)
@ -75,6 +75,10 @@ class RAGFlowExcelParser:
@staticmethod
def _dataframe_to_workbook(df):
# if contains multiple sheets use _dataframes_to_workbook
if isinstance(df, dict) and len(df) > 1:
return RAGFlowExcelParser._dataframes_to_workbook(df)
df = RAGFlowExcelParser._clean_dataframe(df)
wb = Workbook()
ws = wb.active
@ -88,6 +92,22 @@ class RAGFlowExcelParser:
ws.cell(row=row_num, column=col_num, value=value)
return wb
@staticmethod
def _dataframes_to_workbook(dfs: dict):
wb = Workbook()
default_sheet = wb.active
wb.remove(default_sheet)
for sheet_name, df in dfs.items():
df = RAGFlowExcelParser._clean_dataframe(df)
ws = wb.create_sheet(title=sheet_name)
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
return wb
def html(self, fnm, chunk_rows=256):
from html import escape

View File

@ -17,6 +17,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from api.utils.api_utils import timeout
from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
from rag.prompts.generator import vision_llm_figure_describe_prompt
@ -32,6 +34,43 @@ def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
if isinstance(figure_data[1], Image.Image)
]
def vision_figure_parser_docx_wrapper(sections,tbls,callback=None,**kwargs):
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
figures_data = vision_figure_parser_figure_data_wrapper(sections)
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tbls.extend(boosted_figures)
except Exception as e:
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
return tbls
def vision_figure_parser_pdf_wrapper(tbls,callback=None,**kwargs):
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
def is_figure_item(item):
return (
isinstance(item[0][0], Image.Image) and
isinstance(item[0][1], list)
)
figures_data = [item for item in tbls if is_figure_item(item)]
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tbls = [item for item in tbls if not is_figure_item(item)]
tbls.extend(boosted_figures)
except Exception as e:
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
return tbls
shared_executor = ThreadPoolExecutor(max_workers=10)

View File

@ -0,0 +1,17 @@
---
sidebar_position: 32
slug: /chunker_token_component
---
# Parser component
A component that sets the parsing rules for your dataset.
---
A **Parser** component defines how various file types should be parsed, including parsing methods for PDFs , fields to parse for Emails, and OCR methods for images.
## Scenario
A **Parser** component is auto-populated on the ingestion pipeline canvas and required in all ingestion pipeline workflows.

View File

@ -1198,23 +1198,24 @@ Failure:
### List documents
**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}`
**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}`
Lists documents in a specified dataset.
#### Request
- Method: GET
- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}`
- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}`
- Headers:
- `'content-Type: application/json'`
- `'Authorization: Bearer <YOUR_API_KEY>'`
##### Request example
##### Request examples
**A basic request with pagination:**
```bash
curl --request GET \
--url http://{address}/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp} \
--url http://{address}/api/v1/datasets/{dataset_id}/documents?page=1&page_size=10 \
--header 'Authorization: Bearer <YOUR_API_KEY>'
```
@ -1236,10 +1237,34 @@ curl --request GET \
Indicates whether the retrieved documents should be sorted in descending order. Defaults to `true`.
- `id`: (*Filter parameter*), `string`
The ID of the document to retrieve.
- `create_time_from`: (*Filter parameter*), `integer`
- `create_time_from`: (*Filter parameter*), `integer`
Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to `0`.
- `create_time_to`: (*Filter parameter*), `integer`
- `create_time_to`: (*Filter parameter*), `integer`
Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to `0`.
- `suffix`: (*Filter parameter*), `array[string]`
Filter by file suffix. Supports multiple values, e.g., `pdf`, `txt`, and `docx`. Defaults to all suffixes.
- `run`: (*Filter parameter*), `array[string]`
Filter by document processing status. Supports numeric, text, and mixed formats:
- Numeric format: `["0", "1", "2", "3", "4"]`
- Text format: `[UNSTART, RUNNING, CANCEL, DONE, FAIL]`
- Mixed format: `[UNSTART, 1, DONE]` (mixing numeric and text formats)
- Status mapping:
- `0` / `UNSTART`: Document not yet processed
- `1` / `RUNNING`: Document is currently being processed
- `2` / `CANCEL`: Document processing was cancelled
- `3` / `DONE`: Document processing completed successfully
- `4` / `FAIL`: Document processing failed
Defaults to all statuses.
##### Usage examples
**A request with multiple filtering parameters**
```bash
curl --request GET \
--url 'http://{address}/api/v1/datasets/{dataset_id}/documents?suffix=pdf&run=DONE&page=1&page_size=10' \
--header 'Authorization: Bearer <YOUR_API_KEY>'
```
#### Response
@ -1270,7 +1295,7 @@ Success:
"process_duration": 0.0,
"progress": 0.0,
"progress_msg": "",
"run": "0",
"run": "UNSTART",
"size": 7,
"source_type": "local",
"status": "1",

View File

@ -20,11 +20,14 @@ import re
from io import BytesIO
from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
from deepdoc.parser import PdfParser, PlainParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
from PIL import Image
class Pdf(PdfParser):
@ -81,13 +84,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections, tbls = [], []
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
doc_parser = DocxParser()
doc_parser = naive.Docx()
# TODO: table of contents need to be removed
sections, tbls = doc_parser(
binary if binary else filename, from_page=from_page, to_page=to_page)
filename, binary=binary, from_page=from_page, to_page=to_page)
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
tbls = [((None, lns), None) for lns in tbls]
tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
# tbls = [((None, lns), None) for lns in tbls]
sections=[(item[0],item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)]
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -96,6 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")

View File

@ -23,6 +23,7 @@ from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from rag.utils import num_tokens_from_string
from deepdoc.parser import PdfParser, PlainParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
from docx import Document
from PIL import Image
@ -252,7 +253,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
tk_cnt = num_tokens_from_string(txt)
if sec_id > -1:
last_sid = sec_id
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
@ -261,6 +262,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)
tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng)
for text, image in ti_list:
d = copy.deepcopy(doc)

View File

@ -32,7 +32,7 @@ from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from api.utils.file_utils import extract_embed_file
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.mineru_parser import MinerUParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
@ -475,24 +475,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2
sections, tables = Docx()(filename, binary)
if vision_model:
figures_data = vision_figure_parser_figure_data_wrapper(sections)
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tables.extend(boosted_figures)
except Exception as e:
callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
@ -521,25 +510,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if layout_recognizer == "DeepDOC":
pdf_parser = Pdf()
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True)
callback(0.5, "Basic parsing complete. Proceeding with figure enhancement...")
try:
pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs)
boosted_figures = pdf_vision_parser(callback=callback)
tables.extend(boosted_figures)
except Exception as e:
callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
tables.extend(figures)
else:
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
tables=vision_figure_parser_pdf_wrapper(tbls=tables,callback=callback,**kwargs)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")

View File

@ -23,6 +23,7 @@ from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
class Pdf(PdfParser):
@ -57,13 +58,8 @@ class Pdf(PdfParser):
sections = [(b["text"], self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
@ -80,6 +76,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = naive.Docx()(filename, binary)
tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
sections = [s for s, _ in sections if s]
for (_, html), _ in tbls:
sections.append(html)
@ -89,8 +86,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = Pdf()
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
sections, _ = pdf_parser(
sections, tbls = pdf_parser(
filename if not binary else binary, to_page=to_page, callback=callback)
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
sections = [s for s, _ in sections if s]
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):

View File

@ -18,12 +18,12 @@ import logging
import copy
import re
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser
import numpy as np
class Pdf(PdfParser):
def __init__(self):
self.model_speciess = ParserType.PAPER.value
@ -160,6 +160,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
tbls=paper["tables"]
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
paper["tables"] = tbls
else:
raise NotImplementedError("file type not supported yet(pdf supported)")

View File

@ -13,13 +13,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import base64
import json
import os
import tempfile
import logging
from abc import ABC
from copy import deepcopy
from io import BytesIO
from pathlib import Path
from urllib.parse import urljoin
import requests
from openai import OpenAI
@ -171,6 +174,7 @@ class GptV4(Base):
def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", base_url="https://api.openai.com/v1", **kwargs):
if not base_url:
base_url = "https://api.openai.com/v1"
self.api_key = key
self.client = OpenAI(api_key=key, base_url=base_url)
self.model_name = model_name
self.lang = lang
@ -224,6 +228,61 @@ class QWenCV(GptV4):
base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
super().__init__(key, model_name, lang=lang, base_url=base_url, **kwargs)
def chat(self, system, history, gen_conf, images=[], video_bytes=None, filename=""):
if video_bytes:
try:
summary, summary_num_tokens = self._process_video(video_bytes, filename)
return summary, summary_num_tokens
except Exception as e:
return "**ERROR**: " + str(e), 0
return "**ERROR**: Method chat not supported yet.", 0
def _process_video(self, video_bytes, filename):
from dashscope import MultiModalConversation
video_suffix = Path(filename).suffix or ".mp4"
with tempfile.NamedTemporaryFile(delete=False, suffix=video_suffix) as tmp:
tmp.write(video_bytes)
tmp_path = tmp.name
video_path = f"file://{tmp_path}"
messages = [
{
"role": "user",
"content": [
{
"video": video_path,
"fps": 2,
},
{
"text": "Please summarize this video in proper sentences.",
},
],
}
]
def call_api():
response = MultiModalConversation.call(
api_key=self.api_key,
model=self.model_name,
messages=messages,
)
summary = response["output"]["choices"][0]["message"].content[0]["text"]
return summary, num_tokens_from_string(summary)
try:
return call_api()
except Exception as e1:
import dashscope
dashscope.base_http_api_url = "https://dashscope-intl.aliyuncs.com/api/v1"
try:
return call_api()
except Exception as e2:
raise RuntimeError(f"Both default and intl endpoint failed.\nFirst error: {e1}\nSecond error: {e2}")
class HunyuanCV(GptV4):
_FACTORY_NAME = "Tencent Hunyuan"
@ -616,8 +675,6 @@ class GeminiCV(Base):
def _process_video(self, video_bytes, filename):
from google import genai
from google.genai import types
import tempfile
from pathlib import Path
video_size_mb = len(video_bytes) / (1024 * 1024)
client = genai.Client(api_key=self.api_key)

View File

@ -459,12 +459,10 @@ def tree_merge(bull, sections, depth):
return len(BULLET_PATTERN[bull])+1, text
else:
return len(BULLET_PATTERN[bull])+2, text
level_set = set()
lines = []
for section in sections:
level, text = get_level(bull, section)
if not text.strip("\n"):
continue
@ -797,8 +795,8 @@ class Node:
def __init__(self, level, depth=-1, texts=None):
self.level = level
self.depth = depth
self.texts = texts if texts is not None else [] # 存放内容
self.children = [] # 子节点
self.texts = texts or []
self.children = []
def add_child(self, child_node):
self.children.append(child_node)
@ -825,35 +823,51 @@ class Node:
return f"Node(level={self.level}, texts={self.texts}, children={len(self.children)})"
def build_tree(self, lines):
stack = [self]
for line in lines:
level, text = line
node = Node(level=level, texts=[text])
if level <= self.depth or self.depth == -1:
while stack and level <= stack[-1].get_level():
stack.pop()
stack[-1].add_child(node)
stack.append(node)
else:
stack = [self]
for level, text in lines:
if self.depth != -1 and level > self.depth:
# Beyond target depth: merge content into the current leaf instead of creating deeper nodes
stack[-1].add_text(text)
return self
continue
# Move up until we find the proper parent whose level is strictly smaller than current
while len(stack) > 1 and level <= stack[-1].get_level():
stack.pop()
node = Node(level=level, texts=[text])
# Attach as child of current parent and descend
stack[-1].add_child(node)
stack.append(node)
return self
def get_tree(self):
tree_list = []
self._dfs(self, tree_list, 0, [])
self._dfs(self, tree_list, [])
return tree_list
def _dfs(self, node, tree_list, current_depth, titles):
def _dfs(self, node, tree_list, titles):
level = node.get_level()
texts = node.get_texts()
child = node.get_children()
if node.get_texts():
if 0 < node.get_level() < self.depth:
titles.extend(node.get_texts())
else:
combined_text = ["\n".join(titles + node.get_texts())]
tree_list.append(combined_text)
if level == 0 and texts:
tree_list.append("\n".join(titles+texts))
# Titles within configured depth are accumulated into the current path
if 1 <= level <= self.depth:
path_titles = titles + texts
else:
path_titles = titles
for child in node.get_children():
self._dfs(child, tree_list, current_depth + 1, titles.copy())
# Body outside the depth limit becomes its own chunk under the current title path
if level > self.depth and texts:
tree_list.append("\n".join(path_titles + texts))
# A leaf title within depth emits its title path as a chunk (header-only section)
elif not child and (1 <= level <= self.depth):
tree_list.append("\n".join(path_titles))
# Recurse into children with the updated title path
for c in child:
self._dfs(c, tree_list, path_titles)

View File

@ -114,7 +114,7 @@ class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
),
}
],
{"max_tokens": self._max_token},
{"max_tokens": max(self._max_token, 512)}, # fix issue: #10235
)
cnt = re.sub(
"(······\n由于长度的原因,回答被截断了,要继续吗?|For the content length reason, it stopped, continue?)",

View File

@ -1052,13 +1052,14 @@ async def task_manager():
async def main():
logging.info(r"""
______ __ ______ __
/_ __/___ ______/ /__ / ____/ _____ _______ __/ /_____ _____
/ / / __ `/ ___/ //_/ / __/ | |/_/ _ \/ ___/ / / / __/ __ \/ ___/
/ / / /_/ (__ ) ,< / /____> </ __/ /__/ /_/ / /_/ /_/ / /
/_/ \__,_/____/_/|_| /_____/_/|_|\___/\___/\__,_/\__/\____/_/
____ __ _
/ _/___ ____ ____ _____/ /_(_)___ ____ ________ ______ _____ _____
/ // __ \/ __ `/ _ \/ ___/ __/ / __ \/ __ \ / ___/ _ \/ ___/ | / / _ \/ ___/
_/ // / / / /_/ / __(__ ) /_/ / /_/ / / / / (__ ) __/ / | |/ / __/ /
/___/_/ /_/\__, /\___/____/\__/_/\____/_/ /_/ /____/\___/_/ |___/\___/_/
/____/
""")
logging.info(f'TaskExecutor: RAGFlow version: {get_ragflow_version()}')
logging.info(f'RAGFlow version: {get_ragflow_version()}')
settings.init_settings()
print_rag_settings()
if sys.platform != "win32":

View File

@ -83,7 +83,7 @@ class TestChunksRetrieval:
"ValueError('Search does not support negative slicing.')",
marks=pytest.mark.skip,
),
pytest.param({"page": 2, "page_size": 2}, 0, 2, "", marks=pytest.mark.skip(reason="issues/6646")),
({"page": 2, "page_size": 2}, 0, 2, ""),
({"page": 3, "page_size": 2}, 0, 0, ""),
({"page": "3", "page_size": 2}, 0, 0, ""),
pytest.param(
@ -124,9 +124,9 @@ class TestChunksRetrieval:
marks=pytest.mark.skip,
),
# ({"page_size": 0}, 0, 0, ""),
({"page_size": 1}, 0, 1, ""),
pytest.param({"page_size": 1}, 0, 1, "", marks=pytest.mark.skip(reason="issues/10692")),
({"page_size": 5}, 0, 4, ""),
({"page_size": "1"}, 0, 1, ""),
pytest.param({"page_size": "1"}, 0, 1, "", marks=pytest.mark.skip(reason="issues/10692")),
# ({"page_size": -1}, 0, 0, ""),
pytest.param(
{"page_size": "a"},

View File

@ -1,5 +1,4 @@
import { PlusOutlined } from '@ant-design/icons';
import { TweenOneGroup } from 'rc-tween-one';
import React, { useEffect, useRef, useState } from 'react';
import { X } from 'lucide-react';
@ -57,7 +56,7 @@ const EditTag = React.forwardRef<HTMLDivElement, EditTagsProps>(
<HoverCard key={tag}>
<HoverCardContent side="top">{tag}</HoverCardContent>
<HoverCardTrigger asChild>
<div className="w-fit flex items-center justify-center gap-2 border-dashed border px-1 rounded-sm bg-bg-card">
<div className="w-fit flex items-center justify-center gap-2 border-dashed border px-2 py-1 rounded-sm bg-bg-card">
<div className="flex gap-2 items-center">
<div className="max-w-80 overflow-hidden text-ellipsis">
{tag}
@ -84,11 +83,11 @@ const EditTag = React.forwardRef<HTMLDivElement, EditTagsProps>(
return (
<div>
{inputVisible ? (
{inputVisible && (
<Input
ref={inputRef}
type="text"
className="h-8 bg-bg-card"
className="h-8 bg-bg-card mb-1"
value={inputValue}
onChange={handleInputChange}
onBlur={handleInputConfirm}
@ -98,36 +97,20 @@ const EditTag = React.forwardRef<HTMLDivElement, EditTagsProps>(
}
}}
/>
) : (
<Button
variant="dashed"
className="w-fit flex items-center justify-center gap-2 bg-bg-card"
onClick={showInput}
style={tagPlusStyle}
>
<PlusOutlined />
</Button>
)}
{Array.isArray(tagChild) && tagChild.length > 0 && (
<TweenOneGroup
className="flex gap-2 flex-wrap mt-2"
enter={{
scale: 0.8,
opacity: 0,
type: 'from',
duration: 100,
}}
onEnd={(e) => {
if (e.type === 'appear' || e.type === 'enter') {
(e.target as any).style = 'display: inline-block';
}
}}
leave={{ opacity: 0, width: 0, scale: 0, duration: 200 }}
appear={false}
>
{tagChild}
</TweenOneGroup>
)}
<div className="flex gap-2 py-1">
{Array.isArray(tagChild) && tagChild.length > 0 && <>{tagChild}</>}
{!inputVisible && (
<Button
variant="dashed"
className="w-fit flex items-center justify-center gap-2 bg-bg-card"
onClick={showInput}
style={tagPlusStyle}
>
<PlusOutlined />
</Button>
)}
</div>
</div>
);
},

View File

@ -1533,8 +1533,8 @@ This delimiter is used to split the input text into several text pieces echo of
'Your users will see this welcome message at the beginning.',
modeTip: 'The mode defines how the workflow is initiated.',
mode: 'Mode',
conversational: 'conversational',
task: 'task',
conversational: 'Conversational',
task: 'Task',
beginInputTip:
'By defining input parameters, this content can be accessed by other components in subsequent processes.',
query: 'Query variables',
@ -1605,6 +1605,119 @@ This delimiter is used to split the input text into several text pieces echo of
ceateAgent: 'Agent flow',
createPipeline: 'Ingestion pipeline',
chooseAgentType: 'Choose Agent Type',
parser: 'Parser',
parserDescription:
'Extracts raw text and structure from files for downstream processing.',
tokenizer: 'Indexer',
tokenizerRequired: 'Please add the Indexer node first',
tokenizerDescription:
'Transforms text into the required data structure (e.g., vector embeddings for Embedding Search) depending on the chosen search method.',
splitter: 'Token',
splitterDescription:
'Split text into chunks by token length with optional delimiters and overlap.',
hierarchicalMergerDescription:
'Split documents into sections by title hierarchy with regex rules for finer control.',
hierarchicalMerger: 'Title',
extractor: 'Transformer',
extractorDescription:
'Use an LLM to extract structured insights from document chunks—such as summaries, classifications, etc.',
outputFormat: 'Output format',
fileFormats: 'File format',
fileFormatOptions: {
pdf: 'PDF',
spreadsheet: 'Spreadsheet',
image: 'Image',
email: 'Email',
'text&markdown': 'Text & Markup',
word: 'Word',
slides: 'PPT',
audio: 'Audio',
},
fields: 'Field',
addParser: 'Add Parser',
hierarchy: 'Hierarchy',
regularExpressions: 'Regular Expressions',
overlappedPercent: 'Overlapped percent (%)',
searchMethod: 'Search method',
searchMethodTip: `Defines how the content can be searched — by full-text, embedding, or both.
The Indexer will store the content in the corresponding data structures for the selected methods.`,
// file: 'File',
parserMethod: 'Parsing method',
// systemPrompt: 'System Prompt',
systemPromptPlaceholder:
'Enter system prompt for image analysis, if empty the system default value will be used',
exportJson: 'Export JSON',
viewResult: 'View result',
running: 'Running',
summary: 'Summary',
keywords: 'Keywords',
questions: 'Questions',
metadata: 'Metadata',
fieldName: 'Result destination',
prompts: {
system: {
keywords: `Role
You are a text analyzer.
Task
Extract the most important keywords/phrases of a given piece of text content.
Requirements
- Summarize the text content, and give the top 5 important keywords/phrases.
- The keywords MUST be in the same language as the given piece of text content.
- The keywords are delimited by ENGLISH COMMA.
- Output keywords ONLY.`,
questions: `Role
You are a text analyzer.
Task
Propose 3 questions about a given piece of text content.
Requirements
- Understand and summarize the text content, and propose the top 3 important questions.
- The questions SHOULD NOT have overlapping meanings.
- The questions SHOULD cover the main content of the text as much as possible.
- The questions MUST be in the same language as the given piece of text content.
- One question per line.
- Output questions ONLY.`,
summary: `Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.
Key Instructions:
1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.
2. Language: Write the summary in the same language as the source text.
3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.
4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.`,
metadata: `Extract important structured information from the given content. Output ONLY a valid JSON string with no additional text. If no important structured information is found, output an empty JSON object: {}.
Important structured information may include: names, dates, locations, events, key facts, numerical data, or other extractable entities.`,
},
user: {
keywords: `Text Content
[Insert text here]`,
questions: `Text Content
[Insert text here]`,
summary: `Text to Summarize:
[Insert text here]`,
metadata: `Content: [INSERT CONTENT HERE]`,
},
},
cancel: 'Cancel',
swicthPromptMessage:
'The prompt word will change. Please confirm whether to abandon the existing prompt word?',
tokenizerSearchMethodOptions: {
full_text: 'Full-text',
embedding: 'Embedding',
},
filenameEmbeddingWeight: 'Filename embedding weight',
tokenizerFieldsOptions: {
text: 'Processed Text',
keywords: 'Keywords',
questions: 'Questions',
summary: 'Augmented Context',
},
imageParseMethodOptions: {
ocr: 'OCR',
},
},
llmTools: {
bad_calculator: {
@ -1705,125 +1818,6 @@ This delimiter is used to split the input text into several text pieces echo of
<p>Are you sure you want to proceed?</p> `,
unlinkPipelineModalConfirmText: 'Unlink',
},
dataflow: {
parser: 'Parser',
parserDescription:
'Extracts raw text and structure from files for downstream processing.',
tokenizer: 'Indexer',
tokenizerRequired: 'Please add the Indexer node first',
tokenizerDescription:
'Transforms text into the required data structure (e.g., vector embeddings for Embedding Search) depending on the chosen search method.',
splitter: 'Token',
splitterDescription:
'Split text into chunks by token length with optional delimiters and overlap.',
hierarchicalMergerDescription:
'Split documents into sections by title hierarchy with regex rules for finer control.',
hierarchicalMerger: 'Title',
extractor: 'Transformer',
extractorDescription:
'Use an LLM to extract structured insights from document chunks—such as summaries, classifications, etc.',
outputFormat: 'Output format',
lang: 'Language',
fileFormats: 'File format',
fileFormatOptions: {
pdf: 'PDF',
spreadsheet: 'Spreadsheet',
image: 'Image',
email: 'Email',
'text&markdown': 'Text & Markup',
word: 'Word',
slides: 'PPT',
audio: 'Audio',
},
fields: 'Field',
addParser: 'Add Parser',
hierarchy: 'Hierarchy',
regularExpressions: 'Regular Expressions',
overlappedPercent: 'Overlapped percent (%)',
searchMethod: 'Search method',
searchMethodTip: `Defines how the content can be searched — by full-text, embedding, or both.
The Indexer will store the content in the corresponding data structures for the selected methods.`,
begin: 'File',
parserMethod: 'Parsing method',
systemPrompt: 'System Prompt',
systemPromptPlaceholder:
'Enter system prompt for image analysis, if empty the system default value will be used',
exportJson: 'Export JSON',
viewResult: 'View result',
running: 'Running',
summary: 'Summary',
keywords: 'Keywords',
questions: 'Questions',
metadata: 'Metadata',
fieldName: 'Result destination',
prompts: {
system: {
keywords: `Role
You are a text analyzer.
Task
Extract the most important keywords/phrases of a given piece of text content.
Requirements
- Summarize the text content, and give the top 5 important keywords/phrases.
- The keywords MUST be in the same language as the given piece of text content.
- The keywords are delimited by ENGLISH COMMA.
- Output keywords ONLY.`,
questions: `Role
You are a text analyzer.
Task
Propose 3 questions about a given piece of text content.
Requirements
- Understand and summarize the text content, and propose the top 3 important questions.
- The questions SHOULD NOT have overlapping meanings.
- The questions SHOULD cover the main content of the text as much as possible.
- The questions MUST be in the same language as the given piece of text content.
- One question per line.
- Output questions ONLY.`,
summary: `Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.
Key Instructions:
1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.
2. Language: Write the summary in the same language as the source text.
3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.
4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.`,
metadata: `Extract important structured information from the given content. Output ONLY a valid JSON string with no additional text. If no important structured information is found, output an empty JSON object: {}.
Important structured information may include: names, dates, locations, events, key facts, numerical data, or other extractable entities.`,
},
user: {
keywords: `Text Content
[Insert text here]`,
questions: `Text Content
[Insert text here]`,
summary: `Text to Summarize:
[Insert text here]`,
metadata: `Content: [INSERT CONTENT HERE]`,
},
},
cancel: 'Cancel',
swicthPromptMessage:
'The prompt word will change. Please confirm whether to abandon the existing prompt word?',
tokenizerSearchMethodOptions: {
full_text: 'Full-text',
embedding: 'Embedding',
},
filenameEmbeddingWeight: 'Filename embedding weight',
tokenizerFieldsOptions: {
text: 'Processed Text',
keywords: 'Keywords',
questions: 'Questions',
summary: 'Augmented Context',
},
imageParseMethodOptions: {
ocr: 'OCR',
},
note: 'Note',
noteDescription: 'Note',
notePlaceholder: 'Please enter a note',
},
datasetOverview: {
downloadTip: 'Files being downloaded from data sources. ',
processingTip: 'Files being processed by Ingestion pipeline.',

View File

@ -1511,6 +1511,93 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
createFromTemplate: '从模板创建',
importJsonFile: '导入 JSON 文件',
chooseAgentType: '选择智能体类型',
parser: '解析器',
parserDescription: '从文件中提取原始文本和结构以供下游处理。',
tokenizer: '分词器',
tokenizerRequired: '请先添加Tokenizer节点',
tokenizerDescription:
'根据所选的搜索方法,将文本转换为所需的数据结构(例如,用于嵌入搜索的向量嵌入)。',
splitter: '按字符分割',
splitterDescription:
'根据分词器长度将文本拆分成块,并带有可选的分隔符和重叠。',
hierarchicalMergerDescription:
'使用正则表达式规则按标题层次结构将文档拆分成多个部分,以实现更精细的控制。',
hierarchicalMerger: '按标题分割',
extractor: '提取器',
extractorDescription:
'使用 LLM 从文档块(例如摘要、分类等)中提取结构化见解。',
outputFormat: '输出格式',
fileFormats: '文件格式',
fields: '字段',
addParser: '增加解析器',
hierarchy: '层次结构',
regularExpressions: '正则表达式',
overlappedPercent: '重叠百分比(%',
searchMethod: '搜索方法',
searchMethodTip: `决定该数据集启用的搜索方式,可选择全文、向量,或两者兼有。
Tokenizer 会根据所选方式将内容存储为对应的数据结构。`,
filenameEmbdWeight: '文件名嵌入权重',
parserMethod: '解析方法',
systemPromptPlaceholder:
'请输入用于图像分析的系统提示词,若为空则使用系统缺省值',
exportJson: '导出 JSON',
viewResult: '查看结果',
running: '运行中',
summary: '增强上下文',
keywords: '关键词',
questions: '问题',
metadata: '元数据',
fieldName: '结果目的地',
prompts: {
system: {
keywords: `角色
你是一名文本分析员。
任务
从给定的文本内容中提取最重要的关键词/短语。
要求
- 总结文本内容并给出最重要的5个关键词/短语。
- 关键词必须与给定的文本内容使用相同的语言。
- 关键词之间用英文逗号分隔。
- 仅输出关键词。`,
questions: `角色
你是一名文本分析员。
任务
针对给定的文本内容提出3个问题。
要求
- 理解并总结文本内容并提出最重要的3个问题。
- 问题的含义不应重叠。
- 问题应尽可能涵盖文本的主要内容。
- 问题必须与给定的文本内容使用相同的语言。
- 每行一个问题。
- 仅输出问题。`,
summary: `扮演一个精准的摘要者。你的任务是为提供的内容创建一个简洁且忠实于原文的摘要。
关键说明:
1. 准确性:摘要必须严格基于所提供的信息。请勿引入任何未明确说明的新事实、结论或解释。
2. 语言:摘要必须使用与原文相同的语言。
3. 客观性:不带偏见地呈现要点,保留内容的原始意图和语气。请勿进行编辑。
4. 简洁性:专注于最重要的思想,省略细节和多余的内容。`,
metadata: `从给定内容中提取重要的结构化信息。仅输出有效的 JSON 字符串,不包含任何附加文本。如果未找到重要的结构化信息,则输出一个空的 JSON 对象:{}。
重要的结构化信息可能包括:姓名、日期、地点、事件、关键事实、数字数据或其他可提取实体。`,
},
user: {
keywords: `文本内容
[在此处插入文本]`,
questions: `文本内容
[在此处插入文本]`,
summary: `要总结的文本:
[在此处插入文本]`,
metadata: `内容:[在此处插入内容]`,
},
},
cancel: '取消',
filenameEmbeddingWeight: '文件名嵌入权重',
switchPromptMessage: '提示词将发生变化,请确认是否放弃已有提示词?',
},
footer: {
profile: 'All rights reserved @ React',
@ -1618,101 +1705,6 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
<p>你确定要继续吗?</p> `,
unlinkPipelineModalConfirmText: '解绑',
},
dataflow: {
parser: '解析器',
parserDescription: '从文件中提取原始文本和结构以供下游处理。',
tokenizer: '分词器',
tokenizerRequired: '请先添加Tokenizer节点',
tokenizerDescription:
'根据所选的搜索方法,将文本转换为所需的数据结构(例如,用于嵌入搜索的向量嵌入)。',
splitter: '按字符分割',
splitterDescription:
'根据分词器长度将文本拆分成块,并带有可选的分隔符和重叠。',
hierarchicalMergerDescription:
'使用正则表达式规则按标题层次结构将文档拆分成多个部分,以实现更精细的控制。',
hierarchicalMerger: '按标题分割',
extractor: '提取器',
extractorDescription:
'使用 LLM 从文档块(例如摘要、分类等)中提取结构化见解。',
outputFormat: '输出格式',
lang: '语言',
fileFormats: '文件格式',
fields: '字段',
addParser: '增加解析器',
hierarchy: '层次结构',
regularExpressions: '正则表达式',
overlappedPercent: '重叠百分比(%',
searchMethod: '搜索方法',
searchMethodTip: `决定该数据集启用的搜索方式,可选择全文、向量,或两者兼有。
Tokenizer 会根据所选方式将内容存储为对应的数据结构。`,
filenameEmbdWeight: '文件名嵌入权重',
begin: '文件',
parserMethod: '解析方法',
systemPrompt: '系统提示词',
systemPromptPlaceholder:
'请输入用于图像分析的系统提示词,若为空则使用系统缺省值',
exportJson: '导出 JSON',
viewResult: '查看结果',
running: '运行中',
summary: '增强上下文',
keywords: '关键词',
questions: '问题',
metadata: '元数据',
fieldName: '结果目的地',
prompts: {
system: {
keywords: `角色
你是一名文本分析员。
任务
从给定的文本内容中提取最重要的关键词/短语。
要求
- 总结文本内容并给出最重要的5个关键词/短语。
- 关键词必须与给定的文本内容使用相同的语言。
- 关键词之间用英文逗号分隔。
- 仅输出关键词。`,
questions: `角色
你是一名文本分析员。
任务
针对给定的文本内容提出3个问题。
要求
- 理解并总结文本内容并提出最重要的3个问题。
- 问题的含义不应重叠。
- 问题应尽可能涵盖文本的主要内容。
- 问题必须与给定的文本内容使用相同的语言。
- 每行一个问题。
- 仅输出问题。`,
summary: `扮演一个精准的摘要者。你的任务是为提供的内容创建一个简洁且忠实于原文的摘要。
关键说明:
1. 准确性:摘要必须严格基于所提供的信息。请勿引入任何未明确说明的新事实、结论或解释。
2. 语言:摘要必须使用与原文相同的语言。
3. 客观性:不带偏见地呈现要点,保留内容的原始意图和语气。请勿进行编辑。
4. 简洁性:专注于最重要的思想,省略细节和多余的内容。`,
metadata: `从给定内容中提取重要的结构化信息。仅输出有效的 JSON 字符串,不包含任何附加文本。如果未找到重要的结构化信息,则输出一个空的 JSON 对象:{}。
重要的结构化信息可能包括:姓名、日期、地点、事件、关键事实、数字数据或其他可提取实体。`,
},
user: {
keywords: `文本内容
[在此处插入文本]`,
questions: `文本内容
[在此处插入文本]`,
summary: `要总结的文本:
[在此处插入文本]`,
metadata: `内容:[在此处插入内容]`,
},
},
cancel: '取消',
filenameEmbeddingWeight: '文件名嵌入权重',
switchPromptMessage: '提示词将发生变化,请确认是否放弃已有提示词?',
note: '注释',
noteDescription: '注释',
notePlaceholder: '请输入注释',
},
datasetOverview: {
downloadTip: '正在从数据源下载文件。',
processingTip: '正在由pipeline处理文件。',

View File

@ -36,7 +36,7 @@ function InnerFileNode({ data, id, selected }: NodeProps<IBeginNode>) {
<section className="flex items-center gap-2">
<OperatorIcon name={data.label as Operator}></OperatorIcon>
<div className="truncate text-center font-semibold text-sm">
{t(`dataflow.begin`)}
{t(`flow.begin`)}
</div>
</section>
<section className={cn(styles.generateParameters, 'flex gap-2 flex-col')}>

View File

@ -5,6 +5,8 @@ import { Plus } from 'lucide-react';
import { useMemo } from 'react';
import { NodeHandleId } from '../../constant';
import { HandleContext } from '../../context';
import { useIsPipeline } from '../../hooks/use-is-pipeline';
import useGraphStore from '../../store';
import { useDropdownManager } from '../context';
import { NextStepDropdown } from './dropdown/next-step-dropdown';
@ -14,9 +16,12 @@ export function CommonHandle({
...props
}: HandleProps & { nodeId: string }) {
const { visible, hideModal, showModal } = useSetModalState();
const { canShowDropdown, setActiveDropdown, clearActiveDropdown } =
useDropdownManager();
const { hasChildNode } = useGraphStore((state) => state);
const isPipeline = useIsPipeline();
const isConnectable = !(isPipeline && hasChildNode(nodeId)); // Using useMemo will cause isConnectable to not be updated when the subsequent connection line is deleted
const value = useMemo(
() => ({
@ -33,6 +38,7 @@ export function CommonHandle({
<HandleContext.Provider value={value}>
<Handle
{...props}
isConnectable={isConnectable}
className={cn(
'inline-flex justify-center items-center !bg-accent-primary !border-none group-hover:!size-4 group-hover:!rounded-sm',
className,
@ -40,6 +46,10 @@ export function CommonHandle({
onClick={(e) => {
e.stopPropagation();
if (!isConnectable) {
return;
}
if (!canShowDropdown()) {
return;
}

View File

@ -46,7 +46,7 @@ function ParserNode({
className="flex flex-col text-text-primary gap-1"
>
<span className="text-text-secondary">Parser {idx + 1}</span>
{t(`dataflow.fileFormatOptions.${x.fileFormat}`)}
{t(`flow.fileFormatOptions.${x.fileFormat}`)}
</LabelCard>
)}
</NodeCollapsible>

View File

@ -38,12 +38,10 @@ function TokenizerNode({
></CommonHandle>
<NodeHeader id={id} name={data.name} label={data.label}></NodeHeader>
<LabelCard className="text-text-primary flex justify-between flex-col gap-1">
<span className="text-text-secondary">
{t('dataflow.searchMethod')}
</span>
<span className="text-text-secondary">{t('flow.searchMethod')}</span>
<ul className="space-y-1">
{data.form?.search_method.map((x) => (
<li key={x}>{t(`dataflow.tokenizerSearchMethodOptions.${x}`)}</li>
<li key={x}>{t(`flow.tokenizerSearchMethodOptions.${x}`)}</li>
))}
</ul>
</LabelCard>

View File

@ -48,13 +48,3 @@ export type HandleContextType = {
export const HandleContext = createContext<HandleContextType>(
{} as HandleContextType,
);
export type PipelineLogContextType = {
messageId: string;
setMessageId: (messageId: string) => void;
setUploadedFileData: (data: Record<string, any>) => void;
};
export const PipelineLogContext = createContext<PipelineLogContextType>(
{} as PipelineLogContextType,
);

View File

@ -47,7 +47,7 @@ const ExtractorForm = ({ node }: INextOperatorForm) => {
const promptOptions = useBuildNodeOutputOptions(node?.id);
const options = buildOptions(ContextGeneratorFieldName, t, 'dataflow');
const options = buildOptions(ContextGeneratorFieldName, t, 'flow');
const {
handleFieldNameChange,
@ -63,7 +63,7 @@ const ExtractorForm = ({ node }: INextOperatorForm) => {
<Form {...form}>
<FormWrapper>
<LargeModelFormField></LargeModelFormField>
<RAGFlowFormItem label={t('dataflow.fieldName')} name="field_name">
<RAGFlowFormItem label={t('flow.fieldName')} name="field_name">
{(field) => (
<SelectWithSearch
onChange={(value) => {
@ -93,7 +93,7 @@ const ExtractorForm = ({ node }: INextOperatorForm) => {
</FormWrapper>
{visible && (
<ConfirmDeleteDialog
title={t('dataflow.switchPromptMessage')}
title={t('flow.switchPromptMessage')}
open
onOpenChange={hideModal}
onOk={confirmSwitch}

View File

@ -21,7 +21,7 @@ export function useSwitchPrompt(form: UseFormReturn<ExtractorFormSchemaType>) {
const setPromptValue = useCallback(
(field: keyof ExtractorFormSchemaType, key: string, value: string) => {
form.setValue(field, t(`dataflow.prompts.${key}.${value}`), {
form.setValue(field, t(`flow.prompts.${key}.${value}`), {
shouldDirty: true,
shouldValidate: true,
});

View File

@ -98,7 +98,7 @@ export function RegularExpressions({
</CardHeader>
<CardContent>
<FormLabel required className="mb-2 text-text-secondary">
{t('dataflow.regularExpressions')}
{t('flow.regularExpressions')}
</FormLabel>
<section className="space-y-4">
{fields.map((field, index) => (
@ -158,7 +158,7 @@ const HierarchicalMergerForm = ({ node }: INextOperatorForm) => {
return (
<Form {...form}>
<FormWrapper>
<RAGFlowFormItem name={'hierarchy'} label={t('dataflow.hierarchy')}>
<RAGFlowFormItem name={'hierarchy'} label={t('flow.hierarchy')}>
<SelectWithSearch options={HierarchyOptions}></SelectWithSearch>
</RAGFlowFormItem>
{fields.map((field, index) => (

View File

@ -50,7 +50,7 @@ export function OutputFormatFormField({
return (
<RAGFlowFormItem
name={buildFieldNameWithPrefix(`output_format`, prefix)}
label={t('dataflow.outputFormat')}
label={t('flow.outputFormat')}
>
<SelectWithSearch
options={buildOutputOptionsFormatMap()[fileType]}
@ -69,7 +69,7 @@ export function ParserMethodFormField({
name={buildFieldNameWithPrefix(`parse_method`, prefix)}
horizontal={false}
optionsWithoutLLM={optionsWithoutLLM}
label={t('dataflow.parserMethod')}
label={t('flow.parserMethod')}
></LayoutRecognizeFormField>
);
}
@ -92,7 +92,7 @@ export function LanguageFormField({ prefix }: CommonProps) {
return (
<RAGFlowFormItem
name={buildFieldNameWithPrefix(`lang`, prefix)}
label={t('dataflow.lang')}
label={t('flow.lang')}
>
{(field) => (
<SelectWithSearch

View File

@ -14,7 +14,7 @@ export function EmailFormFields({ prefix }: CommonProps) {
<>
<RAGFlowFormItem
name={buildFieldNameWithPrefix(`fields`, prefix)}
label={t('dataflow.fields')}
label={t('flow.fields')}
>
{(field) => (
<MultiSelect

View File

@ -17,7 +17,7 @@ export function ImageFormFields({ prefix }: CommonProps) {
const options = buildOptions(
ImageParseMethod,
t,
'dataflow.imageParseMethodOptions',
'flow.imageParseMethodOptions',
);
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
@ -50,9 +50,9 @@ export function ImageFormFields({ prefix }: CommonProps) {
{languageShown && (
<RAGFlowFormItem
name={buildFieldNameWithPrefix('system_prompt', prefix)}
label={t('dataflow.systemPrompt')}
label={t('flow.systemPrompt')}
>
<Textarea placeholder={t('dataflow.systemPromptPlaceholder')} />
<Textarea placeholder={t('flow.systemPromptPlaceholder')} />
</RAGFlowFormItem>
)}
</>

View File

@ -133,7 +133,7 @@ function ParserItem({
</div>
<RAGFlowFormItem
name={buildFieldNameWithPrefix(`fileFormat`, prefix)}
label={t('dataflow.fileFormats')}
label={t('flow.fileFormats')}
>
{(field) => (
<SelectWithSearch
@ -165,7 +165,7 @@ const ParserForm = ({ node }: INextOperatorForm) => {
const FileFormatOptions = buildOptions(
FileType,
t,
'dataflow.fileFormatOptions',
'flow.fileFormatOptions',
).filter(
(x) => x.value !== FileType.Video, // Temporarily hide the video option
);
@ -212,7 +212,7 @@ const ParserForm = ({ node }: INextOperatorForm) => {
})}
{fields.length < FileFormatOptions.length && (
<BlockButton onClick={add} type="button" className="mt-2.5">
{t('dataflow.addParser')}
{t('flow.addParser')}
</BlockButton>
)}
</form>

View File

@ -2,6 +2,10 @@ import { Collapse } from '@/components/collapse';
import { CrossLanguageFormField } from '@/components/cross-language-form-field';
import { FormContainer } from '@/components/form-container';
import { KnowledgeBaseFormField } from '@/components/knowledge-base-item';
import {
MetadataFilter,
MetadataFilterSchema,
} from '@/components/metadata-filter';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { RerankFormFields } from '@/components/rerank';
import { SimilaritySliderFormField } from '@/components/similarity-slider';
@ -41,6 +45,7 @@ export const RetrievalPartialSchema = {
cross_languages: z.array(z.string()),
use_kg: z.boolean(),
toc_enhance: z.boolean(),
...MetadataFilterSchema,
};
export const FormSchema = z.object({
@ -118,6 +123,7 @@ function RetrievalForm({ node }: INextOperatorForm) {
></SimilaritySliderFormField>
<TopNFormField></TopNFormField>
<RerankFormFields></RerankFormFields>
<MetadataFilter></MetadataFilter>
<EmptyResponseField></EmptyResponseField>
<CrossLanguageFormField name="cross_languages"></CrossLanguageFormField>
<UseKnowledgeGraphFormField name="use_kg"></UseKnowledgeGraphFormField>

View File

@ -60,7 +60,7 @@ const SplitterForm = ({ node }: INextOperatorForm) => {
name="overlapped_percent"
max={30}
min={0}
label={t('dataflow.overlappedPercent')}
label={t('flow.overlappedPercent')}
></SliderInputFormField>
<section>
<span className="mb-2 inline-block">{t('flow.delimiters')}</span>

View File

@ -38,12 +38,12 @@ const TokenizerForm = ({ node }: INextOperatorForm) => {
const SearchMethodOptions = buildOptions(
TokenizerSearchMethod,
t,
`dataflow.tokenizerSearchMethodOptions`,
`flow.tokenizerSearchMethodOptions`,
);
const FieldsOptions = buildOptions(
TokenizerFields,
t,
'dataflow.tokenizerFieldsOptions',
'flow.tokenizerFieldsOptions',
);
const form = useForm<TokenizerFormSchemaType>({
@ -59,8 +59,8 @@ const TokenizerForm = ({ node }: INextOperatorForm) => {
<FormWrapper>
<RAGFlowFormItem
name="search_method"
label={t('dataflow.searchMethod')}
tooltip={t('dataflow.searchMethodTip')}
label={t('flow.searchMethod')}
tooltip={t('flow.searchMethodTip')}
>
{(field) => (
<MultiSelect
@ -73,11 +73,11 @@ const TokenizerForm = ({ node }: INextOperatorForm) => {
</RAGFlowFormItem>
<SliderInputFormField
name="filename_embd_weight"
label={t('dataflow.filenameEmbeddingWeight')}
label={t('flow.filenameEmbeddingWeight')}
max={0.5}
step={0.01}
></SliderInputFormField>
<RAGFlowFormItem name="fields" label={t('dataflow.fields')}>
<RAGFlowFormItem name="fields" label={t('flow.fields')}>
{(field) => <SelectWithSearch options={FieldsOptions} {...field} />}
</RAGFlowFormItem>
</FormWrapper>

View File

@ -2,6 +2,7 @@ import { Collapse } from '@/components/collapse';
import { CrossLanguageFormField } from '@/components/cross-language-form-field';
import { FormContainer } from '@/components/form-container';
import { KnowledgeBaseFormField } from '@/components/knowledge-base-item';
import { MetadataFilter } from '@/components/metadata-filter';
import { RerankFormFields } from '@/components/rerank';
import { SimilaritySliderFormField } from '@/components/similarity-slider';
import { TOCEnhanceFormField } from '@/components/toc-enhance-form-field';
@ -51,6 +52,7 @@ const RetrievalForm = () => {
></SimilaritySliderFormField>
<TopNFormField></TopNFormField>
<RerankFormFields></RerankFormFields>
<MetadataFilter></MetadataFilter>
<EmptyResponseField></EmptyResponseField>
<CrossLanguageFormField name="cross_languages"></CrossLanguageFormField>
<UseKnowledgeGraphFormField name="use_kg"></UseKnowledgeGraphFormField>

View File

@ -128,8 +128,8 @@ export const useInitializeOperatorParams = () => {
[Operator.Extractor]: {
...initialExtractorValues,
llm_id: llmId,
sys_prompt: t('dataflow.prompts.system.summary'),
prompts: t('dataflow.prompts.user.summary'),
sys_prompt: t('flow.prompts.system.summary'),
prompts: t('flow.prompts.user.summary'),
},
};
}, [llmId]);

View File

@ -0,0 +1,55 @@
import message from '@/components/ui/message';
import { useSendMessageBySSE } from '@/hooks/use-send-message';
import api from '@/utils/api';
import { get } from 'lodash';
import { useCallback, useState } from 'react';
import { useParams } from 'umi';
import { UseFetchLogReturnType } from './use-fetch-pipeline-log';
import { useSaveGraph } from './use-save-graph';
export function useRunDataflow({
showLogSheet,
setMessageId,
}: {
showLogSheet: () => void;
} & Pick<UseFetchLogReturnType, 'setMessageId'>) {
const { send } = useSendMessageBySSE(api.runCanvas);
const { id } = useParams();
const { saveGraph, loading } = useSaveGraph();
const [uploadedFileData, setUploadedFileData] =
useState<Record<string, any>>();
const run = useCallback(
async (fileResponseData: Record<string, any>) => {
const saveRet = await saveGraph();
const success = saveRet?.code === 0;
if (!success) return;
showLogSheet();
const res = await send({
id,
query: '',
session_id: null,
files: [fileResponseData.file],
});
if (res && res?.response.status === 200 && get(res, 'data.code') === 0) {
// fetch canvas
setUploadedFileData(fileResponseData.file);
const msgId = get(res, 'data.data.message_id');
if (msgId) {
setMessageId(msgId);
}
return msgId;
} else {
message.error(get(res, 'data.message', ''));
}
},
[id, saveGraph, send, setMessageId, setUploadedFileData, showLogSheet],
);
return { run, loading: loading, uploadedFileData };
}
export type RunDataflowType = ReturnType<typeof useRunDataflow>;

View File

@ -61,7 +61,7 @@ export const useShowSingleDebugDrawer = () => {
};
};
const ExcludedNodes = [Operator.Note, Operator.Placeholder];
const ExcludedNodes = [Operator.Note, Operator.Placeholder, Operator.File];
export function useShowDrawer({
drawerVisible,

View File

@ -32,25 +32,26 @@ import {
Settings,
Upload,
} from 'lucide-react';
import { ComponentPropsWithoutRef, useCallback, useState } from 'react';
import { ComponentPropsWithoutRef, useCallback } from 'react';
import { useTranslation } from 'react-i18next';
import { useParams } from 'umi';
import AgentCanvas from './canvas';
import { DropdownProvider } from './canvas/context';
import { Operator } from './constant';
import { PipelineLogContext } from './context';
import { useCancelCurrentDataflow } from './hooks/use-cancel-dataflow';
import { useHandleExportJsonFile } from './hooks/use-export-json';
import { useFetchDataOnMount } from './hooks/use-fetch-data';
import { useFetchPipelineLog } from './hooks/use-fetch-pipeline-log';
import { useGetBeginNodeDataInputs } from './hooks/use-get-begin-query';
import { useIsPipeline } from './hooks/use-is-pipeline';
import { useRunDataflow } from './hooks/use-run-dataflow';
import {
useSaveGraph,
useSaveGraphBeforeOpeningDebugDrawer,
useWatchAgentChange,
} from './hooks/use-save-graph';
import { PipelineLogSheet } from './pipeline-log-sheet';
import PipelineRunSheet from './pipeline-run-sheet';
import { SettingDialog } from './setting-dialog';
import useGraphStore from './store';
import { useAgentHistoryManager } from './use-agent-history-manager';
@ -110,6 +111,12 @@ export default function Agent() {
// pipeline
const {
visible: pipelineRunSheetVisible,
hideModal: hidePipelineRunSheet,
showModal: showPipelineRunSheet,
} = useSetModalState();
const {
visible: pipelineLogSheetVisible,
showModal: showPipelineLogSheet,
@ -126,13 +133,11 @@ export default function Agent() {
isLogEmpty,
} = useFetchPipelineLog(pipelineLogSheetVisible);
const [uploadedFileData, setUploadedFileData] =
useState<Record<string, any>>();
const findNodeByName = useGraphStore((state) => state.findNodeByName);
const handleRunPipeline = useCallback(() => {
if (!findNodeByName(Operator.Tokenizer)) {
message.warning(t('dataflow.tokenizerRequired'));
message.warning(t('flow.tokenizerRequired'));
return;
}
@ -141,14 +146,15 @@ export default function Agent() {
showPipelineLogSheet();
} else {
hidePipelineLogSheet();
handleRun();
// handleRun();
showPipelineRunSheet();
}
}, [
findNodeByName,
handleRun,
hidePipelineLogSheet,
isParsing,
showPipelineLogSheet,
showPipelineRunSheet,
t,
]);
@ -157,7 +163,7 @@ export default function Agent() {
stopFetchTrace,
});
const run = useCallback(() => {
const handleButtonRunClick = useCallback(() => {
if (isPipeline) {
handleRunPipeline();
} else {
@ -165,6 +171,12 @@ export default function Agent() {
}
}, [handleRunAgent, handleRunPipeline, isPipeline]);
const {
run: runPipeline,
loading: pipelineRunning,
uploadedFileData,
} = useRunDataflow({ showLogSheet: showPipelineLogSheet, setMessageId });
return (
<section className="h-full">
<PageHeader>
@ -194,7 +206,7 @@ export default function Agent() {
>
<LaptopMinimalCheck /> {t('flow.save')}
</ButtonLoading>
<Button variant={'secondary'} onClick={run}>
<Button variant={'secondary'} onClick={handleButtonRunClick}>
<CirclePlay />
{t('flow.run')}
</Button>
@ -241,18 +253,14 @@ export default function Agent() {
</DropdownMenu>
</div>
</PageHeader>
<PipelineLogContext.Provider
value={{ messageId, setMessageId, setUploadedFileData }}
>
<ReactFlowProvider>
<DropdownProvider>
<AgentCanvas
drawerVisible={chatDrawerVisible}
hideDrawer={hideChatDrawer}
></AgentCanvas>
</DropdownProvider>
</ReactFlowProvider>
</PipelineLogContext.Provider>
<ReactFlowProvider>
<DropdownProvider>
<AgentCanvas
drawerVisible={chatDrawerVisible}
hideDrawer={hideChatDrawer}
></AgentCanvas>
</DropdownProvider>
</ReactFlowProvider>
{embedVisible && (
<EmbedDialog
visible={embedVisible}
@ -284,6 +292,13 @@ export default function Agent() {
uploadedFileData={uploadedFileData}
></PipelineLogSheet>
)}
{pipelineRunSheetVisible && (
<PipelineRunSheet
hideModal={hidePipelineRunSheet}
run={runPipeline}
loading={pipelineRunning}
></PipelineRunSheet>
)}
</section>
);
}

View File

@ -77,7 +77,7 @@ export function PipelineLogSheet({
uploadedFileData?.extension,
})}
>
{t('dataflow.viewResult')} <ArrowUpRight />
{t('flow.viewResult')} <ArrowUpRight />
</Button>
)}
</SheetTitle>
@ -95,7 +95,7 @@ export function PipelineLogSheet({
className="w-full mt-8 bg-state-error/10 text-state-error hover:bg-state-error hover:text-bg-base"
onClick={handleCancel}
>
<CirclePause /> {t('dataflow.cancel')}
<CirclePause /> {t('flow.cancel')}
</Button>
) : (
<Button
@ -104,7 +104,7 @@ export function PipelineLogSheet({
className="w-full mt-8 bg-accent-primary-5 text-text-secondary hover:bg-accent-primary-5 hover:text-accent-primary hover:border-accent-primary hover:border"
>
<SquareArrowOutUpRight />
{t('dataflow.exportJson')}
{t('flow.exportJson')}
</Button>
)}
</div>

View File

@ -0,0 +1,31 @@
import {
Sheet,
SheetContent,
SheetHeader,
SheetTitle,
} from '@/components/ui/sheet';
import { IModalProps } from '@/interfaces/common';
import { cn } from '@/lib/utils';
import { useTranslation } from 'react-i18next';
import { RunDataflowType } from '../hooks/use-run-dataflow';
import { UploaderForm } from './uploader';
type RunSheetProps = IModalProps<any> &
Pick<RunDataflowType, 'run' | 'loading'>;
const PipelineRunSheet = ({ hideModal, run, loading }: RunSheetProps) => {
const { t } = useTranslation();
return (
<Sheet onOpenChange={hideModal} open modal={false}>
<SheetContent className={cn('top-20 p-2')}>
<SheetHeader>
<SheetTitle>{t('flow.testRun')}</SheetTitle>
<UploaderForm ok={run} loading={loading}></UploaderForm>
</SheetHeader>
</SheetContent>
</Sheet>
);
};
export default PipelineRunSheet;

View File

@ -0,0 +1,57 @@
'use client';
import { z } from 'zod';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { ButtonLoading } from '@/components/ui/button';
import { Form } from '@/components/ui/form';
import { FileUploadDirectUpload } from '@/pages/agent/debug-content/uploader';
import { zodResolver } from '@hookform/resolvers/zod';
import { useForm } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
const formSchema = z.object({
file: z.record(z.any()),
});
export type FormSchemaType = z.infer<typeof formSchema>;
type UploaderFormProps = {
ok: (values: FormSchemaType) => void;
loading: boolean;
};
export function UploaderForm({ ok, loading }: UploaderFormProps) {
const { t } = useTranslation();
const form = useForm<FormSchemaType>({
resolver: zodResolver(formSchema),
defaultValues: {},
});
return (
<Form {...form}>
<form onSubmit={form.handleSubmit(ok)} className="space-y-8">
<RAGFlowFormItem name="file">
{(field) => {
return (
<FileUploadDirectUpload
value={field.value}
onChange={field.onChange}
></FileUploadDirectUpload>
);
}}
</RAGFlowFormItem>
<div>
<ButtonLoading
type="submit"
loading={loading}
className="w-full mt-1"
>
{t('flow.run')}
</ButtonLoading>
</div>
</form>
</Form>
);
}

View File

@ -89,6 +89,7 @@ export type RFState = {
) => void; // Deleting a condition of a classification operator will delete the related edge
findAgentToolNodeById: (id: string | null) => string | undefined;
selectNodeIds: (nodeIds: string[]) => void;
hasChildNode: (nodeId: string) => boolean;
};
// this is our useStore hook that we can use in our components to get parts of the store and call actions
@ -527,6 +528,10 @@ const useGraphStore = create<RFState>()(
})),
);
},
hasChildNode: (nodeId) => {
const { edges } = get();
return edges.some((edge) => edge.source === nodeId);
},
})),
{ name: 'graph', trace: true },
),

View File

@ -9,16 +9,30 @@ import { removeUselessFieldsFromValues } from '@/utils/form';
import { Edge, Node, XYPosition } from '@xyflow/react';
import { FormInstance, FormListFieldData } from 'antd';
import { humanId } from 'human-id';
import { curry, get, intersectionWith, isEqual, omit, sample } from 'lodash';
import {
curry,
get,
intersectionWith,
isEmpty,
isEqual,
omit,
sample,
} from 'lodash';
import pipe from 'lodash/fp/pipe';
import isObject from 'lodash/isObject';
import {
CategorizeAnchorPointPositions,
FileType,
FileTypeSuffixMap,
NoCopyOperatorsList,
NoDebugOperatorsList,
NodeHandleId,
Operator,
} from './constant';
import { ExtractorFormSchemaType } from './form/extractor-form';
import { HierarchicalMergerFormSchemaType } from './form/hierarchical-merger-form';
import { ParserFormSchemaType } from './form/parser-form';
import { SplitterFormSchemaType } from './form/splitter-form';
import { BeginQuery, IPosition } from './interface';
function buildAgentExceptionGoto(edges: Edge[], nodeId: string) {
@ -170,6 +184,92 @@ export function hasSubAgent(edges: Edge[], nodeId?: string) {
return !!edge;
}
// Because the array of react-hook-form must be object data,
// it needs to be converted into a simple data type array required by the backend
function transformObjectArrayToPureArray(
list: Array<Record<string, any>>,
field: string,
) {
return Array.isArray(list)
? list.filter((x) => !isEmpty(x[field])).map((y) => y[field])
: [];
}
function transformParserParams(params: ParserFormSchemaType) {
const setups = params.setups.reduce<
Record<string, ParserFormSchemaType['setups'][0]>
>((pre, cur) => {
if (cur.fileFormat) {
let filteredSetup: Partial<
ParserFormSchemaType['setups'][0] & { suffix: string[] }
> = {
output_format: cur.output_format,
suffix: FileTypeSuffixMap[cur.fileFormat as FileType],
};
switch (cur.fileFormat) {
case FileType.PDF:
filteredSetup = {
...filteredSetup,
parse_method: cur.parse_method,
lang: cur.lang,
};
break;
case FileType.Image:
filteredSetup = {
...filteredSetup,
parse_method: cur.parse_method,
lang: cur.lang,
system_prompt: cur.system_prompt,
};
break;
case FileType.Email:
filteredSetup = {
...filteredSetup,
fields: cur.fields,
};
break;
case FileType.Video:
case FileType.Audio:
filteredSetup = {
...filteredSetup,
llm_id: cur.llm_id,
};
break;
default:
break;
}
pre[cur.fileFormat] = filteredSetup;
}
return pre;
}, {});
return { ...params, setups };
}
function transformSplitterParams(params: SplitterFormSchemaType) {
return {
...params,
overlapped_percent: Number(params.overlapped_percent) / 100,
delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'),
};
}
function transformHierarchicalMergerParams(
params: HierarchicalMergerFormSchemaType,
) {
const levels = params.levels.map((x) =>
transformObjectArrayToPureArray(x.expressions, 'expression'),
);
return { ...params, hierarchy: Number(params.hierarchy), levels };
}
function transformExtractorParams(params: ExtractorFormSchemaType) {
return { ...params, prompts: [{ content: params.prompts, role: 'user' }] };
}
// construct a dsl based on the node information of the graph
export const buildDslComponentsByGraph = (
nodes: RAGFlowNodeType[],
@ -202,6 +302,21 @@ export const buildDslComponentsByGraph = (
params = buildCategorize(edges, nodes, id);
break;
case Operator.Parser:
params = transformParserParams(params);
break;
case Operator.Splitter:
params = transformSplitterParams(params);
break;
case Operator.HierarchicalMerger:
params = transformHierarchicalMergerParams(params);
break;
case Operator.Extractor:
params = transformExtractorParams(params);
break;
default:
break;
}

View File

@ -148,6 +148,6 @@ export interface NavigateToDataflowResultProps {
[PipelineResultSearchParams.AgentTitle]?: string;
[PipelineResultSearchParams.IsReadOnly]?: string;
[PipelineResultSearchParams.Type]: string;
[PipelineResultSearchParams.CreatedBy]: string;
[PipelineResultSearchParams.DocumentExtension]: string;
[PipelineResultSearchParams.CreatedBy]?: string;
[PipelineResultSearchParams.DocumentExtension]?: string;
}

View File

@ -311,7 +311,6 @@ const FileLogsTable: FC<FileLogsTableProps> = ({
data,
pagination,
setPagination,
loading,
active = LogTabs.FILE_LOGS,
}) => {
const [sorting, setSorting] = useState<SortingState>([]);
@ -328,13 +327,13 @@ const FileLogsTable: FC<FileLogsTableProps> = ({
fileName: row.original.document_name,
source: row.original.source_from,
task: row.original?.task_type,
status: row.original.statusName,
status: row.original.status as RunningStatus,
startDate: formatDate(row.original.process_begin_at),
duration: formatSecondsToHumanReadable(
row.original.process_duration || 0,
),
details: row.original.progress_msg,
};
} as unknown as IFileLogItem;
console.log('logDetail', logDetail);
setLogInfo(logDetail);
setIsModalVisible(true);