Feature/mineru improvements (#11938)

我已在下面的评论中用中文重复说明。

### What problem does this PR solve?

## Summary
This PR enhances the MinerU document parser with additional
configuration options, giving users more control over PDF parsing
behavior and improving support for multilingual documents.

## Changes

### Backend (`deepdoc/parser/mineru_parser.py`)
- Added configurable parsing options:
- **Parse Method**: `auto`, `txt`, or `ocr` — allows users to choose the
extraction strategy
- **Formula Recognition**: Toggle for enabling/disabling formula
extraction (useful to disable for Cyrillic documents where it may cause
issues)
- **Table Recognition**: Toggle for enabling/disabling table extraction
- Added language code mapping (`LANGUAGE_TO_MINERU_MAP`) to translate
RAGFlow language settings to MinerU-compatible language codes for better
OCR accuracy
- Improved parser configuration handling to pass these options through
the processing pipeline

### Frontend (`web/`)
- Created new `MinerUOptionsFormField` component that conditionally
renders when MinerU is selected as the layout recognition engine
- Added UI controls for:
  - Parse method selection (dropdown)
  - Formula recognition toggle (switch)
  - Table recognition toggle (switch)
- Added i18n translations for English and Chinese
- Integrated the options into both the dataset creation dialog and
dataset settings page

### Integration
- Updated `rag/app/naive.py` to forward MinerU options to the parser
- Updated task service to handle the new configuration parameters

## Why
MinerU is a powerful document parser, but the default settings don't
work well for all document types. This PR allows users to:
1. Choose the best parsing method for their documents
2. Disable formula recognition for Cyrillic/non-Latin scripts where it
causes issues
3. Control table extraction based on document needs
4. Benefit from automatic language detection for better OCR results

## Testing
- [x] Tested MinerU parsing with different parse methods
- [x] Verified UI renders correctly when MinerU is selected/deselected
- [x] Confirmed settings persist correctly in dataset configuration

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: user210 <user210@rt>
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
concertdictate
2025-12-16 07:15:25 +02:00
committed by GitHub
parent 1112b6291b
commit 49c74d08e8
13 changed files with 371 additions and 46 deletions

View File

@ -121,6 +121,13 @@ class TaskService(CommonService):
.where(cls.model.id == task_id) .where(cls.model.id == task_id)
) )
docs = list(docs.dicts()) docs = list(docs.dicts())
# Assuming docs = list(docs.dicts())
if docs:
kb_config = docs[0]['kb_parser_config'] # Dict from Knowledgebase.parser_config
mineru_method = kb_config.get('mineru_parse_method', 'auto')
mineru_formula = kb_config.get('mineru_formula_enable', True)
mineru_table = kb_config.get('mineru_table_enable', True)
print(mineru_method, mineru_formula, mineru_table)
if not docs: if not docs:
return None return None

View File

@ -24,6 +24,7 @@ import tempfile
import threading import threading
import time import time
import zipfile import zipfile
from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from os import PathLike from os import PathLike
from pathlib import Path from pathlib import Path
@ -53,6 +54,87 @@ class MinerUContentType(StrEnum):
DISCARDED = "discarded" DISCARDED = "discarded"
# Mapping from language names to MinerU language codes
LANGUAGE_TO_MINERU_MAP = {
'English': 'en',
'Chinese': 'ch',
'Traditional Chinese': 'chinese_cht',
'Russian': 'east_slavic',
'Ukrainian': 'east_slavic',
'Indonesian': 'latin',
'Spanish': 'latin',
'Vietnamese': 'latin',
'Japanese': 'japan',
'Korean': 'korean',
'Portuguese BR': 'latin',
'German': 'latin',
'French': 'latin',
'Italian': 'latin',
'Tamil': 'ta',
'Telugu': 'te',
'Kannada': 'ka',
'Thai': 'th',
'Greek': 'el',
'Hindi': 'devanagari',
}
class MinerUBackend(StrEnum):
"""MinerU processing backend options."""
PIPELINE = "pipeline" # Traditional multimodel pipeline (default)
VLM_TRANSFORMERS = "vlm-transformers" # Vision-language model using HuggingFace Transformers
VLM_MLX_ENGINE = "vlm-mlx-engine" # Faster, requires Apple Silicon and macOS 13.5+
VLM_VLLM_ENGINE = "vlm-vllm-engine" # Local vLLM engine, requires local GPU
VLM_VLLM_ASYNC_ENGINE = "vlm-vllm-async-engine" # Asynchronous vLLM engine, new in MinerU API
VLM_LMDEPLOY_ENGINE = "vlm-lmdeploy-engine" # LMDeploy engine
VLM_HTTP_CLIENT = "vlm-http-client" # HTTP client for remote vLLM server (CPU only)
class MinerULanguage(StrEnum):
"""MinerU supported languages for OCR (pipeline backend only)."""
CH = "ch" # Chinese
CH_SERVER = "ch_server" # Chinese (server)
CH_LITE = "ch_lite" # Chinese (lite)
EN = "en" # English
KOREAN = "korean" # Korean
JAPAN = "japan" # Japanese
CHINESE_CHT = "chinese_cht" # Chinese Traditional
TA = "ta" # Tamil
TE = "te" # Telugu
KA = "ka" # Kannada
TH = "th" # Thai
EL = "el" # Greek
LATIN = "latin" # Latin
ARABIC = "arabic" # Arabic
EAST_SLAVIC = "east_slavic" # East Slavic
CYRILLIC = "cyrillic" # Cyrillic
DEVANAGARI = "devanagari" # Devanagari
class MinerUParseMethod(StrEnum):
"""MinerU PDF parsing methods (pipeline backend only)."""
AUTO = "auto" # Automatically determine the method based on the file type
TXT = "txt" # Use text extraction method
OCR = "ocr" # Use OCR method for image-based PDFs
@dataclass
class MinerUParseOptions:
"""Options for MinerU PDF parsing."""
backend: MinerUBackend = MinerUBackend.PIPELINE
lang: Optional[MinerULanguage] = None # language for OCR (pipeline backend only)
method: MinerUParseMethod = MinerUParseMethod.AUTO
server_url: Optional[str] = None
delete_output: bool = True
parse_method: str = "raw"
formula_enable: bool = True
table_enable: bool = True
class MinerUParser(RAGFlowPdfParser): class MinerUParser(RAGFlowPdfParser):
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""): def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
self.mineru_path = Path(mineru_path) self.mineru_path = Path(mineru_path)
@ -96,7 +178,8 @@ class MinerUParser(RAGFlowPdfParser):
with open(full_path, "wb") as f: with open(full_path, "wb") as f:
f.write(zip_ref.read(filename)) f.write(zip_ref.read(filename))
def _is_http_endpoint_valid(self, url, timeout=5): @staticmethod
def _is_http_endpoint_valid(url, timeout=5):
try: try:
response = requests.head(url, timeout=timeout, allow_redirects=True) response = requests.head(url, timeout=timeout, allow_redirects=True)
return response.status_code in [200, 301, 302, 307, 308] return response.status_code in [200, 301, 302, 307, 308]
@ -141,7 +224,8 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.warning(f"[MinerU] vlm-http-client server check failed: {e}") self.logger.warning(f"[MinerU] vlm-http-client server check failed: {e}")
try: try:
response = requests.get(server_url, timeout=5) response = requests.get(server_url, timeout=5)
self.logger.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}") self.logger.info(
f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
self.using_api = False self.using_api = False
return True, reason return True, reason
except Exception as e: except Exception as e:
@ -185,14 +269,15 @@ class MinerUParser(RAGFlowPdfParser):
return False, reason return False, reason
def _run_mineru( def _run_mineru(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
): ):
if self.using_api: if self.using_api:
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback) self._run_mineru_api(input_path, output_dir, options, callback)
else: else:
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback) self._run_mineru_executable(input_path, output_dir, options, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): def _run_mineru_api(self, input_path: Path, output_dir: Path, options: MinerUParseOptions,
callback: Optional[Callable] = None):
output_zip_path = os.path.join(str(output_dir), "output.zip") output_zip_path = os.path.join(str(output_dir), "output.zip")
pdf_file_path = str(input_path) pdf_file_path = str(input_path)
@ -201,18 +286,18 @@ class MinerUParser(RAGFlowPdfParser):
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}") raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
pdf_file_name = Path(pdf_file_path).stem.strip() pdf_file_name = Path(pdf_file_path).stem.strip()
output_path = os.path.join(str(output_dir), pdf_file_name, method) output_path = os.path.join(str(output_dir), pdf_file_name, options.method)
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")} files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
data = { data = {
"output_dir": "./output", "output_dir": "./output",
"lang_list": lang, "lang_list": options.lang,
"backend": backend, "backend": options.backend,
"parse_method": method, "parse_method": options.method,
"formula_enable": True, "formula_enable": options.formula_enable,
"table_enable": True, "table_enable": options.table_enable,
"server_url": None, "server_url": None,
"return_md": True, "return_md": True,
"return_middle_json": True, "return_middle_json": True,
@ -229,7 +314,8 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse") self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
if callback: if callback:
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse") callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, timeout=1800) response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers,
timeout=1800)
response.raise_for_status() response.raise_for_status()
if response.headers.get("Content-Type") == "application/zip": if response.headers.get("Content-Type") == "application/zip":
@ -253,15 +339,15 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info("[MinerU] Api completed successfully.") self.logger.info("[MinerU] Api completed successfully.")
def _run_mineru_executable( def _run_mineru_executable(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
): ):
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method] cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", options.method]
if backend: if options.backend:
cmd.extend(["-b", backend]) cmd.extend(["-b", options.backend])
if lang: if options.lang:
cmd.extend(["-l", lang]) cmd.extend(["-l", options.lang])
if server_url and backend == "vlm-http-client": if options.server_url and options.backend == "vlm-http-client":
cmd.extend(["-u", server_url]) cmd.extend(["-u", options.server_url])
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}") self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
@ -313,7 +399,8 @@ class MinerUParser(RAGFlowPdfParser):
try: try:
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf: with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
self.pdf = pdf self.pdf = pdf
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])] self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in
enumerate(self.pdf.pages[page_from:page_to])]
except Exception as e: except Exception as e:
self.page_images = None self.page_images = None
self.total_page = 0 self.total_page = 0
@ -375,7 +462,8 @@ class MinerUParser(RAGFlowPdfParser):
pos = poss[-1] pos = poss[-1]
last_page_idx = pos[0][-1] last_page_idx = pos[0][-1]
if not (0 <= last_page_idx < page_count): if not (0 <= last_page_idx < page_count):
self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.") self.logger.warning(
f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
if need_position: if need_position:
return None, None return None, None
return return
@ -401,10 +489,12 @@ class MinerUParser(RAGFlowPdfParser):
if 0 <= pn - 1 < page_count: if 0 <= pn - 1 < page_count:
bottom += self.page_images[pn - 1].size[1] bottom += self.page_images[pn - 1].size[1]
else: else:
self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.") self.logger.warning(
f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
if not (0 <= pns[0] < page_count): if not (0 <= pns[0] < page_count):
self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.") self.logger.warning(
f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
continue continue
img0 = self.page_images[pns[0]] img0 = self.page_images[pns[0]]
@ -417,7 +507,8 @@ class MinerUParser(RAGFlowPdfParser):
bottom -= img0.size[1] bottom -= img0.size[1]
for pn in pns[1:]: for pn in pns[1:]:
if not (0 <= pn < page_count): if not (0 <= pn < page_count):
self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.") self.logger.warning(
f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
continue continue
page = self.page_images[pn] page = self.page_images[pn]
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1])) x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
@ -461,7 +552,8 @@ class MinerUParser(RAGFlowPdfParser):
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
return poss return poss
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]: def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[
dict[str, Any]]:
candidates = [] candidates = []
seen = set() seen = set()
@ -543,11 +635,13 @@ class MinerUParser(RAGFlowPdfParser):
case MinerUContentType.TEXT: case MinerUContentType.TEXT:
section = output["text"] section = output["text"]
case MinerUContentType.TABLE: case MinerUContentType.TABLE:
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", [])) section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(
output.get("table_footnote", []))
if not section.strip(): if not section.strip():
section = "FAILED TO PARSE TABLE" section = "FAILED TO PARSE TABLE"
case MinerUContentType.IMAGE: case MinerUContentType.IMAGE:
section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", [])) section = "".join(output.get("image_caption", [])) + "\n" + "".join(
output.get("image_footnote", []))
case MinerUContentType.EQUATION: case MinerUContentType.EQUATION:
section = output["text"] section = output["text"]
case MinerUContentType.CODE: case MinerUContentType.CODE:
@ -569,24 +663,30 @@ class MinerUParser(RAGFlowPdfParser):
return [] return []
def parse_pdf( def parse_pdf(
self, self,
filepath: str | PathLike[str], filepath: str | PathLike[str],
binary: BytesIO | bytes, binary: BytesIO | bytes,
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
*, *,
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
backend: str = "pipeline", backend: str = "pipeline",
lang: Optional[str] = None, server_url: Optional[str] = None,
method: str = "auto", delete_output: bool = True,
server_url: Optional[str] = None, parse_method: str = "raw",
delete_output: bool = True, **kwargs,
parse_method: str = "raw",
) -> tuple: ) -> tuple:
import shutil import shutil
temp_pdf = None temp_pdf = None
created_tmp_dir = False created_tmp_dir = False
# Assuming the dict is defined as shown
lang = kwargs.get('lang', 'English')
mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Returns 'ch' if lang not found
mineru_method_raw_str = kwargs.get('parser_config', {}).get('mineru_parse_method', 'auto')
enable_formula = kwargs.get('parser_config', {}).get('mineru_formula_enable', True)
enable_table = kwargs.get('parser_config', {}).get('mineru_enable', True)
# remove spaces, or mineru crash, and _read_output fail too # remove spaces, or mineru crash, and _read_output fail too
file_path = Path(filepath) file_path = Path(filepath)
pdf_file_name = file_path.stem.replace(" ", "") + ".pdf" pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
@ -625,8 +725,18 @@ class MinerUParser(RAGFlowPdfParser):
self.__images__(pdf, zoomin=1) self.__images__(pdf, zoomin=1)
try: try:
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback) options = MinerUParseOptions(
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend) backend=MinerUBackend(backend),
lang=MinerULanguage(mineru_lang_code),
method=MinerUParseMethod(mineru_method_raw_str),
server_url=server_url,
delete_output=delete_output,
parse_method=parse_method,
formula_enable=enable_formula,
table_enable=enable_table,
)
self._run_mineru(pdf, out_dir, options, callback=callback)
outputs = self._read_output(out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback: if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")

View File

@ -85,6 +85,8 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
binary=binary, binary=binary,
callback=callback, callback=callback,
parse_method=parse_method, parse_method=parse_method,
lang=lang,
**kwargs
) )
return sections, tables, pdf_parser return sections, tables, pdf_parser
except Exception as e: except Exception as e:
@ -94,6 +96,9 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
callback(-1, "MinerU not found.") callback(-1, "MinerU not found.")
return None, None, None return None, None, None
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
pdf_parser = DoclingParser() pdf_parser = DoclingParser()
parse_method = kwargs.get("parse_method", "raw") parse_method = kwargs.get("parse_method", "raw")

View File

@ -67,7 +67,7 @@ class MinerUOcrModel(Base, MinerUParser):
server_url = server_url or self.mineru_server_url server_url = server_url or self.mineru_server_url
return self.check_installation(backend=backend, server_url=server_url) return self.check_installation(backend=backend, server_url=server_url)
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs): def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw",**kwargs):
ok, reason = self.check_available() ok, reason = self.check_available()
if not ok: if not ok:
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.") raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
@ -82,5 +82,6 @@ class MinerUOcrModel(Base, MinerUParser):
server_url=self.mineru_server_url, server_url=self.mineru_server_url,
delete_output=self.mineru_delete_output, delete_output=self.mineru_delete_output,
parse_method=parse_method, parse_method=parse_method,
**kwargs
) )
return sections, tables return sections, tables

View File

@ -0,0 +1,97 @@
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { RAGFlowSelect } from '@/components/ui/select';
import { Switch } from '@/components/ui/switch';
import { LLMFactory } from '@/constants/llm';
import { buildOptions } from '@/utils/form';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
const parseMethodOptions = buildOptions(['auto', 'txt', 'ocr']);
export function MinerUOptionsFormField() {
const form = useFormContext();
const { t } = useTranslation();
const layoutRecognize = useWatch({
control: form.control,
name: 'parser_config.layout_recognize',
});
// Check if MinerU is selected (the value contains 'MinerU' or matches the factory name)
const isMinerUSelected =
layoutRecognize?.includes(LLMFactory.MinerU) ||
layoutRecognize?.toLowerCase()?.includes('mineru');
if (!isMinerUSelected) {
return null;
}
return (
<div className="space-y-4 border-l-2 border-primary/30 pl-4 ml-2">
<div className="text-sm font-medium text-text-secondary">
{t('knowledgeConfiguration.mineruOptions', 'MinerU Options')}
</div>
<RAGFlowFormItem
name="parser_config.mineru_parse_method"
label={t('knowledgeConfiguration.mineruParseMethod', 'Parse Method')}
tooltip={t(
'knowledgeConfiguration.mineruParseMethodTip',
'Method for parsing PDF: auto (automatic detection), txt (text extraction), ocr (optical character recognition)',
)}
horizontal={true}
>
{(field) => (
<RAGFlowSelect
value={field.value || 'auto'}
onChange={field.onChange}
options={parseMethodOptions}
placeholder={t('common.selectPlaceholder', 'Select value')}
/>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name="parser_config.mineru_formula_enable"
label={t(
'knowledgeConfiguration.mineruFormulaEnable',
'Formula Recognition',
)}
tooltip={t(
'knowledgeConfiguration.mineruFormulaEnableTip',
'Enable formula recognition. Note: This may not work correctly for Cyrillic documents.',
)}
horizontal={true}
labelClassName="!mb-0"
>
{(field) => (
<Switch
checked={field.value ?? true}
onCheckedChange={field.onChange}
/>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name="parser_config.mineru_table_enable"
label={t(
'knowledgeConfiguration.mineruTableEnable',
'Table Recognition',
)}
tooltip={t(
'knowledgeConfiguration.mineruTableEnableTip',
'Enable table recognition and extraction.',
)}
horizontal={true}
labelClassName="!mb-0"
>
{(field) => (
<Switch
checked={field.value ?? true}
onCheckedChange={field.onChange}
/>
)}
</RAGFlowFormItem>
</div>
);
}

View File

@ -103,14 +103,22 @@ export const LanguageTranslationMap = {
Chinese: 'zh', Chinese: 'zh',
'Traditional Chinese': 'zh-TRADITIONAL', 'Traditional Chinese': 'zh-TRADITIONAL',
Russian: 'ru', Russian: 'ru',
Indonesia: 'id', Indonesian: 'id',
Spanish: 'es', Spanish: 'es',
Vietnamese: 'vi', Vietnamese: 'vi',
Japanese: 'ja', Japanese: 'ja',
Korean: 'ko',
'Portuguese BR': 'pt-br', 'Portuguese BR': 'pt-br',
German: 'de', German: 'de',
French: 'fr', French: 'fr',
Italian: 'it', Italian: 'it',
Tamil: 'ta',
Telugu: 'te',
Kannada: 'ka',
Thai: 'th',
Greek: 'el',
Hindi: 'hi',
Ukrainian: 'uk',
}; };
export enum FileMimeType { export enum FileMimeType {

View File

@ -330,6 +330,15 @@ export default {
reRankModelWaring: 'Re-rank model is very time consuming.', reRankModelWaring: 'Re-rank model is very time consuming.',
}, },
knowledgeConfiguration: { knowledgeConfiguration: {
mineruOptions: 'MinerU Options',
mineruParseMethod: 'Parse Method',
mineruParseMethodTip:
'Method for parsing PDF: auto (automatic detection), txt (text extraction), ocr (optical character recognition)',
mineruFormulaEnable: 'Formula Recognition',
mineruFormulaEnableTip:
'Enable formula recognition. Note: This may not work correctly for Cyrillic documents.',
mineruTableEnable: 'Table Recognition',
mineruTableEnableTip: 'Enable table recognition and extraction.',
overlappedPercent: 'Overlapped percent(%)', overlappedPercent: 'Overlapped percent(%)',
generationScopeTip: generationScopeTip:
'Determines whether RAPTOR is generated for the entire dataset or for a single file.', 'Determines whether RAPTOR is generated for the entire dataset or for a single file.',

View File

@ -265,6 +265,15 @@ export default {
theDocumentBeingParsedCannotBeDeleted: '正在解析的文档不能被删除', theDocumentBeingParsedCannotBeDeleted: '正在解析的文档不能被删除',
}, },
knowledgeConfiguration: { knowledgeConfiguration: {
mineruOptions: 'MinerU 选项',
mineruParseMethod: '解析方法',
mineruParseMethodTip:
'PDF 解析方法auto自动检测、txt文本提取、ocr光学字符识别',
mineruFormulaEnable: '公式识别',
mineruFormulaEnableTip:
'启用公式识别。注意:对于西里尔文档可能无法正常工作。',
mineruTableEnable: '表格识别',
mineruTableEnableTip: '启用表格识别和提取。',
generationScopeTip: '选择 RAPTOR 的生成范围:整个知识库或单个文件。', generationScopeTip: '选择 RAPTOR 的生成范围:整个知识库或单个文件。',
generationScope: '生成范围', generationScope: '生成范围',
scopeSingleFile: '单文件', scopeSingleFile: '单文件',

View File

@ -6,6 +6,7 @@ import { DelimiterFormField } from '@/components/delimiter-form-field';
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field'; import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field'; import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field';
import { MinerUOptionsFormField } from '@/components/mineru-options-form-field';
import { import {
ConfigurationFormContainer, ConfigurationFormContainer,
MainContainer, MainContainer,
@ -17,6 +18,7 @@ export function NaiveConfiguration() {
<MainContainer> <MainContainer>
<ConfigurationFormContainer> <ConfigurationFormContainer>
<LayoutRecognizeFormField></LayoutRecognizeFormField> <LayoutRecognizeFormField></LayoutRecognizeFormField>
<MinerUOptionsFormField></MinerUOptionsFormField>
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField> <MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
<DelimiterFormField></DelimiterFormField> <DelimiterFormField></DelimiterFormField>
<EnableTocToggle /> <EnableTocToggle />

View File

@ -13,6 +13,7 @@ export const formSchema = z
// avatar: z.instanceof(File), // avatar: z.instanceof(File),
avatar: z.any().nullish(), avatar: z.any().nullish(),
permission: z.string().optional(), permission: z.string().optional(),
language: z.string().optional(),
parser_id: z.string(), parser_id: z.string(),
pipeline_id: z.string().optional(), pipeline_id: z.string().optional(),
pipeline_name: z.string().optional(), pipeline_name: z.string().optional(),
@ -30,6 +31,10 @@ export const formSchema = z
topn_tags: z.number().optional(), topn_tags: z.number().optional(),
toc_extraction: z.boolean().optional(), toc_extraction: z.boolean().optional(),
overlapped_percent: z.number().optional(), overlapped_percent: z.number().optional(),
// MinerU-specific options
mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(),
mineru_formula_enable: z.boolean().optional(),
mineru_table_enable: z.boolean().optional(),
raptor: z raptor: z
.object({ .object({
use_raptor: z.boolean().optional(), use_raptor: z.boolean().optional(),

View File

@ -1,5 +1,7 @@
import { AvatarUpload } from '@/components/avatar-upload'; import { AvatarUpload } from '@/components/avatar-upload';
import { SelectWithSearch } from '@/components/originui/select-with-search';
import PageRankFormField from '@/components/page-rank-form-field'; import PageRankFormField from '@/components/page-rank-form-field';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { import {
FormControl, FormControl,
FormField, FormField,
@ -8,6 +10,8 @@ import {
FormMessage, FormMessage,
} from '@/components/ui/form'; } from '@/components/ui/form';
import { Input } from '@/components/ui/input'; import { Input } from '@/components/ui/input';
import { LanguageTranslationMap } from '@/constants/common';
import { useMemo } from 'react';
import { useFormContext } from 'react-hook-form'; import { useFormContext } from 'react-hook-form';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
import { TagItems } from './components/tag-item'; import { TagItems } from './components/tag-item';
@ -18,6 +22,13 @@ export function GeneralForm() {
const form = useFormContext(); const form = useFormContext();
const { t } = useTranslation(); const { t } = useTranslation();
const languageOptions = useMemo(() => {
return Object.keys(LanguageTranslationMap).map((x) => ({
label: x,
value: x,
}));
}, []);
return ( return (
<> <>
<FormField <FormField
@ -41,6 +52,18 @@ export function GeneralForm() {
</FormItem> </FormItem>
)} )}
/> />
<div className="items-center">
<RAGFlowFormItem
name="language"
label={t('common.language')}
horizontal={true}
>
<SelectWithSearch
options={languageOptions}
triggerClassName="w-full"
></SelectWithSearch>
</RAGFlowFormItem>
</div>
<FormField <FormField
control={form.control} control={form.control}
name="avatar" name="avatar"

View File

@ -58,6 +58,7 @@ export default function DatasetSettings() {
name: '', name: '',
parser_id: DocumentParserType.Naive, parser_id: DocumentParserType.Naive,
permission: PermissionRole.Me, permission: PermissionRole.Me,
language: 'English',
parser_config: { parser_config: {
layout_recognize: DocumentType.DeepDOC, layout_recognize: DocumentType.DeepDOC,
chunk_token_num: 512, chunk_token_num: 512,
@ -68,6 +69,10 @@ export default function DatasetSettings() {
topn_tags: 3, topn_tags: 3,
toc_extraction: false, toc_extraction: false,
overlapped_percent: 0, overlapped_percent: 0,
// MinerU-specific defaults
mineru_parse_method: 'auto',
mineru_formula_enable: true,
mineru_table_enable: true,
raptor: { raptor: {
use_raptor: true, use_raptor: true,
max_token: 256, max_token: 256,

View File

@ -16,10 +16,18 @@ import {
FormMessage, FormMessage,
} from '@/components/ui/form'; } from '@/components/ui/form';
import { Input } from '@/components/ui/input'; import { Input } from '@/components/ui/input';
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from '@/components/ui/select';
import { LanguageTranslationMap } from '@/constants/common';
import { FormLayout } from '@/constants/form'; import { FormLayout } from '@/constants/form';
import { IModalProps } from '@/interfaces/common'; import { IModalProps } from '@/interfaces/common';
import { zodResolver } from '@hookform/resolvers/zod'; import { zodResolver } from '@hookform/resolvers/zod';
import { useEffect } from 'react'; import { useEffect, useMemo } from 'react';
import { useForm, useWatch } from 'react-hook-form'; import { useForm, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
import { z } from 'zod'; import { z } from 'zod';
@ -34,6 +42,13 @@ const FormId = 'dataset-creating-form';
export function InputForm({ onOk }: IModalProps<any>) { export function InputForm({ onOk }: IModalProps<any>) {
const { t } = useTranslation(); const { t } = useTranslation();
const languageOptions = useMemo(() => {
return Object.keys(LanguageTranslationMap).map((x) => ({
label: x,
value: x,
}));
}, []);
const FormSchema = z const FormSchema = z
.object({ .object({
name: z name: z
@ -51,6 +66,7 @@ export function InputForm({ onOk }: IModalProps<any>) {
.trim(), .trim(),
parser_id: z.string().optional(), parser_id: z.string().optional(),
pipeline_id: z.string().optional(), pipeline_id: z.string().optional(),
language: z.string().optional(),
}) })
.superRefine((data, ctx) => { .superRefine((data, ctx) => {
// When parseType === 1, parser_id is required // When parseType === 1, parser_id is required
@ -83,6 +99,7 @@ export function InputForm({ onOk }: IModalProps<any>) {
parseType: 1, parseType: 1,
parser_id: '', parser_id: '',
embd_id: '', embd_id: '',
language: 'English',
}, },
}); });
@ -130,6 +147,33 @@ export function InputForm({ onOk }: IModalProps<any>) {
)} )}
/> />
<FormField
control={form.control}
name="language"
render={({ field }) => (
<FormItem>
<FormLabel>{t('common.language')}</FormLabel>
<Select onValueChange={field.onChange} defaultValue={field.value}>
<FormControl>
<SelectTrigger>
<SelectValue
placeholder={t('common.languagePlaceholder')}
/>
</SelectTrigger>
</FormControl>
<SelectContent>
{languageOptions.map((option) => (
<SelectItem key={option.value} value={option.value}>
{option.label}
</SelectItem>
))}
</SelectContent>
</Select>
<FormMessage />
</FormItem>
)}
/>
<EmbeddingModelItem line={2} isEdit={false} /> <EmbeddingModelItem line={2} isEdit={false} />
<ParseTypeItem /> <ParseTypeItem />
{parseType === 1 && <ChunkMethodItem></ChunkMethodItem>} {parseType === 1 && <ChunkMethodItem></ChunkMethodItem>}