mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-06 18:45:08 +08:00
Compare commits
3 Commits
d38f8a1562
...
672958a192
| Author | SHA1 | Date | |
|---|---|---|---|
| 672958a192 | |||
| 3820de916c | |||
| ef44979b5c |
@ -207,7 +207,7 @@ releases! 🌟
|
||||
> Note: Prior to `v0.22.0`, we provided both images with embedding models and slim images without embedding models. Details as follows:
|
||||
|
||||
| RAGFlow image tag | Image size (GB) | Has embedding models? | Stable? |
|
||||
| ----------------- | --------------- | --------------------- | ------------------------ |
|
||||
|-------------------|-----------------|-----------------------|----------------|
|
||||
| v0.21.1 | ≈9 | ✔️ | Stable release |
|
||||
| v0.21.1-slim | ≈2 | ❌ | Stable release |
|
||||
|
||||
|
||||
@ -207,7 +207,7 @@ Coba demo kami di [https://demo.ragflow.io](https://demo.ragflow.io).
|
||||
> Catatan: Sebelum `v0.22.0`, kami menyediakan image dengan model embedding dan image slim tanpa model embedding. Detailnya sebagai berikut:
|
||||
|
||||
| RAGFlow image tag | Image size (GB) | Has embedding models? | Stable? |
|
||||
| ----------------- | --------------- | --------------------- | ------------------------ |
|
||||
|-------------------|-----------------|-----------------------|----------------|
|
||||
| v0.21.1 | ≈9 | ✔️ | Stable release |
|
||||
| v0.21.1-slim | ≈2 | ❌ | Stable release |
|
||||
|
||||
|
||||
@ -187,7 +187,7 @@
|
||||
> 注意:`v0.22.0` より前のバージョンでは、embedding モデルを含むイメージと、embedding モデルを含まない slim イメージの両方を提供していました。詳細は以下の通りです:
|
||||
|
||||
| RAGFlow image tag | Image size (GB) | Has embedding models? | Stable? |
|
||||
| ----------------- | --------------- | --------------------- | ------------------------ |
|
||||
|-------------------|-----------------|-----------------------|----------------|
|
||||
| v0.21.1 | ≈9 | ✔️ | Stable release |
|
||||
| v0.21.1-slim | ≈2 | ❌ | Stable release |
|
||||
|
||||
|
||||
@ -189,7 +189,7 @@
|
||||
> 참고: `v0.22.0` 이전 버전에서는 embedding 모델이 포함된 이미지와 embedding 모델이 포함되지 않은 slim 이미지를 모두 제공했습니다. 자세한 내용은 다음과 같습니다:
|
||||
|
||||
| RAGFlow image tag | Image size (GB) | Has embedding models? | Stable? |
|
||||
| ----------------- | --------------- | --------------------- | ------------------------ |
|
||||
|-------------------|-----------------|-----------------------|----------------|
|
||||
| v0.21.1 | ≈9 | ✔️ | Stable release |
|
||||
| v0.21.1-slim | ≈2 | ❌ | Stable release |
|
||||
|
||||
|
||||
@ -207,7 +207,7 @@ Experimente nossa demo em [https://demo.ragflow.io](https://demo.ragflow.io).
|
||||
> Nota: Antes da `v0.22.0`, fornecíamos imagens com modelos de embedding e imagens slim sem modelos de embedding. Detalhes a seguir:
|
||||
|
||||
| RAGFlow image tag | Image size (GB) | Has embedding models? | Stable? |
|
||||
| ----------------- | --------------- | --------------------- | ------------------------ |
|
||||
|-------------------|-----------------|-----------------------|----------------|
|
||||
| v0.21.1 | ≈9 | ✔️ | Stable release |
|
||||
| v0.21.1-slim | ≈2 | ❌ | Stable release |
|
||||
|
||||
|
||||
@ -206,7 +206,7 @@
|
||||
> 注意:在 `v0.22.0` 之前的版本,我們會同時提供包含 embedding 模型的映像和不含 embedding 模型的 slim 映像。具體如下:
|
||||
|
||||
| RAGFlow image tag | Image size (GB) | Has embedding models? | Stable? |
|
||||
| ----------------- | --------------- | --------------------- | ------------------------ |
|
||||
|-------------------|-----------------|-----------------------|----------------|
|
||||
| v0.21.1 | ≈9 | ✔️ | Stable release |
|
||||
| v0.21.1-slim | ≈2 | ❌ | Stable release |
|
||||
|
||||
|
||||
@ -207,7 +207,7 @@
|
||||
> 注意:在 `v0.22.0` 之前的版本,我们会同时提供包含 embedding 模型的镜像和不含 embedding 模型的 slim 镜像。具体如下:
|
||||
|
||||
| RAGFlow image tag | Image size (GB) | Has embedding models? | Stable? |
|
||||
| ----------------- | --------------- | --------------------- | ------------------------ |
|
||||
|-------------------|-----------------|-----------------------|----------------|
|
||||
| v0.21.1 | ≈9 | ✔️ | Stable release |
|
||||
| v0.21.1-slim | ≈2 | ❌ | Stable release |
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ Use this section to tell people about which versions of your project are
|
||||
currently being supported with security updates.
|
||||
|
||||
| Version | Supported |
|
||||
| ------- | ------------------ |
|
||||
|---------|--------------------|
|
||||
| <=0.7.0 | :white_check_mark: |
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
@ -252,7 +252,6 @@ async def delete_chats(tenant_id):
|
||||
continue
|
||||
temp_dict = {"status": StatusEnum.INVALID.value}
|
||||
success_count += DialogService.update_by_id(id, temp_dict)
|
||||
print(success_count, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$", flush=True)
|
||||
|
||||
if errors:
|
||||
if success_count > 0:
|
||||
|
||||
30
common/parser_config_utils.py
Normal file
30
common/parser_config_utils.py
Normal file
@ -0,0 +1,30 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Any
|
||||
|
||||
|
||||
def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str | None]:
|
||||
parser_model_name: str | None = None
|
||||
layout_recognizer = layout_recognizer_raw
|
||||
|
||||
if isinstance(layout_recognizer_raw, str):
|
||||
lowered = layout_recognizer_raw.lower()
|
||||
if lowered.endswith("@mineru"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
layout_recognizer = "MinerU"
|
||||
|
||||
return layout_recognizer, parser_model_name
|
||||
@ -262,10 +262,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
elif self.mineru_server_url:
|
||||
data["server_url"] = self.mineru_server_url
|
||||
|
||||
print("--------------------------------", flush=True)
|
||||
print(f"{data=}", flush=True)
|
||||
print(f"{options=}", flush=True)
|
||||
print("--------------------------------", flush=True)
|
||||
self.logger.info(f"[MinerU] request {data=}")
|
||||
self.logger.info(f"[MinerU] request {options=}")
|
||||
|
||||
headers = {"Accept": "application/json"}
|
||||
try:
|
||||
|
||||
@ -14,7 +14,7 @@ To access the RAGFlow admin UI, append `/admin` to the web UI's address, e.g. `h
|
||||
|
||||
### Default Credentials
|
||||
| Username | Password |
|
||||
|----------|----------|
|
||||
|--------------------|----------|
|
||||
| `admin@ragflow.io` | `admin` |
|
||||
|
||||
## Admin UI Overview
|
||||
|
||||
@ -158,7 +158,7 @@ Optional. Text to display as a diagonal watermark across each page. Useful for m
|
||||
The **Docs Generator** component provides the following output variables:
|
||||
|
||||
| Variable name | Type | Description |
|
||||
| ------------- | --------- | --------------------------------------------------------------------------- |
|
||||
|---------------|-----------|--------------------------------------------------------------|
|
||||
| `file_path` | `string` | The server path where the generated document is saved. |
|
||||
| `pdf_base64` | `string` | The document content encoded in base64 format. |
|
||||
| `download` | `string` | JSON containing download information for the chat interface. |
|
||||
@ -190,7 +190,7 @@ The **Docs Generator** includes intelligent font handling for international cont
|
||||
### Supported scripts
|
||||
|
||||
| Script | Unicode Range | Font Used |
|
||||
| ------ | ------------- | --------- |
|
||||
|------------------------------|---------------|--------------------|
|
||||
| Chinese (CJK) | U+4E00–U+9FFF | STSong-Light |
|
||||
| Japanese (Hiragana/Katakana) | U+3040–U+30FF | HeiseiMin-W3 |
|
||||
| Korean (Hangul) | U+AC00–U+D7AF | HYSMyeongJo-Medium |
|
||||
|
||||
@ -18,7 +18,7 @@ Within the configuration panel, you can add multiple parsers and set the corresp
|
||||
The **Parser** component supports parsing the following file types:
|
||||
|
||||
| File type | File format |
|
||||
| ------------- | ------------------------ |
|
||||
|---------------|--------------------------|
|
||||
| PDF | PDF |
|
||||
| Spreadsheet | XLSX, XLS, CSV |
|
||||
| Image | PNG, JPG, JPEG, GIF, TIF |
|
||||
@ -98,7 +98,7 @@ A Video parser transcribes video files to text. To use this parser, you must fir
|
||||
The global variable names for the output of the **Parser** component, which can be referenced by subsequent components in the ingestion pipeline.
|
||||
|
||||
| Variable name | Type |
|
||||
| ------------- | ------------------------ |
|
||||
|---------------|-----------------|
|
||||
| `markdown` | `string` |
|
||||
| `text` | `string` |
|
||||
| `html` | `string` |
|
||||
|
||||
@ -45,7 +45,7 @@ Click the light bulb icon above the *current* dialogue and scroll down the popup
|
||||
|
||||
|
||||
| Item name | Description |
|
||||
| ----------------- |-----------------------------------------------------------------------------------------------|
|
||||
|-------------------|-----------------------------------------------------------------------------------------------|
|
||||
| Total | Total time spent on this conversation round, including chunk retrieval and answer generation. |
|
||||
| Check LLM | Time to validate the specified LLM. |
|
||||
| Create retriever | Time to create a chunk retriever. |
|
||||
|
||||
@ -40,7 +40,7 @@ This section covers the following topics:
|
||||
RAGFlow offers multiple built-in chunking template to facilitate chunking files of different layouts and ensure semantic integrity. From the **Built-in** chunking method dropdown under **Parse type**, you can choose the default template that suits the layouts and formats of your files. The following table shows the descriptions and the compatible file formats of each supported chunk template:
|
||||
|
||||
| **Template** | Description | File format |
|
||||
|--------------|-----------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|
|
||||
|--------------|-------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------|
|
||||
| General | Files are consecutively chunked based on a preset chunk token number. | MD, MDX, DOCX, XLSX, XLS (Excel 97-2003), PPT, PDF, TXT, JPEG, JPG, PNG, TIF, GIF, CSV, JSON, EML, HTML |
|
||||
| Q&A | Retrieves relevant information and generates answers to respond to questions. | XLSX, XLS (Excel 97-2003), CSV/TXT |
|
||||
| Resume | Enterprise edition only. You can also try it out on demo.ragflow.io. | DOCX, PDF, TXT |
|
||||
|
||||
@ -14,7 +14,7 @@ A complete reference for RAGFlow's RESTful API. Before proceeding, please ensure
|
||||
---
|
||||
|
||||
| Code | Message | Description |
|
||||
| ---- | --------------------- | -------------------------- |
|
||||
|------|-----------------------|----------------------------|
|
||||
| 400 | Bad Request | Invalid request parameters |
|
||||
| 401 | Unauthorized | Unauthorized access |
|
||||
| 403 | Forbidden | Access denied |
|
||||
|
||||
@ -23,7 +23,7 @@ pip install ragflow-sdk
|
||||
---
|
||||
|
||||
| Code | Message | Description |
|
||||
|------|----------------------|-----------------------------|
|
||||
|------|-----------------------|----------------------------|
|
||||
| 400 | Bad Request | Invalid request parameters |
|
||||
| 401 | Unauthorized | Unauthorized access |
|
||||
| 403 | Forbidden | Access denied |
|
||||
|
||||
@ -82,7 +82,7 @@ pip install ragflow-firecrawl-integration
|
||||
## Configuration Options
|
||||
|
||||
| Option | Description | Default | Required |
|
||||
|--------|-------------|---------|----------|
|
||||
|--------------------|----------------------------------|-----------------------------|----------|
|
||||
| `api_key` | Your Firecrawl API key | - | Yes |
|
||||
| `api_url` | Firecrawl API endpoint | `https://api.firecrawl.dev` | No |
|
||||
| `max_retries` | Maximum retry attempts | 3 | No |
|
||||
|
||||
@ -100,7 +100,7 @@ intergrations/firecrawl/
|
||||
## 🔧 **Configuration Options**
|
||||
|
||||
| Option | Description | Default | Required |
|
||||
|--------|-------------|---------|----------|
|
||||
|--------------------|----------------------------------|-----------------------------|----------|
|
||||
| `api_key` | Your Firecrawl API key | - | Yes |
|
||||
| `api_url` | Firecrawl API endpoint | `https://api.firecrawl.dev` | No |
|
||||
| `max_retries` | Maximum retry attempts | 3 | No |
|
||||
|
||||
@ -21,6 +21,7 @@ from io import BytesIO
|
||||
from deepdoc.parser.utils import get_text
|
||||
from rag.app import naive
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
||||
tokenize_chunks, attach_media_context
|
||||
@ -96,7 +97,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -114,6 +117,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback = callback,
|
||||
pdf_cls = Pdf,
|
||||
layout_recognizer = layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
@ -26,6 +26,7 @@ from rag.nlp import bullets_category, remove_contents_table, \
|
||||
from rag.nlp import rag_tokenizer, Node
|
||||
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
|
||||
|
||||
|
||||
@ -155,7 +156,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return tokenize_chunks(chunks, doc, eng, None)
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -173,6 +176,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback = callback,
|
||||
pdf_cls = Pdf,
|
||||
layout_recognizer = layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
@ -27,6 +27,7 @@ from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
@ -196,7 +197,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -205,6 +208,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_parser = PARSERS.get(name, by_plaintext)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
kwargs.pop("parse_method", None)
|
||||
kwargs.pop("mineru_llm_name", None)
|
||||
sections, tbls, pdf_parser = pdf_parser(
|
||||
filename = filename,
|
||||
binary = binary,
|
||||
@ -214,6 +219,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback = callback,
|
||||
pdf_cls = Pdf,
|
||||
layout_recognizer = layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
parse_method = "manual",
|
||||
**kwargs
|
||||
)
|
||||
|
||||
@ -36,6 +36,7 @@ from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from deepdoc.parser.docling_parser import DoclingParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
|
||||
|
||||
|
||||
@ -56,11 +57,19 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||
parse_method = kwargs.get("parse_method", "raw")
|
||||
mineru_llm_name = kwargs.get("mineru_llm_name")
|
||||
tenant_id = kwargs.get("tenant_id")
|
||||
|
||||
def by_mineru(
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
mineru_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
pdf_parser = None
|
||||
if tenant_id:
|
||||
if not mineru_llm_name:
|
||||
@ -86,7 +95,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
||||
callback=callback,
|
||||
parse_method=parse_method,
|
||||
lang=lang,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
except Exception as e:
|
||||
@ -97,8 +106,6 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
||||
return None, None, None
|
||||
|
||||
|
||||
|
||||
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
|
||||
pdf_parser = DoclingParser()
|
||||
parse_method = kwargs.get("parse_method", "raw")
|
||||
@ -136,10 +143,19 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",
|
||||
|
||||
|
||||
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
if kwargs.get("layout_recognizer", "") == "Plain Text":
|
||||
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
|
||||
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
|
||||
pdf_parser = PlainParser()
|
||||
else:
|
||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
|
||||
tenant_id = kwargs.get("tenant_id")
|
||||
if not tenant_id:
|
||||
raise ValueError("tenant_id is required when using vision layout recognizer")
|
||||
vision_model = LLMBundle(
|
||||
tenant_id,
|
||||
LLMType.IMAGE2TEXT,
|
||||
llm_name=layout_recognizer,
|
||||
lang=kwargs.get("lang", "Chinese"),
|
||||
)
|
||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
||||
|
||||
sections, tables = pdf_parser(
|
||||
@ -716,14 +732,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer_raw = parser_config.get("layout_recognize", "DeepDOC")
|
||||
parser_model_name = None
|
||||
layout_recognizer = layout_recognizer_raw
|
||||
if isinstance(layout_recognizer_raw, str):
|
||||
lowered = layout_recognizer_raw.lower()
|
||||
if lowered.endswith("@mineru"):
|
||||
parser_model_name = layout_recognizer_raw.split("@", 1)[0]
|
||||
layout_recognizer = "MinerU"
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
|
||||
if parser_config.get("analyze_hyperlink", False) and is_root:
|
||||
urls = extract_links_from_pdf(binary)
|
||||
|
||||
@ -24,6 +24,7 @@ from rag.nlp import rag_tokenizer, tokenize
|
||||
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
@ -82,7 +83,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -100,6 +103,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback = callback,
|
||||
pdf_cls = Pdf,
|
||||
layout_recognizer = layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
@ -24,6 +24,7 @@ from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bull
|
||||
from deepdoc.parser import PdfParser
|
||||
import numpy as np
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -149,7 +150,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -163,6 +166,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
paper = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
else:
|
||||
kwargs.pop("parse_method", None)
|
||||
kwargs.pop("mineru_llm_name", None)
|
||||
sections, tables, pdf_parser = pdf_parser(
|
||||
filename=filename,
|
||||
binary=binary,
|
||||
@ -171,6 +176,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang=lang,
|
||||
callback=callback,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
parse_method="paper",
|
||||
**kwargs
|
||||
)
|
||||
|
||||
@ -24,6 +24,7 @@ from PyPDF2 import PdfReader as pdf2_read
|
||||
|
||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import rag_tokenizer
|
||||
from rag.nlp import tokenize, is_english
|
||||
|
||||
@ -195,7 +196,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
res.append(d)
|
||||
return res
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -213,6 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback=callback,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
@ -121,7 +121,7 @@ make logs # With Make
|
||||
### 🧰 Makefile Toolbox
|
||||
|
||||
| Command | Description |
|
||||
| ----------------- | ------------------------------------------------ |
|
||||
|-------------------|--------------------------------------------------|
|
||||
| `make` | Setup, build, launch and test all at once |
|
||||
| `make setup` | Initialize environment and install uv |
|
||||
| `make ensure_env` | Auto-create `.env` if missing |
|
||||
@ -183,7 +183,7 @@ This security model strikes a balance between **robust isolation** and **develop
|
||||
Currently, the following languages are officially supported:
|
||||
|
||||
| Language | Priority |
|
||||
| -------- | -------- |
|
||||
|----------|----------|
|
||||
| Python | High |
|
||||
| Node.js | Medium |
|
||||
|
||||
|
||||
@ -42,6 +42,7 @@ import { ExcelToHtmlFormField } from '../excel-to-html-form-field';
|
||||
import { FormContainer } from '../form-container';
|
||||
import { LayoutRecognizeFormField } from '../layout-recognize-form-field';
|
||||
import { MaxTokenNumberFormField } from '../max-token-number-from-field';
|
||||
import { MinerUOptionsFormField } from '../mineru-options-form-field';
|
||||
import { ButtonLoading } from '../ui/button';
|
||||
import { Input } from '../ui/input';
|
||||
import { DynamicPageRange } from './dynamic-page-range';
|
||||
@ -335,7 +336,10 @@ export function ChunkMethodDialog({
|
||||
className="space-y-3"
|
||||
>
|
||||
{showOne && (
|
||||
<>
|
||||
<LayoutRecognizeFormField showMineruOptions={false} />
|
||||
{isMineruSelected && <MinerUOptionsFormField />}
|
||||
</>
|
||||
)}
|
||||
{showMaxTokenNumber && (
|
||||
<>
|
||||
@ -359,9 +363,6 @@ export function ChunkMethodDialog({
|
||||
}
|
||||
className="space-y-3"
|
||||
>
|
||||
{isMineruSelected && (
|
||||
<LayoutRecognizeFormField showMineruOptions />
|
||||
)}
|
||||
{selectedTag === DocumentParserType.Naive && (
|
||||
<EnableTocToggle />
|
||||
)}
|
||||
|
||||
Reference in New Issue
Block a user