mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-02 08:35:08 +08:00
Fix: overlap cannot be properly applied (#12828)
### What problem does this PR solve? Overlap cannot be properly applied. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -14,6 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
def get_float(v):
|
def get_float(v):
|
||||||
"""
|
"""
|
||||||
Convert a value to float, handling None and exceptions gracefully.
|
Convert a value to float, handling None and exceptions gracefully.
|
||||||
@ -39,8 +40,19 @@ def get_float(v):
|
|||||||
42.0
|
42.0
|
||||||
"""
|
"""
|
||||||
if v is None:
|
if v is None:
|
||||||
return float('-inf')
|
return float("-inf")
|
||||||
try:
|
try:
|
||||||
return float(v)
|
return float(v)
|
||||||
except Exception:
|
except Exception:
|
||||||
return float('-inf')
|
return float("-inf")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_overlapped_percent(overlapped_percent):
|
||||||
|
try:
|
||||||
|
value = float(overlapped_percent)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return 0
|
||||||
|
if 0 < value < 1:
|
||||||
|
value *= 100
|
||||||
|
value = int(value)
|
||||||
|
return max(0, min(value, 90))
|
||||||
|
|||||||
@ -38,6 +38,7 @@ from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parse
|
|||||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||||
from deepdoc.parser.docling_parser import DoclingParser
|
from deepdoc.parser.docling_parser import DoclingParser
|
||||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||||
|
from common.float_utils import normalize_overlapped_percent
|
||||||
from common.parser_config_utils import normalize_layout_recognizer
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
from rag.nlp import (
|
from rag.nlp import (
|
||||||
concat_img,
|
concat_img,
|
||||||
@ -983,12 +984,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||||
|
|
||||||
st = timer()
|
st = timer()
|
||||||
|
overlapped_percent = normalize_overlapped_percent(parser_config.get("overlapped_percent", 0))
|
||||||
if is_markdown:
|
if is_markdown:
|
||||||
merged_chunks = []
|
merged_chunks = []
|
||||||
merged_images = []
|
merged_images = []
|
||||||
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
|
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
|
||||||
overlapped_percent = int(parser_config.get("overlapped_percent", 0))
|
|
||||||
overlapped_percent = max(0, min(overlapped_percent, 90))
|
|
||||||
|
|
||||||
current_text = ""
|
current_text = ""
|
||||||
current_tokens = 0
|
current_tokens = 0
|
||||||
@ -1037,10 +1037,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
section_images = None
|
section_images = None
|
||||||
|
|
||||||
if section_images:
|
if section_images:
|
||||||
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent)
|
||||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
||||||
else:
|
else:
|
||||||
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent)
|
||||||
|
|
||||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
||||||
|
|
||||||
|
|||||||
@ -23,6 +23,7 @@ from rag.utils.base64_image import id2image, image2id
|
|||||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||||
from rag.flow.splitter.schema import SplitterFromUpstream
|
from rag.flow.splitter.schema import SplitterFromUpstream
|
||||||
|
from common.float_utils import normalize_overlapped_percent
|
||||||
from rag.nlp import attach_media_context, naive_merge, naive_merge_with_images
|
from rag.nlp import attach_media_context, naive_merge, naive_merge_with_images
|
||||||
from common import settings
|
from common import settings
|
||||||
|
|
||||||
@ -68,6 +69,7 @@ class Splitter(ProcessBase):
|
|||||||
|
|
||||||
self.set_output("output_format", "chunks")
|
self.set_output("output_format", "chunks")
|
||||||
self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
|
self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
|
||||||
|
overlapped_percent = normalize_overlapped_percent(self._param.overlapped_percent)
|
||||||
if from_upstream.output_format in ["markdown", "text", "html"]:
|
if from_upstream.output_format in ["markdown", "text", "html"]:
|
||||||
if from_upstream.output_format == "markdown":
|
if from_upstream.output_format == "markdown":
|
||||||
payload = from_upstream.markdown_result
|
payload = from_upstream.markdown_result
|
||||||
@ -83,7 +85,7 @@ class Splitter(ProcessBase):
|
|||||||
payload,
|
payload,
|
||||||
self._param.chunk_token_size,
|
self._param.chunk_token_size,
|
||||||
deli,
|
deli,
|
||||||
self._param.overlapped_percent,
|
overlapped_percent,
|
||||||
)
|
)
|
||||||
if custom_pattern:
|
if custom_pattern:
|
||||||
docs = []
|
docs = []
|
||||||
@ -129,7 +131,7 @@ class Splitter(ProcessBase):
|
|||||||
section_images,
|
section_images,
|
||||||
self._param.chunk_token_size,
|
self._param.chunk_token_size,
|
||||||
deli,
|
deli,
|
||||||
self._param.overlapped_percent,
|
overlapped_percent,
|
||||||
)
|
)
|
||||||
cks = [
|
cks = [
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user