From f096917eebc4c53959a37363822c0503e2233231 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Tue, 27 Jan 2026 12:43:01 +0800 Subject: [PATCH] Fix: overlap cannot be properly applied (#12828) ### What problem does this PR solve? Overlap cannot be properly applied. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- common/float_utils.py | 16 ++++++++++++++-- rag/app/naive.py | 8 ++++---- rag/flow/splitter/splitter.py | 6 ++++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/common/float_utils.py b/common/float_utils.py index 74db3b1cf..d7ef42fbb 100644 --- a/common/float_utils.py +++ b/common/float_utils.py @@ -14,6 +14,7 @@ # limitations under the License. # + def get_float(v): """ Convert a value to float, handling None and exceptions gracefully. @@ -39,8 +40,19 @@ def get_float(v): 42.0 """ if v is None: - return float('-inf') + return float("-inf") try: return float(v) except Exception: - return float('-inf') \ No newline at end of file + return float("-inf") + + +def normalize_overlapped_percent(overlapped_percent): + try: + value = float(overlapped_percent) + except (TypeError, ValueError): + return 0 + if 0 < value < 1: + value *= 100 + value = int(value) + return max(0, min(value, 90)) diff --git a/rag/app/naive.py b/rag/app/naive.py index b793b9fdc..6c49d53bf 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -38,6 +38,7 @@ from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parse from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser +from common.float_utils import normalize_overlapped_percent from common.parser_config_utils import normalize_layout_recognizer from rag.nlp import ( concat_img, @@ -983,12 +984,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)") st = timer() + overlapped_percent = normalize_overlapped_percent(parser_config.get("overlapped_percent", 0)) if is_markdown: merged_chunks = [] merged_images = [] chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128))) - overlapped_percent = int(parser_config.get("overlapped_percent", 0)) - overlapped_percent = max(0, min(overlapped_percent, 90)) current_text = "" current_tokens = 0 @@ -1037,10 +1037,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca section_images = None if section_images: - chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?")) + chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent) res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli)) else: - chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?")) + chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent) res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli)) diff --git a/rag/flow/splitter/splitter.py b/rag/flow/splitter/splitter.py index 4fc4e544c..309968117 100644 --- a/rag/flow/splitter/splitter.py +++ b/rag/flow/splitter/splitter.py @@ -23,6 +23,7 @@ from rag.utils.base64_image import id2image, image2id from deepdoc.parser.pdf_parser import RAGFlowPdfParser from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.splitter.schema import SplitterFromUpstream +from common.float_utils import normalize_overlapped_percent from rag.nlp import attach_media_context, naive_merge, naive_merge_with_images from common import settings @@ -68,6 +69,7 @@ class Splitter(ProcessBase): self.set_output("output_format", "chunks") self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.") + overlapped_percent = normalize_overlapped_percent(self._param.overlapped_percent) if from_upstream.output_format in ["markdown", "text", "html"]: if from_upstream.output_format == "markdown": payload = from_upstream.markdown_result @@ -83,7 +85,7 @@ class Splitter(ProcessBase): payload, self._param.chunk_token_size, deli, - self._param.overlapped_percent, + overlapped_percent, ) if custom_pattern: docs = [] @@ -129,7 +131,7 @@ class Splitter(ProcessBase): section_images, self._param.chunk_token_size, deli, - self._param.overlapped_percent, + overlapped_percent, ) cks = [ {