mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-01 16:15:07 +08:00
Fix: overlap cannot be properly applied (#12828)
### What problem does this PR solve? Overlap cannot be properly applied. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -38,6 +38,7 @@ from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parse
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from deepdoc.parser.docling_parser import DoclingParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from common.float_utils import normalize_overlapped_percent
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import (
|
||||
concat_img,
|
||||
@ -983,12 +984,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||
|
||||
st = timer()
|
||||
overlapped_percent = normalize_overlapped_percent(parser_config.get("overlapped_percent", 0))
|
||||
if is_markdown:
|
||||
merged_chunks = []
|
||||
merged_images = []
|
||||
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
|
||||
overlapped_percent = int(parser_config.get("overlapped_percent", 0))
|
||||
overlapped_percent = max(0, min(overlapped_percent, 90))
|
||||
|
||||
current_text = ""
|
||||
current_tokens = 0
|
||||
@ -1037,10 +1037,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
section_images = None
|
||||
|
||||
if section_images:
|
||||
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent)
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent)
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user