From 1033a3ae268367dbd2715ddc522ff6ee9a05064c Mon Sep 17 00:00:00 2001 From: FallingSnowFlake Date: Fri, 21 Nov 2025 14:33:29 +0800 Subject: [PATCH] Fix: improve PDF text type detection by expanding regex content (#11432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add whitespace validation to the PDF English text checking regex - Reduce false negatives in English PDF content recognition ### What problem does this PR solve? The core idea is to **expand the regex content used for English text detection** so it can accommodate more valid characters commonly found in English PDFs. The modifications include: - Adding support for **space** in the regex. - Ensuring the update does not reduce existing detection accuracy. ### Type of change - [✅] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/pdf_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 6d8431c82..f6613c2f5 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -1091,7 +1091,7 @@ class RAGFlowPdfParser: logging.debug("Images converted.") self.is_english = [ - re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) + re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars)) ] if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2: @@ -1148,7 +1148,7 @@ class RAGFlowPdfParser: if not self.is_english and not any([c for c in self.page_chars]) and self.boxes: bxes = [b for bxs in self.boxes for b in bxs] - self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))])) + self.is_english = re.search(r"[ \na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))])) logging.debug(f"Is it English: {self.is_english}")