From bc9e1e3b9ab5367699aafb17616bf2c316b34ba2 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Fri, 26 Dec 2025 18:57:16 +0800 Subject: [PATCH] Fix: parent-children pipleine bad case. (#12246) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/pdf_parser.py | 2 +- rag/flow/splitter/splitter.py | 8 ++++---- rag/prompts/meta_data.md | 22 +++++++++++++--------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index b704d25ea..5328eae47 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -1206,7 +1206,7 @@ class RAGFlowPdfParser: start = timer() self._text_merge() self._concat_downward() - self._naive_vertical_merge(zoomin) + #self._naive_vertical_merge(zoomin) if callback: callback(0.92, "Text merged ({:.2f}s)".format(timer() - start)) diff --git a/rag/flow/splitter/splitter.py b/rag/flow/splitter/splitter.py index 0aec023d1..343241ab3 100644 --- a/rag/flow/splitter/splitter.py +++ b/rag/flow/splitter/splitter.py @@ -92,9 +92,9 @@ class Splitter(ProcessBase): continue split_sec = re.split(r"(%s)" % custom_pattern, c, flags=re.DOTALL) if split_sec: - for txt in split_sec: + for j in range(0, len(split_sec), 2): docs.append({ - "text": txt, + "text": split_sec[j], "mom": c }) else: @@ -155,9 +155,9 @@ class Splitter(ProcessBase): split_sec = re.split(r"(%s)" % custom_pattern, c["text"], flags=re.DOTALL) if split_sec: c["mom"] = c["text"] - for txt in split_sec: + for j in range(0, len(split_sec), 2): cc = deepcopy(c) - cc["text"] = txt + cc["text"] = split_sec[j] docs.append(cc) else: docs.append(c) diff --git a/rag/prompts/meta_data.md b/rag/prompts/meta_data.md index 440396df4..cf4dd0e37 100644 --- a/rag/prompts/meta_data.md +++ b/rag/prompts/meta_data.md @@ -1,13 +1,17 @@ -Extract important structured information from the given content. -Output ONLY a valid JSON string with no additional text. -If no important structured information is found, output an empty JSON object: {}. +## Role: Metadata extraction expert +## Constraints: + - Core Directive: Extract important structured information from the given content. Output ONLY a valid JSON string. No Markdown (e.g., ```json), no explanations, and no notes. + - Schema Parsing: In the `properties` object provided in Schema, the attribute name (e.g., 'author') is the target Key. Extract values based on the `description`; if no `description` is provided, refer to the key's literal meaning. + - Extraction Rules: Extract only when there is an explicit semantic correlation. If multiple values or data points match a field's definition, extract and include all of them. Strictly follow the Schema below and only output matched key-value pairs. If the content is irrelevant or no matching information is identified, you **MUST** output {}. + - Data Source: Extraction must be based solely on content below. Semantic mapping (synonyms) is allowed, but strictly prohibit hallucinations or fabricated facts. -Important structured information structure as following: +## Enum Rules (Triggered ONLY if an enum list is present): + - Value Lock: All extracted values MUST strictly match the provided enum list. + - Normalization: Map synonyms or variants in the text back to the standard enum value (e.g., "Dec" to "December"). + - Fallback: Output {} if no explicit match or synonym is identified. +## Schema for extraction: {{ schema }} ---------------------------- -The given content as following: - -{{ content }} - +## Content to analyze: +{{ content }} \ No newline at end of file