From bc9e1e3b9ab5367699aafb17616bf2c316b34ba2 Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Fri, 26 Dec 2025 18:57:16 +0800
Subject: [PATCH] Fix: parent-children pipleine bad case. (#12246)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 deepdoc/parser/pdf_parser.py  |  2 +-
 rag/flow/splitter/splitter.py |  8 ++++----
 rag/prompts/meta_data.md      | 22 +++++++++++++---------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py
index b704d25ea..5328eae47 100644
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@@ -1206,7 +1206,7 @@ class RAGFlowPdfParser:
         start = timer()
         self._text_merge()
         self._concat_downward()
-        self._naive_vertical_merge(zoomin)
+        #self._naive_vertical_merge(zoomin)
         if callback:
             callback(0.92, "Text merged ({:.2f}s)".format(timer() - start))
 
diff --git a/rag/flow/splitter/splitter.py b/rag/flow/splitter/splitter.py
index 0aec023d1..343241ab3 100644
--- a/rag/flow/splitter/splitter.py
+++ b/rag/flow/splitter/splitter.py
@@ -92,9 +92,9 @@ class Splitter(ProcessBase):
                         continue
                     split_sec = re.split(r"(%s)" % custom_pattern, c, flags=re.DOTALL)
                     if split_sec:
-                        for txt in split_sec:
+                        for j in range(0, len(split_sec), 2):
                             docs.append({
-                                "text": txt,
+                                "text": split_sec[j],
                                 "mom": c
                             })
                     else:
@@ -155,9 +155,9 @@ class Splitter(ProcessBase):
                 split_sec = re.split(r"(%s)" % custom_pattern, c["text"], flags=re.DOTALL)
                 if split_sec:
                     c["mom"] = c["text"]
-                    for txt in split_sec:
+                    for j in range(0, len(split_sec), 2):
                         cc = deepcopy(c)
-                        cc["text"] = txt
+                        cc["text"] = split_sec[j]
                         docs.append(cc)
                 else:
                     docs.append(c)
diff --git a/rag/prompts/meta_data.md b/rag/prompts/meta_data.md
index 440396df4..cf4dd0e37 100644
--- a/rag/prompts/meta_data.md
+++ b/rag/prompts/meta_data.md
@@ -1,13 +1,17 @@
-Extract important structured information from the given content. 
-Output ONLY a valid JSON string with no additional text. 
-If no important structured information is found, output an empty JSON object: {}.
+## Role: Metadata extraction expert
+## Constraints:
+ - Core Directive: Extract important structured information from the given content. Output ONLY a valid JSON string. No Markdown (e.g., ```json), no explanations, and no notes.
+ - Schema Parsing: In the `properties` object provided in Schema, the attribute name (e.g., 'author') is the target Key. Extract values based on the `description`; if no `description` is provided, refer to the key's literal meaning.
+ - Extraction Rules: Extract only when there is an explicit semantic correlation. If multiple values or data points match a field's definition, extract and include all of them. Strictly follow the Schema below and only output matched key-value pairs. If the content is irrelevant or no matching information is identified, you **MUST** output {}.
+ - Data Source: Extraction must be based solely on content below. Semantic mapping (synonyms) is allowed, but strictly prohibit hallucinations or fabricated facts.
 
-Important structured information structure as following: 
+## Enum Rules (Triggered ONLY if an enum list is present): 
+ - Value Lock: All extracted values MUST strictly match the provided enum list.
+ - Normalization: Map synonyms or variants in the text back to the standard enum value (e.g., "Dec" to "December").
+ - Fallback: Output {} if no explicit match or synonym is identified.
 
+## Schema for extraction:
 {{ schema }}
 
----------------------------
-The given content as following:
-
-{{ content }}
-
+## Content to analyze:
+{{ content }}
\ No newline at end of file