From d8192f8f1736e2a01db50e68d947a1f3b74e4a1c Mon Sep 17 00:00:00 2001 From: Pegasus <42954461+leonace924@users.noreply.github.com> Date: Thu, 15 Jan 2026 01:24:51 -0500 Subject: [PATCH] Fix: validate regex pattern in split_with_pattern to prevent crash (#12633) ### What problem does this PR solve? Fix regex pattern validation in split_with_pattern (#12605) - Add try-except block to validate user-provided regex patterns before use - Gracefully fallback to single chunk when invalid regex is provided - Prevent server crash during DOCX parsing with malformed delimiters ## Problem Parsing DOCX files with custom regex delimiters crashes with `re.error: nothing to repeat at position 9` when users provide invalid regex patterns. Closes #12605 ## Solution Validate and compile regex pattern before use. On invalid pattern, log warning and return content as single chunk instead of crashing. ## Changes - `rag/nlp/__init__.py`: Add regex validation in `split_with_pattern()` function ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Contribution by Gittensor, see my contribution statistics at https://gittensor.io/miners/details?githubId=42954461 --- rag/nlp/__init__.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index e4cefd993..bc033dac4 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -275,7 +275,18 @@ def tokenize(d, txt, eng): def split_with_pattern(d, pattern: str, content: str, eng) -> list: docs = [] - txts = [txt for txt in re.split(r"(%s)" % pattern, content, flags=re.DOTALL)] + + # Validate and compile regex pattern before use + try: + compiled_pattern = re.compile(r"(%s)" % pattern, flags=re.DOTALL) + except re.error as e: + logging.warning(f"Invalid delimiter regex pattern '{pattern}': {e}. Falling back to no split.") + # Fallback: return content as single chunk + dd = copy.deepcopy(d) + tokenize(dd, content, eng) + return [dd] + + txts = [txt for txt in compiled_pattern.split(content)] for j in range(0, len(txts), 2): txt = txts[j] if not txt: