From 9e323a93510d12a52e90a72b0ddf338512993092 Mon Sep 17 00:00:00 2001 From: Liu An Date: Thu, 25 Sep 2025 16:47:56 +0800 Subject: [PATCH] =?UTF-8?q?Feat(nlp):=20add=20"=E6=80=8E=E4=B9=88=E5=8A=9E?= =?UTF-8?q?"=20pattern=20to=20question=20word=20removal=20(#10284)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Added "怎么办" to the regex pattern in rmWWW method to improve query cleaning by removing this common question phrase along with other question words. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- rag/nlp/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rag/nlp/query.py b/rag/nlp/query.py index b708ff490..68d2d2979 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -56,7 +56,7 @@ class FulltextQueryer: def rmWWW(txt): patts = [ ( - r"是*(什么样的|哪家|一下|那家|请问|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀|谁|哪位|哪个)是*", + r"是*(怎么办|什么样的|哪家|一下|那家|请问|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀|谁|哪位|哪个)是*", "", ), (r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),