Refactor: rename rmSpace to remove_redundant_spaces (#10796)

### What problem does this PR solve? - rename rmSpace to remove_redundant_spaces - move clean_markdown_block to common module - add unit tests for remove_redundant_spaces and clean_markdown_block ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2026-02-01 08:05:07 +08:00 · 2025-10-28 09:46:32 +08:00
parent e59458c36b
commit 766d900a41
17 changed files with 604 additions and 75 deletions
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@ -24,7 +24,7 @@ from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
 from deepdoc.vision import OCR
 from rag.nlp import rag_tokenizer, tokenize
-from rag.utils import clean_markdown_block
+from common.string_utils import clean_markdown_block

 ocr = OCR()

--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -30,7 +30,7 @@ from docx import Document
 from PIL import Image
 from markdown import markdown

-from rag.utils import get_float
+from common.float_utils import get_float


 class Excel(ExcelParser):
--- a/rag/app/resume.py
+++ b/rag/app/resume.py
@ -25,7 +25,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
 from rag.nlp import rag_tokenizer
 from deepdoc.parser.resume import refactor
 from deepdoc.parser.resume import step_one, step_two
-from rag.utils import rmSpace
+from common.string_utils import remove_redundant_spaces

 forbidden_select_fields4resume = [
    "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
@ -130,7 +130,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
        if isinstance(v, list):
            v = v[0]
        if n.find("tks") > 0:
-            v = rmSpace(v)
+            v = remove_redundant_spaces(v)
        titles.append(str(v))
    doc = {
        "docnm_kwd": filename,
@ -145,7 +145,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
        if isinstance(v, list):
            v = " ".join(v)
        if n.find("tks") > 0:
-            v = rmSpace(v)
+            v = remove_redundant_spaces(v)
        pairs.append((m, str(v)))

    doc["content_with_weight"] = "\n".join(
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -23,10 +23,11 @@ from dataclasses import dataclass

 from rag.prompts.generator import relevant_chunks_with_toc
 from rag.settings import TAG_FLD, PAGERANK_FLD
-from rag.utils import rmSpace, get_float
 from rag.nlp import rag_tokenizer, query
 import numpy as np
 from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr
+from common.string_utils import remove_redundant_spaces
+from common.float_utils import get_float


 def index_name(uid): return f"ragflow_{uid}"
@ -342,7 +343,7 @@ class Dealer:
            ins_tw.append(tks)

        tksim = self.qryr.token_similarity(keywords, ins_tw)
-        vtsim, _ = rerank_mdl.similarity(query, [rmSpace(" ".join(tks)) for tks in ins_tw])
+        vtsim, _ = rerank_mdl.similarity(query, [remove_redundant_spaces(" ".join(tks)) for tks in ins_tw])
        ## For rank feature(tag_fea) scores.
        rank_fea = self._rank_feature_scores(rank_feature, sres)

@ -442,7 +443,7 @@ class Dealer:
            }
            if highlight and sres.highlight:
                if id in sres.highlight:
-                    d["highlight"] = rmSpace(sres.highlight[id])
+                    d["highlight"] = remove_redundant_spaces(sres.highlight[id])
                else:
                    d["highlight"] = d["content_with_weight"]
            ranks["chunks"].append(d)
--- a/rag/utils/init.py
+++ b/rag/utils/init.py
@ -15,7 +15,6 @@
 #

 import os
-import re

 import tiktoken

@ -33,48 +32,6 @@ def singleton(cls, *args, **kw):

    return _singleton

-
-def rmSpace(txt):
-    txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
-    return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
-
-
-def findMaxDt(fnm):
-    m = "1970-01-01 00:00:00"
-    try:
-        with open(fnm, "r") as f:
-            while True:
-                line = f.readline()
-                if not line:
-                    break
-                line = line.strip("\n")
-                if line == 'nan':
-                    continue
-                if line > m:
-                    m = line
-    except Exception:
-        pass
-    return m
-
-
-def findMaxTm(fnm):
-    m = 0
-    try:
-        with open(fnm, "r") as f:
-            while True:
-                line = f.readline()
-                if not line:
-                    break
-                line = line.strip("\n")
-                if line == 'nan':
-                    continue
-                if int(line) > m:
-                    m = int(line)
-    except Exception:
-        pass
-    return m
-
-
 tiktoken_cache_dir = get_project_base_directory()
 os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
 # encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
@ -113,18 +70,4 @@ def truncate(string: str, max_len: int) -> str:
    """Returns truncated text if the length of text exceed max_len."""
    return encoder.decode(encoder.encode(string)[:max_len])

-  
-def clean_markdown_block(text):
-    text = re.sub(r'^\s*```markdown\s*\n?', '', text)
-    text = re.sub(r'\n?\s*```\s*$', '', text)
-    return text.strip()
-
-  
-def get_float(v):
-    if v is None:
-        return float('-inf')
-    try:
-        return float(v)
-    except Exception:
-        return float('-inf')

--- a/rag/utils/es_conn.py
+++ b/rag/utils/es_conn.py
@ -26,12 +26,13 @@ from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
 from elastic_transport import ConnectionTimeout
 from rag import settings
 from rag.settings import TAG_FLD, PAGERANK_FLD
-from rag.utils import singleton, get_float
+from rag.utils import singleton
 from api.utils.file_utils import get_project_base_directory
 from api.utils.common import convert_bytes
 from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
    FusionExpr
 from rag.nlp import is_english, rag_tokenizer
+from common.float_utils import get_float

 ATTEMPT_TIME = 2

@ -503,7 +504,7 @@ class ESConnection(DocStoreConnection):
                if not isinstance(v, str):
                    m[n] = str(m[n])
                # if n.find("tks") > 0:
-                #     m[n] = rmSpace(m[n])
+                #     m[n] = remove_redundant_spaces(m[n])

            if m:
                res_fields[d["id"]] = m
--- a/rag/utils/opensearch_conn.py
+++ b/rag/utils/opensearch_conn.py
@ -484,7 +484,7 @@ class OSConnection(DocStoreConnection):
                if not isinstance(v, str):
                    m[n] = str(m[n])
                # if n.find("tks") > 0:
-                #     m[n] = rmSpace(m[n])
+                #     m[n] = remove_redundant_spaces(m[n])

            if m:
                res_fields[d["id"]] = m