Refactor: rename rmSpace to remove_redundant_spaces (#10796)

### What problem does this PR solve?

- rename rmSpace to remove_redundant_spaces
- move clean_markdown_block to common module
- add unit tests for remove_redundant_spaces and clean_markdown_block

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2025-10-28 09:46:32 +08:00
committed by GitHub
parent e59458c36b
commit 766d900a41
17 changed files with 604 additions and 75 deletions

View File

@ -24,7 +24,7 @@ from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.vision import OCR
from rag.nlp import rag_tokenizer, tokenize
from rag.utils import clean_markdown_block
from common.string_utils import clean_markdown_block
ocr = OCR()

View File

@ -30,7 +30,7 @@ from docx import Document
from PIL import Image
from markdown import markdown
from rag.utils import get_float
from common.float_utils import get_float
class Excel(ExcelParser):

View File

@ -25,7 +25,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import rag_tokenizer
from deepdoc.parser.resume import refactor
from deepdoc.parser.resume import step_one, step_two
from rag.utils import rmSpace
from common.string_utils import remove_redundant_spaces
forbidden_select_fields4resume = [
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
@ -130,7 +130,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
if isinstance(v, list):
v = v[0]
if n.find("tks") > 0:
v = rmSpace(v)
v = remove_redundant_spaces(v)
titles.append(str(v))
doc = {
"docnm_kwd": filename,
@ -145,7 +145,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
if isinstance(v, list):
v = " ".join(v)
if n.find("tks") > 0:
v = rmSpace(v)
v = remove_redundant_spaces(v)
pairs.append((m, str(v)))
doc["content_with_weight"] = "\n".join(

View File

@ -23,10 +23,11 @@ from dataclasses import dataclass
from rag.prompts.generator import relevant_chunks_with_toc
from rag.settings import TAG_FLD, PAGERANK_FLD
from rag.utils import rmSpace, get_float
from rag.nlp import rag_tokenizer, query
import numpy as np
from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr
from common.string_utils import remove_redundant_spaces
from common.float_utils import get_float
def index_name(uid): return f"ragflow_{uid}"
@ -342,7 +343,7 @@ class Dealer:
ins_tw.append(tks)
tksim = self.qryr.token_similarity(keywords, ins_tw)
vtsim, _ = rerank_mdl.similarity(query, [rmSpace(" ".join(tks)) for tks in ins_tw])
vtsim, _ = rerank_mdl.similarity(query, [remove_redundant_spaces(" ".join(tks)) for tks in ins_tw])
## For rank feature(tag_fea) scores.
rank_fea = self._rank_feature_scores(rank_feature, sres)
@ -442,7 +443,7 @@ class Dealer:
}
if highlight and sres.highlight:
if id in sres.highlight:
d["highlight"] = rmSpace(sres.highlight[id])
d["highlight"] = remove_redundant_spaces(sres.highlight[id])
else:
d["highlight"] = d["content_with_weight"]
ranks["chunks"].append(d)

View File

@ -15,7 +15,6 @@
#
import os
import re
import tiktoken
@ -33,48 +32,6 @@ def singleton(cls, *args, **kw):
return _singleton
def rmSpace(txt):
txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
def findMaxDt(fnm):
m = "1970-01-01 00:00:00"
try:
with open(fnm, "r") as f:
while True:
line = f.readline()
if not line:
break
line = line.strip("\n")
if line == 'nan':
continue
if line > m:
m = line
except Exception:
pass
return m
def findMaxTm(fnm):
m = 0
try:
with open(fnm, "r") as f:
while True:
line = f.readline()
if not line:
break
line = line.strip("\n")
if line == 'nan':
continue
if int(line) > m:
m = int(line)
except Exception:
pass
return m
tiktoken_cache_dir = get_project_base_directory()
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
# encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
@ -113,18 +70,4 @@ def truncate(string: str, max_len: int) -> str:
"""Returns truncated text if the length of text exceed max_len."""
return encoder.decode(encoder.encode(string)[:max_len])
def clean_markdown_block(text):
text = re.sub(r'^\s*```markdown\s*\n?', '', text)
text = re.sub(r'\n?\s*```\s*$', '', text)
return text.strip()
def get_float(v):
if v is None:
return float('-inf')
try:
return float(v)
except Exception:
return float('-inf')

View File

@ -26,12 +26,13 @@ from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
from elastic_transport import ConnectionTimeout
from rag import settings
from rag.settings import TAG_FLD, PAGERANK_FLD
from rag.utils import singleton, get_float
from rag.utils import singleton
from api.utils.file_utils import get_project_base_directory
from api.utils.common import convert_bytes
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
FusionExpr
from rag.nlp import is_english, rag_tokenizer
from common.float_utils import get_float
ATTEMPT_TIME = 2
@ -503,7 +504,7 @@ class ESConnection(DocStoreConnection):
if not isinstance(v, str):
m[n] = str(m[n])
# if n.find("tks") > 0:
# m[n] = rmSpace(m[n])
# m[n] = remove_redundant_spaces(m[n])
if m:
res_fields[d["id"]] = m

View File

@ -484,7 +484,7 @@ class OSConnection(DocStoreConnection):
if not isinstance(v, str):
m[n] = str(m[n])
# if n.find("tks") > 0:
# m[n] = rmSpace(m[n])
# m[n] = remove_redundant_spaces(m[n])
if m:
res_fields[d["id"]] = m