mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Refactor: rename rmSpace to remove_redundant_spaces (#10796)
### What problem does this PR solve? - rename rmSpace to remove_redundant_spaces - move clean_markdown_block to common module - add unit tests for remove_redundant_spaces and clean_markdown_block ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
@ -15,7 +15,6 @@
|
||||
#
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
import tiktoken
|
||||
|
||||
@ -33,48 +32,6 @@ def singleton(cls, *args, **kw):
|
||||
|
||||
return _singleton
|
||||
|
||||
|
||||
def rmSpace(txt):
|
||||
txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def findMaxDt(fnm):
|
||||
m = "1970-01-01 00:00:00"
|
||||
try:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.strip("\n")
|
||||
if line == 'nan':
|
||||
continue
|
||||
if line > m:
|
||||
m = line
|
||||
except Exception:
|
||||
pass
|
||||
return m
|
||||
|
||||
|
||||
def findMaxTm(fnm):
|
||||
m = 0
|
||||
try:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.strip("\n")
|
||||
if line == 'nan':
|
||||
continue
|
||||
if int(line) > m:
|
||||
m = int(line)
|
||||
except Exception:
|
||||
pass
|
||||
return m
|
||||
|
||||
|
||||
tiktoken_cache_dir = get_project_base_directory()
|
||||
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
|
||||
# encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
||||
@ -113,18 +70,4 @@ def truncate(string: str, max_len: int) -> str:
|
||||
"""Returns truncated text if the length of text exceed max_len."""
|
||||
return encoder.decode(encoder.encode(string)[:max_len])
|
||||
|
||||
|
||||
def clean_markdown_block(text):
|
||||
text = re.sub(r'^\s*```markdown\s*\n?', '', text)
|
||||
text = re.sub(r'\n?\s*```\s*$', '', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_float(v):
|
||||
if v is None:
|
||||
return float('-inf')
|
||||
try:
|
||||
return float(v)
|
||||
except Exception:
|
||||
return float('-inf')
|
||||
|
||||
|
||||
@ -26,12 +26,13 @@ from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
|
||||
from elastic_transport import ConnectionTimeout
|
||||
from rag import settings
|
||||
from rag.settings import TAG_FLD, PAGERANK_FLD
|
||||
from rag.utils import singleton, get_float
|
||||
from rag.utils import singleton
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.common import convert_bytes
|
||||
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
|
||||
FusionExpr
|
||||
from rag.nlp import is_english, rag_tokenizer
|
||||
from common.float_utils import get_float
|
||||
|
||||
ATTEMPT_TIME = 2
|
||||
|
||||
@ -503,7 +504,7 @@ class ESConnection(DocStoreConnection):
|
||||
if not isinstance(v, str):
|
||||
m[n] = str(m[n])
|
||||
# if n.find("tks") > 0:
|
||||
# m[n] = rmSpace(m[n])
|
||||
# m[n] = remove_redundant_spaces(m[n])
|
||||
|
||||
if m:
|
||||
res_fields[d["id"]] = m
|
||||
|
||||
@ -484,7 +484,7 @@ class OSConnection(DocStoreConnection):
|
||||
if not isinstance(v, str):
|
||||
m[n] = str(m[n])
|
||||
# if n.find("tks") > 0:
|
||||
# m[n] = rmSpace(m[n])
|
||||
# m[n] = remove_redundant_spaces(m[n])
|
||||
|
||||
if m:
|
||||
res_fields[d["id"]] = m
|
||||
|
||||
Reference in New Issue
Block a user