mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-31 23:55:06 +08:00
Refactor: rename rmSpace to remove_redundant_spaces (#10796)
### What problem does this PR solve? - rename rmSpace to remove_redundant_spaces - move clean_markdown_block to common module - add unit tests for remove_redundant_spaces and clean_markdown_block ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
@ -24,7 +24,7 @@ from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from deepdoc.vision import OCR
|
||||
from rag.nlp import rag_tokenizer, tokenize
|
||||
from rag.utils import clean_markdown_block
|
||||
from common.string_utils import clean_markdown_block
|
||||
|
||||
ocr = OCR()
|
||||
|
||||
|
||||
@ -30,7 +30,7 @@ from docx import Document
|
||||
from PIL import Image
|
||||
from markdown import markdown
|
||||
|
||||
from rag.utils import get_float
|
||||
from common.float_utils import get_float
|
||||
|
||||
|
||||
class Excel(ExcelParser):
|
||||
|
||||
@ -25,7 +25,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser.resume import refactor
|
||||
from deepdoc.parser.resume import step_one, step_two
|
||||
from rag.utils import rmSpace
|
||||
from common.string_utils import remove_redundant_spaces
|
||||
|
||||
forbidden_select_fields4resume = [
|
||||
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
|
||||
@ -130,7 +130,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
if isinstance(v, list):
|
||||
v = v[0]
|
||||
if n.find("tks") > 0:
|
||||
v = rmSpace(v)
|
||||
v = remove_redundant_spaces(v)
|
||||
titles.append(str(v))
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
@ -145,7 +145,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
if isinstance(v, list):
|
||||
v = " ".join(v)
|
||||
if n.find("tks") > 0:
|
||||
v = rmSpace(v)
|
||||
v = remove_redundant_spaces(v)
|
||||
pairs.append((m, str(v)))
|
||||
|
||||
doc["content_with_weight"] = "\n".join(
|
||||
|
||||
@ -23,10 +23,11 @@ from dataclasses import dataclass
|
||||
|
||||
from rag.prompts.generator import relevant_chunks_with_toc
|
||||
from rag.settings import TAG_FLD, PAGERANK_FLD
|
||||
from rag.utils import rmSpace, get_float
|
||||
from rag.nlp import rag_tokenizer, query
|
||||
import numpy as np
|
||||
from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr
|
||||
from common.string_utils import remove_redundant_spaces
|
||||
from common.float_utils import get_float
|
||||
|
||||
|
||||
def index_name(uid): return f"ragflow_{uid}"
|
||||
@ -342,7 +343,7 @@ class Dealer:
|
||||
ins_tw.append(tks)
|
||||
|
||||
tksim = self.qryr.token_similarity(keywords, ins_tw)
|
||||
vtsim, _ = rerank_mdl.similarity(query, [rmSpace(" ".join(tks)) for tks in ins_tw])
|
||||
vtsim, _ = rerank_mdl.similarity(query, [remove_redundant_spaces(" ".join(tks)) for tks in ins_tw])
|
||||
## For rank feature(tag_fea) scores.
|
||||
rank_fea = self._rank_feature_scores(rank_feature, sres)
|
||||
|
||||
@ -442,7 +443,7 @@ class Dealer:
|
||||
}
|
||||
if highlight and sres.highlight:
|
||||
if id in sres.highlight:
|
||||
d["highlight"] = rmSpace(sres.highlight[id])
|
||||
d["highlight"] = remove_redundant_spaces(sres.highlight[id])
|
||||
else:
|
||||
d["highlight"] = d["content_with_weight"]
|
||||
ranks["chunks"].append(d)
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
#
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
import tiktoken
|
||||
|
||||
@ -33,48 +32,6 @@ def singleton(cls, *args, **kw):
|
||||
|
||||
return _singleton
|
||||
|
||||
|
||||
def rmSpace(txt):
|
||||
txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def findMaxDt(fnm):
|
||||
m = "1970-01-01 00:00:00"
|
||||
try:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.strip("\n")
|
||||
if line == 'nan':
|
||||
continue
|
||||
if line > m:
|
||||
m = line
|
||||
except Exception:
|
||||
pass
|
||||
return m
|
||||
|
||||
|
||||
def findMaxTm(fnm):
|
||||
m = 0
|
||||
try:
|
||||
with open(fnm, "r") as f:
|
||||
while True:
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
line = line.strip("\n")
|
||||
if line == 'nan':
|
||||
continue
|
||||
if int(line) > m:
|
||||
m = int(line)
|
||||
except Exception:
|
||||
pass
|
||||
return m
|
||||
|
||||
|
||||
tiktoken_cache_dir = get_project_base_directory()
|
||||
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
|
||||
# encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
||||
@ -113,18 +70,4 @@ def truncate(string: str, max_len: int) -> str:
|
||||
"""Returns truncated text if the length of text exceed max_len."""
|
||||
return encoder.decode(encoder.encode(string)[:max_len])
|
||||
|
||||
|
||||
def clean_markdown_block(text):
|
||||
text = re.sub(r'^\s*```markdown\s*\n?', '', text)
|
||||
text = re.sub(r'\n?\s*```\s*$', '', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_float(v):
|
||||
if v is None:
|
||||
return float('-inf')
|
||||
try:
|
||||
return float(v)
|
||||
except Exception:
|
||||
return float('-inf')
|
||||
|
||||
|
||||
@ -26,12 +26,13 @@ from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
|
||||
from elastic_transport import ConnectionTimeout
|
||||
from rag import settings
|
||||
from rag.settings import TAG_FLD, PAGERANK_FLD
|
||||
from rag.utils import singleton, get_float
|
||||
from rag.utils import singleton
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.common import convert_bytes
|
||||
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
|
||||
FusionExpr
|
||||
from rag.nlp import is_english, rag_tokenizer
|
||||
from common.float_utils import get_float
|
||||
|
||||
ATTEMPT_TIME = 2
|
||||
|
||||
@ -503,7 +504,7 @@ class ESConnection(DocStoreConnection):
|
||||
if not isinstance(v, str):
|
||||
m[n] = str(m[n])
|
||||
# if n.find("tks") > 0:
|
||||
# m[n] = rmSpace(m[n])
|
||||
# m[n] = remove_redundant_spaces(m[n])
|
||||
|
||||
if m:
|
||||
res_fields[d["id"]] = m
|
||||
|
||||
@ -484,7 +484,7 @@ class OSConnection(DocStoreConnection):
|
||||
if not isinstance(v, str):
|
||||
m[n] = str(m[n])
|
||||
# if n.find("tks") > 0:
|
||||
# m[n] = rmSpace(m[n])
|
||||
# m[n] = remove_redundant_spaces(m[n])
|
||||
|
||||
if m:
|
||||
res_fields[d["id"]] = m
|
||||
|
||||
Reference in New Issue
Block a user