Fix error and format issue (#11975)

### What problem does this PR solve?

1. Fix error of book chunking.
2. Fix format issues.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2025-12-16 19:29:37 +08:00
committed by GitHub
parent 344a106eba
commit 0e8b9588ba
9 changed files with 51 additions and 84 deletions

View File

@ -105,10 +105,10 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
apt update && \ apt update && \
arch="$(uname -m)"; \ arch="$(uname -m)"; \
if [ "$arch" = "arm64" ] || [ "$arch" = "aarch64" ]; then \ if [ "$arch" = "arm64" ] || [ "$arch" = "aarch64" ]; then \
# ARM64 (macOS/Apple Silicon or Linux aarch64) # ARM64 (macOS/Apple Silicon or Linux aarch64) \
ACCEPT_EULA=Y apt install -y unixodbc-dev msodbcsql18; \ ACCEPT_EULA=Y apt install -y unixodbc-dev msodbcsql18; \
else \ else \
# x86_64 or others # x86_64 or others \
ACCEPT_EULA=Y apt install -y unixodbc-dev msodbcsql17; \ ACCEPT_EULA=Y apt install -y unixodbc-dev msodbcsql17; \
fi || \ fi || \
{ echo "Failed to install ODBC driver"; exit 1; } { echo "Failed to install ODBC driver"; exit 1; }

View File

@ -13,6 +13,3 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
# from beartype.claw import beartype_this_package
# beartype_this_package()

View File

@ -27,7 +27,6 @@ import pandas as pd
from agent import settings from agent import settings
from common.connection_utils import timeout from common.connection_utils import timeout
_FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params" _FEEDED_DEPRECATED_PARAMS = "_feeded_deprecated_params"
_DEPRECATED_PARAMS = "_deprecated_params" _DEPRECATED_PARAMS = "_deprecated_params"
_USER_FEEDED_PARAMS = "_user_feeded_params" _USER_FEEDED_PARAMS = "_user_feeded_params"
@ -253,96 +252,65 @@ class ComponentParamBase(ABC):
self._validate_param(attr, validation_json) self._validate_param(attr, validation_json)
@staticmethod @staticmethod
def check_string(param, descr): def check_string(param, description):
if type(param).__name__ not in ["str"]: if type(param).__name__ not in ["str"]:
raise ValueError( raise ValueError(description + " {} not supported, should be string type".format(param))
descr + " {} not supported, should be string type".format(param)
)
@staticmethod @staticmethod
def check_empty(param, descr): def check_empty(param, description):
if not param: if not param:
raise ValueError( raise ValueError(description + " does not support empty value.")
descr + " does not support empty value."
)
@staticmethod @staticmethod
def check_positive_integer(param, descr): def check_positive_integer(param, description):
if type(param).__name__ not in ["int", "long"] or param <= 0: if type(param).__name__ not in ["int", "long"] or param <= 0:
raise ValueError( raise ValueError(description + " {} not supported, should be positive integer".format(param))
descr + " {} not supported, should be positive integer".format(param)
)
@staticmethod @staticmethod
def check_positive_number(param, descr): def check_positive_number(param, description):
if type(param).__name__ not in ["float", "int", "long"] or param <= 0: if type(param).__name__ not in ["float", "int", "long"] or param <= 0:
raise ValueError( raise ValueError(description + " {} not supported, should be positive numeric".format(param))
descr + " {} not supported, should be positive numeric".format(param)
)
@staticmethod @staticmethod
def check_nonnegative_number(param, descr): def check_nonnegative_number(param, description):
if type(param).__name__ not in ["float", "int", "long"] or param < 0: if type(param).__name__ not in ["float", "int", "long"] or param < 0:
raise ValueError( raise ValueError(description + " {} not supported, should be non-negative numeric".format(param))
descr
+ " {} not supported, should be non-negative numeric".format(param)
)
@staticmethod @staticmethod
def check_decimal_float(param, descr): def check_decimal_float(param, description):
if type(param).__name__ not in ["float", "int"] or param < 0 or param > 1: if type(param).__name__ not in ["float", "int"] or param < 0 or param > 1:
raise ValueError( raise ValueError(description + " {} not supported, should be a float number in range [0, 1]".format(param))
descr
+ " {} not supported, should be a float number in range [0, 1]".format(
param
)
)
@staticmethod @staticmethod
def check_boolean(param, descr): def check_boolean(param, description):
if type(param).__name__ != "bool": if type(param).__name__ != "bool":
raise ValueError( raise ValueError(description + " {} not supported, should be bool type".format(param))
descr + " {} not supported, should be bool type".format(param)
)
@staticmethod @staticmethod
def check_open_unit_interval(param, descr): def check_open_unit_interval(param, description):
if type(param).__name__ not in ["float"] or param <= 0 or param >= 1: if type(param).__name__ not in ["float"] or param <= 0 or param >= 1:
raise ValueError( raise ValueError(description + " should be a numeric number between 0 and 1 exclusively")
descr + " should be a numeric number between 0 and 1 exclusively"
)
@staticmethod @staticmethod
def check_valid_value(param, descr, valid_values): def check_valid_value(param, description, valid_values):
if param not in valid_values: if param not in valid_values:
raise ValueError( raise ValueError(description + " {} is not supported, it should be in {}".format(param, valid_values))
descr
+ " {} is not supported, it should be in {}".format(param, valid_values)
)
@staticmethod @staticmethod
def check_defined_type(param, descr, types): def check_defined_type(param, description, types):
if type(param).__name__ not in types: if type(param).__name__ not in types:
raise ValueError( raise ValueError(description + " {} not supported, should be one of {}".format(param, types))
descr + " {} not supported, should be one of {}".format(param, types)
)
@staticmethod @staticmethod
def check_and_change_lower(param, valid_list, descr=""): def check_and_change_lower(param, valid_list, description=""):
if type(param).__name__ != "str": if type(param).__name__ != "str":
raise ValueError( raise ValueError(description + " {} not supported, should be one of {}".format(param, valid_list))
descr
+ " {} not supported, should be one of {}".format(param, valid_list)
)
lower_param = param.lower() lower_param = param.lower()
if lower_param in valid_list: if lower_param in valid_list:
return lower_param return lower_param
else: else:
raise ValueError( raise ValueError(description + " {} not supported, should be one of {}".format(param, valid_list))
descr
+ " {} not supported, should be one of {}".format(param, valid_list)
)
@staticmethod @staticmethod
def _greater_equal_than(value, limit): def _greater_equal_than(value, limit):
@ -374,16 +342,16 @@ class ComponentParamBase(ABC):
def _not_in(value, wrong_value_list): def _not_in(value, wrong_value_list):
return value not in wrong_value_list return value not in wrong_value_list
def _warn_deprecated_param(self, param_name, descr): def _warn_deprecated_param(self, param_name, description):
if self._deprecated_params_set.get(param_name): if self._deprecated_params_set.get(param_name):
logging.warning( logging.warning(
f"{descr} {param_name} is deprecated and ignored in this version." f"{description} {param_name} is deprecated and ignored in this version."
) )
def _warn_to_deprecate_param(self, param_name, descr, new_param): def _warn_to_deprecate_param(self, param_name, description, new_param):
if self._deprecated_params_set.get(param_name): if self._deprecated_params_set.get(param_name):
logging.warning( logging.warning(
f"{descr} {param_name} will be deprecated in future release; " f"{description} {param_name} will be deprecated in future release; "
f"please use {new_param} instead." f"please use {new_param} instead."
) )
return True return True
@ -532,7 +500,7 @@ class ComponentBase(ABC):
res[exp] = { res[exp] = {
"name": (self._canvas.get_component_name(cpn_id) + f"@{var_nm}") if cpn_id else exp, "name": (self._canvas.get_component_name(cpn_id) + f"@{var_nm}") if cpn_id else exp,
"value": self._canvas.get_variable_value(exp), "value": self._canvas.get_variable_value(exp),
"_retrival": self._canvas.get_variable_value(f"{cpn_id}@_references") if cpn_id else None, "_retrieval": self._canvas.get_variable_value(f"{cpn_id}@_references") if cpn_id else None,
"_cpn_id": cpn_id "_cpn_id": cpn_id
} }
return res return res
@ -583,6 +551,7 @@ class ComponentBase(ABC):
for n, v in kv.items(): for n, v in kv.items():
def repl(_match, val=v): def repl(_match, val=v):
return str(val) if val is not None else "" return str(val) if val is not None else ""
content = re.sub( content = re.sub(
r"\{%s\}" % re.escape(n), r"\{%s\}" % re.escape(n),
repl, repl,

View File

@ -75,7 +75,7 @@
}, },
"history": [], "history": [],
"path": [], "path": [],
"retrival": {"chunks": [], "doc_aggs": []}, "retrieval": {"chunks": [], "doc_aggs": []},
"globals": { "globals": {
"sys.query": "", "sys.query": "",
"sys.user_id": "", "sys.user_id": "",

View File

@ -82,7 +82,7 @@
}, },
"history": [], "history": [],
"path": [], "path": [],
"retrival": {"chunks": [], "doc_aggs": []}, "retrieval": {"chunks": [], "doc_aggs": []},
"globals": { "globals": {
"sys.query": "", "sys.query": "",
"sys.user_id": "", "sys.user_id": "",

View File

@ -51,7 +51,7 @@
}, },
"history": [], "history": [],
"path": [], "path": [],
"retrival": {"chunks": [], "doc_aggs": []}, "retrieval": {"chunks": [], "doc_aggs": []},
"globals": { "globals": {
"sys.query": "", "sys.query": "",
"sys.user_id": "", "sys.user_id": "",

View File

@ -85,7 +85,7 @@
}, },
"history": [], "history": [],
"path": [], "path": [],
"retrival": {"chunks": [], "doc_aggs": []}, "retrieval": {"chunks": [], "doc_aggs": []},
"globals": { "globals": {
"sys.query": "", "sys.query": "",
"sys.user_id": "", "sys.user_id": "",

View File

@ -45,7 +45,7 @@
}, },
"history": [], "history": [],
"path": [], "path": [],
"retrival": {"chunks": [], "doc_aggs": []}, "retrieval": {"chunks": [], "doc_aggs": []},
"globals": { "globals": {
"sys.query": "", "sys.query": "",
"sys.user_id": "", "sys.user_id": "",

View File

@ -166,9 +166,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections = [s.split("@") for s, _ in sections] sections = [s.split("@") for s, _ in sections]
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ] sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ]
chunks = naive_merge( chunks = naive_merge(
sections, kwargs.get( sections,
"chunk_token_num", 256), kwargs.get( parser_config.get("chunk_token_num", 256),
"delimer", "\n。;!?")) parser_config.get("delimiter", "\n。;!?")
)
# is it English # is it English
# is_english(random_choices([t for t, _ in sections], k=218)) # is_english(random_choices([t for t, _ in sections], k=218))