From f52e56c2d65b536269822862c3ded90fd2b2e54b Mon Sep 17 00:00:00 2001 From: Jin Hai Date: Fri, 31 Oct 2025 16:42:01 +0800 Subject: [PATCH] Remove 'get_lan_ip' and add common misc_utils.py (#10880) ### What problem does this PR solve? Add get_uuid, download_img and hash_str2int into misc_utils.py ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai --- admin/server/auth.py | 2 +- agent/canvas.py | 2 +- agent/tools/base.py | 2 +- api/apps/api_app.py | 2 +- api/apps/canvas_app.py | 2 +- api/apps/dialog_app.py | 2 +- api/apps/document_app.py | 2 +- api/apps/file2document_app.py | 2 +- api/apps/file_app.py | 2 +- api/apps/kb_app.py | 2 +- api/apps/mcp_server_app.py | 2 +- api/apps/sdk/agent.py | 2 +- api/apps/sdk/chat.py | 2 +- api/apps/sdk/dataset.py | 2 +- api/apps/sdk/files.py | 2 +- api/apps/sdk/session.py | 2 +- api/apps/search_app.py | 2 +- api/apps/tenant_app.py | 2 +- api/apps/user_app.py | 2 +- api/db/services/canvas_service.py | 2 +- api/db/services/common_service.py | 2 +- api/db/services/conversation_service.py | 2 +- api/db/services/document_service.py | 2 +- api/db/services/file_service.py | 2 +- .../pipeline_operation_log_service.py | 2 +- api/db/services/task_service.py | 2 +- api/db/services/user_service.py | 2 +- api/utils/__init__.py | 58 ---- common/misc_utils.py | 36 +++ graphrag/general/index.py | 2 +- graphrag/search.py | 2 +- graphrag/utils.py | 2 +- rag/benchmark.py | 2 +- .../hierarchical_merger.py | 2 +- rag/flow/parser/parser.py | 2 +- rag/flow/splitter/splitter.py | 2 +- rag/prompts/generator.py | 2 +- rag/utils/tavily_conn.py | 2 +- test/unit_test/common/test_misc_utils.py | 272 ++++++++++++++++++ 39 files changed, 344 insertions(+), 94 deletions(-) create mode 100644 common/misc_utils.py create mode 100644 test/unit_test/common/test_misc_utils.py diff --git a/admin/server/auth.py b/admin/server/auth.py index a6d022a5a..c3046f5a5 100644 --- a/admin/server/auth.py +++ b/admin/server/auth.py @@ -29,7 +29,7 @@ from api.db.init_data import encode_to_base64 from api.db.services import UserService from api.db import ActiveEnum, StatusEnum from api.utils.crypt import decrypt -from api.utils import get_uuid +from common.misc_utils import get_uuid from common.time_utils import current_timestamp, datetime_format, get_format_time from api.utils.api_utils import ( construct_response, diff --git a/agent/canvas.py b/agent/canvas.py index d1cbc48ae..a1c31c17a 100644 --- a/agent/canvas.py +++ b/agent/canvas.py @@ -26,7 +26,7 @@ from typing import Any, Union, Tuple from agent.component import component_class from agent.component.base import ComponentBase from api.db.services.file_service import FileService -from api.utils import get_uuid, hash_str2int +from common.misc_utils import get_uuid, hash_str2int from rag.prompts.generator import chunks_format from rag.utils.redis_conn import REDIS_CONN diff --git a/agent/tools/base.py b/agent/tools/base.py index e775615ac..93bde20aa 100644 --- a/agent/tools/base.py +++ b/agent/tools/base.py @@ -20,7 +20,7 @@ from copy import deepcopy from functools import partial from typing import TypedDict, List, Any from agent.component.base import ComponentParamBase, ComponentBase -from api.utils import hash_str2int +from common.misc_utils import hash_str2int from rag.llm.chat_model import ToolCallSession from rag.prompts.generator import kb_prompt from rag.utils.mcp_tool_call_conn import MCPToolCallSession diff --git a/api/apps/api_app.py b/api/apps/api_app.py index 206dc1581..094593c72 100644 --- a/api/apps/api_app.py +++ b/api/apps/api_app.py @@ -33,7 +33,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.task_service import queue_tasks, TaskService from api.db.services.user_service import UserTenantService from api import settings -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import server_error_response, get_data_error_result, get_json_result, validate_request, \ generate_confirmation_token diff --git a/api/apps/canvas_app.py b/api/apps/canvas_app.py index 7bf3a452f..e0e767401 100644 --- a/api/apps/canvas_app.py +++ b/api/apps/canvas_app.py @@ -35,7 +35,7 @@ from api.db.services.task_service import queue_dataflow, CANVAS_DEBUG_DOC_ID, Ta from api.db.services.user_service import TenantService from api.db.services.user_canvas_version import UserCanvasVersionService from api.settings import RetCode -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import get_json_result, server_error_response, validate_request, get_data_error_result from agent.canvas import Canvas from peewee import MySQLDatabase, PostgresqlDatabase diff --git a/api/apps/dialog_app.py b/api/apps/dialog_app.py index e7f1e068a..a7cb8650f 100644 --- a/api/apps/dialog_app.py +++ b/api/apps/dialog_app.py @@ -24,7 +24,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.user_service import TenantService, UserTenantService from api import settings from api.utils.api_utils import server_error_response, get_data_error_result, validate_request -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import get_json_result diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 42856bf77..fb6f9fdd4 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -35,7 +35,7 @@ from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks, queue_dataflow from api.db.services.user_service import UserTenantService -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import ( get_data_error_result, get_json_result, diff --git a/api/apps/file2document_app.py b/api/apps/file2document_app.py index 862b7e7e0..07bed457b 100644 --- a/api/apps/file2document_app.py +++ b/api/apps/file2document_app.py @@ -23,7 +23,7 @@ from flask import request from flask_login import login_required, current_user from api.db.services.knowledgebase_service import KnowledgebaseService from api.utils.api_utils import server_error_response, get_data_error_result, validate_request -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.db import FileType from api.db.services.document_service import DocumentService from api import settings diff --git a/api/apps/file_app.py b/api/apps/file_app.py index 7828a82e6..252c57646 100644 --- a/api/apps/file_app.py +++ b/api/apps/file_app.py @@ -26,7 +26,7 @@ from api.common.check_team_permission import check_file_team_permission from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from api.utils.api_utils import server_error_response, get_data_error_result, validate_request -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.db import FileType, FileSource from api.db.services import duplicate_name from api.db.services.file_service import FileService diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index 8f177ddb8..2fe46d386 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -31,7 +31,7 @@ from api.db.services.pipeline_operation_log_service import PipelineOperationLogS from api.db.services.task_service import TaskService, GRAPH_RAPTOR_FAKE_DOC_ID from api.db.services.user_service import TenantService, UserTenantService from api.utils.api_utils import get_error_data_result, server_error_response, get_data_error_result, validate_request, not_allowed_parameters -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.db import PipelineTaskType, StatusEnum, FileSource, VALID_FILE_TYPES, VALID_TASK_STATUS from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.db_models import File diff --git a/api/apps/mcp_server_app.py b/api/apps/mcp_server_app.py index f9922827b..985da0bb3 100644 --- a/api/apps/mcp_server_app.py +++ b/api/apps/mcp_server_app.py @@ -22,7 +22,7 @@ from api.db.services.mcp_server_service import MCPServerService from api.db.services.user_service import TenantService from api.settings import RetCode -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import get_data_error_result, get_json_result, server_error_response, validate_request, \ get_mcp_tools from api.utils.web_utils import get_float, safe_json_parse diff --git a/api/apps/sdk/agent.py b/api/apps/sdk/agent.py index b41328365..824a7f0fa 100644 --- a/api/apps/sdk/agent.py +++ b/api/apps/sdk/agent.py @@ -20,7 +20,7 @@ from typing import Any, cast from api.db.services.canvas_service import UserCanvasService from api.db.services.user_canvas_version import UserCanvasVersionService from api.settings import RetCode -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import get_data_error_result, get_error_data_result, get_json_result, token_required from api.utils.api_utils import get_result from flask import request diff --git a/api/apps/sdk/chat.py b/api/apps/sdk/chat.py index 44cf2f533..02424d6b9 100644 --- a/api/apps/sdk/chat.py +++ b/api/apps/sdk/chat.py @@ -23,7 +23,7 @@ from api.db.services.dialog_service import DialogService from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.tenant_llm_service import TenantLLMService from api.db.services.user_service import TenantService -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import check_duplicate_ids, get_error_data_result, get_result, token_required diff --git a/api/apps/sdk/dataset.py b/api/apps/sdk/dataset.py index 5a43b6835..f5a04b510 100644 --- a/api/apps/sdk/dataset.py +++ b/api/apps/sdk/dataset.py @@ -28,7 +28,7 @@ from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.user_service import TenantService -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import ( deep_merge, get_error_argument_result, diff --git a/api/apps/sdk/files.py b/api/apps/sdk/files.py index d2a3de21f..0f71b3857 100644 --- a/api/apps/sdk/files.py +++ b/api/apps/sdk/files.py @@ -26,7 +26,7 @@ from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from api.db.services.knowledgebase_service import KnowledgebaseService from api.utils.api_utils import server_error_response, token_required -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.db import FileType from api.db.services import duplicate_name from api.db.services.file_service import FileService diff --git a/api/apps/sdk/session.py b/api/apps/sdk/session.py index 684d00928..4f9aa2c95 100644 --- a/api/apps/sdk/session.py +++ b/api/apps/sdk/session.py @@ -35,7 +35,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle from api.db.services.search_service import SearchService from api.db.services.user_service import UserTenantService -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import check_duplicate_ids, get_data_openai, get_error_data_result, get_json_result, \ get_result, server_error_response, token_required, validate_request from rag.app.tag import label_question diff --git a/api/apps/search_app.py b/api/apps/search_app.py index e0002f816..1aacda4d0 100644 --- a/api/apps/search_app.py +++ b/api/apps/search_app.py @@ -24,7 +24,7 @@ from api.db.db_models import DB from api.db.services import duplicate_name from api.db.services.search_service import SearchService from api.db.services.user_service import TenantService, UserTenantService -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import get_data_error_result, get_json_result, not_allowed_parameters, server_error_response, validate_request diff --git a/api/apps/tenant_app.py b/api/apps/tenant_app.py index 6a9351a62..3d0d35779 100644 --- a/api/apps/tenant_app.py +++ b/api/apps/tenant_app.py @@ -23,7 +23,7 @@ from api.db import UserTenantRole, StatusEnum from api.db.db_models import UserTenant from api.db.services.user_service import UserTenantService, UserService -from api.utils import get_uuid +from common.misc_utils import get_uuid from common.time_utils import delta_seconds from api.utils.api_utils import get_json_result, validate_request, server_error_response, get_data_error_result from api.utils.web_utils import send_invite_email diff --git a/api/apps/user_app.py b/api/apps/user_app.py index 1f5495f42..54f8f47fd 100644 --- a/api/apps/user_app.py +++ b/api/apps/user_app.py @@ -35,7 +35,7 @@ from api.db.services.llm_service import get_init_tenant_llm from api.db.services.tenant_llm_service import TenantLLMService from api.db.services.user_service import TenantService, UserService, UserTenantService from common.time_utils import current_timestamp, datetime_format, get_format_time -from api.utils import download_img, get_uuid +from common.misc_utils import download_img, get_uuid from api.utils.api_utils import ( construct_response, get_data_error_result, diff --git a/api/db/services/canvas_service.py b/api/db/services/canvas_service.py index 4c9da3ac3..6872cd5bd 100644 --- a/api/db/services/canvas_service.py +++ b/api/db/services/canvas_service.py @@ -22,7 +22,7 @@ from api.db import CanvasCategory, TenantPermission from api.db.db_models import DB, CanvasTemplate, User, UserCanvas, API4Conversation from api.db.services.api_service import API4ConversationService from api.db.services.common_service import CommonService -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import get_data_openai import tiktoken from peewee import fn diff --git a/api/db/services/common_service.py b/api/db/services/common_service.py index 2c26b6f32..8c8a4191e 100644 --- a/api/db/services/common_service.py +++ b/api/db/services/common_service.py @@ -19,7 +19,7 @@ import peewee from peewee import InterfaceError, OperationalError from api.db.db_models import DB -from api.utils import get_uuid +from common.misc_utils import get_uuid from common.time_utils import current_timestamp, datetime_format def retry_db_operation(func): diff --git a/api/db/services/conversation_service.py b/api/db/services/conversation_service.py index 53913f442..bd7f1d4b8 100644 --- a/api/db/services/conversation_service.py +++ b/api/db/services/conversation_service.py @@ -20,7 +20,7 @@ from api.db.db_models import Conversation, DB from api.db.services.api_service import API4ConversationService from api.db.services.common_service import CommonService from api.db.services.dialog_service import DialogService, chat -from api.utils import get_uuid +from common.misc_utils import get_uuid import json from rag.prompts.generator import chunks_format diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 6814b0968..3ef640633 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -34,7 +34,7 @@ from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTena from api.db.db_utils import bulk_insert_into_db from api.db.services.common_service import CommonService from api.db.services.knowledgebase_service import KnowledgebaseService -from api.utils import get_uuid +from common.misc_utils import get_uuid from common.time_utils import current_timestamp, get_format_time from rag.nlp import rag_tokenizer, search from rag.settings import get_svr_queue_name, SVR_CONSUMER_GROUP_NAME diff --git a/api/db/services/file_service.py b/api/db/services/file_service.py index c6b63564f..5723b14b1 100644 --- a/api/db/services/file_service.py +++ b/api/db/services/file_service.py @@ -27,7 +27,7 @@ from api.db.services import duplicate_name from api.db.services.common_service import CommonService from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img from rag.llm.cv_model import GptV4 from rag.utils.storage_factory import STORAGE_IMPL diff --git a/api/db/services/pipeline_operation_log_service.py b/api/db/services/pipeline_operation_log_service.py index 3957171b9..01bc32dd1 100644 --- a/api/db/services/pipeline_operation_log_service.py +++ b/api/db/services/pipeline_operation_log_service.py @@ -27,7 +27,7 @@ from api.db.services.common_service import CommonService from api.db.services.document_service import DocumentService from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.task_service import GRAPH_RAPTOR_FAKE_DOC_ID -from api.utils import get_uuid +from common.misc_utils import get_uuid from common.time_utils import current_timestamp, datetime_format diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index 44a8b7ab4..6ed0fd0e3 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -27,7 +27,7 @@ from api.db import StatusEnum, FileType, TaskStatus from api.db.db_models import Task, Document, Knowledgebase, Tenant from api.db.services.common_service import CommonService from api.db.services.document_service import DocumentService -from api.utils import get_uuid +from common.misc_utils import get_uuid from common.time_utils import current_timestamp from deepdoc.parser.excel_parser import RAGFlowExcelParser from rag.settings import get_svr_queue_name diff --git a/api/db/services/user_service.py b/api/db/services/user_service.py index ef9178aca..54fe4bf27 100644 --- a/api/db/services/user_service.py +++ b/api/db/services/user_service.py @@ -24,7 +24,7 @@ from api.db import UserTenantRole from api.db.db_models import DB, UserTenant from api.db.db_models import User, Tenant from api.db.services.common_service import CommonService -from api.utils import get_uuid +from common.misc_utils import get_uuid from common.time_utils import current_timestamp, datetime_format from api.db import StatusEnum from rag.settings import MINIO diff --git a/api/utils/__init__.py b/api/utils/__init__.py index 8747343cf..e7d561502 100644 --- a/api/utils/__init__.py +++ b/api/utils/__init__.py @@ -13,49 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import base64 -import hashlib -import os -import socket -import uuid -import requests - import importlib -from .common import string_to_bytes - -def get_lan_ip(): - if os.name != "nt": - import fcntl - import struct - - def get_interface_ip(ifname): - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - return socket.inet_ntoa( - fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', string_to_bytes(ifname[:15])))[20:24]) - - ip = socket.gethostbyname(socket.getfqdn()) - if ip.startswith("127.") and os.name != "nt": - interfaces = [ - "bond1", - "eth0", - "eth1", - "eth2", - "wlan0", - "wlan1", - "wifi0", - "ath0", - "ath1", - "ppp0", - ] - for ifname in interfaces: - try: - ip = get_interface_ip(ifname) - break - except IOError: - pass - return ip or '' - def from_dict_hook(in_dict: dict): if "type" in in_dict and "data" in in_dict: @@ -66,20 +25,3 @@ def from_dict_hook(in_dict: dict): in_dict["module"]), in_dict["type"])(**in_dict["data"]) else: return in_dict - - -def get_uuid(): - return uuid.uuid1().hex - - -def download_img(url): - if not url: - return "" - response = requests.get(url) - return "data:" + \ - response.headers.get('Content-Type', 'image/jpg') + ";" + \ - "base64," + base64.b64encode(response.content).decode("utf-8") - - -def hash_str2int(line: str, mod: int = 10 ** 8) -> int: - return int(hashlib.sha1(line.encode("utf-8")).hexdigest(), 16) % mod diff --git a/common/misc_utils.py b/common/misc_utils.py new file mode 100644 index 000000000..07594c145 --- /dev/null +++ b/common/misc_utils.py @@ -0,0 +1,36 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import base64 +import hashlib +import uuid +import requests + +def get_uuid(): + return uuid.uuid1().hex + + +def download_img(url): + if not url: + return "" + response = requests.get(url) + return "data:" + \ + response.headers.get('Content-Type', 'image/jpg') + ";" + \ + "base64," + base64.b64encode(response.content).decode("utf-8") + + +def hash_str2int(line: str, mod: int = 10 ** 8) -> int: + return int(hashlib.sha1(line.encode("utf-8")).hexdigest(), 16) % mod \ No newline at end of file diff --git a/graphrag/general/index.py b/graphrag/general/index.py index 7cb47de12..650b511de 100644 --- a/graphrag/general/index.py +++ b/graphrag/general/index.py @@ -22,7 +22,7 @@ import trio from api import settings from api.db.services.document_service import DocumentService -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import timeout from graphrag.entity_resolution import EntityResolution from graphrag.general.community_reports_extractor import CommunityReportsExtractor diff --git a/graphrag/search.py b/graphrag/search.py index 4ce29a675..a415c7610 100644 --- a/graphrag/search.py +++ b/graphrag/search.py @@ -21,7 +21,7 @@ import json_repair import pandas as pd import trio -from api.utils import get_uuid +from common.misc_utils import get_uuid from graphrag.query_analyze_prompt import PROMPTS from graphrag.utils import get_entity_type2samples, get_llm_cache, set_llm_cache, get_relation from rag.utils import num_tokens_from_string diff --git a/graphrag/utils.py b/graphrag/utils.py index 877380d6a..8250dea8c 100644 --- a/graphrag/utils.py +++ b/graphrag/utils.py @@ -24,7 +24,7 @@ import xxhash from networkx.readwrite import json_graph from api import settings -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.api_utils import timeout from rag.nlp import rag_tokenizer, search from rag.utils.doc_store_conn import OrderByExpr diff --git a/rag/benchmark.py b/rag/benchmark.py index b73830073..7a9ad1ae2 100644 --- a/rag/benchmark.py +++ b/rag/benchmark.py @@ -24,7 +24,7 @@ from api.db import LLMType from api.db.services.llm_service import LLMBundle from api.db.services.knowledgebase_service import KnowledgebaseService from api import settings -from api.utils import get_uuid +from common.misc_utils import get_uuid from rag.nlp import tokenize, search from ranx import evaluate from ranx import Qrels, Run diff --git a/rag/flow/hierarchical_merger/hierarchical_merger.py b/rag/flow/hierarchical_merger/hierarchical_merger.py index e7b8b9def..69d50c8f1 100644 --- a/rag/flow/hierarchical_merger/hierarchical_merger.py +++ b/rag/flow/hierarchical_merger/hierarchical_merger.py @@ -20,7 +20,7 @@ from functools import partial import trio -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.base64_image import id2image, image2id from deepdoc.parser.pdf_parser import RAGFlowPdfParser from rag.flow.base import ProcessBase, ProcessParamBase diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 836481f1e..6e55a629e 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -26,7 +26,7 @@ from api.db import LLMType from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.llm_service import LLMBundle -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.base64_image import image2id from deepdoc.parser import ExcelParser from deepdoc.parser.mineru_parser import MinerUParser diff --git a/rag/flow/splitter/splitter.py b/rag/flow/splitter/splitter.py index 24f62b6f1..da0ce8b91 100644 --- a/rag/flow/splitter/splitter.py +++ b/rag/flow/splitter/splitter.py @@ -17,7 +17,7 @@ from functools import partial import trio -from api.utils import get_uuid +from common.misc_utils import get_uuid from api.utils.base64_image import id2image, image2id from deepdoc.parser.pdf_parser import RAGFlowPdfParser from rag.flow.base import ProcessBase, ProcessParamBase diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py index c7fef07ff..f5680540b 100644 --- a/rag/prompts/generator.py +++ b/rag/prompts/generator.py @@ -22,7 +22,7 @@ from typing import Tuple import jinja2 import json_repair import trio -from api.utils import hash_str2int +from common.misc_utils import hash_str2int from rag.nlp import rag_tokenizer from rag.prompts.template import load_prompt from rag.settings import TAG_FLD diff --git a/rag/utils/tavily_conn.py b/rag/utils/tavily_conn.py index c8eaf4ae9..d57271716 100644 --- a/rag/utils/tavily_conn.py +++ b/rag/utils/tavily_conn.py @@ -15,7 +15,7 @@ # import logging from tavily import TavilyClient -from api.utils import get_uuid +from common.misc_utils import get_uuid from rag.nlp import rag_tokenizer diff --git a/test/unit_test/common/test_misc_utils.py b/test/unit_test/common/test_misc_utils.py new file mode 100644 index 000000000..61d1d8dbd --- /dev/null +++ b/test/unit_test/common/test_misc_utils.py @@ -0,0 +1,272 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import uuid +import hashlib +from common.misc_utils import get_uuid, download_img, hash_str2int + + +class TestGetUuid: + """Test cases for get_uuid function""" + + def test_returns_string(self): + """Test that function returns a string""" + result = get_uuid() + assert isinstance(result, str) + + def test_hex_format(self): + """Test that returned string is in hex format""" + result = get_uuid() + # UUID v1 hex should be 32 characters (without dashes) + assert len(result) == 32 + # Should only contain hexadecimal characters + assert all(c in '0123456789abcdef' for c in result) + + def test_no_dashes_in_result(self): + """Test that result contains no dashes""" + result = get_uuid() + assert '-' not in result + + def test_unique_results(self): + """Test that multiple calls return different UUIDs""" + results = [get_uuid() for _ in range(10)] + + # All results should be unique + assert len(results) == len(set(results)) + + # All should be valid hex strings of correct length + for result in results: + assert len(result) == 32 + assert all(c in '0123456789abcdef' for c in result) + + def test_valid_uuid_structure(self): + """Test that the hex string can be converted back to UUID""" + result = get_uuid() + + # Should be able to create UUID from the hex string + reconstructed_uuid = uuid.UUID(hex=result) + assert isinstance(reconstructed_uuid, uuid.UUID) + + # The hex representation should match the original + assert reconstructed_uuid.hex == result + + def test_uuid1_specific_characteristics(self): + """Test that UUID v1 characteristics are present""" + result = get_uuid() + uuid_obj = uuid.UUID(hex=result) + + # UUID v1 should have version 1 + assert uuid_obj.version == 1 + + # Variant should be RFC 4122 + assert uuid_obj.variant == 'specified in RFC 4122' + + def test_result_length_consistency(self): + """Test that all generated UUIDs have consistent length""" + for _ in range(100): + result = get_uuid() + assert len(result) == 32 + + def test_hex_characters_only(self): + """Test that only valid hex characters are used""" + for _ in range(100): + result = get_uuid() + # Should only contain lowercase hex characters (UUID hex is lowercase) + assert result.islower() + assert all(c in '0123456789abcdef' for c in result) + + +class TestDownloadImg: + """Test cases for download_img function""" + + def test_empty_url_returns_empty_string(self): + """Test that empty URL returns empty string""" + result = download_img("") + assert result == "" + + def test_none_url_returns_empty_string(self): + """Test that None URL returns empty string""" + result = download_img(None) + assert result == "" + + +class TestHashStr2Int: + """Test cases for hash_str2int function""" + + def test_basic_hashing(self): + """Test basic string hashing functionality""" + result = hash_str2int("hello") + assert isinstance(result, int) + assert 0 <= result < 10 ** 8 + + def test_default_mod_value(self): + """Test that default mod value is 10^8""" + result = hash_str2int("test") + assert 0 <= result < 10 ** 8 + + def test_custom_mod_value(self): + """Test with custom mod value""" + result = hash_str2int("test", mod=1000) + assert isinstance(result, int) + assert 0 <= result < 1000 + + def test_same_input_same_output(self): + """Test that same input produces same output""" + result1 = hash_str2int("consistent") + result2 = hash_str2int("consistent") + result3 = hash_str2int("consistent") + + assert result1 == result2 == result3 + + def test_different_input_different_output(self): + """Test that different inputs produce different outputs (usually)""" + result1 = hash_str2int("hello") + result2 = hash_str2int("world") + result3 = hash_str2int("hello world") + + # While hash collisions are possible, they're very unlikely for these inputs + results = [result1, result2, result3] + assert len(set(results)) == len(results) + + def test_empty_string(self): + """Test hashing empty string""" + result = hash_str2int("") + assert isinstance(result, int) + assert 0 <= result < 10 ** 8 + + def test_unicode_string(self): + """Test hashing unicode strings""" + test_strings = [ + "中文", + "🚀火箭", + "café", + "🎉", + "Hello 世界" + ] + + for test_str in test_strings: + result = hash_str2int(test_str) + assert isinstance(result, int) + assert 0 <= result < 10 ** 8 + + def test_special_characters(self): + """Test hashing strings with special characters""" + test_strings = [ + "hello@world.com", + "test#123", + "line\nwith\nnewlines", + "tab\tcharacter", + "space in string" + ] + + for test_str in test_strings: + result = hash_str2int(test_str) + assert isinstance(result, int) + assert 0 <= result < 10 ** 8 + + def test_large_string(self): + """Test hashing large string""" + large_string = "x" * 10000 + result = hash_str2int(large_string) + assert isinstance(result, int) + assert 0 <= result < 10 ** 8 + + def test_mod_value_1(self): + """Test with mod value 1 (should always return 0)""" + result = hash_str2int("any string", mod=1) + assert result == 0 + + def test_mod_value_2(self): + """Test with mod value 2 (should return 0 or 1)""" + result = hash_str2int("test", mod=2) + assert result in [0, 1] + + def test_very_large_mod(self): + """Test with very large mod value""" + result = hash_str2int("test", mod=10 ** 12) + assert isinstance(result, int) + assert 0 <= result < 10 ** 12 + + def test_hash_algorithm_sha1(self): + """Test that SHA1 algorithm is used""" + test_string = "hello" + expected_hash = hashlib.sha1(test_string.encode("utf-8")).hexdigest() + expected_int = int(expected_hash, 16) % (10 ** 8) + + result = hash_str2int(test_string) + assert result == expected_int + + def test_utf8_encoding(self): + """Test that UTF-8 encoding is used""" + # This should work without encoding errors + result = hash_str2int("café 🎉") + assert isinstance(result, int) + + def test_range_with_different_mods(self): + """Test that result is always in correct range for different mod values""" + test_cases = [ + ("test1", 100), + ("test2", 1000), + ("test3", 10000), + ("test4", 999999), + ] + + for test_str, mod_val in test_cases: + result = hash_str2int(test_str, mod=mod_val) + assert 0 <= result < mod_val + + def test_hexdigest_conversion(self): + """Test the hexdigest to integer conversion""" + test_string = "hello" + hash_obj = hashlib.sha1(test_string.encode("utf-8")) + hex_digest = hash_obj.hexdigest() + expected_int = int(hex_digest, 16) % (10 ** 8) + + result = hash_str2int(test_string) + assert result == expected_int + + def test_consistent_with_direct_calculation(self): + """Test that function matches direct hashlib usage""" + test_strings = ["a", "b", "abc", "hello world", "12345"] + + for test_str in test_strings: + direct_result = int(hashlib.sha1(test_str.encode("utf-8")).hexdigest(), 16) % (10 ** 8) + function_result = hash_str2int(test_str) + assert function_result == direct_result + + def test_numeric_strings(self): + """Test hashing numeric strings""" + test_strings = ["123", "0", "999999", "3.14159", "-42"] + + for test_str in test_strings: + result = hash_str2int(test_str) + assert isinstance(result, int) + assert 0 <= result < 10 ** 8 + + def test_whitespace_strings(self): + """Test hashing strings with various whitespace""" + test_strings = [ + " leading", + "trailing ", + " both ", + "\ttab", + "new\nline", + "\r\nwindows" + ] + + for test_str in test_strings: + result = hash_str2int(test_str) + assert isinstance(result, int) + assert 0 <= result < 10 ** 8