Files
ragflow/rag/utils/opendal_conn.py
Yingfeng 6c9afd1ffb Potential fix for code scanning alert no. 60: Clear-text logging of sensitive information (#12068)
Potential fix for
[https://github.com/infiniflow/ragflow/security/code-scanning/60](https://github.com/infiniflow/ragflow/security/code-scanning/60)

In general, the correct fix is to ensure that no sensitive data
(passwords, API keys, full connection strings with embedded credentials,
etc.) is ever written to logs. This can be done by (1) whitelisting only
clearly non-sensitive fields for logging, and/or (2) explicitly
scrubbing or masking any value that might contain credentials before
logging, and (3) not relying on later deletion from the dictionary to
protect against logging, since the log call already happened.

For this function, the best minimal fix is:

- Keep the idea of a safe key whitelist, but strengthen it so we are
absolutely sure we never log `password` or `connection_string`, even
indirectly.
- Avoid building the logged dict from the same potentially-tainted
`kwargs` object before we have removed sensitive keys, or relying solely
on key names that might change.
- Construct a separate, small log context that is obviously safe:
scheme, host, port, database, table, and possibly a boolean like
`has_password` instead of the password itself.
- Optionally, add a small helper to derive this safe log context, but
given the scope we can keep it inline.

Concretely in `rag/utils/opendal_conn.py`:

- Replace the current `SAFE_LOG_KEYS` / `loggable_kwargs` /
`logging.info(...)` block so that:
- We do not pass through arbitrary `kwargs` values by key filtering
alone.
- We instead build a new dict with explicitly chosen, non-sensitive
fields, e.g.:

    ```python
    safe_log_info = {
        "scheme": kwargs.get("scheme"),
        "host": kwargs.get("host"),
        "port": kwargs.get("port"),
        "database": kwargs.get("database"),
        "table": kwargs.get("table"),
"has_password": "password" in kwargs or "connection_string" in kwargs,
    }
logging.info("Loaded OpenDAL configuration (non sensitive fields only):
%s", safe_log_info)
    ```

- This makes sure that neither the password nor a connection string
containing it is ever logged, while still retaining useful diagnostic
information.
- Keep the existing deletion of `password` and `connection_string` from
`kwargs` after logging, as an additional safety measure for any later
use of `kwargs`.

No new imports or external libraries are required; we only modify lines
45–56 of the shown snippet.


_Suggested fixes powered by Copilot Autofix. Review carefully before
merging._

Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-12-22 13:31:39 +08:00

137 lines
5.1 KiB
Python

import opendal
import logging
import pymysql
from urllib.parse import quote_plus
from common.config_utils import get_base_config
from common.decorator import singleton
CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS `{}` (
`key` VARCHAR(255) PRIMARY KEY,
`value` LONGBLOB,
`created_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
`updated_at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
);
"""
SET_MAX_ALLOWED_PACKET_SQL = """
SET GLOBAL max_allowed_packet={}
"""
def get_opendal_config():
try:
opendal_config = get_base_config('opendal', {})
if opendal_config.get("scheme", "mysql") == 'mysql':
mysql_config = get_base_config('mysql', {})
max_packet = mysql_config.get("max_allowed_packet", 134217728)
kwargs = {
"scheme": "mysql",
"host": mysql_config.get("host", "127.0.0.1"),
"port": str(mysql_config.get("port", 3306)),
"user": mysql_config.get("user", "root"),
"password": mysql_config.get("password", ""),
"database": mysql_config.get("name", "test_open_dal"),
"table": opendal_config.get("config", {}).get("oss_table", "opendal_storage"),
"max_allowed_packet": str(max_packet)
}
kwargs["connection_string"] = f"mysql://{kwargs['user']}:{quote_plus(kwargs['password'])}@{kwargs['host']}:{kwargs['port']}/{kwargs['database']}?max_allowed_packet={max_packet}"
else:
scheme = opendal_config.get("scheme")
config_data = opendal_config.get("config", {})
kwargs = {"scheme": scheme, **config_data}
# Only include non-sensitive keys in logs. Do NOT
# add 'password' or any key containing embedded credentials
# (like 'connection_string').
safe_log_info = {
"scheme": kwargs.get("scheme"),
"host": kwargs.get("host"),
"port": kwargs.get("port"),
"database": kwargs.get("database"),
"table": kwargs.get("table"),
# indicate presence of credentials without logging them
"has_credentials": any(k in kwargs for k in ("password", "connection_string")),
}
logging.info("Loaded OpenDAL configuration (non sensitive fields only): %s", safe_log_info)
# For safety, explicitly remove sensitive keys from kwargs after use
if "password" in kwargs:
del kwargs["password"]
if "connection_string" in kwargs:
del kwargs["connection_string"]
return kwargs
except Exception as e:
logging.error("Failed to load OpenDAL configuration from yaml: %s", str(e))
raise
@singleton
class OpenDALStorage:
def __init__(self):
self._kwargs = get_opendal_config()
self._scheme = self._kwargs.get('scheme', 'mysql')
if self._scheme == 'mysql':
self.init_db_config()
self.init_opendal_mysql_table()
self._operator = opendal.Operator(**self._kwargs)
logging.info("OpenDALStorage initialized successfully")
def health(self):
bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
return self._operator.write(f"{bucket}/{fnm}", binary)
def put(self, bucket, fnm, binary, tenant_id=None):
self._operator.write(f"{bucket}/{fnm}", binary)
def get(self, bucket, fnm, tenant_id=None):
return self._operator.read(f"{bucket}/{fnm}")
def rm(self, bucket, fnm, tenant_id=None):
self._operator.delete(f"{bucket}/{fnm}")
self._operator.__init__()
def scan(self, bucket, fnm, tenant_id=None):
return self._operator.scan(f"{bucket}/{fnm}")
def obj_exist(self, bucket, fnm, tenant_id=None):
return self._operator.exists(f"{bucket}/{fnm}")
def init_db_config(self):
try:
conn = pymysql.connect(
host=self._kwargs['host'],
port=int(self._kwargs['port']),
user=self._kwargs['user'],
password=self._kwargs['password'],
database=self._kwargs['database']
)
cursor = conn.cursor()
max_packet = self._kwargs.get('max_allowed_packet', 4194304) # Default to 4MB if not specified
cursor.execute(SET_MAX_ALLOWED_PACKET_SQL.format(max_packet))
conn.commit()
cursor.close()
conn.close()
logging.info(f"Database configuration initialized with max_allowed_packet={max_packet}")
except Exception as e:
logging.error(f"Failed to initialize database configuration: {str(e)}")
raise
def init_opendal_mysql_table(self):
conn = pymysql.connect(
host=self._kwargs['host'],
port=int(self._kwargs['port']),
user=self._kwargs['user'],
password=self._kwargs['password'],
database=self._kwargs['database']
)
cursor = conn.cursor()
cursor.execute(CREATE_TABLE_SQL.format(self._kwargs['table']))
conn.commit()
cursor.close()
conn.close()
logging.info(f"Table `{self._kwargs['table']}` initialized.")