From 6587acef88f02375c2094b13c4a46efc54552437 Mon Sep 17 00:00:00 2001 From: Jonah Hartmann Date: Fri, 5 Dec 2025 03:10:26 +0100 Subject: [PATCH] Feat: use filepath for files with the same name (#11752) ### What problem does this PR solve? When there are multiple files with the same name the file would just duplicate, making it hard to distinguish between the different files. Now if there are multiple files with the same name, they will be named after their folder path in the webdav storage unit. The same could be done for the other connectors, too, since most of them will have similars issues, when iterating through the folder paths. ### Type of change - [x] New Feature (non-breaking change which adds functionality) Contribution by RAGcon GmbH, visit us [here](https://www.ragcon.ai/) --- common/data_source/webdav_connector.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/common/data_source/webdav_connector.py b/common/data_source/webdav_connector.py index 6f96b500d..f8e615789 100644 --- a/common/data_source/webdav_connector.py +++ b/common/data_source/webdav_connector.py @@ -190,6 +190,11 @@ class WebDAVConnector(LoadConnector, PollConnector): files = self._list_files_recursive(self.remote_path, start, end) logging.info(f"Found {len(files)} files matching time criteria") + filename_counts: dict[str, int] = {} + for file_path, _ in files: + file_name = os.path.basename(file_path) + filename_counts[file_name] = filename_counts.get(file_name, 0) + 1 + batch: list[Document] = [] for file_path, file_info in files: file_name = os.path.basename(file_path) @@ -237,12 +242,22 @@ class WebDAVConnector(LoadConnector, PollConnector): else: modified = datetime.now(timezone.utc) + if filename_counts.get(file_name, 0) > 1: + relative_path = file_path + if file_path.startswith(self.remote_path): + relative_path = file_path[len(self.remote_path):] + if relative_path.startswith('/'): + relative_path = relative_path[1:] + semantic_id = relative_path.replace('/', ' / ') if relative_path else file_name + else: + semantic_id = file_name + batch.append( Document( id=f"webdav:{self.base_url}:{file_path}", blob=blob, source=DocumentSource.WEBDAV, - semantic_identifier=file_name, + semantic_identifier=semantic_id, extension=get_file_ext(file_name), doc_updated_at=modified, size_bytes=size_bytes if size_bytes else 0