From 6587acef88f02375c2094b13c4a46efc54552437 Mon Sep 17 00:00:00 2001
From: Jonah Hartmann <jonah.hartmann@uni-konstanz.de>
Date: Fri, 5 Dec 2025 03:10:26 +0100
Subject: [PATCH] Feat: use filepath for files with the same name (#11752)

### What problem does this PR solve?

When there are multiple files with the same name the file would just
duplicate, making it hard to distinguish between the different files.
Now if there are multiple files with the same name, they will be named
after their folder path in the webdav storage unit.

The same could be done for the other connectors, too, since most of them
will have similars issues, when iterating through the folder paths.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Contribution by RAGcon GmbH, visit us [here](https://www.ragcon.ai/)
---
 common/data_source/webdav_connector.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/common/data_source/webdav_connector.py b/common/data_source/webdav_connector.py
index 6f96b500d..f8e615789 100644
--- a/common/data_source/webdav_connector.py
+++ b/common/data_source/webdav_connector.py
@@ -190,6 +190,11 @@ class WebDAVConnector(LoadConnector, PollConnector):
         files = self._list_files_recursive(self.remote_path, start, end)
         logging.info(f"Found {len(files)} files matching time criteria")
         
+        filename_counts: dict[str, int] = {}
+        for file_path, _ in files:
+            file_name = os.path.basename(file_path)
+            filename_counts[file_name] = filename_counts.get(file_name, 0) + 1
+        
         batch: list[Document] = []
         for file_path, file_info in files:
             file_name = os.path.basename(file_path)
@@ -237,12 +242,22 @@ class WebDAVConnector(LoadConnector, PollConnector):
                 else:
                     modified = datetime.now(timezone.utc)
 
+                if filename_counts.get(file_name, 0) > 1:
+                    relative_path = file_path
+                    if file_path.startswith(self.remote_path):
+                        relative_path = file_path[len(self.remote_path):]
+                    if relative_path.startswith('/'):
+                        relative_path = relative_path[1:]
+                    semantic_id = relative_path.replace('/', ' / ') if relative_path else file_name
+                else:
+                    semantic_id = file_name
+
                 batch.append(
                     Document(
                         id=f"webdav:{self.base_url}:{file_path}",
                         blob=blob,
                         source=DocumentSource.WEBDAV,
-                        semantic_identifier=file_name,
+                        semantic_identifier=semantic_id,
                         extension=get_file_ext(file_name),
                         doc_updated_at=modified,
                         size_bytes=size_bytes if size_bytes else 0