Files
ragflow/common/data_source/google_drive/model.py
Yongteng Lei df16a80f25 Feat: add initial Google Drive connector support (#11147)
### What problem does this PR solve?

This feature is primarily ported from the
[Onyx](https://github.com/onyx-dot-app/onyx) project with necessary
modifications. Thanks for such a brilliant project.

Minor: consistently use `google_drive` rather than `google_driver`.

<img width="566" height="731" alt="image"
src="https://github.com/user-attachments/assets/6f64e70e-881e-42c7-b45f-809d3e0024a4"
/>

<img width="904" height="830" alt="image"
src="https://github.com/user-attachments/assets/dfa7d1ef-819a-4a82-8c52-0999f48ed4a6"
/>

<img width="911" height="869" alt="image"
src="https://github.com/user-attachments/assets/39e792fb-9fbe-4f3d-9b3c-b2265186bc22"
/>

<img width="947" height="323" alt="image"
src="https://github.com/user-attachments/assets/27d70e96-d9c0-42d9-8c89-276919b6d61d"
/>


### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-11-10 19:15:02 +08:00

145 lines
5.2 KiB
Python

from enum import Enum
from typing import Any
from pydantic import BaseModel, ConfigDict, field_serializer, field_validator
from common.data_source.google_util.util_threadpool_concurrency import ThreadSafeDict
from common.data_source.models import ConnectorCheckpoint, SecondsSinceUnixEpoch
GoogleDriveFileType = dict[str, Any]
class GDriveMimeType(str, Enum):
DOC = "application/vnd.google-apps.document"
SPREADSHEET = "application/vnd.google-apps.spreadsheet"
SPREADSHEET_OPEN_FORMAT = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
SPREADSHEET_MS_EXCEL = "application/vnd.ms-excel"
PDF = "application/pdf"
WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
PPT = "application/vnd.google-apps.presentation"
POWERPOINT = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
PLAIN_TEXT = "text/plain"
MARKDOWN = "text/markdown"
# These correspond to The major stages of retrieval for google drive.
# The stages for the oauth flow are:
# get_all_files_for_oauth(),
# get_all_drive_ids(),
# get_files_in_shared_drive(),
# crawl_folders_for_files()
#
# The stages for the service account flow are roughly:
# get_all_user_emails(),
# get_all_drive_ids(),
# get_files_in_shared_drive(),
# Then for each user:
# get_files_in_my_drive()
# get_files_in_shared_drive()
# crawl_folders_for_files()
class DriveRetrievalStage(str, Enum):
START = "start"
DONE = "done"
# OAuth specific stages
OAUTH_FILES = "oauth_files"
# Service account specific stages
USER_EMAILS = "user_emails"
MY_DRIVE_FILES = "my_drive_files"
# Used for both oauth and service account flows
DRIVE_IDS = "drive_ids"
SHARED_DRIVE_FILES = "shared_drive_files"
FOLDER_FILES = "folder_files"
class StageCompletion(BaseModel):
"""
Describes the point in the retrieval+indexing process that the
connector is at. completed_until is the timestamp of the latest
file that has been retrieved or error that has been yielded.
Optional fields are used for retrieval stages that need more information
for resuming than just the timestamp of the latest file.
"""
stage: DriveRetrievalStage
completed_until: SecondsSinceUnixEpoch
current_folder_or_drive_id: str | None = None
next_page_token: str | None = None
# only used for shared drives
processed_drive_ids: set[str] = set()
def update(
self,
stage: DriveRetrievalStage,
completed_until: SecondsSinceUnixEpoch,
current_folder_or_drive_id: str | None = None,
) -> None:
self.stage = stage
self.completed_until = completed_until
self.current_folder_or_drive_id = current_folder_or_drive_id
class GoogleDriveCheckpoint(ConnectorCheckpoint):
# Checkpoint version of _retrieved_ids
retrieved_folder_and_drive_ids: set[str]
# Describes the point in the retrieval+indexing process that the
# checkpoint is at. when this is set to a given stage, the connector
# has finished yielding all values from the previous stage.
completion_stage: DriveRetrievalStage
# The latest timestamp of a file that has been retrieved per user email.
# StageCompletion is used to track the completion of each stage, but the
# timestamp part is not used for folder crawling.
completion_map: ThreadSafeDict[str, StageCompletion]
# all file ids that have been retrieved
all_retrieved_file_ids: set[str] = set()
# cached version of the drive and folder ids to retrieve
drive_ids_to_retrieve: list[str] | None = None
folder_ids_to_retrieve: list[str] | None = None
# cached user emails
user_emails: list[str] | None = None
@field_serializer("completion_map")
def serialize_completion_map(self, completion_map: ThreadSafeDict[str, StageCompletion], _info: Any) -> dict[str, StageCompletion]:
return completion_map._dict
@field_validator("completion_map", mode="before")
def validate_completion_map(cls, v: Any) -> ThreadSafeDict[str, StageCompletion]:
assert isinstance(v, dict) or isinstance(v, ThreadSafeDict)
return ThreadSafeDict({k: StageCompletion.model_validate(val) for k, val in v.items()})
class RetrievedDriveFile(BaseModel):
"""
Describes a file that has been retrieved from google drive.
user_email is the email of the user that the file was retrieved
by impersonating. If an error worthy of being reported is encountered,
error should be set and later propagated as a ConnectorFailure.
"""
# The stage at which this file was retrieved
completion_stage: DriveRetrievalStage
# The file that was retrieved
drive_file: GoogleDriveFileType
# The email of the user that the file was retrieved by impersonating
user_email: str
# The id of the parent folder or drive of the file
parent_id: str | None = None
# Any unexpected error that occurred while retrieving the file.
# In particular, this is not used for 403/404 errors, which are expected
# in the context of impersonating all the users to try to retrieve all
# files from all their Drives and Folders.
error: Exception | None = None
model_config = ConfigDict(arbitrary_types_allowed=True)