From f099bc1236fc9f64c28c70786a1527f37a63b937 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Mon, 29 Dec 2025 16:57:20 +0800 Subject: [PATCH] Feat: github connector (#12292) ### What problem does this PR solve? Feat: github connector ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- common/data_source/config.py | 2 + common/data_source/connector_runner.py | 217 ++++ common/data_source/github/__init__.py | 0 common/data_source/github/connector.py | 954 ++++++++++++++++++ common/data_source/github/models.py | 17 + common/data_source/github/rate_limit_utils.py | 24 + common/data_source/github/utils.py | 46 + common/data_source/interfaces.py | 15 +- pyproject.toml | 1 + uv.lock | 55 + 10 files changed, 1322 insertions(+), 9 deletions(-) create mode 100644 common/data_source/connector_runner.py create mode 100644 common/data_source/github/__init__.py create mode 100644 common/data_source/github/connector.py create mode 100644 common/data_source/github/models.py create mode 100644 common/data_source/github/rate_limit_utils.py create mode 100644 common/data_source/github/utils.py diff --git a/common/data_source/config.py b/common/data_source/config.py index e36ee404b..676696d65 100644 --- a/common/data_source/config.py +++ b/common/data_source/config.py @@ -232,6 +232,8 @@ _REPLACEMENT_EXPANSIONS = "body.view.value" BOX_WEB_OAUTH_REDIRECT_URI = os.environ.get("BOX_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/box/oauth/web/callback") +GITHUB_CONNECTOR_BASE_URL = os.environ.get("GITHUB_CONNECTOR_BASE_URL") or None + class HtmlBasedConnectorTransformLinksStrategy(str, Enum): # remove links entirely STRIP = "strip" diff --git a/common/data_source/connector_runner.py b/common/data_source/connector_runner.py new file mode 100644 index 000000000..d47d65128 --- /dev/null +++ b/common/data_source/connector_runner.py @@ -0,0 +1,217 @@ +import sys +import time +import logging +from collections.abc import Generator +from datetime import datetime +from typing import Generic +from typing import TypeVar +from common.data_source.interfaces import ( + BaseConnector, + CheckpointedConnector, + CheckpointedConnectorWithPermSync, + CheckpointOutput, + LoadConnector, + PollConnector, +) +from common.data_source.models import ConnectorCheckpoint, ConnectorFailure, Document + + +TimeRange = tuple[datetime, datetime] + +CT = TypeVar("CT", bound=ConnectorCheckpoint) + + +def batched_doc_ids( + checkpoint_connector_generator: CheckpointOutput[CT], + batch_size: int, +) -> Generator[set[str], None, None]: + batch: set[str] = set() + for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()( + checkpoint_connector_generator + ): + if document is not None: + batch.add(document.id) + elif ( + failure and failure.failed_document and failure.failed_document.document_id + ): + batch.add(failure.failed_document.document_id) + + if len(batch) >= batch_size: + yield batch + batch = set() + if len(batch) > 0: + yield batch + + +class CheckpointOutputWrapper(Generic[CT]): + """ + Wraps a CheckpointOutput generator to give things back in a more digestible format, + specifically for Document outputs. + The connector format is easier for the connector implementor (e.g. it enforces exactly + one new checkpoint is returned AND that the checkpoint is at the end), thus the different + formats. + """ + + def __init__(self) -> None: + self.next_checkpoint: CT | None = None + + def __call__( + self, + checkpoint_connector_generator: CheckpointOutput[CT], + ) -> Generator[ + tuple[Document | None, ConnectorFailure | None, CT | None], + None, + None, + ]: + # grabs the final return value and stores it in the `next_checkpoint` variable + def _inner_wrapper( + checkpoint_connector_generator: CheckpointOutput[CT], + ) -> CheckpointOutput[CT]: + self.next_checkpoint = yield from checkpoint_connector_generator + return self.next_checkpoint # not used + + for document_or_failure in _inner_wrapper(checkpoint_connector_generator): + if isinstance(document_or_failure, Document): + yield document_or_failure, None, None + elif isinstance(document_or_failure, ConnectorFailure): + yield None, document_or_failure, None + else: + raise ValueError( + f"Invalid document_or_failure type: {type(document_or_failure)}" + ) + + if self.next_checkpoint is None: + raise RuntimeError( + "Checkpoint is None. This should never happen - the connector should always return a checkpoint." + ) + + yield None, None, self.next_checkpoint + + +class ConnectorRunner(Generic[CT]): + """ + Handles: + - Batching + - Additional exception logging + - Combining different connector types to a single interface + """ + + def __init__( + self, + connector: BaseConnector, + batch_size: int, + # cannot be True for non-checkpointed connectors + include_permissions: bool, + time_range: TimeRange | None = None, + ): + if not isinstance(connector, CheckpointedConnector) and include_permissions: + raise ValueError( + "include_permissions cannot be True for non-checkpointed connectors" + ) + + self.connector = connector + self.time_range = time_range + self.batch_size = batch_size + self.include_permissions = include_permissions + + self.doc_batch: list[Document] = [] + + def run(self, checkpoint: CT) -> Generator[ + tuple[list[Document] | None, ConnectorFailure | None, CT | None], + None, + None, + ]: + """Adds additional exception logging to the connector.""" + try: + if isinstance(self.connector, CheckpointedConnector): + if self.time_range is None: + raise ValueError("time_range is required for CheckpointedConnector") + + start = time.monotonic() + if self.include_permissions: + if not isinstance( + self.connector, CheckpointedConnectorWithPermSync + ): + raise ValueError( + "Connector does not support permission syncing" + ) + load_from_checkpoint = ( + self.connector.load_from_checkpoint_with_perm_sync + ) + else: + load_from_checkpoint = self.connector.load_from_checkpoint + checkpoint_connector_generator = load_from_checkpoint( + start=self.time_range[0].timestamp(), + end=self.time_range[1].timestamp(), + checkpoint=checkpoint, + ) + next_checkpoint: CT | None = None + # this is guaranteed to always run at least once with next_checkpoint being non-None + for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()( + checkpoint_connector_generator + ): + if document is not None and isinstance(document, Document): + self.doc_batch.append(document) + + if failure is not None: + yield None, failure, None + + if len(self.doc_batch) >= self.batch_size: + yield self.doc_batch, None, None + self.doc_batch = [] + + # yield remaining documents + if len(self.doc_batch) > 0: + yield self.doc_batch, None, None + self.doc_batch = [] + + yield None, None, next_checkpoint + + logging.debug( + f"Connector took {time.monotonic() - start} seconds to get to the next checkpoint." + ) + + else: + finished_checkpoint = self.connector.build_dummy_checkpoint() + finished_checkpoint.has_more = False + + if isinstance(self.connector, PollConnector): + if self.time_range is None: + raise ValueError("time_range is required for PollConnector") + + for document_batch in self.connector.poll_source( + start=self.time_range[0].timestamp(), + end=self.time_range[1].timestamp(), + ): + yield document_batch, None, None + + yield None, None, finished_checkpoint + elif isinstance(self.connector, LoadConnector): + for document_batch in self.connector.load_from_state(): + yield document_batch, None, None + + yield None, None, finished_checkpoint + else: + raise ValueError(f"Invalid connector. type: {type(self.connector)}") + except Exception: + exc_type, _, exc_traceback = sys.exc_info() + + # Traverse the traceback to find the last frame where the exception was raised + tb = exc_traceback + if tb is None: + logging.error("No traceback found for exception") + raise + + while tb.tb_next: + tb = tb.tb_next # Move to the next frame in the traceback + + # Get the local variables from the frame where the exception occurred + local_vars = tb.tb_frame.f_locals + local_vars_str = "\n".join( + f"{key}: {value}" for key, value in local_vars.items() + ) + logging.error( + f"Error in connector. type: {exc_type};\n" + f"local_vars below -> \n{local_vars_str[:1024]}" + ) + raise \ No newline at end of file diff --git a/common/data_source/github/__init__.py b/common/data_source/github/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/common/data_source/github/connector.py b/common/data_source/github/connector.py new file mode 100644 index 000000000..a6ee3a59a --- /dev/null +++ b/common/data_source/github/connector.py @@ -0,0 +1,954 @@ +import copy +import logging +from collections.abc import Callable +from collections.abc import Generator +from datetime import datetime +from datetime import timedelta +from datetime import timezone +from enum import Enum +from typing import Any +from typing import cast + +from github import Github +from github import RateLimitExceededException +from github import Repository +from github.GithubException import GithubException +from github.Issue import Issue +from github.NamedUser import NamedUser +from github.PaginatedList import PaginatedList +from github.PullRequest import PullRequest +from pydantic import BaseModel +from typing_extensions import override + +from common.data_source.config import DocumentSource, GITHUB_CONNECTOR_BASE_URL +from common.data_source.exceptions import ( + ConnectorMissingCredentialError, + ConnectorValidationError, + CredentialExpiredError, + InsufficientPermissionsError, + UnexpectedValidationError, +) +from common.data_source.interfaces import CheckpointedConnectorWithPermSync, CheckpointOutput +from common.data_source.models import ( + ConnectorCheckpoint, + ConnectorFailure, + Document, + DocumentFailure, + ExternalAccess, + SecondsSinceUnixEpoch, + TextSection, +) +from common.data_source.connector_runner import ConnectorRunner +from .models import SerializedRepository +from .rate_limit_utils import sleep_after_rate_limit_exception +from .utils import deserialize_repository +from .utils import get_external_access_permission + +ITEMS_PER_PAGE = 100 +CURSOR_LOG_FREQUENCY = 50 + +_MAX_NUM_RATE_LIMIT_RETRIES = 5 + +ONE_DAY = timedelta(days=1) +SLIM_BATCH_SIZE = 100 +# Cases +# X (from start) standard run, no fallback to cursor-based pagination +# X (from start) standard run errors, fallback to cursor-based pagination +# X error in the middle of a page +# X no errors: run to completion +# X (from checkpoint) standard run, no fallback to cursor-based pagination +# X (from checkpoint) continue from cursor-based pagination +# - retrying +# - no retrying + +# things to check: +# checkpoint state on return +# checkpoint progress (no infinite loop) + + +class DocMetadata(BaseModel): + repo: str + + +def get_nextUrl_key(pag_list: PaginatedList[PullRequest | Issue]) -> str: + if "_PaginatedList__nextUrl" in pag_list.__dict__: + return "_PaginatedList__nextUrl" + for key in pag_list.__dict__: + if "__nextUrl" in key: + return key + for key in pag_list.__dict__: + if "nextUrl" in key: + return key + return "" + + +def get_nextUrl( + pag_list: PaginatedList[PullRequest | Issue], nextUrl_key: str +) -> str | None: + return getattr(pag_list, nextUrl_key) if nextUrl_key else None + + +def set_nextUrl( + pag_list: PaginatedList[PullRequest | Issue], nextUrl_key: str, nextUrl: str +) -> None: + if nextUrl_key: + setattr(pag_list, nextUrl_key, nextUrl) + elif nextUrl: + raise ValueError("Next URL key not found: " + str(pag_list.__dict__)) + + +def _paginate_until_error( + git_objs: Callable[[], PaginatedList[PullRequest | Issue]], + cursor_url: str | None, + prev_num_objs: int, + cursor_url_callback: Callable[[str | None, int], None], + retrying: bool = False, +) -> Generator[PullRequest | Issue, None, None]: + num_objs = prev_num_objs + pag_list = git_objs() + nextUrl_key = get_nextUrl_key(pag_list) + if cursor_url: + set_nextUrl(pag_list, nextUrl_key, cursor_url) + elif retrying: + # if we are retrying, we want to skip the objects retrieved + # over previous calls. Unfortunately, this WILL retrieve all + # pages before the one we are resuming from, so we really + # don't want this case to be hit often + logging.warning( + "Retrying from a previous cursor-based pagination call. " + "This will retrieve all pages before the one we are resuming from, " + "which may take a while and consume many API calls." + ) + pag_list = cast(PaginatedList[PullRequest | Issue], pag_list[prev_num_objs:]) + num_objs = 0 + + try: + # this for loop handles cursor-based pagination + for issue_or_pr in pag_list: + num_objs += 1 + yield issue_or_pr + # used to store the current cursor url in the checkpoint. This value + # is updated during iteration over pag_list. + cursor_url_callback(get_nextUrl(pag_list, nextUrl_key), num_objs) + + if num_objs % CURSOR_LOG_FREQUENCY == 0: + logging.info( + f"Retrieved {num_objs} objects with current cursor url: {get_nextUrl(pag_list, nextUrl_key)}" + ) + + except Exception as e: + logging.exception(f"Error during cursor-based pagination: {e}") + if num_objs - prev_num_objs > 0: + raise + + if get_nextUrl(pag_list, nextUrl_key) is not None and not retrying: + logging.info( + "Assuming that this error is due to cursor " + "expiration because no objects were retrieved. " + "Retrying from the first page." + ) + yield from _paginate_until_error( + git_objs, None, prev_num_objs, cursor_url_callback, retrying=True + ) + return + + # for no cursor url or if we reach this point after a retry, raise the error + raise + + +def _get_batch_rate_limited( + # We pass in a callable because we want git_objs to produce a fresh + # PaginatedList each time it's called to avoid using the same object for cursor-based pagination + # from a partial offset-based pagination call. + git_objs: Callable[[], PaginatedList], + page_num: int, + cursor_url: str | None, + prev_num_objs: int, + cursor_url_callback: Callable[[str | None, int], None], + github_client: Github, + attempt_num: int = 0, +) -> Generator[PullRequest | Issue, None, None]: + if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES: + raise RuntimeError( + "Re-tried fetching batch too many times. Something is going wrong with fetching objects from Github" + ) + try: + if cursor_url: + # when this is set, we are resuming from an earlier + # cursor-based pagination call. + yield from _paginate_until_error( + git_objs, cursor_url, prev_num_objs, cursor_url_callback + ) + return + objs = list(git_objs().get_page(page_num)) + # fetch all data here to disable lazy loading later + # this is needed to capture the rate limit exception here (if one occurs) + for obj in objs: + if hasattr(obj, "raw_data"): + getattr(obj, "raw_data") + yield from objs + except RateLimitExceededException: + sleep_after_rate_limit_exception(github_client) + yield from _get_batch_rate_limited( + git_objs, + page_num, + cursor_url, + prev_num_objs, + cursor_url_callback, + github_client, + attempt_num + 1, + ) + except GithubException as e: + if not ( + e.status == 422 + and ( + "cursor" in (e.message or "") + or "cursor" in (e.data or {}).get("message", "") + ) + ): + raise + # Fallback to a cursor-based pagination strategy + # This can happen for "large datasets," but there's no documentation + # On the error on the web as far as we can tell. + # Error message: + # "Pagination with the page parameter is not supported for large datasets, + # please use cursor based pagination (after/before)" + yield from _paginate_until_error( + git_objs, cursor_url, prev_num_objs, cursor_url_callback + ) + + +def _get_userinfo(user: NamedUser) -> dict[str, str]: + def _safe_get(attr_name: str) -> str | None: + try: + return cast(str | None, getattr(user, attr_name)) + except GithubException: + logging.debug(f"Error getting {attr_name} for user") + return None + + return { + k: v + for k, v in { + "login": _safe_get("login"), + "name": _safe_get("name"), + "email": _safe_get("email"), + }.items() + if v is not None + } + + +def _convert_pr_to_document( + pull_request: PullRequest, repo_external_access: ExternalAccess | None +) -> Document: + repo_name = pull_request.base.repo.full_name if pull_request.base else "" + doc_metadata = DocMetadata(repo=repo_name) + return Document( + id=pull_request.html_url, + sections=[ + TextSection(link=pull_request.html_url, text=pull_request.body or "") + ], + external_access=repo_external_access, + source=DocumentSource.GITHUB, + semantic_identifier=f"{pull_request.number}: {pull_request.title}", + # updated_at is UTC time but is timezone unaware, explicitly add UTC + # as there is logic in indexing to prevent wrong timestamped docs + # due to local time discrepancies with UTC + doc_updated_at=( + pull_request.updated_at.replace(tzinfo=timezone.utc) + if pull_request.updated_at + else None + ), + # this metadata is used in perm sync + doc_metadata=doc_metadata.model_dump(), + metadata={ + k: [str(vi) for vi in v] if isinstance(v, list) else str(v) + for k, v in { + "object_type": "PullRequest", + "id": pull_request.number, + "merged": pull_request.merged, + "state": pull_request.state, + "user": _get_userinfo(pull_request.user) if pull_request.user else None, + "assignees": [ + _get_userinfo(assignee) for assignee in pull_request.assignees + ], + "repo": ( + pull_request.base.repo.full_name if pull_request.base else None + ), + "num_commits": str(pull_request.commits), + "num_files_changed": str(pull_request.changed_files), + "labels": [label.name for label in pull_request.labels], + "created_at": ( + pull_request.created_at.replace(tzinfo=timezone.utc) + if pull_request.created_at + else None + ), + "updated_at": ( + pull_request.updated_at.replace(tzinfo=timezone.utc) + if pull_request.updated_at + else None + ), + "closed_at": ( + pull_request.closed_at.replace(tzinfo=timezone.utc) + if pull_request.closed_at + else None + ), + "merged_at": ( + pull_request.merged_at.replace(tzinfo=timezone.utc) + if pull_request.merged_at + else None + ), + "merged_by": ( + _get_userinfo(pull_request.merged_by) + if pull_request.merged_by + else None + ), + }.items() + if v is not None + }, + ) + + +def _fetch_issue_comments(issue: Issue) -> str: + comments = issue.get_comments() + return "\nComment: ".join(comment.body for comment in comments) + + +def _convert_issue_to_document( + issue: Issue, repo_external_access: ExternalAccess | None +) -> Document: + repo_name = issue.repository.full_name if issue.repository else "" + doc_metadata = DocMetadata(repo=repo_name) + return Document( + id=issue.html_url, + sections=[TextSection(link=issue.html_url, text=issue.body or "")], + source=DocumentSource.GITHUB, + external_access=repo_external_access, + semantic_identifier=f"{issue.number}: {issue.title}", + # updated_at is UTC time but is timezone unaware + doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc), + # this metadata is used in perm sync + doc_metadata=doc_metadata.model_dump(), + metadata={ + k: [str(vi) for vi in v] if isinstance(v, list) else str(v) + for k, v in { + "object_type": "Issue", + "id": issue.number, + "state": issue.state, + "user": _get_userinfo(issue.user) if issue.user else None, + "assignees": [_get_userinfo(assignee) for assignee in issue.assignees], + "repo": issue.repository.full_name if issue.repository else None, + "labels": [label.name for label in issue.labels], + "created_at": ( + issue.created_at.replace(tzinfo=timezone.utc) + if issue.created_at + else None + ), + "updated_at": ( + issue.updated_at.replace(tzinfo=timezone.utc) + if issue.updated_at + else None + ), + "closed_at": ( + issue.closed_at.replace(tzinfo=timezone.utc) + if issue.closed_at + else None + ), + "closed_by": ( + _get_userinfo(issue.closed_by) if issue.closed_by else None + ), + }.items() + if v is not None + }, + ) + + +class GithubConnectorStage(Enum): + START = "start" + PRS = "prs" + ISSUES = "issues" + + +class GithubConnectorCheckpoint(ConnectorCheckpoint): + stage: GithubConnectorStage + curr_page: int + + cached_repo_ids: list[int] | None = None + cached_repo: SerializedRepository | None = None + + # Used for the fallback cursor-based pagination strategy + num_retrieved: int + cursor_url: str | None = None + + def reset(self) -> None: + """ + Resets curr_page, num_retrieved, and cursor_url to their initial values (0, 0, None) + """ + self.curr_page = 0 + self.num_retrieved = 0 + self.cursor_url = None + + +def make_cursor_url_callback( + checkpoint: GithubConnectorCheckpoint, +) -> Callable[[str | None, int], None]: + def cursor_url_callback(cursor_url: str | None, num_objs: int) -> None: + # we want to maintain the old cursor url so code after retrieval + # can determine that we are using the fallback cursor-based pagination strategy + if cursor_url: + checkpoint.cursor_url = cursor_url + checkpoint.num_retrieved = num_objs + + return cursor_url_callback + + +class GithubConnector(CheckpointedConnectorWithPermSync[GithubConnectorCheckpoint]): + def __init__( + self, + repo_owner: str, + repositories: str | None = None, + state_filter: str = "all", + include_prs: bool = True, + include_issues: bool = False, + ) -> None: + self.repo_owner = repo_owner + self.repositories = repositories + self.state_filter = state_filter + self.include_prs = include_prs + self.include_issues = include_issues + self.github_client: Github | None = None + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + # defaults to 30 items per page, can be set to as high as 100 + self.github_client = ( + Github( + credentials["github_access_token"], + base_url=GITHUB_CONNECTOR_BASE_URL, + per_page=ITEMS_PER_PAGE, + ) + if GITHUB_CONNECTOR_BASE_URL + else Github(credentials["github_access_token"], per_page=ITEMS_PER_PAGE) + ) + return None + + def get_github_repo( + self, github_client: Github, attempt_num: int = 0 + ) -> Repository.Repository: + if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES: + raise RuntimeError( + "Re-tried fetching repo too many times. Something is going wrong with fetching objects from Github" + ) + + try: + return github_client.get_repo(f"{self.repo_owner}/{self.repositories}") + except RateLimitExceededException: + sleep_after_rate_limit_exception(github_client) + return self.get_github_repo(github_client, attempt_num + 1) + + def get_github_repos( + self, github_client: Github, attempt_num: int = 0 + ) -> list[Repository.Repository]: + """Get specific repositories based on comma-separated repo_name string.""" + if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES: + raise RuntimeError( + "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github" + ) + + try: + repos = [] + # Split repo_name by comma and strip whitespace + repo_names = [ + name.strip() for name in (cast(str, self.repositories)).split(",") + ] + + for repo_name in repo_names: + if repo_name: # Skip empty strings + try: + repo = github_client.get_repo(f"{self.repo_owner}/{repo_name}") + repos.append(repo) + except GithubException as e: + logging.warning( + f"Could not fetch repo {self.repo_owner}/{repo_name}: {e}" + ) + + return repos + except RateLimitExceededException: + sleep_after_rate_limit_exception(github_client) + return self.get_github_repos(github_client, attempt_num + 1) + + def get_all_repos( + self, github_client: Github, attempt_num: int = 0 + ) -> list[Repository.Repository]: + if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES: + raise RuntimeError( + "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github" + ) + + try: + # Try to get organization first + try: + org = github_client.get_organization(self.repo_owner) + return list(org.get_repos()) + + except GithubException: + # If not an org, try as a user + user = github_client.get_user(self.repo_owner) + return list(user.get_repos()) + except RateLimitExceededException: + sleep_after_rate_limit_exception(github_client) + return self.get_all_repos(github_client, attempt_num + 1) + + def _pull_requests_func( + self, repo: Repository.Repository + ) -> Callable[[], PaginatedList[PullRequest]]: + return lambda: repo.get_pulls( + state=self.state_filter, sort="updated", direction="desc" + ) + + def _issues_func( + self, repo: Repository.Repository + ) -> Callable[[], PaginatedList[Issue]]: + return lambda: repo.get_issues( + state=self.state_filter, sort="updated", direction="desc" + ) + + def _fetch_from_github( + self, + checkpoint: GithubConnectorCheckpoint, + start: datetime | None = None, + end: datetime | None = None, + include_permissions: bool = False, + ) -> Generator[Document | ConnectorFailure, None, GithubConnectorCheckpoint]: + if self.github_client is None: + raise ConnectorMissingCredentialError("GitHub") + + checkpoint = copy.deepcopy(checkpoint) + + # First run of the connector, fetch all repos and store in checkpoint + if checkpoint.cached_repo_ids is None: + repos = [] + if self.repositories: + if "," in self.repositories: + # Multiple repositories specified + repos = self.get_github_repos(self.github_client) + else: + # Single repository (backward compatibility) + repos = [self.get_github_repo(self.github_client)] + else: + # All repositories + repos = self.get_all_repos(self.github_client) + if not repos: + checkpoint.has_more = False + return checkpoint + + curr_repo = repos.pop() + checkpoint.cached_repo_ids = [repo.id for repo in repos] + checkpoint.cached_repo = SerializedRepository( + id=curr_repo.id, + headers=curr_repo.raw_headers, + raw_data=curr_repo.raw_data, + ) + checkpoint.stage = GithubConnectorStage.PRS + checkpoint.curr_page = 0 + # save checkpoint with repo ids retrieved + return checkpoint + + if checkpoint.cached_repo is None: + raise ValueError("No repo saved in checkpoint") + + # Deserialize the repository from the checkpoint + repo = deserialize_repository(checkpoint.cached_repo, self.github_client) + + cursor_url_callback = make_cursor_url_callback(checkpoint) + repo_external_access: ExternalAccess | None = None + if include_permissions: + repo_external_access = get_external_access_permission( + repo, self.github_client + ) + if self.include_prs and checkpoint.stage == GithubConnectorStage.PRS: + logging.info(f"Fetching PRs for repo: {repo.name}") + + pr_batch = _get_batch_rate_limited( + self._pull_requests_func(repo), + checkpoint.curr_page, + checkpoint.cursor_url, + checkpoint.num_retrieved, + cursor_url_callback, + self.github_client, + ) + checkpoint.curr_page += 1 # NOTE: not used for cursor-based fallback + done_with_prs = False + num_prs = 0 + pr = None + for pr in pr_batch: + num_prs += 1 + + # we iterate backwards in time, so at this point we stop processing prs + if ( + start is not None + and pr.updated_at + and pr.updated_at.replace(tzinfo=timezone.utc) < start + ): + done_with_prs = True + break + # Skip PRs updated after the end date + if ( + end is not None + and pr.updated_at + and pr.updated_at.replace(tzinfo=timezone.utc) > end + ): + continue + try: + yield _convert_pr_to_document( + cast(PullRequest, pr), repo_external_access + ) + except Exception as e: + error_msg = f"Error converting PR to document: {e}" + logging.exception(error_msg) + yield ConnectorFailure( + failed_document=DocumentFailure( + document_id=str(pr.id), document_link=pr.html_url + ), + failure_message=error_msg, + exception=e, + ) + continue + + # If we reach this point with a cursor url in the checkpoint, we were using + # the fallback cursor-based pagination strategy. That strategy tries to get all + # PRs, so having curosr_url set means we are done with prs. However, we need to + # return AFTER the checkpoint reset to avoid infinite loops. + + # if we found any PRs on the page and there are more PRs to get, return the checkpoint. + # In offset mode, while indexing without time constraints, the pr batch + # will be empty when we're done. + used_cursor = checkpoint.cursor_url is not None + logging.info(f"Fetched {num_prs} PRs for repo: {repo.name}") + if num_prs > 0 and not done_with_prs and not used_cursor: + return checkpoint + + # if we went past the start date during the loop or there are no more + # prs to get, we move on to issues + checkpoint.stage = GithubConnectorStage.ISSUES + checkpoint.reset() + + if used_cursor: + # save the checkpoint after changing stage; next run will continue from issues + return checkpoint + + checkpoint.stage = GithubConnectorStage.ISSUES + + if self.include_issues and checkpoint.stage == GithubConnectorStage.ISSUES: + logging.info(f"Fetching issues for repo: {repo.name}") + + issue_batch = list( + _get_batch_rate_limited( + self._issues_func(repo), + checkpoint.curr_page, + checkpoint.cursor_url, + checkpoint.num_retrieved, + cursor_url_callback, + self.github_client, + ) + ) + logging.info(f"Fetched {len(issue_batch)} issues for repo: {repo.name}") + checkpoint.curr_page += 1 + done_with_issues = False + num_issues = 0 + for issue in issue_batch: + num_issues += 1 + issue = cast(Issue, issue) + # we iterate backwards in time, so at this point we stop processing prs + if ( + start is not None + and issue.updated_at.replace(tzinfo=timezone.utc) < start + ): + done_with_issues = True + break + # Skip PRs updated after the end date + if ( + end is not None + and issue.updated_at.replace(tzinfo=timezone.utc) > end + ): + continue + + if issue.pull_request is not None: + # PRs are handled separately + continue + + try: + yield _convert_issue_to_document(issue, repo_external_access) + except Exception as e: + error_msg = f"Error converting issue to document: {e}" + logging.exception(error_msg) + yield ConnectorFailure( + failed_document=DocumentFailure( + document_id=str(issue.id), + document_link=issue.html_url, + ), + failure_message=error_msg, + exception=e, + ) + continue + + logging.info(f"Fetched {num_issues} issues for repo: {repo.name}") + # if we found any issues on the page, and we're not done, return the checkpoint. + # don't return if we're using cursor-based pagination to avoid infinite loops + if num_issues > 0 and not done_with_issues and not checkpoint.cursor_url: + return checkpoint + + # if we went past the start date during the loop or there are no more + # issues to get, we move on to the next repo + checkpoint.stage = GithubConnectorStage.PRS + checkpoint.reset() + + checkpoint.has_more = len(checkpoint.cached_repo_ids) > 0 + if checkpoint.cached_repo_ids: + next_id = checkpoint.cached_repo_ids.pop() + next_repo = self.github_client.get_repo(next_id) + checkpoint.cached_repo = SerializedRepository( + id=next_id, + headers=next_repo.raw_headers, + raw_data=next_repo.raw_data, + ) + checkpoint.stage = GithubConnectorStage.PRS + checkpoint.reset() + + if checkpoint.cached_repo_ids: + logging.info( + f"{len(checkpoint.cached_repo_ids)} repos remaining (IDs: {checkpoint.cached_repo_ids})" + ) + else: + logging.info("No more repos remaining") + + return checkpoint + + def _load_from_checkpoint( + self, + start: SecondsSinceUnixEpoch, + end: SecondsSinceUnixEpoch, + checkpoint: GithubConnectorCheckpoint, + include_permissions: bool = False, + ) -> CheckpointOutput[GithubConnectorCheckpoint]: + start_datetime = datetime.fromtimestamp(start, tz=timezone.utc) + # add a day for timezone safety + end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) + ONE_DAY + + # Move start time back by 3 hours, since some Issues/PRs are getting dropped + # Could be due to delayed processing on GitHub side + # The non-updated issues since last poll will be shortcut-ed and not embedded + adjusted_start_datetime = start_datetime - timedelta(hours=3) + + epoch = datetime.fromtimestamp(0, tz=timezone.utc) + if adjusted_start_datetime < epoch: + adjusted_start_datetime = epoch + + return self._fetch_from_github( + checkpoint, + start=adjusted_start_datetime, + end=end_datetime, + include_permissions=include_permissions, + ) + + @override + def load_from_checkpoint( + self, + start: SecondsSinceUnixEpoch, + end: SecondsSinceUnixEpoch, + checkpoint: GithubConnectorCheckpoint, + ) -> CheckpointOutput[GithubConnectorCheckpoint]: + return self._load_from_checkpoint( + start, end, checkpoint, include_permissions=False + ) + + @override + def load_from_checkpoint_with_perm_sync( + self, + start: SecondsSinceUnixEpoch, + end: SecondsSinceUnixEpoch, + checkpoint: GithubConnectorCheckpoint, + ) -> CheckpointOutput[GithubConnectorCheckpoint]: + return self._load_from_checkpoint( + start, end, checkpoint, include_permissions=True + ) + + def validate_connector_settings(self) -> None: + if self.github_client is None: + raise ConnectorMissingCredentialError("GitHub credentials not loaded.") + + if not self.repo_owner: + raise ConnectorValidationError( + "Invalid connector settings: 'repo_owner' must be provided." + ) + + try: + if self.repositories: + if "," in self.repositories: + # Multiple repositories specified + repo_names = [name.strip() for name in self.repositories.split(",")] + if not repo_names: + raise ConnectorValidationError( + "Invalid connector settings: No valid repository names provided." + ) + + # Validate at least one repository exists and is accessible + valid_repos = False + validation_errors = [] + + for repo_name in repo_names: + if not repo_name: + continue + + try: + test_repo = self.github_client.get_repo( + f"{self.repo_owner}/{repo_name}" + ) + logging.info( + f"Successfully accessed repository: {self.repo_owner}/{repo_name}" + ) + test_repo.get_contents("") + valid_repos = True + # If at least one repo is valid, we can proceed + break + except GithubException as e: + validation_errors.append( + f"Repository '{repo_name}': {e.data.get('message', str(e))}" + ) + + if not valid_repos: + error_msg = ( + "None of the specified repositories could be accessed: " + ) + error_msg += ", ".join(validation_errors) + raise ConnectorValidationError(error_msg) + else: + # Single repository (backward compatibility) + test_repo = self.github_client.get_repo( + f"{self.repo_owner}/{self.repositories}" + ) + test_repo.get_contents("") + else: + # Try to get organization first + try: + org = self.github_client.get_organization(self.repo_owner) + total_count = org.get_repos().totalCount + if total_count == 0: + raise ConnectorValidationError( + f"Found no repos for organization: {self.repo_owner}. " + "Does the credential have the right scopes?" + ) + except GithubException as e: + # Check for missing SSO + MISSING_SSO_ERROR_MESSAGE = "You must grant your Personal Access token access to this organization".lower() + if MISSING_SSO_ERROR_MESSAGE in str(e).lower(): + SSO_GUIDE_LINK = ( + "https://docs.github.com/en/enterprise-cloud@latest/authentication/" + "authenticating-with-saml-single-sign-on/" + "authorizing-a-personal-access-token-for-use-with-saml-single-sign-on" + ) + raise ConnectorValidationError( + f"Your GitHub token is missing authorization to access the " + f"`{self.repo_owner}` organization. Please follow the guide to " + f"authorize your token: {SSO_GUIDE_LINK}" + ) + # If not an org, try as a user + user = self.github_client.get_user(self.repo_owner) + + # Check if we can access any repos + total_count = user.get_repos().totalCount + if total_count == 0: + raise ConnectorValidationError( + f"Found no repos for user: {self.repo_owner}. " + "Does the credential have the right scopes?" + ) + + except RateLimitExceededException: + raise UnexpectedValidationError( + "Validation failed due to GitHub rate-limits being exceeded. Please try again later." + ) + + except GithubException as e: + if e.status == 401: + raise CredentialExpiredError( + "GitHub credential appears to be invalid or expired (HTTP 401)." + ) + elif e.status == 403: + raise InsufficientPermissionsError( + "Your GitHub token does not have sufficient permissions for this repository (HTTP 403)." + ) + elif e.status == 404: + if self.repositories: + if "," in self.repositories: + raise ConnectorValidationError( + f"None of the specified GitHub repositories could be found for owner: {self.repo_owner}" + ) + else: + raise ConnectorValidationError( + f"GitHub repository not found with name: {self.repo_owner}/{self.repositories}" + ) + else: + raise ConnectorValidationError( + f"GitHub user or organization not found: {self.repo_owner}" + ) + else: + raise ConnectorValidationError( + f"Unexpected GitHub error (status={e.status}): {e.data}" + ) + + except Exception as exc: + raise Exception( + f"Unexpected error during GitHub settings validation: {exc}" + ) + + def validate_checkpoint_json( + self, checkpoint_json: str + ) -> GithubConnectorCheckpoint: + return GithubConnectorCheckpoint.model_validate_json(checkpoint_json) + + def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint: + return GithubConnectorCheckpoint( + stage=GithubConnectorStage.PRS, curr_page=0, has_more=True, num_retrieved=0 + ) + + +if __name__ == "__main__": + import os + + # Initialize the connector + connector = GithubConnector( + repo_owner=os.environ["REPO_OWNER"], + repositories=os.environ.get("REPOSITORIES"), + ) + connector.load_credentials( + {"github_access_token": os.environ["ACCESS_TOKEN_GITHUB"]} + ) + + if connector.github_client: + get_external_access_permission( + connector.get_github_repos(connector.github_client).pop(), + connector.github_client, + ) + + # Create a time range from epoch to now + end_time = datetime.now(timezone.utc) + start_time = datetime.fromtimestamp(0, tz=timezone.utc) + time_range = (start_time, end_time) + + # Initialize the runner with a batch size of 10 + runner: ConnectorRunner[GithubConnectorCheckpoint] = ConnectorRunner( + connector, batch_size=10, include_permissions=False, time_range=time_range + ) + + # Get initial checkpoint + checkpoint = connector.build_dummy_checkpoint() + + # Run the connector + while checkpoint.has_more: + for doc_batch, failure, next_checkpoint in runner.run(checkpoint): + if doc_batch: + print(f"Retrieved batch of {len(doc_batch)} documents") + for doc in doc_batch: + print(f"Document: {doc.semantic_identifier}") + if failure: + print(f"Failure: {failure.failure_message}") + if next_checkpoint: + checkpoint = next_checkpoint \ No newline at end of file diff --git a/common/data_source/github/models.py b/common/data_source/github/models.py new file mode 100644 index 000000000..9754bfa8d --- /dev/null +++ b/common/data_source/github/models.py @@ -0,0 +1,17 @@ +from typing import Any + +from github import Repository +from github.Requester import Requester +from pydantic import BaseModel + + +class SerializedRepository(BaseModel): + # id is part of the raw_data as well, just pulled out for convenience + id: int + headers: dict[str, str | int] + raw_data: dict[str, Any] + + def to_Repository(self, requester: Requester) -> Repository.Repository: + return Repository.Repository( + requester, self.headers, self.raw_data, completed=True + ) \ No newline at end of file diff --git a/common/data_source/github/rate_limit_utils.py b/common/data_source/github/rate_limit_utils.py new file mode 100644 index 000000000..d683bad08 --- /dev/null +++ b/common/data_source/github/rate_limit_utils.py @@ -0,0 +1,24 @@ +import time +import logging +from datetime import datetime +from datetime import timedelta +from datetime import timezone + +from github import Github + + +def sleep_after_rate_limit_exception(github_client: Github) -> None: + """ + Sleep until the GitHub rate limit resets. + + Args: + github_client: The GitHub client that hit the rate limit + """ + sleep_time = github_client.get_rate_limit().core.reset.replace( + tzinfo=timezone.utc + ) - datetime.now(tz=timezone.utc) + sleep_time += timedelta(minutes=1) # add an extra minute just to be safe + logging.info( + "Ran into Github rate-limit. Sleeping %s seconds.", sleep_time.seconds + ) + time.sleep(sleep_time.total_seconds()) \ No newline at end of file diff --git a/common/data_source/github/utils.py b/common/data_source/github/utils.py new file mode 100644 index 000000000..13b25ac51 --- /dev/null +++ b/common/data_source/github/utils.py @@ -0,0 +1,46 @@ +import logging +from collections.abc import Callable +from typing import cast + +from github import Github +from github.Repository import Repository + +from common.data_source.models import ExternalAccess + +from .models import SerializedRepository + + +def get_external_access_permission( + repo: Repository, github_client: Github +) -> ExternalAccess: + """ + Get the external access permission for a repository. + This functionality requires Enterprise Edition. + """ + # RAGFlow doesn't implement the Onyx EE external-permissions system. + # Default to private/unknown permissions. + return ExternalAccess.empty() + + +def deserialize_repository( + cached_repo: SerializedRepository, github_client: Github +) -> Repository: + """ + Deserialize a SerializedRepository back into a Repository object. + """ + # Try to access the requester - different PyGithub versions may use different attribute names + try: + # Try to get the requester using getattr to avoid linter errors + requester = getattr(github_client, "_requester", None) + if requester is None: + requester = getattr(github_client, "_Github__requester", None) + if requester is None: + # If we can't find the requester attribute, we need to fall back to recreating the repo + raise AttributeError("Could not find requester attribute") + + return cached_repo.to_Repository(requester) + except Exception as e: + # If all else fails, re-fetch the repo directly + logging.warning("Failed to deserialize repository: %s. Attempting to re-fetch.", e) + repo_id = cached_repo.id + return github_client.get_repo(repo_id) \ No newline at end of file diff --git a/common/data_source/interfaces.py b/common/data_source/interfaces.py index c5c665aa2..cd180967f 100644 --- a/common/data_source/interfaces.py +++ b/common/data_source/interfaces.py @@ -236,16 +236,13 @@ class BaseConnector(abc.ABC, Generic[CT]): def validate_perm_sync(self) -> None: """ - Don't override this; add a function to perm_sync_valid.py in the ee package - to do permission sync validation + Permission-sync validation hook. + + RAGFlow doesn't ship the Onyx EE permission-sync validation package. + Connectors that support permission sync should override + `validate_connector_settings()` as needed. """ - """ - validate_connector_settings_fn = fetch_ee_implementation_or_noop( - "onyx.connectors.perm_sync_valid", - "validate_perm_sync", - noop_return_value=None, - ) - validate_connector_settings_fn(self)""" + return None def set_allow_images(self, value: bool) -> None: """Implement if the underlying connector wants to skip/allow image downloading diff --git a/pyproject.toml b/pyproject.toml index c8a8755ad..e5b5efaa6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -149,6 +149,7 @@ dependencies = [ # "cryptography==46.0.3", # "jinja2>=3.1.0", "pyairtable>=3.3.0", + "pygithub>=2.8.1", "asana>=5.2.2", ] diff --git a/uv.lock b/uv.lock index cfaaa401f..173246531 100644 --- a/uv.lock +++ b/uv.lock @@ -5509,6 +5509,22 @@ dependencies = [ ] sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/8e/aedef81641c8dca6fd0fb7294de5bed9c45f3397d67fddf755c1042c2642/PyExecJS-1.5.1.tar.gz", hash = "sha256:34cc1d070976918183ff7bdc0ad71f8157a891c92708c00c5fbbff7a769f505c", size = 13344, upload-time = "2018-01-18T04:33:55.126Z" } +[[package]] +name = "pygithub" +version = "2.8.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "pyjwt", extra = ["crypto"] }, + { name = "pynacl" }, + { name = "requests" }, + { name = "typing-extensions" }, + { name = "urllib3" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/74/e560bdeffea72ecb26cff27f0fad548bbff5ecc51d6a155311ea7f9e4c4c/pygithub-2.8.1.tar.gz", hash = "sha256:341b7c78521cb07324ff670afd1baa2bf5c286f8d9fd302c1798ba594a5400c9", size = 2246994, upload-time = "2025-09-02T17:41:54.674Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/ba/7049ce39f653f6140aac4beb53a5aaf08b4407b6a3019aae394c1c5244ff/pygithub-2.8.1-py3-none-any.whl", hash = "sha256:23a0a5bca93baef082e03411bf0ce27204c32be8bfa7abc92fe4a3e132936df0", size = 432709, upload-time = "2025-09-02T17:41:52.947Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -5541,6 +5557,43 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/4c/ad33b92b9864cbde84f259d5df035a6447f91891f5be77788e2a3892bce3/pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9", size = 45300, upload-time = "2025-08-24T12:55:53.394Z" }, ] +[[package]] +name = "pynacl" +version = "1.6.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616, upload-time = "2025-11-10T16:02:13.195Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/75/d6/4b2dca33ed512de8f54e5c6074aa06eaeb225bfbcd9b16f33a414389d6bd/pynacl-1.6.1-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:7d7c09749450c385301a3c20dca967a525152ae4608c0a096fe8464bfc3df93d", size = 389109, upload-time = "2025-11-10T16:01:28.79Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/30/e8dbb8ff4fa2559bbbb2187ba0d0d7faf728d17cb8396ecf4a898b22d3da/pynacl-1.6.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc734c1696ffd49b40f7c1779c89ba908157c57345cf626be2e0719488a076d3", size = 808254, upload-time = "2025-11-10T16:01:37.839Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/f9/f5449c652f31da00249638dbab065ad4969c635119094b79b17c3a4da2ab/pynacl-1.6.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3cd787ec1f5c155dc8ecf39b1333cfef41415dc96d392f1ce288b4fe970df489", size = 1407365, upload-time = "2025-11-10T16:01:40.454Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/eb/2f/9aa5605f473b712065c0a193ebf4ad4725d7a245533f0cd7e5dcdbc78f35/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b35d93ab2df03ecb3aa506be0d3c73609a51449ae0855c2e89c7ed44abde40b", size = 843842, upload-time = "2025-11-10T16:01:30.524Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/8d/748f0f6956e207453da8f5f21a70885fbbb2e060d5c9d78e0a4a06781451/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dece79aecbb8f4640a1adbb81e4aa3bfb0e98e99834884a80eb3f33c7c30e708", size = 1445559, upload-time = "2025-11-10T16:01:33.663Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/d0/2387f0dcb0e9816f38373999e48db4728ed724d31accdd4e737473319d35/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c2228054f04bf32d558fb89bb99f163a8197d5a9bf4efa13069a7fa8d4b93fc3", size = 825791, upload-time = "2025-11-10T16:01:34.823Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/3d/ef6fb7eb072aaf15f280bc66f26ab97e7fc9efa50fb1927683013ef47473/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:2b12f1b97346f177affcdfdc78875ff42637cb40dcf79484a97dae3448083a78", size = 1410843, upload-time = "2025-11-10T16:01:36.401Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e3/fb/23824a017526850ee7d8a1cc4cd1e3e5082800522c10832edbbca8619537/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e735c3a1bdfde3834503baf1a6d74d4a143920281cb724ba29fb84c9f49b9c48", size = 801140, upload-time = "2025-11-10T16:01:42.013Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/d1/ebc6b182cb98603a35635b727d62f094bc201bf610f97a3bb6357fe688d2/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3384a454adf5d716a9fadcb5eb2e3e72cd49302d1374a60edc531c9957a9b014", size = 1371966, upload-time = "2025-11-10T16:01:43.297Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/64/f4/c9d7b6f02924b1f31db546c7bd2a83a2421c6b4a8e6a2e53425c9f2802e0/pynacl-1.6.1-cp314-cp314t-win32.whl", hash = "sha256:d8615ee34d01c8e0ab3f302dcdd7b32e2bcf698ba5f4809e7cc407c8cdea7717", size = 230482, upload-time = "2025-11-10T16:01:47.688Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c4/2c/942477957fba22da7bf99131850e5ebdff66623418ab48964e78a7a8293e/pynacl-1.6.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5f5b35c1a266f8a9ad22525049280a600b19edd1f785bccd01ae838437dcf935", size = 243232, upload-time = "2025-11-10T16:01:45.208Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/0c/bdbc0d04a53b96a765ab03aa2cf9a76ad8653d70bf1665459b9a0dedaa1c/pynacl-1.6.1-cp314-cp314t-win_arm64.whl", hash = "sha256:d984c91fe3494793b2a1fb1e91429539c6c28e9ec8209d26d25041ec599ccf63", size = 187907, upload-time = "2025-11-10T16:01:46.328Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591, upload-time = "2025-11-10T16:01:49.1Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866, upload-time = "2025-11-10T16:01:55.688Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001, upload-time = "2025-11-10T16:01:57.101Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024, upload-time = "2025-11-10T16:01:50.228Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766, upload-time = "2025-11-10T16:01:51.886Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275, upload-time = "2025-11-10T16:01:53.351Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891, upload-time = "2025-11-10T16:01:54.587Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291, upload-time = "2025-11-10T16:01:58.111Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839, upload-time = "2025-11-10T16:01:59.252Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371, upload-time = "2025-11-10T16:02:01.075Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031, upload-time = "2025-11-10T16:02:02.656Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585, upload-time = "2025-11-10T16:02:07.116Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923, upload-time = "2025-11-10T16:02:04.401Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970, upload-time = "2025-11-10T16:02:05.786Z" }, +] + [[package]] name = "pynndescent" version = "0.5.13" @@ -6171,6 +6224,7 @@ dependencies = [ { name = "pyairtable" }, { name = "pyclipper" }, { name = "pycryptodomex" }, + { name = "pygithub" }, { name = "pyobvector" }, { name = "pyodbc" }, { name = "pypandoc" }, @@ -6301,6 +6355,7 @@ requires-dist = [ { name = "pyairtable", specifier = ">=3.3.0" }, { name = "pyclipper", specifier = ">=1.4.0,<2.0.0" }, { name = "pycryptodomex", specifier = "==3.20.0" }, + { name = "pygithub", specifier = ">=2.8.1" }, { name = "pyobvector", specifier = "==0.2.18" }, { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" }, { name = "pypandoc", specifier = ">=1.16" },