From c2c079886f0725408af9654668ba0d7a0591a3db Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 29 Dec 2025 17:06:40 +0800 Subject: [PATCH] Revert "Feat: github connector" (#12296) Reverts infiniflow/ragflow#12292 --- common/data_source/config.py | 2 - common/data_source/connector_runner.py | 217 ---- common/data_source/github/__init__.py | 0 common/data_source/github/connector.py | 954 ------------------ common/data_source/github/models.py | 17 - common/data_source/github/rate_limit_utils.py | 24 - common/data_source/github/utils.py | 46 - common/data_source/interfaces.py | 15 +- pyproject.toml | 1 - uv.lock | 55 - 10 files changed, 9 insertions(+), 1322 deletions(-) delete mode 100644 common/data_source/connector_runner.py delete mode 100644 common/data_source/github/__init__.py delete mode 100644 common/data_source/github/connector.py delete mode 100644 common/data_source/github/models.py delete mode 100644 common/data_source/github/rate_limit_utils.py delete mode 100644 common/data_source/github/utils.py diff --git a/common/data_source/config.py b/common/data_source/config.py index 8f9553365..fc14b524c 100644 --- a/common/data_source/config.py +++ b/common/data_source/config.py @@ -234,8 +234,6 @@ _REPLACEMENT_EXPANSIONS = "body.view.value" BOX_WEB_OAUTH_REDIRECT_URI = os.environ.get("BOX_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/box/oauth/web/callback") -GITHUB_CONNECTOR_BASE_URL = os.environ.get("GITHUB_CONNECTOR_BASE_URL") or None - class HtmlBasedConnectorTransformLinksStrategy(str, Enum): # remove links entirely STRIP = "strip" diff --git a/common/data_source/connector_runner.py b/common/data_source/connector_runner.py deleted file mode 100644 index d47d65128..000000000 --- a/common/data_source/connector_runner.py +++ /dev/null @@ -1,217 +0,0 @@ -import sys -import time -import logging -from collections.abc import Generator -from datetime import datetime -from typing import Generic -from typing import TypeVar -from common.data_source.interfaces import ( - BaseConnector, - CheckpointedConnector, - CheckpointedConnectorWithPermSync, - CheckpointOutput, - LoadConnector, - PollConnector, -) -from common.data_source.models import ConnectorCheckpoint, ConnectorFailure, Document - - -TimeRange = tuple[datetime, datetime] - -CT = TypeVar("CT", bound=ConnectorCheckpoint) - - -def batched_doc_ids( - checkpoint_connector_generator: CheckpointOutput[CT], - batch_size: int, -) -> Generator[set[str], None, None]: - batch: set[str] = set() - for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()( - checkpoint_connector_generator - ): - if document is not None: - batch.add(document.id) - elif ( - failure and failure.failed_document and failure.failed_document.document_id - ): - batch.add(failure.failed_document.document_id) - - if len(batch) >= batch_size: - yield batch - batch = set() - if len(batch) > 0: - yield batch - - -class CheckpointOutputWrapper(Generic[CT]): - """ - Wraps a CheckpointOutput generator to give things back in a more digestible format, - specifically for Document outputs. - The connector format is easier for the connector implementor (e.g. it enforces exactly - one new checkpoint is returned AND that the checkpoint is at the end), thus the different - formats. - """ - - def __init__(self) -> None: - self.next_checkpoint: CT | None = None - - def __call__( - self, - checkpoint_connector_generator: CheckpointOutput[CT], - ) -> Generator[ - tuple[Document | None, ConnectorFailure | None, CT | None], - None, - None, - ]: - # grabs the final return value and stores it in the `next_checkpoint` variable - def _inner_wrapper( - checkpoint_connector_generator: CheckpointOutput[CT], - ) -> CheckpointOutput[CT]: - self.next_checkpoint = yield from checkpoint_connector_generator - return self.next_checkpoint # not used - - for document_or_failure in _inner_wrapper(checkpoint_connector_generator): - if isinstance(document_or_failure, Document): - yield document_or_failure, None, None - elif isinstance(document_or_failure, ConnectorFailure): - yield None, document_or_failure, None - else: - raise ValueError( - f"Invalid document_or_failure type: {type(document_or_failure)}" - ) - - if self.next_checkpoint is None: - raise RuntimeError( - "Checkpoint is None. This should never happen - the connector should always return a checkpoint." - ) - - yield None, None, self.next_checkpoint - - -class ConnectorRunner(Generic[CT]): - """ - Handles: - - Batching - - Additional exception logging - - Combining different connector types to a single interface - """ - - def __init__( - self, - connector: BaseConnector, - batch_size: int, - # cannot be True for non-checkpointed connectors - include_permissions: bool, - time_range: TimeRange | None = None, - ): - if not isinstance(connector, CheckpointedConnector) and include_permissions: - raise ValueError( - "include_permissions cannot be True for non-checkpointed connectors" - ) - - self.connector = connector - self.time_range = time_range - self.batch_size = batch_size - self.include_permissions = include_permissions - - self.doc_batch: list[Document] = [] - - def run(self, checkpoint: CT) -> Generator[ - tuple[list[Document] | None, ConnectorFailure | None, CT | None], - None, - None, - ]: - """Adds additional exception logging to the connector.""" - try: - if isinstance(self.connector, CheckpointedConnector): - if self.time_range is None: - raise ValueError("time_range is required for CheckpointedConnector") - - start = time.monotonic() - if self.include_permissions: - if not isinstance( - self.connector, CheckpointedConnectorWithPermSync - ): - raise ValueError( - "Connector does not support permission syncing" - ) - load_from_checkpoint = ( - self.connector.load_from_checkpoint_with_perm_sync - ) - else: - load_from_checkpoint = self.connector.load_from_checkpoint - checkpoint_connector_generator = load_from_checkpoint( - start=self.time_range[0].timestamp(), - end=self.time_range[1].timestamp(), - checkpoint=checkpoint, - ) - next_checkpoint: CT | None = None - # this is guaranteed to always run at least once with next_checkpoint being non-None - for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()( - checkpoint_connector_generator - ): - if document is not None and isinstance(document, Document): - self.doc_batch.append(document) - - if failure is not None: - yield None, failure, None - - if len(self.doc_batch) >= self.batch_size: - yield self.doc_batch, None, None - self.doc_batch = [] - - # yield remaining documents - if len(self.doc_batch) > 0: - yield self.doc_batch, None, None - self.doc_batch = [] - - yield None, None, next_checkpoint - - logging.debug( - f"Connector took {time.monotonic() - start} seconds to get to the next checkpoint." - ) - - else: - finished_checkpoint = self.connector.build_dummy_checkpoint() - finished_checkpoint.has_more = False - - if isinstance(self.connector, PollConnector): - if self.time_range is None: - raise ValueError("time_range is required for PollConnector") - - for document_batch in self.connector.poll_source( - start=self.time_range[0].timestamp(), - end=self.time_range[1].timestamp(), - ): - yield document_batch, None, None - - yield None, None, finished_checkpoint - elif isinstance(self.connector, LoadConnector): - for document_batch in self.connector.load_from_state(): - yield document_batch, None, None - - yield None, None, finished_checkpoint - else: - raise ValueError(f"Invalid connector. type: {type(self.connector)}") - except Exception: - exc_type, _, exc_traceback = sys.exc_info() - - # Traverse the traceback to find the last frame where the exception was raised - tb = exc_traceback - if tb is None: - logging.error("No traceback found for exception") - raise - - while tb.tb_next: - tb = tb.tb_next # Move to the next frame in the traceback - - # Get the local variables from the frame where the exception occurred - local_vars = tb.tb_frame.f_locals - local_vars_str = "\n".join( - f"{key}: {value}" for key, value in local_vars.items() - ) - logging.error( - f"Error in connector. type: {exc_type};\n" - f"local_vars below -> \n{local_vars_str[:1024]}" - ) - raise \ No newline at end of file diff --git a/common/data_source/github/__init__.py b/common/data_source/github/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/common/data_source/github/connector.py b/common/data_source/github/connector.py deleted file mode 100644 index a6ee3a59a..000000000 --- a/common/data_source/github/connector.py +++ /dev/null @@ -1,954 +0,0 @@ -import copy -import logging -from collections.abc import Callable -from collections.abc import Generator -from datetime import datetime -from datetime import timedelta -from datetime import timezone -from enum import Enum -from typing import Any -from typing import cast - -from github import Github -from github import RateLimitExceededException -from github import Repository -from github.GithubException import GithubException -from github.Issue import Issue -from github.NamedUser import NamedUser -from github.PaginatedList import PaginatedList -from github.PullRequest import PullRequest -from pydantic import BaseModel -from typing_extensions import override - -from common.data_source.config import DocumentSource, GITHUB_CONNECTOR_BASE_URL -from common.data_source.exceptions import ( - ConnectorMissingCredentialError, - ConnectorValidationError, - CredentialExpiredError, - InsufficientPermissionsError, - UnexpectedValidationError, -) -from common.data_source.interfaces import CheckpointedConnectorWithPermSync, CheckpointOutput -from common.data_source.models import ( - ConnectorCheckpoint, - ConnectorFailure, - Document, - DocumentFailure, - ExternalAccess, - SecondsSinceUnixEpoch, - TextSection, -) -from common.data_source.connector_runner import ConnectorRunner -from .models import SerializedRepository -from .rate_limit_utils import sleep_after_rate_limit_exception -from .utils import deserialize_repository -from .utils import get_external_access_permission - -ITEMS_PER_PAGE = 100 -CURSOR_LOG_FREQUENCY = 50 - -_MAX_NUM_RATE_LIMIT_RETRIES = 5 - -ONE_DAY = timedelta(days=1) -SLIM_BATCH_SIZE = 100 -# Cases -# X (from start) standard run, no fallback to cursor-based pagination -# X (from start) standard run errors, fallback to cursor-based pagination -# X error in the middle of a page -# X no errors: run to completion -# X (from checkpoint) standard run, no fallback to cursor-based pagination -# X (from checkpoint) continue from cursor-based pagination -# - retrying -# - no retrying - -# things to check: -# checkpoint state on return -# checkpoint progress (no infinite loop) - - -class DocMetadata(BaseModel): - repo: str - - -def get_nextUrl_key(pag_list: PaginatedList[PullRequest | Issue]) -> str: - if "_PaginatedList__nextUrl" in pag_list.__dict__: - return "_PaginatedList__nextUrl" - for key in pag_list.__dict__: - if "__nextUrl" in key: - return key - for key in pag_list.__dict__: - if "nextUrl" in key: - return key - return "" - - -def get_nextUrl( - pag_list: PaginatedList[PullRequest | Issue], nextUrl_key: str -) -> str | None: - return getattr(pag_list, nextUrl_key) if nextUrl_key else None - - -def set_nextUrl( - pag_list: PaginatedList[PullRequest | Issue], nextUrl_key: str, nextUrl: str -) -> None: - if nextUrl_key: - setattr(pag_list, nextUrl_key, nextUrl) - elif nextUrl: - raise ValueError("Next URL key not found: " + str(pag_list.__dict__)) - - -def _paginate_until_error( - git_objs: Callable[[], PaginatedList[PullRequest | Issue]], - cursor_url: str | None, - prev_num_objs: int, - cursor_url_callback: Callable[[str | None, int], None], - retrying: bool = False, -) -> Generator[PullRequest | Issue, None, None]: - num_objs = prev_num_objs - pag_list = git_objs() - nextUrl_key = get_nextUrl_key(pag_list) - if cursor_url: - set_nextUrl(pag_list, nextUrl_key, cursor_url) - elif retrying: - # if we are retrying, we want to skip the objects retrieved - # over previous calls. Unfortunately, this WILL retrieve all - # pages before the one we are resuming from, so we really - # don't want this case to be hit often - logging.warning( - "Retrying from a previous cursor-based pagination call. " - "This will retrieve all pages before the one we are resuming from, " - "which may take a while and consume many API calls." - ) - pag_list = cast(PaginatedList[PullRequest | Issue], pag_list[prev_num_objs:]) - num_objs = 0 - - try: - # this for loop handles cursor-based pagination - for issue_or_pr in pag_list: - num_objs += 1 - yield issue_or_pr - # used to store the current cursor url in the checkpoint. This value - # is updated during iteration over pag_list. - cursor_url_callback(get_nextUrl(pag_list, nextUrl_key), num_objs) - - if num_objs % CURSOR_LOG_FREQUENCY == 0: - logging.info( - f"Retrieved {num_objs} objects with current cursor url: {get_nextUrl(pag_list, nextUrl_key)}" - ) - - except Exception as e: - logging.exception(f"Error during cursor-based pagination: {e}") - if num_objs - prev_num_objs > 0: - raise - - if get_nextUrl(pag_list, nextUrl_key) is not None and not retrying: - logging.info( - "Assuming that this error is due to cursor " - "expiration because no objects were retrieved. " - "Retrying from the first page." - ) - yield from _paginate_until_error( - git_objs, None, prev_num_objs, cursor_url_callback, retrying=True - ) - return - - # for no cursor url or if we reach this point after a retry, raise the error - raise - - -def _get_batch_rate_limited( - # We pass in a callable because we want git_objs to produce a fresh - # PaginatedList each time it's called to avoid using the same object for cursor-based pagination - # from a partial offset-based pagination call. - git_objs: Callable[[], PaginatedList], - page_num: int, - cursor_url: str | None, - prev_num_objs: int, - cursor_url_callback: Callable[[str | None, int], None], - github_client: Github, - attempt_num: int = 0, -) -> Generator[PullRequest | Issue, None, None]: - if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES: - raise RuntimeError( - "Re-tried fetching batch too many times. Something is going wrong with fetching objects from Github" - ) - try: - if cursor_url: - # when this is set, we are resuming from an earlier - # cursor-based pagination call. - yield from _paginate_until_error( - git_objs, cursor_url, prev_num_objs, cursor_url_callback - ) - return - objs = list(git_objs().get_page(page_num)) - # fetch all data here to disable lazy loading later - # this is needed to capture the rate limit exception here (if one occurs) - for obj in objs: - if hasattr(obj, "raw_data"): - getattr(obj, "raw_data") - yield from objs - except RateLimitExceededException: - sleep_after_rate_limit_exception(github_client) - yield from _get_batch_rate_limited( - git_objs, - page_num, - cursor_url, - prev_num_objs, - cursor_url_callback, - github_client, - attempt_num + 1, - ) - except GithubException as e: - if not ( - e.status == 422 - and ( - "cursor" in (e.message or "") - or "cursor" in (e.data or {}).get("message", "") - ) - ): - raise - # Fallback to a cursor-based pagination strategy - # This can happen for "large datasets," but there's no documentation - # On the error on the web as far as we can tell. - # Error message: - # "Pagination with the page parameter is not supported for large datasets, - # please use cursor based pagination (after/before)" - yield from _paginate_until_error( - git_objs, cursor_url, prev_num_objs, cursor_url_callback - ) - - -def _get_userinfo(user: NamedUser) -> dict[str, str]: - def _safe_get(attr_name: str) -> str | None: - try: - return cast(str | None, getattr(user, attr_name)) - except GithubException: - logging.debug(f"Error getting {attr_name} for user") - return None - - return { - k: v - for k, v in { - "login": _safe_get("login"), - "name": _safe_get("name"), - "email": _safe_get("email"), - }.items() - if v is not None - } - - -def _convert_pr_to_document( - pull_request: PullRequest, repo_external_access: ExternalAccess | None -) -> Document: - repo_name = pull_request.base.repo.full_name if pull_request.base else "" - doc_metadata = DocMetadata(repo=repo_name) - return Document( - id=pull_request.html_url, - sections=[ - TextSection(link=pull_request.html_url, text=pull_request.body or "") - ], - external_access=repo_external_access, - source=DocumentSource.GITHUB, - semantic_identifier=f"{pull_request.number}: {pull_request.title}", - # updated_at is UTC time but is timezone unaware, explicitly add UTC - # as there is logic in indexing to prevent wrong timestamped docs - # due to local time discrepancies with UTC - doc_updated_at=( - pull_request.updated_at.replace(tzinfo=timezone.utc) - if pull_request.updated_at - else None - ), - # this metadata is used in perm sync - doc_metadata=doc_metadata.model_dump(), - metadata={ - k: [str(vi) for vi in v] if isinstance(v, list) else str(v) - for k, v in { - "object_type": "PullRequest", - "id": pull_request.number, - "merged": pull_request.merged, - "state": pull_request.state, - "user": _get_userinfo(pull_request.user) if pull_request.user else None, - "assignees": [ - _get_userinfo(assignee) for assignee in pull_request.assignees - ], - "repo": ( - pull_request.base.repo.full_name if pull_request.base else None - ), - "num_commits": str(pull_request.commits), - "num_files_changed": str(pull_request.changed_files), - "labels": [label.name for label in pull_request.labels], - "created_at": ( - pull_request.created_at.replace(tzinfo=timezone.utc) - if pull_request.created_at - else None - ), - "updated_at": ( - pull_request.updated_at.replace(tzinfo=timezone.utc) - if pull_request.updated_at - else None - ), - "closed_at": ( - pull_request.closed_at.replace(tzinfo=timezone.utc) - if pull_request.closed_at - else None - ), - "merged_at": ( - pull_request.merged_at.replace(tzinfo=timezone.utc) - if pull_request.merged_at - else None - ), - "merged_by": ( - _get_userinfo(pull_request.merged_by) - if pull_request.merged_by - else None - ), - }.items() - if v is not None - }, - ) - - -def _fetch_issue_comments(issue: Issue) -> str: - comments = issue.get_comments() - return "\nComment: ".join(comment.body for comment in comments) - - -def _convert_issue_to_document( - issue: Issue, repo_external_access: ExternalAccess | None -) -> Document: - repo_name = issue.repository.full_name if issue.repository else "" - doc_metadata = DocMetadata(repo=repo_name) - return Document( - id=issue.html_url, - sections=[TextSection(link=issue.html_url, text=issue.body or "")], - source=DocumentSource.GITHUB, - external_access=repo_external_access, - semantic_identifier=f"{issue.number}: {issue.title}", - # updated_at is UTC time but is timezone unaware - doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc), - # this metadata is used in perm sync - doc_metadata=doc_metadata.model_dump(), - metadata={ - k: [str(vi) for vi in v] if isinstance(v, list) else str(v) - for k, v in { - "object_type": "Issue", - "id": issue.number, - "state": issue.state, - "user": _get_userinfo(issue.user) if issue.user else None, - "assignees": [_get_userinfo(assignee) for assignee in issue.assignees], - "repo": issue.repository.full_name if issue.repository else None, - "labels": [label.name for label in issue.labels], - "created_at": ( - issue.created_at.replace(tzinfo=timezone.utc) - if issue.created_at - else None - ), - "updated_at": ( - issue.updated_at.replace(tzinfo=timezone.utc) - if issue.updated_at - else None - ), - "closed_at": ( - issue.closed_at.replace(tzinfo=timezone.utc) - if issue.closed_at - else None - ), - "closed_by": ( - _get_userinfo(issue.closed_by) if issue.closed_by else None - ), - }.items() - if v is not None - }, - ) - - -class GithubConnectorStage(Enum): - START = "start" - PRS = "prs" - ISSUES = "issues" - - -class GithubConnectorCheckpoint(ConnectorCheckpoint): - stage: GithubConnectorStage - curr_page: int - - cached_repo_ids: list[int] | None = None - cached_repo: SerializedRepository | None = None - - # Used for the fallback cursor-based pagination strategy - num_retrieved: int - cursor_url: str | None = None - - def reset(self) -> None: - """ - Resets curr_page, num_retrieved, and cursor_url to their initial values (0, 0, None) - """ - self.curr_page = 0 - self.num_retrieved = 0 - self.cursor_url = None - - -def make_cursor_url_callback( - checkpoint: GithubConnectorCheckpoint, -) -> Callable[[str | None, int], None]: - def cursor_url_callback(cursor_url: str | None, num_objs: int) -> None: - # we want to maintain the old cursor url so code after retrieval - # can determine that we are using the fallback cursor-based pagination strategy - if cursor_url: - checkpoint.cursor_url = cursor_url - checkpoint.num_retrieved = num_objs - - return cursor_url_callback - - -class GithubConnector(CheckpointedConnectorWithPermSync[GithubConnectorCheckpoint]): - def __init__( - self, - repo_owner: str, - repositories: str | None = None, - state_filter: str = "all", - include_prs: bool = True, - include_issues: bool = False, - ) -> None: - self.repo_owner = repo_owner - self.repositories = repositories - self.state_filter = state_filter - self.include_prs = include_prs - self.include_issues = include_issues - self.github_client: Github | None = None - - def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: - # defaults to 30 items per page, can be set to as high as 100 - self.github_client = ( - Github( - credentials["github_access_token"], - base_url=GITHUB_CONNECTOR_BASE_URL, - per_page=ITEMS_PER_PAGE, - ) - if GITHUB_CONNECTOR_BASE_URL - else Github(credentials["github_access_token"], per_page=ITEMS_PER_PAGE) - ) - return None - - def get_github_repo( - self, github_client: Github, attempt_num: int = 0 - ) -> Repository.Repository: - if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES: - raise RuntimeError( - "Re-tried fetching repo too many times. Something is going wrong with fetching objects from Github" - ) - - try: - return github_client.get_repo(f"{self.repo_owner}/{self.repositories}") - except RateLimitExceededException: - sleep_after_rate_limit_exception(github_client) - return self.get_github_repo(github_client, attempt_num + 1) - - def get_github_repos( - self, github_client: Github, attempt_num: int = 0 - ) -> list[Repository.Repository]: - """Get specific repositories based on comma-separated repo_name string.""" - if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES: - raise RuntimeError( - "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github" - ) - - try: - repos = [] - # Split repo_name by comma and strip whitespace - repo_names = [ - name.strip() for name in (cast(str, self.repositories)).split(",") - ] - - for repo_name in repo_names: - if repo_name: # Skip empty strings - try: - repo = github_client.get_repo(f"{self.repo_owner}/{repo_name}") - repos.append(repo) - except GithubException as e: - logging.warning( - f"Could not fetch repo {self.repo_owner}/{repo_name}: {e}" - ) - - return repos - except RateLimitExceededException: - sleep_after_rate_limit_exception(github_client) - return self.get_github_repos(github_client, attempt_num + 1) - - def get_all_repos( - self, github_client: Github, attempt_num: int = 0 - ) -> list[Repository.Repository]: - if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES: - raise RuntimeError( - "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github" - ) - - try: - # Try to get organization first - try: - org = github_client.get_organization(self.repo_owner) - return list(org.get_repos()) - - except GithubException: - # If not an org, try as a user - user = github_client.get_user(self.repo_owner) - return list(user.get_repos()) - except RateLimitExceededException: - sleep_after_rate_limit_exception(github_client) - return self.get_all_repos(github_client, attempt_num + 1) - - def _pull_requests_func( - self, repo: Repository.Repository - ) -> Callable[[], PaginatedList[PullRequest]]: - return lambda: repo.get_pulls( - state=self.state_filter, sort="updated", direction="desc" - ) - - def _issues_func( - self, repo: Repository.Repository - ) -> Callable[[], PaginatedList[Issue]]: - return lambda: repo.get_issues( - state=self.state_filter, sort="updated", direction="desc" - ) - - def _fetch_from_github( - self, - checkpoint: GithubConnectorCheckpoint, - start: datetime | None = None, - end: datetime | None = None, - include_permissions: bool = False, - ) -> Generator[Document | ConnectorFailure, None, GithubConnectorCheckpoint]: - if self.github_client is None: - raise ConnectorMissingCredentialError("GitHub") - - checkpoint = copy.deepcopy(checkpoint) - - # First run of the connector, fetch all repos and store in checkpoint - if checkpoint.cached_repo_ids is None: - repos = [] - if self.repositories: - if "," in self.repositories: - # Multiple repositories specified - repos = self.get_github_repos(self.github_client) - else: - # Single repository (backward compatibility) - repos = [self.get_github_repo(self.github_client)] - else: - # All repositories - repos = self.get_all_repos(self.github_client) - if not repos: - checkpoint.has_more = False - return checkpoint - - curr_repo = repos.pop() - checkpoint.cached_repo_ids = [repo.id for repo in repos] - checkpoint.cached_repo = SerializedRepository( - id=curr_repo.id, - headers=curr_repo.raw_headers, - raw_data=curr_repo.raw_data, - ) - checkpoint.stage = GithubConnectorStage.PRS - checkpoint.curr_page = 0 - # save checkpoint with repo ids retrieved - return checkpoint - - if checkpoint.cached_repo is None: - raise ValueError("No repo saved in checkpoint") - - # Deserialize the repository from the checkpoint - repo = deserialize_repository(checkpoint.cached_repo, self.github_client) - - cursor_url_callback = make_cursor_url_callback(checkpoint) - repo_external_access: ExternalAccess | None = None - if include_permissions: - repo_external_access = get_external_access_permission( - repo, self.github_client - ) - if self.include_prs and checkpoint.stage == GithubConnectorStage.PRS: - logging.info(f"Fetching PRs for repo: {repo.name}") - - pr_batch = _get_batch_rate_limited( - self._pull_requests_func(repo), - checkpoint.curr_page, - checkpoint.cursor_url, - checkpoint.num_retrieved, - cursor_url_callback, - self.github_client, - ) - checkpoint.curr_page += 1 # NOTE: not used for cursor-based fallback - done_with_prs = False - num_prs = 0 - pr = None - for pr in pr_batch: - num_prs += 1 - - # we iterate backwards in time, so at this point we stop processing prs - if ( - start is not None - and pr.updated_at - and pr.updated_at.replace(tzinfo=timezone.utc) < start - ): - done_with_prs = True - break - # Skip PRs updated after the end date - if ( - end is not None - and pr.updated_at - and pr.updated_at.replace(tzinfo=timezone.utc) > end - ): - continue - try: - yield _convert_pr_to_document( - cast(PullRequest, pr), repo_external_access - ) - except Exception as e: - error_msg = f"Error converting PR to document: {e}" - logging.exception(error_msg) - yield ConnectorFailure( - failed_document=DocumentFailure( - document_id=str(pr.id), document_link=pr.html_url - ), - failure_message=error_msg, - exception=e, - ) - continue - - # If we reach this point with a cursor url in the checkpoint, we were using - # the fallback cursor-based pagination strategy. That strategy tries to get all - # PRs, so having curosr_url set means we are done with prs. However, we need to - # return AFTER the checkpoint reset to avoid infinite loops. - - # if we found any PRs on the page and there are more PRs to get, return the checkpoint. - # In offset mode, while indexing without time constraints, the pr batch - # will be empty when we're done. - used_cursor = checkpoint.cursor_url is not None - logging.info(f"Fetched {num_prs} PRs for repo: {repo.name}") - if num_prs > 0 and not done_with_prs and not used_cursor: - return checkpoint - - # if we went past the start date during the loop or there are no more - # prs to get, we move on to issues - checkpoint.stage = GithubConnectorStage.ISSUES - checkpoint.reset() - - if used_cursor: - # save the checkpoint after changing stage; next run will continue from issues - return checkpoint - - checkpoint.stage = GithubConnectorStage.ISSUES - - if self.include_issues and checkpoint.stage == GithubConnectorStage.ISSUES: - logging.info(f"Fetching issues for repo: {repo.name}") - - issue_batch = list( - _get_batch_rate_limited( - self._issues_func(repo), - checkpoint.curr_page, - checkpoint.cursor_url, - checkpoint.num_retrieved, - cursor_url_callback, - self.github_client, - ) - ) - logging.info(f"Fetched {len(issue_batch)} issues for repo: {repo.name}") - checkpoint.curr_page += 1 - done_with_issues = False - num_issues = 0 - for issue in issue_batch: - num_issues += 1 - issue = cast(Issue, issue) - # we iterate backwards in time, so at this point we stop processing prs - if ( - start is not None - and issue.updated_at.replace(tzinfo=timezone.utc) < start - ): - done_with_issues = True - break - # Skip PRs updated after the end date - if ( - end is not None - and issue.updated_at.replace(tzinfo=timezone.utc) > end - ): - continue - - if issue.pull_request is not None: - # PRs are handled separately - continue - - try: - yield _convert_issue_to_document(issue, repo_external_access) - except Exception as e: - error_msg = f"Error converting issue to document: {e}" - logging.exception(error_msg) - yield ConnectorFailure( - failed_document=DocumentFailure( - document_id=str(issue.id), - document_link=issue.html_url, - ), - failure_message=error_msg, - exception=e, - ) - continue - - logging.info(f"Fetched {num_issues} issues for repo: {repo.name}") - # if we found any issues on the page, and we're not done, return the checkpoint. - # don't return if we're using cursor-based pagination to avoid infinite loops - if num_issues > 0 and not done_with_issues and not checkpoint.cursor_url: - return checkpoint - - # if we went past the start date during the loop or there are no more - # issues to get, we move on to the next repo - checkpoint.stage = GithubConnectorStage.PRS - checkpoint.reset() - - checkpoint.has_more = len(checkpoint.cached_repo_ids) > 0 - if checkpoint.cached_repo_ids: - next_id = checkpoint.cached_repo_ids.pop() - next_repo = self.github_client.get_repo(next_id) - checkpoint.cached_repo = SerializedRepository( - id=next_id, - headers=next_repo.raw_headers, - raw_data=next_repo.raw_data, - ) - checkpoint.stage = GithubConnectorStage.PRS - checkpoint.reset() - - if checkpoint.cached_repo_ids: - logging.info( - f"{len(checkpoint.cached_repo_ids)} repos remaining (IDs: {checkpoint.cached_repo_ids})" - ) - else: - logging.info("No more repos remaining") - - return checkpoint - - def _load_from_checkpoint( - self, - start: SecondsSinceUnixEpoch, - end: SecondsSinceUnixEpoch, - checkpoint: GithubConnectorCheckpoint, - include_permissions: bool = False, - ) -> CheckpointOutput[GithubConnectorCheckpoint]: - start_datetime = datetime.fromtimestamp(start, tz=timezone.utc) - # add a day for timezone safety - end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) + ONE_DAY - - # Move start time back by 3 hours, since some Issues/PRs are getting dropped - # Could be due to delayed processing on GitHub side - # The non-updated issues since last poll will be shortcut-ed and not embedded - adjusted_start_datetime = start_datetime - timedelta(hours=3) - - epoch = datetime.fromtimestamp(0, tz=timezone.utc) - if adjusted_start_datetime < epoch: - adjusted_start_datetime = epoch - - return self._fetch_from_github( - checkpoint, - start=adjusted_start_datetime, - end=end_datetime, - include_permissions=include_permissions, - ) - - @override - def load_from_checkpoint( - self, - start: SecondsSinceUnixEpoch, - end: SecondsSinceUnixEpoch, - checkpoint: GithubConnectorCheckpoint, - ) -> CheckpointOutput[GithubConnectorCheckpoint]: - return self._load_from_checkpoint( - start, end, checkpoint, include_permissions=False - ) - - @override - def load_from_checkpoint_with_perm_sync( - self, - start: SecondsSinceUnixEpoch, - end: SecondsSinceUnixEpoch, - checkpoint: GithubConnectorCheckpoint, - ) -> CheckpointOutput[GithubConnectorCheckpoint]: - return self._load_from_checkpoint( - start, end, checkpoint, include_permissions=True - ) - - def validate_connector_settings(self) -> None: - if self.github_client is None: - raise ConnectorMissingCredentialError("GitHub credentials not loaded.") - - if not self.repo_owner: - raise ConnectorValidationError( - "Invalid connector settings: 'repo_owner' must be provided." - ) - - try: - if self.repositories: - if "," in self.repositories: - # Multiple repositories specified - repo_names = [name.strip() for name in self.repositories.split(",")] - if not repo_names: - raise ConnectorValidationError( - "Invalid connector settings: No valid repository names provided." - ) - - # Validate at least one repository exists and is accessible - valid_repos = False - validation_errors = [] - - for repo_name in repo_names: - if not repo_name: - continue - - try: - test_repo = self.github_client.get_repo( - f"{self.repo_owner}/{repo_name}" - ) - logging.info( - f"Successfully accessed repository: {self.repo_owner}/{repo_name}" - ) - test_repo.get_contents("") - valid_repos = True - # If at least one repo is valid, we can proceed - break - except GithubException as e: - validation_errors.append( - f"Repository '{repo_name}': {e.data.get('message', str(e))}" - ) - - if not valid_repos: - error_msg = ( - "None of the specified repositories could be accessed: " - ) - error_msg += ", ".join(validation_errors) - raise ConnectorValidationError(error_msg) - else: - # Single repository (backward compatibility) - test_repo = self.github_client.get_repo( - f"{self.repo_owner}/{self.repositories}" - ) - test_repo.get_contents("") - else: - # Try to get organization first - try: - org = self.github_client.get_organization(self.repo_owner) - total_count = org.get_repos().totalCount - if total_count == 0: - raise ConnectorValidationError( - f"Found no repos for organization: {self.repo_owner}. " - "Does the credential have the right scopes?" - ) - except GithubException as e: - # Check for missing SSO - MISSING_SSO_ERROR_MESSAGE = "You must grant your Personal Access token access to this organization".lower() - if MISSING_SSO_ERROR_MESSAGE in str(e).lower(): - SSO_GUIDE_LINK = ( - "https://docs.github.com/en/enterprise-cloud@latest/authentication/" - "authenticating-with-saml-single-sign-on/" - "authorizing-a-personal-access-token-for-use-with-saml-single-sign-on" - ) - raise ConnectorValidationError( - f"Your GitHub token is missing authorization to access the " - f"`{self.repo_owner}` organization. Please follow the guide to " - f"authorize your token: {SSO_GUIDE_LINK}" - ) - # If not an org, try as a user - user = self.github_client.get_user(self.repo_owner) - - # Check if we can access any repos - total_count = user.get_repos().totalCount - if total_count == 0: - raise ConnectorValidationError( - f"Found no repos for user: {self.repo_owner}. " - "Does the credential have the right scopes?" - ) - - except RateLimitExceededException: - raise UnexpectedValidationError( - "Validation failed due to GitHub rate-limits being exceeded. Please try again later." - ) - - except GithubException as e: - if e.status == 401: - raise CredentialExpiredError( - "GitHub credential appears to be invalid or expired (HTTP 401)." - ) - elif e.status == 403: - raise InsufficientPermissionsError( - "Your GitHub token does not have sufficient permissions for this repository (HTTP 403)." - ) - elif e.status == 404: - if self.repositories: - if "," in self.repositories: - raise ConnectorValidationError( - f"None of the specified GitHub repositories could be found for owner: {self.repo_owner}" - ) - else: - raise ConnectorValidationError( - f"GitHub repository not found with name: {self.repo_owner}/{self.repositories}" - ) - else: - raise ConnectorValidationError( - f"GitHub user or organization not found: {self.repo_owner}" - ) - else: - raise ConnectorValidationError( - f"Unexpected GitHub error (status={e.status}): {e.data}" - ) - - except Exception as exc: - raise Exception( - f"Unexpected error during GitHub settings validation: {exc}" - ) - - def validate_checkpoint_json( - self, checkpoint_json: str - ) -> GithubConnectorCheckpoint: - return GithubConnectorCheckpoint.model_validate_json(checkpoint_json) - - def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint: - return GithubConnectorCheckpoint( - stage=GithubConnectorStage.PRS, curr_page=0, has_more=True, num_retrieved=0 - ) - - -if __name__ == "__main__": - import os - - # Initialize the connector - connector = GithubConnector( - repo_owner=os.environ["REPO_OWNER"], - repositories=os.environ.get("REPOSITORIES"), - ) - connector.load_credentials( - {"github_access_token": os.environ["ACCESS_TOKEN_GITHUB"]} - ) - - if connector.github_client: - get_external_access_permission( - connector.get_github_repos(connector.github_client).pop(), - connector.github_client, - ) - - # Create a time range from epoch to now - end_time = datetime.now(timezone.utc) - start_time = datetime.fromtimestamp(0, tz=timezone.utc) - time_range = (start_time, end_time) - - # Initialize the runner with a batch size of 10 - runner: ConnectorRunner[GithubConnectorCheckpoint] = ConnectorRunner( - connector, batch_size=10, include_permissions=False, time_range=time_range - ) - - # Get initial checkpoint - checkpoint = connector.build_dummy_checkpoint() - - # Run the connector - while checkpoint.has_more: - for doc_batch, failure, next_checkpoint in runner.run(checkpoint): - if doc_batch: - print(f"Retrieved batch of {len(doc_batch)} documents") - for doc in doc_batch: - print(f"Document: {doc.semantic_identifier}") - if failure: - print(f"Failure: {failure.failure_message}") - if next_checkpoint: - checkpoint = next_checkpoint \ No newline at end of file diff --git a/common/data_source/github/models.py b/common/data_source/github/models.py deleted file mode 100644 index 9754bfa8d..000000000 --- a/common/data_source/github/models.py +++ /dev/null @@ -1,17 +0,0 @@ -from typing import Any - -from github import Repository -from github.Requester import Requester -from pydantic import BaseModel - - -class SerializedRepository(BaseModel): - # id is part of the raw_data as well, just pulled out for convenience - id: int - headers: dict[str, str | int] - raw_data: dict[str, Any] - - def to_Repository(self, requester: Requester) -> Repository.Repository: - return Repository.Repository( - requester, self.headers, self.raw_data, completed=True - ) \ No newline at end of file diff --git a/common/data_source/github/rate_limit_utils.py b/common/data_source/github/rate_limit_utils.py deleted file mode 100644 index d683bad08..000000000 --- a/common/data_source/github/rate_limit_utils.py +++ /dev/null @@ -1,24 +0,0 @@ -import time -import logging -from datetime import datetime -from datetime import timedelta -from datetime import timezone - -from github import Github - - -def sleep_after_rate_limit_exception(github_client: Github) -> None: - """ - Sleep until the GitHub rate limit resets. - - Args: - github_client: The GitHub client that hit the rate limit - """ - sleep_time = github_client.get_rate_limit().core.reset.replace( - tzinfo=timezone.utc - ) - datetime.now(tz=timezone.utc) - sleep_time += timedelta(minutes=1) # add an extra minute just to be safe - logging.info( - "Ran into Github rate-limit. Sleeping %s seconds.", sleep_time.seconds - ) - time.sleep(sleep_time.total_seconds()) \ No newline at end of file diff --git a/common/data_source/github/utils.py b/common/data_source/github/utils.py deleted file mode 100644 index 13b25ac51..000000000 --- a/common/data_source/github/utils.py +++ /dev/null @@ -1,46 +0,0 @@ -import logging -from collections.abc import Callable -from typing import cast - -from github import Github -from github.Repository import Repository - -from common.data_source.models import ExternalAccess - -from .models import SerializedRepository - - -def get_external_access_permission( - repo: Repository, github_client: Github -) -> ExternalAccess: - """ - Get the external access permission for a repository. - This functionality requires Enterprise Edition. - """ - # RAGFlow doesn't implement the Onyx EE external-permissions system. - # Default to private/unknown permissions. - return ExternalAccess.empty() - - -def deserialize_repository( - cached_repo: SerializedRepository, github_client: Github -) -> Repository: - """ - Deserialize a SerializedRepository back into a Repository object. - """ - # Try to access the requester - different PyGithub versions may use different attribute names - try: - # Try to get the requester using getattr to avoid linter errors - requester = getattr(github_client, "_requester", None) - if requester is None: - requester = getattr(github_client, "_Github__requester", None) - if requester is None: - # If we can't find the requester attribute, we need to fall back to recreating the repo - raise AttributeError("Could not find requester attribute") - - return cached_repo.to_Repository(requester) - except Exception as e: - # If all else fails, re-fetch the repo directly - logging.warning("Failed to deserialize repository: %s. Attempting to re-fetch.", e) - repo_id = cached_repo.id - return github_client.get_repo(repo_id) \ No newline at end of file diff --git a/common/data_source/interfaces.py b/common/data_source/interfaces.py index 679414dc6..5232f62de 100644 --- a/common/data_source/interfaces.py +++ b/common/data_source/interfaces.py @@ -237,13 +237,16 @@ class BaseConnector(abc.ABC, Generic[CT]): def validate_perm_sync(self) -> None: """ - Permission-sync validation hook. - - RAGFlow doesn't ship the Onyx EE permission-sync validation package. - Connectors that support permission sync should override - `validate_connector_settings()` as needed. + Don't override this; add a function to perm_sync_valid.py in the ee package + to do permission sync validation """ - return None + """ + validate_connector_settings_fn = fetch_ee_implementation_or_noop( + "onyx.connectors.perm_sync_valid", + "validate_perm_sync", + noop_return_value=None, + ) + validate_connector_settings_fn(self)""" def set_allow_images(self, value: bool) -> None: """Implement if the underlying connector wants to skip/allow image downloading diff --git a/pyproject.toml b/pyproject.toml index 08d6b1de9..716ec3a06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -149,7 +149,6 @@ dependencies = [ # "cryptography==46.0.3", # "jinja2>=3.1.0", "pyairtable>=3.3.0", - "pygithub>=2.8.1", "asana>=5.2.2", "python-gitlab>=7.0.0", ] diff --git a/uv.lock b/uv.lock index 9472ee3d5..cdb621019 100644 --- a/uv.lock +++ b/uv.lock @@ -5509,22 +5509,6 @@ dependencies = [ ] sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/8e/aedef81641c8dca6fd0fb7294de5bed9c45f3397d67fddf755c1042c2642/PyExecJS-1.5.1.tar.gz", hash = "sha256:34cc1d070976918183ff7bdc0ad71f8157a891c92708c00c5fbbff7a769f505c", size = 13344, upload-time = "2018-01-18T04:33:55.126Z" } -[[package]] -name = "pygithub" -version = "2.8.1" -source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } -dependencies = [ - { name = "pyjwt", extra = ["crypto"] }, - { name = "pynacl" }, - { name = "requests" }, - { name = "typing-extensions" }, - { name = "urllib3" }, -] -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/74/e560bdeffea72ecb26cff27f0fad548bbff5ecc51d6a155311ea7f9e4c4c/pygithub-2.8.1.tar.gz", hash = "sha256:341b7c78521cb07324ff670afd1baa2bf5c286f8d9fd302c1798ba594a5400c9", size = 2246994, upload-time = "2025-09-02T17:41:54.674Z" } -wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/ba/7049ce39f653f6140aac4beb53a5aaf08b4407b6a3019aae394c1c5244ff/pygithub-2.8.1-py3-none-any.whl", hash = "sha256:23a0a5bca93baef082e03411bf0ce27204c32be8bfa7abc92fe4a3e132936df0", size = 432709, upload-time = "2025-09-02T17:41:52.947Z" }, -] - [[package]] name = "pygments" version = "2.19.2" @@ -5557,43 +5541,6 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/4c/ad33b92b9864cbde84f259d5df035a6447f91891f5be77788e2a3892bce3/pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9", size = 45300, upload-time = "2025-08-24T12:55:53.394Z" }, ] -[[package]] -name = "pynacl" -version = "1.6.1" -source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } -dependencies = [ - { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, -] -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616, upload-time = "2025-11-10T16:02:13.195Z" } -wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/75/d6/4b2dca33ed512de8f54e5c6074aa06eaeb225bfbcd9b16f33a414389d6bd/pynacl-1.6.1-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:7d7c09749450c385301a3c20dca967a525152ae4608c0a096fe8464bfc3df93d", size = 389109, upload-time = "2025-11-10T16:01:28.79Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/30/e8dbb8ff4fa2559bbbb2187ba0d0d7faf728d17cb8396ecf4a898b22d3da/pynacl-1.6.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc734c1696ffd49b40f7c1779c89ba908157c57345cf626be2e0719488a076d3", size = 808254, upload-time = "2025-11-10T16:01:37.839Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/f9/f5449c652f31da00249638dbab065ad4969c635119094b79b17c3a4da2ab/pynacl-1.6.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3cd787ec1f5c155dc8ecf39b1333cfef41415dc96d392f1ce288b4fe970df489", size = 1407365, upload-time = "2025-11-10T16:01:40.454Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/eb/2f/9aa5605f473b712065c0a193ebf4ad4725d7a245533f0cd7e5dcdbc78f35/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b35d93ab2df03ecb3aa506be0d3c73609a51449ae0855c2e89c7ed44abde40b", size = 843842, upload-time = "2025-11-10T16:01:30.524Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/8d/748f0f6956e207453da8f5f21a70885fbbb2e060d5c9d78e0a4a06781451/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dece79aecbb8f4640a1adbb81e4aa3bfb0e98e99834884a80eb3f33c7c30e708", size = 1445559, upload-time = "2025-11-10T16:01:33.663Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/d0/2387f0dcb0e9816f38373999e48db4728ed724d31accdd4e737473319d35/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c2228054f04bf32d558fb89bb99f163a8197d5a9bf4efa13069a7fa8d4b93fc3", size = 825791, upload-time = "2025-11-10T16:01:34.823Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/3d/ef6fb7eb072aaf15f280bc66f26ab97e7fc9efa50fb1927683013ef47473/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:2b12f1b97346f177affcdfdc78875ff42637cb40dcf79484a97dae3448083a78", size = 1410843, upload-time = "2025-11-10T16:01:36.401Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e3/fb/23824a017526850ee7d8a1cc4cd1e3e5082800522c10832edbbca8619537/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e735c3a1bdfde3834503baf1a6d74d4a143920281cb724ba29fb84c9f49b9c48", size = 801140, upload-time = "2025-11-10T16:01:42.013Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/d1/ebc6b182cb98603a35635b727d62f094bc201bf610f97a3bb6357fe688d2/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3384a454adf5d716a9fadcb5eb2e3e72cd49302d1374a60edc531c9957a9b014", size = 1371966, upload-time = "2025-11-10T16:01:43.297Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/64/f4/c9d7b6f02924b1f31db546c7bd2a83a2421c6b4a8e6a2e53425c9f2802e0/pynacl-1.6.1-cp314-cp314t-win32.whl", hash = "sha256:d8615ee34d01c8e0ab3f302dcdd7b32e2bcf698ba5f4809e7cc407c8cdea7717", size = 230482, upload-time = "2025-11-10T16:01:47.688Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c4/2c/942477957fba22da7bf99131850e5ebdff66623418ab48964e78a7a8293e/pynacl-1.6.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5f5b35c1a266f8a9ad22525049280a600b19edd1f785bccd01ae838437dcf935", size = 243232, upload-time = "2025-11-10T16:01:45.208Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/0c/bdbc0d04a53b96a765ab03aa2cf9a76ad8653d70bf1665459b9a0dedaa1c/pynacl-1.6.1-cp314-cp314t-win_arm64.whl", hash = "sha256:d984c91fe3494793b2a1fb1e91429539c6c28e9ec8209d26d25041ec599ccf63", size = 187907, upload-time = "2025-11-10T16:01:46.328Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591, upload-time = "2025-11-10T16:01:49.1Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866, upload-time = "2025-11-10T16:01:55.688Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001, upload-time = "2025-11-10T16:01:57.101Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024, upload-time = "2025-11-10T16:01:50.228Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766, upload-time = "2025-11-10T16:01:51.886Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275, upload-time = "2025-11-10T16:01:53.351Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891, upload-time = "2025-11-10T16:01:54.587Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291, upload-time = "2025-11-10T16:01:58.111Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839, upload-time = "2025-11-10T16:01:59.252Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371, upload-time = "2025-11-10T16:02:01.075Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031, upload-time = "2025-11-10T16:02:02.656Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585, upload-time = "2025-11-10T16:02:07.116Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923, upload-time = "2025-11-10T16:02:04.401Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970, upload-time = "2025-11-10T16:02:05.786Z" }, -] - [[package]] name = "pynndescent" version = "0.5.13" @@ -6237,7 +6184,6 @@ dependencies = [ { name = "pyairtable" }, { name = "pyclipper" }, { name = "pycryptodomex" }, - { name = "pygithub" }, { name = "pyobvector" }, { name = "pyodbc" }, { name = "pypandoc" }, @@ -6369,7 +6315,6 @@ requires-dist = [ { name = "pyairtable", specifier = ">=3.3.0" }, { name = "pyclipper", specifier = ">=1.4.0,<2.0.0" }, { name = "pycryptodomex", specifier = "==3.20.0" }, - { name = "pygithub", specifier = ">=2.8.1" }, { name = "pyobvector", specifier = "==0.2.18" }, { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" }, { name = "pypandoc", specifier = ">=1.16" },