Feat: github connector (#12292)

### What problem does this PR solve? Feat: github connector ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-12-30 00:32:30 +08:00 · 2025-12-29 16:57:20 +08:00
parent 0b5d1ebefa
commit f099bc1236
10 changed files with 1322 additions and 9 deletions
--- a/common/data_source/config.py
+++ b/common/data_source/config.py
@ -232,6 +232,8 @@ _REPLACEMENT_EXPANSIONS = "body.view.value"
 BOX_WEB_OAUTH_REDIRECT_URI = os.environ.get("BOX_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/box/oauth/web/callback")
 GITHUB_CONNECTOR_BASE_URL = os.environ.get("GITHUB_CONNECTOR_BASE_URL") or None
 class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
    # remove links entirely
    STRIP = "strip"
--- a/common/data_source/connector_runner.py
+++ b/common/data_source/connector_runner.py
@ -0,0 +1,217 @@
 import sys
 import time
 import logging
 from collections.abc import Generator
 from datetime import datetime
 from typing import Generic
 from typing import TypeVar
 from common.data_source.interfaces import (
    BaseConnector,
    CheckpointedConnector,
    CheckpointedConnectorWithPermSync,
    CheckpointOutput,
    LoadConnector,
    PollConnector,
 )
 from common.data_source.models import ConnectorCheckpoint, ConnectorFailure, Document
 TimeRange = tuple[datetime, datetime]
 CT = TypeVar("CT", bound=ConnectorCheckpoint)
 def batched_doc_ids(
    checkpoint_connector_generator: CheckpointOutput[CT],
    batch_size: int,
 ) -> Generator[set[str], None, None]:
    batch: set[str] = set()
    for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()(
        checkpoint_connector_generator
    ):
        if document is not None:
            batch.add(document.id)
        elif (
            failure and failure.failed_document and failure.failed_document.document_id
        ):
            batch.add(failure.failed_document.document_id)
        if len(batch) >= batch_size:
            yield batch
            batch = set()
    if len(batch) > 0:
        yield batch
 class CheckpointOutputWrapper(Generic[CT]):
    """
    Wraps a CheckpointOutput generator to give things back in a more digestible format,
    specifically for Document outputs.
    The connector format is easier for the connector implementor (e.g. it enforces exactly
    one new checkpoint is returned AND that the checkpoint is at the end), thus the different
    formats.
    """
    def __init__(self) -> None:
        self.next_checkpoint: CT | None = None
    def __call__(
        self,
        checkpoint_connector_generator: CheckpointOutput[CT],
    ) -> Generator[
        tuple[Document | None, ConnectorFailure | None, CT | None],
        None,
        None,
    ]:
        # grabs the final return value and stores it in the `next_checkpoint` variable
        def _inner_wrapper(
            checkpoint_connector_generator: CheckpointOutput[CT],
        ) -> CheckpointOutput[CT]:
            self.next_checkpoint = yield from checkpoint_connector_generator
            return self.next_checkpoint  # not used
        for document_or_failure in _inner_wrapper(checkpoint_connector_generator):
            if isinstance(document_or_failure, Document):
                yield document_or_failure, None, None
            elif isinstance(document_or_failure, ConnectorFailure):
                yield None, document_or_failure, None
            else:
                raise ValueError(
                    f"Invalid document_or_failure type: {type(document_or_failure)}"
                )
        if self.next_checkpoint is None:
            raise RuntimeError(
                "Checkpoint is None. This should never happen - the connector should always return a checkpoint."
            )
        yield None, None, self.next_checkpoint
 class ConnectorRunner(Generic[CT]):
    """
    Handles:
        - Batching
        - Additional exception logging
        - Combining different connector types to a single interface
    """
    def __init__(
        self,
        connector: BaseConnector,
        batch_size: int,
        # cannot be True for non-checkpointed connectors
        include_permissions: bool,
        time_range: TimeRange | None = None,
    ):
        if not isinstance(connector, CheckpointedConnector) and include_permissions:
            raise ValueError(
                "include_permissions cannot be True for non-checkpointed connectors"
            )
        self.connector = connector
        self.time_range = time_range
        self.batch_size = batch_size
        self.include_permissions = include_permissions
        self.doc_batch: list[Document] = []
    def run(self, checkpoint: CT) -> Generator[
        tuple[list[Document] | None, ConnectorFailure | None, CT | None],
        None,
        None,
    ]:
        """Adds additional exception logging to the connector."""
        try:
            if isinstance(self.connector, CheckpointedConnector):
                if self.time_range is None:
                    raise ValueError("time_range is required for CheckpointedConnector")
                start = time.monotonic()
                if self.include_permissions:
                    if not isinstance(
                        self.connector, CheckpointedConnectorWithPermSync
                    ):
                        raise ValueError(
                            "Connector does not support permission syncing"
                        )
                    load_from_checkpoint = (
                        self.connector.load_from_checkpoint_with_perm_sync
                    )
                else:
                    load_from_checkpoint = self.connector.load_from_checkpoint
                checkpoint_connector_generator = load_from_checkpoint(
                    start=self.time_range[0].timestamp(),
                    end=self.time_range[1].timestamp(),
                    checkpoint=checkpoint,
                )
                next_checkpoint: CT | None = None
                # this is guaranteed to always run at least once with next_checkpoint being non-None
                for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()(
                    checkpoint_connector_generator
                ):
                    if document is not None and isinstance(document, Document):
                        self.doc_batch.append(document)
                    if failure is not None:
                        yield None, failure, None
                    if len(self.doc_batch) >= self.batch_size:
                        yield self.doc_batch, None, None
                        self.doc_batch = []
                # yield remaining documents
                if len(self.doc_batch) > 0:
                    yield self.doc_batch, None, None
                    self.doc_batch = []
                yield None, None, next_checkpoint
                logging.debug(
                    f"Connector took {time.monotonic() - start} seconds to get to the next checkpoint."
                )
            else:
                finished_checkpoint = self.connector.build_dummy_checkpoint()
                finished_checkpoint.has_more = False
                if isinstance(self.connector, PollConnector):
                    if self.time_range is None:
                        raise ValueError("time_range is required for PollConnector")
                    for document_batch in self.connector.poll_source(
                        start=self.time_range[0].timestamp(),
                        end=self.time_range[1].timestamp(),
                    ):
                        yield document_batch, None, None
                    yield None, None, finished_checkpoint
                elif isinstance(self.connector, LoadConnector):
                    for document_batch in self.connector.load_from_state():
                        yield document_batch, None, None
                    yield None, None, finished_checkpoint
                else:
                    raise ValueError(f"Invalid connector. type: {type(self.connector)}")
        except Exception:
            exc_type, _, exc_traceback = sys.exc_info()
            # Traverse the traceback to find the last frame where the exception was raised
            tb = exc_traceback
            if tb is None:
                logging.error("No traceback found for exception")
                raise
            while tb.tb_next:
                tb = tb.tb_next  # Move to the next frame in the traceback
            # Get the local variables from the frame where the exception occurred
            local_vars = tb.tb_frame.f_locals
            local_vars_str = "\n".join(
                f"{key}: {value}" for key, value in local_vars.items()
            )
            logging.error(
                f"Error in connector. type: {exc_type};\n"
                f"local_vars below -> \n{local_vars_str[:1024]}"
            )
            raise
--- a/common/data_source/github/init.py
+++ b/common/data_source/github/init.py
--- a/common/data_source/github/connector.py
+++ b/common/data_source/github/connector.py
@ -0,0 +1,954 @@
 import copy
 import logging
 from collections.abc import Callable
 from collections.abc import Generator
 from datetime import datetime
 from datetime import timedelta
 from datetime import timezone
 from enum import Enum
 from typing import Any
 from typing import cast
 from github import Github
 from github import RateLimitExceededException
 from github import Repository
 from github.GithubException import GithubException
 from github.Issue import Issue
 from github.NamedUser import NamedUser
 from github.PaginatedList import PaginatedList
 from github.PullRequest import PullRequest
 from pydantic import BaseModel
 from typing_extensions import override
 from common.data_source.config import DocumentSource, GITHUB_CONNECTOR_BASE_URL
 from common.data_source.exceptions import (
    ConnectorMissingCredentialError,
    ConnectorValidationError,
    CredentialExpiredError,
    InsufficientPermissionsError,
    UnexpectedValidationError,
 )
 from common.data_source.interfaces import CheckpointedConnectorWithPermSync, CheckpointOutput
 from common.data_source.models import (
    ConnectorCheckpoint,
    ConnectorFailure,
    Document,
    DocumentFailure,
    ExternalAccess,
    SecondsSinceUnixEpoch,
    TextSection,
 )
 from common.data_source.connector_runner import ConnectorRunner
 from .models import SerializedRepository
 from .rate_limit_utils import sleep_after_rate_limit_exception
 from .utils import deserialize_repository
 from .utils import get_external_access_permission
 ITEMS_PER_PAGE = 100
 CURSOR_LOG_FREQUENCY = 50
 _MAX_NUM_RATE_LIMIT_RETRIES = 5
 ONE_DAY = timedelta(days=1)
 SLIM_BATCH_SIZE = 100
 # Cases
 # X (from start) standard run, no fallback to cursor-based pagination
 # X (from start) standard run errors, fallback to cursor-based pagination
 #  X error in the middle of a page
 #  X no errors: run to completion
 # X (from checkpoint) standard run, no fallback to cursor-based pagination
 # X (from checkpoint) continue from cursor-based pagination
 #  - retrying
 #  - no retrying
 # things to check:
 # checkpoint state on return
 # checkpoint progress (no infinite loop)
 class DocMetadata(BaseModel):
    repo: str
 def get_nextUrl_key(pag_list: PaginatedList[PullRequest | Issue]) -> str:
    if "_PaginatedList__nextUrl" in pag_list.__dict__:
        return "_PaginatedList__nextUrl"
    for key in pag_list.__dict__:
        if "__nextUrl" in key:
            return key
    for key in pag_list.__dict__:
        if "nextUrl" in key:
            return key
    return ""
 def get_nextUrl(
    pag_list: PaginatedList[PullRequest | Issue], nextUrl_key: str
 ) -> str | None:
    return getattr(pag_list, nextUrl_key) if nextUrl_key else None
 def set_nextUrl(
    pag_list: PaginatedList[PullRequest | Issue], nextUrl_key: str, nextUrl: str
 ) -> None:
    if nextUrl_key:
        setattr(pag_list, nextUrl_key, nextUrl)
    elif nextUrl:
        raise ValueError("Next URL key not found: " + str(pag_list.__dict__))
 def _paginate_until_error(
    git_objs: Callable[[], PaginatedList[PullRequest | Issue]],
    cursor_url: str | None,
    prev_num_objs: int,
    cursor_url_callback: Callable[[str | None, int], None],
    retrying: bool = False,
 ) -> Generator[PullRequest | Issue, None, None]:
    num_objs = prev_num_objs
    pag_list = git_objs()
    nextUrl_key = get_nextUrl_key(pag_list)
    if cursor_url:
        set_nextUrl(pag_list, nextUrl_key, cursor_url)
    elif retrying:
        # if we are retrying, we want to skip the objects retrieved
        # over previous calls. Unfortunately, this WILL retrieve all
        # pages before the one we are resuming from, so we really
        # don't want this case to be hit often
        logging.warning(
            "Retrying from a previous cursor-based pagination call. "
            "This will retrieve all pages before the one we are resuming from, "
            "which may take a while and consume many API calls."
        )
        pag_list = cast(PaginatedList[PullRequest | Issue], pag_list[prev_num_objs:])
        num_objs = 0
    try:
        # this for loop handles cursor-based pagination
        for issue_or_pr in pag_list:
            num_objs += 1
            yield issue_or_pr
            # used to store the current cursor url in the checkpoint. This value
            # is updated during iteration over pag_list.
            cursor_url_callback(get_nextUrl(pag_list, nextUrl_key), num_objs)
            if num_objs % CURSOR_LOG_FREQUENCY == 0:
                logging.info(
                    f"Retrieved {num_objs} objects with current cursor url: {get_nextUrl(pag_list, nextUrl_key)}"
                )
    except Exception as e:
        logging.exception(f"Error during cursor-based pagination: {e}")
        if num_objs - prev_num_objs > 0:
            raise
        if get_nextUrl(pag_list, nextUrl_key) is not None and not retrying:
            logging.info(
                "Assuming that this error is due to cursor "
                "expiration because no objects were retrieved. "
                "Retrying from the first page."
            )
            yield from _paginate_until_error(
                git_objs, None, prev_num_objs, cursor_url_callback, retrying=True
            )
            return
        # for no cursor url or if we reach this point after a retry, raise the error
        raise
 def _get_batch_rate_limited(
    # We pass in a callable because we want git_objs to produce a fresh
    # PaginatedList each time it's called to avoid using the same object for cursor-based pagination
    # from a partial offset-based pagination call.
    git_objs: Callable[[], PaginatedList],
    page_num: int,
    cursor_url: str | None,
    prev_num_objs: int,
    cursor_url_callback: Callable[[str | None, int], None],
    github_client: Github,
    attempt_num: int = 0,
 ) -> Generator[PullRequest | Issue, None, None]:
    if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
        raise RuntimeError(
            "Re-tried fetching batch too many times. Something is going wrong with fetching objects from Github"
        )
    try:
        if cursor_url:
            # when this is set, we are resuming from an earlier
            # cursor-based pagination call.
            yield from _paginate_until_error(
                git_objs, cursor_url, prev_num_objs, cursor_url_callback
            )
            return
        objs = list(git_objs().get_page(page_num))
        # fetch all data here to disable lazy loading later
        # this is needed to capture the rate limit exception here (if one occurs)
        for obj in objs:
            if hasattr(obj, "raw_data"):
                getattr(obj, "raw_data")
        yield from objs
    except RateLimitExceededException:
        sleep_after_rate_limit_exception(github_client)
        yield from _get_batch_rate_limited(
            git_objs,
            page_num,
            cursor_url,
            prev_num_objs,
            cursor_url_callback,
            github_client,
            attempt_num + 1,
        )
    except GithubException as e:
        if not (
            e.status == 422
            and (
                "cursor" in (e.message or "")
                or "cursor" in (e.data or {}).get("message", "")
            )
        ):
            raise
        # Fallback to a cursor-based pagination strategy
        # This can happen for "large datasets," but there's no documentation
        # On the error on the web as far as we can tell.
        # Error message:
        # "Pagination with the page parameter is not supported for large datasets,
        # please use cursor based pagination (after/before)"
        yield from _paginate_until_error(
            git_objs, cursor_url, prev_num_objs, cursor_url_callback
        )
 def _get_userinfo(user: NamedUser) -> dict[str, str]:
    def _safe_get(attr_name: str) -> str | None:
        try:
            return cast(str | None, getattr(user, attr_name))
        except GithubException:
            logging.debug(f"Error getting {attr_name} for user")
            return None
    return {
        k: v
        for k, v in {
            "login": _safe_get("login"),
            "name": _safe_get("name"),
            "email": _safe_get("email"),
        }.items()
        if v is not None
    }
 def _convert_pr_to_document(
    pull_request: PullRequest, repo_external_access: ExternalAccess | None
 ) -> Document:
    repo_name = pull_request.base.repo.full_name if pull_request.base else ""
    doc_metadata = DocMetadata(repo=repo_name)
    return Document(
        id=pull_request.html_url,
        sections=[
            TextSection(link=pull_request.html_url, text=pull_request.body or "")
        ],
        external_access=repo_external_access,
        source=DocumentSource.GITHUB,
        semantic_identifier=f"{pull_request.number}: {pull_request.title}",
        # updated_at is UTC time but is timezone unaware, explicitly add UTC
        # as there is logic in indexing to prevent wrong timestamped docs
        # due to local time discrepancies with UTC
        doc_updated_at=(
            pull_request.updated_at.replace(tzinfo=timezone.utc)
            if pull_request.updated_at
            else None
        ),
        # this metadata is used in perm sync
        doc_metadata=doc_metadata.model_dump(),
        metadata={
            k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
            for k, v in {
                "object_type": "PullRequest",
                "id": pull_request.number,
                "merged": pull_request.merged,
                "state": pull_request.state,
                "user": _get_userinfo(pull_request.user) if pull_request.user else None,
                "assignees": [
                    _get_userinfo(assignee) for assignee in pull_request.assignees
                ],
                "repo": (
                    pull_request.base.repo.full_name if pull_request.base else None
                ),
                "num_commits": str(pull_request.commits),
                "num_files_changed": str(pull_request.changed_files),
                "labels": [label.name for label in pull_request.labels],
                "created_at": (
                    pull_request.created_at.replace(tzinfo=timezone.utc)
                    if pull_request.created_at
                    else None
                ),
                "updated_at": (
                    pull_request.updated_at.replace(tzinfo=timezone.utc)
                    if pull_request.updated_at
                    else None
                ),
                "closed_at": (
                    pull_request.closed_at.replace(tzinfo=timezone.utc)
                    if pull_request.closed_at
                    else None
                ),
                "merged_at": (
                    pull_request.merged_at.replace(tzinfo=timezone.utc)
                    if pull_request.merged_at
                    else None
                ),
                "merged_by": (
                    _get_userinfo(pull_request.merged_by)
                    if pull_request.merged_by
                    else None
                ),
            }.items()
            if v is not None
        },
    )
 def _fetch_issue_comments(issue: Issue) -> str:
    comments = issue.get_comments()
    return "\nComment: ".join(comment.body for comment in comments)
 def _convert_issue_to_document(
    issue: Issue, repo_external_access: ExternalAccess | None
 ) -> Document:
    repo_name = issue.repository.full_name if issue.repository else ""
    doc_metadata = DocMetadata(repo=repo_name)
    return Document(
        id=issue.html_url,
        sections=[TextSection(link=issue.html_url, text=issue.body or "")],
        source=DocumentSource.GITHUB,
        external_access=repo_external_access,
        semantic_identifier=f"{issue.number}: {issue.title}",
        # updated_at is UTC time but is timezone unaware
        doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
        # this metadata is used in perm sync
        doc_metadata=doc_metadata.model_dump(),
        metadata={
            k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
            for k, v in {
                "object_type": "Issue",
                "id": issue.number,
                "state": issue.state,
                "user": _get_userinfo(issue.user) if issue.user else None,
                "assignees": [_get_userinfo(assignee) for assignee in issue.assignees],
                "repo": issue.repository.full_name if issue.repository else None,
                "labels": [label.name for label in issue.labels],
                "created_at": (
                    issue.created_at.replace(tzinfo=timezone.utc)
                    if issue.created_at
                    else None
                ),
                "updated_at": (
                    issue.updated_at.replace(tzinfo=timezone.utc)
                    if issue.updated_at
                    else None
                ),
                "closed_at": (
                    issue.closed_at.replace(tzinfo=timezone.utc)
                    if issue.closed_at
                    else None
                ),
                "closed_by": (
                    _get_userinfo(issue.closed_by) if issue.closed_by else None
                ),
            }.items()
            if v is not None
        },
    )
 class GithubConnectorStage(Enum):
    START = "start"
    PRS = "prs"
    ISSUES = "issues"
 class GithubConnectorCheckpoint(ConnectorCheckpoint):
    stage: GithubConnectorStage
    curr_page: int
    cached_repo_ids: list[int] | None = None
    cached_repo: SerializedRepository | None = None
    # Used for the fallback cursor-based pagination strategy
    num_retrieved: int
    cursor_url: str | None = None
    def reset(self) -> None:
        """
        Resets curr_page, num_retrieved, and cursor_url to their initial values (0, 0, None)
        """
        self.curr_page = 0
        self.num_retrieved = 0
        self.cursor_url = None
 def make_cursor_url_callback(
    checkpoint: GithubConnectorCheckpoint,
 ) -> Callable[[str | None, int], None]:
    def cursor_url_callback(cursor_url: str | None, num_objs: int) -> None:
        # we want to maintain the old cursor url so code after retrieval
        # can determine that we are using the fallback cursor-based pagination strategy
        if cursor_url:
            checkpoint.cursor_url = cursor_url
        checkpoint.num_retrieved = num_objs
    return cursor_url_callback
 class GithubConnector(CheckpointedConnectorWithPermSync[GithubConnectorCheckpoint]):
    def __init__(
        self,
        repo_owner: str,
        repositories: str | None = None,
        state_filter: str = "all",
        include_prs: bool = True,
        include_issues: bool = False,
    ) -> None:
        self.repo_owner = repo_owner
        self.repositories = repositories
        self.state_filter = state_filter
        self.include_prs = include_prs
        self.include_issues = include_issues
        self.github_client: Github | None = None
    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        # defaults to 30 items per page, can be set to as high as 100
        self.github_client = (
            Github(
                credentials["github_access_token"],
                base_url=GITHUB_CONNECTOR_BASE_URL,
                per_page=ITEMS_PER_PAGE,
            )
            if GITHUB_CONNECTOR_BASE_URL
            else Github(credentials["github_access_token"], per_page=ITEMS_PER_PAGE)
        )
        return None
    def get_github_repo(
        self, github_client: Github, attempt_num: int = 0
    ) -> Repository.Repository:
        if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
            raise RuntimeError(
                "Re-tried fetching repo too many times. Something is going wrong with fetching objects from Github"
            )
        try:
            return github_client.get_repo(f"{self.repo_owner}/{self.repositories}")
        except RateLimitExceededException:
            sleep_after_rate_limit_exception(github_client)
            return self.get_github_repo(github_client, attempt_num + 1)
    def get_github_repos(
        self, github_client: Github, attempt_num: int = 0
    ) -> list[Repository.Repository]:
        """Get specific repositories based on comma-separated repo_name string."""
        if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
            raise RuntimeError(
                "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github"
            )
        try:
            repos = []
            # Split repo_name by comma and strip whitespace
            repo_names = [
                name.strip() for name in (cast(str, self.repositories)).split(",")
            ]
            for repo_name in repo_names:
                if repo_name:  # Skip empty strings
                    try:
                        repo = github_client.get_repo(f"{self.repo_owner}/{repo_name}")
                        repos.append(repo)
                    except GithubException as e:
                        logging.warning(
                            f"Could not fetch repo {self.repo_owner}/{repo_name}: {e}"
                        )
            return repos
        except RateLimitExceededException:
            sleep_after_rate_limit_exception(github_client)
            return self.get_github_repos(github_client, attempt_num + 1)
    def get_all_repos(
        self, github_client: Github, attempt_num: int = 0
    ) -> list[Repository.Repository]:
        if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
            raise RuntimeError(
                "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github"
            )
        try:
            # Try to get organization first
            try:
                org = github_client.get_organization(self.repo_owner)
                return list(org.get_repos())
            except GithubException:
                # If not an org, try as a user
                user = github_client.get_user(self.repo_owner)
                return list(user.get_repos())
        except RateLimitExceededException:
            sleep_after_rate_limit_exception(github_client)
            return self.get_all_repos(github_client, attempt_num + 1)
    def _pull_requests_func(
        self, repo: Repository.Repository
    ) -> Callable[[], PaginatedList[PullRequest]]:
        return lambda: repo.get_pulls(
            state=self.state_filter, sort="updated", direction="desc"
        )
    def _issues_func(
        self, repo: Repository.Repository
    ) -> Callable[[], PaginatedList[Issue]]:
        return lambda: repo.get_issues(
            state=self.state_filter, sort="updated", direction="desc"
        )
    def _fetch_from_github(
        self,
        checkpoint: GithubConnectorCheckpoint,
        start: datetime | None = None,
        end: datetime | None = None,
        include_permissions: bool = False,
    ) -> Generator[Document | ConnectorFailure, None, GithubConnectorCheckpoint]:
        if self.github_client is None:
            raise ConnectorMissingCredentialError("GitHub")
        checkpoint = copy.deepcopy(checkpoint)
        # First run of the connector, fetch all repos and store in checkpoint
        if checkpoint.cached_repo_ids is None:
            repos = []
            if self.repositories:
                if "," in self.repositories:
                    # Multiple repositories specified
                    repos = self.get_github_repos(self.github_client)
                else:
                    # Single repository (backward compatibility)
                    repos = [self.get_github_repo(self.github_client)]
            else:
                # All repositories
                repos = self.get_all_repos(self.github_client)
            if not repos:
                checkpoint.has_more = False
                return checkpoint
            curr_repo = repos.pop()
            checkpoint.cached_repo_ids = [repo.id for repo in repos]
            checkpoint.cached_repo = SerializedRepository(
                id=curr_repo.id,
                headers=curr_repo.raw_headers,
                raw_data=curr_repo.raw_data,
            )
            checkpoint.stage = GithubConnectorStage.PRS
            checkpoint.curr_page = 0
            # save checkpoint with repo ids retrieved
            return checkpoint
        if checkpoint.cached_repo is None:
            raise ValueError("No repo saved in checkpoint")
        # Deserialize the repository from the checkpoint
        repo = deserialize_repository(checkpoint.cached_repo, self.github_client)
        cursor_url_callback = make_cursor_url_callback(checkpoint)
        repo_external_access: ExternalAccess | None = None
        if include_permissions:
            repo_external_access = get_external_access_permission(
                repo, self.github_client
            )
        if self.include_prs and checkpoint.stage == GithubConnectorStage.PRS:
            logging.info(f"Fetching PRs for repo: {repo.name}")
            pr_batch = _get_batch_rate_limited(
                self._pull_requests_func(repo),
                checkpoint.curr_page,
                checkpoint.cursor_url,
                checkpoint.num_retrieved,
                cursor_url_callback,
                self.github_client,
            )
            checkpoint.curr_page += 1  # NOTE: not used for cursor-based fallback
            done_with_prs = False
            num_prs = 0
            pr = None
            for pr in pr_batch:
                num_prs += 1
                # we iterate backwards in time, so at this point we stop processing prs
                if (
                    start is not None
                    and pr.updated_at
                    and pr.updated_at.replace(tzinfo=timezone.utc) < start
                ):
                    done_with_prs = True
                    break
                # Skip PRs updated after the end date
                if (
                    end is not None
                    and pr.updated_at
                    and pr.updated_at.replace(tzinfo=timezone.utc) > end
                ):
                    continue
                try:
                    yield _convert_pr_to_document(
                        cast(PullRequest, pr), repo_external_access
                    )
                except Exception as e:
                    error_msg = f"Error converting PR to document: {e}"
                    logging.exception(error_msg)
                    yield ConnectorFailure(
                        failed_document=DocumentFailure(
                            document_id=str(pr.id), document_link=pr.html_url
                        ),
                        failure_message=error_msg,
                        exception=e,
                    )
                    continue
            # If we reach this point with a cursor url in the checkpoint, we were using
            # the fallback cursor-based pagination strategy. That strategy tries to get all
            # PRs, so having curosr_url set means we are done with prs. However, we need to
            # return AFTER the checkpoint reset to avoid infinite loops.
            # if we found any PRs on the page and there are more PRs to get, return the checkpoint.
            # In offset mode, while indexing without time constraints, the pr batch
            # will be empty when we're done.
            used_cursor = checkpoint.cursor_url is not None
            logging.info(f"Fetched {num_prs} PRs for repo: {repo.name}")
            if num_prs > 0 and not done_with_prs and not used_cursor:
                return checkpoint
            # if we went past the start date during the loop or there are no more
            # prs to get, we move on to issues
            checkpoint.stage = GithubConnectorStage.ISSUES
            checkpoint.reset()
            if used_cursor:
                # save the checkpoint after changing stage; next run will continue from issues
                return checkpoint
        checkpoint.stage = GithubConnectorStage.ISSUES
        if self.include_issues and checkpoint.stage == GithubConnectorStage.ISSUES:
            logging.info(f"Fetching issues for repo: {repo.name}")
            issue_batch = list(
                _get_batch_rate_limited(
                    self._issues_func(repo),
                    checkpoint.curr_page,
                    checkpoint.cursor_url,
                    checkpoint.num_retrieved,
                    cursor_url_callback,
                    self.github_client,
                )
            )
            logging.info(f"Fetched {len(issue_batch)} issues for repo: {repo.name}")
            checkpoint.curr_page += 1
            done_with_issues = False
            num_issues = 0
            for issue in issue_batch:
                num_issues += 1
                issue = cast(Issue, issue)
                # we iterate backwards in time, so at this point we stop processing prs
                if (
                    start is not None
                    and issue.updated_at.replace(tzinfo=timezone.utc) < start
                ):
                    done_with_issues = True
                    break
                # Skip PRs updated after the end date
                if (
                    end is not None
                    and issue.updated_at.replace(tzinfo=timezone.utc) > end
                ):
                    continue
                if issue.pull_request is not None:
                    # PRs are handled separately
                    continue
                try:
                    yield _convert_issue_to_document(issue, repo_external_access)
                except Exception as e:
                    error_msg = f"Error converting issue to document: {e}"
                    logging.exception(error_msg)
                    yield ConnectorFailure(
                        failed_document=DocumentFailure(
                            document_id=str(issue.id),
                            document_link=issue.html_url,
                        ),
                        failure_message=error_msg,
                        exception=e,
                    )
                    continue
            logging.info(f"Fetched {num_issues} issues for repo: {repo.name}")
            # if we found any issues on the page, and we're not done, return the checkpoint.
            # don't return if we're using cursor-based pagination to avoid infinite loops
            if num_issues > 0 and not done_with_issues and not checkpoint.cursor_url:
                return checkpoint
            # if we went past the start date during the loop or there are no more
            # issues to get, we move on to the next repo
            checkpoint.stage = GithubConnectorStage.PRS
            checkpoint.reset()
        checkpoint.has_more = len(checkpoint.cached_repo_ids) > 0
        if checkpoint.cached_repo_ids:
            next_id = checkpoint.cached_repo_ids.pop()
            next_repo = self.github_client.get_repo(next_id)
            checkpoint.cached_repo = SerializedRepository(
                id=next_id,
                headers=next_repo.raw_headers,
                raw_data=next_repo.raw_data,
            )
            checkpoint.stage = GithubConnectorStage.PRS
            checkpoint.reset()
        if checkpoint.cached_repo_ids:
            logging.info(
                f"{len(checkpoint.cached_repo_ids)} repos remaining (IDs: {checkpoint.cached_repo_ids})"
            )
        else:
            logging.info("No more repos remaining")
        return checkpoint
    def _load_from_checkpoint(
        self,
        start: SecondsSinceUnixEpoch,
        end: SecondsSinceUnixEpoch,
        checkpoint: GithubConnectorCheckpoint,
        include_permissions: bool = False,
    ) -> CheckpointOutput[GithubConnectorCheckpoint]:
        start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
        # add a day for timezone safety
        end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) + ONE_DAY
        # Move start time back by 3 hours, since some Issues/PRs are getting dropped
        # Could be due to delayed processing on GitHub side
        # The non-updated issues since last poll will be shortcut-ed and not embedded
        adjusted_start_datetime = start_datetime - timedelta(hours=3)
        epoch = datetime.fromtimestamp(0, tz=timezone.utc)
        if adjusted_start_datetime < epoch:
            adjusted_start_datetime = epoch
        return self._fetch_from_github(
            checkpoint,
            start=adjusted_start_datetime,
            end=end_datetime,
            include_permissions=include_permissions,
        )
    @override
    def load_from_checkpoint(
        self,
        start: SecondsSinceUnixEpoch,
        end: SecondsSinceUnixEpoch,
        checkpoint: GithubConnectorCheckpoint,
    ) -> CheckpointOutput[GithubConnectorCheckpoint]:
        return self._load_from_checkpoint(
            start, end, checkpoint, include_permissions=False
        )
    @override
    def load_from_checkpoint_with_perm_sync(
        self,
        start: SecondsSinceUnixEpoch,
        end: SecondsSinceUnixEpoch,
        checkpoint: GithubConnectorCheckpoint,
    ) -> CheckpointOutput[GithubConnectorCheckpoint]:
        return self._load_from_checkpoint(
            start, end, checkpoint, include_permissions=True
        )
    def validate_connector_settings(self) -> None:
        if self.github_client is None:
            raise ConnectorMissingCredentialError("GitHub credentials not loaded.")
        if not self.repo_owner:
            raise ConnectorValidationError(
                "Invalid connector settings: 'repo_owner' must be provided."
            )
        try:
            if self.repositories:
                if "," in self.repositories:
                    # Multiple repositories specified
                    repo_names = [name.strip() for name in self.repositories.split(",")]
                    if not repo_names:
                        raise ConnectorValidationError(
                            "Invalid connector settings: No valid repository names provided."
                        )
                    # Validate at least one repository exists and is accessible
                    valid_repos = False
                    validation_errors = []
                    for repo_name in repo_names:
                        if not repo_name:
                            continue
                        try:
                            test_repo = self.github_client.get_repo(
                                f"{self.repo_owner}/{repo_name}"
                            )
                            logging.info(
                                f"Successfully accessed repository: {self.repo_owner}/{repo_name}"
                            )
                            test_repo.get_contents("")
                            valid_repos = True
                            # If at least one repo is valid, we can proceed
                            break
                        except GithubException as e:
                            validation_errors.append(
                                f"Repository '{repo_name}': {e.data.get('message', str(e))}"
                            )
                    if not valid_repos:
                        error_msg = (
                            "None of the specified repositories could be accessed: "
                        )
                        error_msg += ", ".join(validation_errors)
                        raise ConnectorValidationError(error_msg)
                else:
                    # Single repository (backward compatibility)
                    test_repo = self.github_client.get_repo(
                        f"{self.repo_owner}/{self.repositories}"
                    )
                    test_repo.get_contents("")
            else:
                # Try to get organization first
                try:
                    org = self.github_client.get_organization(self.repo_owner)
                    total_count = org.get_repos().totalCount
                    if total_count == 0:
                        raise ConnectorValidationError(
                            f"Found no repos for organization: {self.repo_owner}. "
                            "Does the credential have the right scopes?"
                        )
                except GithubException as e:
                    # Check for missing SSO
                    MISSING_SSO_ERROR_MESSAGE = "You must grant your Personal Access token access to this organization".lower()
                    if MISSING_SSO_ERROR_MESSAGE in str(e).lower():
                        SSO_GUIDE_LINK = (
                            "https://docs.github.com/en/enterprise-cloud@latest/authentication/"
                            "authenticating-with-saml-single-sign-on/"
                            "authorizing-a-personal-access-token-for-use-with-saml-single-sign-on"
                        )
                        raise ConnectorValidationError(
                            f"Your GitHub token is missing authorization to access the "
                            f"`{self.repo_owner}` organization. Please follow the guide to "
                            f"authorize your token: {SSO_GUIDE_LINK}"
                        )
                    # If not an org, try as a user
                    user = self.github_client.get_user(self.repo_owner)
                    # Check if we can access any repos
                    total_count = user.get_repos().totalCount
                    if total_count == 0:
                        raise ConnectorValidationError(
                            f"Found no repos for user: {self.repo_owner}. "
                            "Does the credential have the right scopes?"
                        )
        except RateLimitExceededException:
            raise UnexpectedValidationError(
                "Validation failed due to GitHub rate-limits being exceeded. Please try again later."
            )
        except GithubException as e:
            if e.status == 401:
                raise CredentialExpiredError(
                    "GitHub credential appears to be invalid or expired (HTTP 401)."
                )
            elif e.status == 403:
                raise InsufficientPermissionsError(
                    "Your GitHub token does not have sufficient permissions for this repository (HTTP 403)."
                )
            elif e.status == 404:
                if self.repositories:
                    if "," in self.repositories:
                        raise ConnectorValidationError(
                            f"None of the specified GitHub repositories could be found for owner: {self.repo_owner}"
                        )
                    else:
                        raise ConnectorValidationError(
                            f"GitHub repository not found with name: {self.repo_owner}/{self.repositories}"
                        )
                else:
                    raise ConnectorValidationError(
                        f"GitHub user or organization not found: {self.repo_owner}"
                    )
            else:
                raise ConnectorValidationError(
                    f"Unexpected GitHub error (status={e.status}): {e.data}"
                )
        except Exception as exc:
            raise Exception(
                f"Unexpected error during GitHub settings validation: {exc}"
            )
    def validate_checkpoint_json(
        self, checkpoint_json: str
    ) -> GithubConnectorCheckpoint:
        return GithubConnectorCheckpoint.model_validate_json(checkpoint_json)
    def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint:
        return GithubConnectorCheckpoint(
            stage=GithubConnectorStage.PRS, curr_page=0, has_more=True, num_retrieved=0
        )
 if __name__ == "__main__":
    import os
    # Initialize the connector
    connector = GithubConnector(
        repo_owner=os.environ["REPO_OWNER"],
        repositories=os.environ.get("REPOSITORIES"),
    )
    connector.load_credentials(
        {"github_access_token": os.environ["ACCESS_TOKEN_GITHUB"]}
    )
    if connector.github_client:
        get_external_access_permission(
            connector.get_github_repos(connector.github_client).pop(),
            connector.github_client,
        )
    # Create a time range from epoch to now
    end_time = datetime.now(timezone.utc)
    start_time = datetime.fromtimestamp(0, tz=timezone.utc)
    time_range = (start_time, end_time)
    # Initialize the runner with a batch size of 10
    runner: ConnectorRunner[GithubConnectorCheckpoint] = ConnectorRunner(
        connector, batch_size=10, include_permissions=False, time_range=time_range
    )
    # Get initial checkpoint
    checkpoint = connector.build_dummy_checkpoint()
    # Run the connector
    while checkpoint.has_more:
        for doc_batch, failure, next_checkpoint in runner.run(checkpoint):
            if doc_batch:
                print(f"Retrieved batch of {len(doc_batch)} documents")
                for doc in doc_batch:
                    print(f"Document: {doc.semantic_identifier}")
            if failure:
                print(f"Failure: {failure.failure_message}")
            if next_checkpoint:
                checkpoint = next_checkpoint
--- a/common/data_source/github/models.py
+++ b/common/data_source/github/models.py
@ -0,0 +1,17 @@
 from typing import Any
 from github import Repository
 from github.Requester import Requester
 from pydantic import BaseModel
 class SerializedRepository(BaseModel):
    # id is part of the raw_data as well, just pulled out for convenience
    id: int
    headers: dict[str, str | int]
    raw_data: dict[str, Any]
    def to_Repository(self, requester: Requester) -> Repository.Repository:
        return Repository.Repository(
            requester, self.headers, self.raw_data, completed=True
        )
--- a/common/data_source/github/rate_limit_utils.py
+++ b/common/data_source/github/rate_limit_utils.py
@ -0,0 +1,24 @@
 import time
 import logging
 from datetime import datetime
 from datetime import timedelta
 from datetime import timezone
 from github import Github
 def sleep_after_rate_limit_exception(github_client: Github) -> None:
    """
    Sleep until the GitHub rate limit resets.
    Args:
        github_client: The GitHub client that hit the rate limit
    """
    sleep_time = github_client.get_rate_limit().core.reset.replace(
        tzinfo=timezone.utc
    ) - datetime.now(tz=timezone.utc)
    sleep_time += timedelta(minutes=1)  # add an extra minute just to be safe
    logging.info(
        "Ran into Github rate-limit. Sleeping %s seconds.", sleep_time.seconds
    )
    time.sleep(sleep_time.total_seconds())
--- a/common/data_source/github/utils.py
+++ b/common/data_source/github/utils.py
@ -0,0 +1,46 @@
 import logging
 from collections.abc import Callable
 from typing import cast
 from github import Github
 from github.Repository import Repository
 from common.data_source.models import ExternalAccess
 from .models import SerializedRepository
 def get_external_access_permission(
    repo: Repository, github_client: Github
 ) -> ExternalAccess:
    """
    Get the external access permission for a repository.
    This functionality requires Enterprise Edition.
    """
    # RAGFlow doesn't implement the Onyx EE external-permissions system.
    # Default to private/unknown permissions.
    return ExternalAccess.empty()
 def deserialize_repository(
    cached_repo: SerializedRepository, github_client: Github
 ) -> Repository:
    """
    Deserialize a SerializedRepository back into a Repository object.
    """
    # Try to access the requester - different PyGithub versions may use different attribute names
    try:
        # Try to get the requester using getattr to avoid linter errors
        requester = getattr(github_client, "_requester", None)
        if requester is None:
            requester = getattr(github_client, "_Github__requester", None)
        if requester is None:
            # If we can't find the requester attribute, we need to fall back to recreating the repo
            raise AttributeError("Could not find requester attribute")
        return cached_repo.to_Repository(requester)
    except Exception as e:
        # If all else fails, re-fetch the repo directly
        logging.warning("Failed to deserialize repository: %s. Attempting to re-fetch.", e)
        repo_id = cached_repo.id
        return github_client.get_repo(repo_id)
--- a/common/data_source/interfaces.py
+++ b/common/data_source/interfaces.py
@ -236,16 +236,13 @@ class BaseConnector(abc.ABC, Generic[CT]):
    def validate_perm_sync(self) -> None:
        """
-        Don't override this; add a function to perm_sync_valid.py in the ee package
+        Permission-sync validation hook.
-        to do permission sync validation
+
        RAGFlow doesn't ship the Onyx EE permission-sync validation package.
        Connectors that support permission sync should override
        `validate_connector_settings()` as needed.
        """
-        """
+        return None
        validate_connector_settings_fn = fetch_ee_implementation_or_noop(
            "onyx.connectors.perm_sync_valid",
            "validate_perm_sync",
            noop_return_value=None,
        )
        validate_connector_settings_fn(self)"""
    def set_allow_images(self, value: bool) -> None:
        """Implement if the underlying connector wants to skip/allow image downloading
--- a/pyproject.toml
+++ b/pyproject.toml
@ -149,6 +149,7 @@ dependencies = [
    #    "cryptography==46.0.3",
    #    "jinja2>=3.1.0",
    "pyairtable>=3.3.0",
    "pygithub>=2.8.1",
    "asana>=5.2.2",
 ]
--- a/uv.lock
+++ b/uv.lock
@ -5509,6 +5509,22 @@ dependencies = [
 ]
 sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/8e/aedef81641c8dca6fd0fb7294de5bed9c45f3397d67fddf755c1042c2642/PyExecJS-1.5.1.tar.gz", hash = "sha256:34cc1d070976918183ff7bdc0ad71f8157a891c92708c00c5fbbff7a769f505c", size = 13344, upload-time = "2018-01-18T04:33:55.126Z" }
 [[package]]
 name = "pygithub"
 version = "2.8.1"
 source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 dependencies = [
    { name = "pyjwt", extra = ["crypto"] },
    { name = "pynacl" },
    { name = "requests" },
    { name = "typing-extensions" },
    { name = "urllib3" },
 ]
 sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/74/e560bdeffea72ecb26cff27f0fad548bbff5ecc51d6a155311ea7f9e4c4c/pygithub-2.8.1.tar.gz", hash = "sha256:341b7c78521cb07324ff670afd1baa2bf5c286f8d9fd302c1798ba594a5400c9", size = 2246994, upload-time = "2025-09-02T17:41:54.674Z" }
 wheels = [
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/ba/7049ce39f653f6140aac4beb53a5aaf08b4407b6a3019aae394c1c5244ff/pygithub-2.8.1-py3-none-any.whl", hash = "sha256:23a0a5bca93baef082e03411bf0ce27204c32be8bfa7abc92fe4a3e132936df0", size = 432709, upload-time = "2025-09-02T17:41:52.947Z" },
 ]
 [[package]]
 name = "pygments"
 version = "2.19.2"
@ -5541,6 +5557,43 @@ wheels = [
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/4c/ad33b92b9864cbde84f259d5df035a6447f91891f5be77788e2a3892bce3/pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9", size = 45300, upload-time = "2025-08-24T12:55:53.394Z" },
 ]
 [[package]]
 name = "pynacl"
 version = "1.6.1"
 source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 dependencies = [
    { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
 ]
 sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616, upload-time = "2025-11-10T16:02:13.195Z" }
 wheels = [
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/75/d6/4b2dca33ed512de8f54e5c6074aa06eaeb225bfbcd9b16f33a414389d6bd/pynacl-1.6.1-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:7d7c09749450c385301a3c20dca967a525152ae4608c0a096fe8464bfc3df93d", size = 389109, upload-time = "2025-11-10T16:01:28.79Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/30/e8dbb8ff4fa2559bbbb2187ba0d0d7faf728d17cb8396ecf4a898b22d3da/pynacl-1.6.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc734c1696ffd49b40f7c1779c89ba908157c57345cf626be2e0719488a076d3", size = 808254, upload-time = "2025-11-10T16:01:37.839Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/f9/f5449c652f31da00249638dbab065ad4969c635119094b79b17c3a4da2ab/pynacl-1.6.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3cd787ec1f5c155dc8ecf39b1333cfef41415dc96d392f1ce288b4fe970df489", size = 1407365, upload-time = "2025-11-10T16:01:40.454Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/eb/2f/9aa5605f473b712065c0a193ebf4ad4725d7a245533f0cd7e5dcdbc78f35/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b35d93ab2df03ecb3aa506be0d3c73609a51449ae0855c2e89c7ed44abde40b", size = 843842, upload-time = "2025-11-10T16:01:30.524Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/8d/748f0f6956e207453da8f5f21a70885fbbb2e060d5c9d78e0a4a06781451/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dece79aecbb8f4640a1adbb81e4aa3bfb0e98e99834884a80eb3f33c7c30e708", size = 1445559, upload-time = "2025-11-10T16:01:33.663Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/d0/2387f0dcb0e9816f38373999e48db4728ed724d31accdd4e737473319d35/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c2228054f04bf32d558fb89bb99f163a8197d5a9bf4efa13069a7fa8d4b93fc3", size = 825791, upload-time = "2025-11-10T16:01:34.823Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/3d/ef6fb7eb072aaf15f280bc66f26ab97e7fc9efa50fb1927683013ef47473/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:2b12f1b97346f177affcdfdc78875ff42637cb40dcf79484a97dae3448083a78", size = 1410843, upload-time = "2025-11-10T16:01:36.401Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e3/fb/23824a017526850ee7d8a1cc4cd1e3e5082800522c10832edbbca8619537/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e735c3a1bdfde3834503baf1a6d74d4a143920281cb724ba29fb84c9f49b9c48", size = 801140, upload-time = "2025-11-10T16:01:42.013Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/d1/ebc6b182cb98603a35635b727d62f094bc201bf610f97a3bb6357fe688d2/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3384a454adf5d716a9fadcb5eb2e3e72cd49302d1374a60edc531c9957a9b014", size = 1371966, upload-time = "2025-11-10T16:01:43.297Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/64/f4/c9d7b6f02924b1f31db546c7bd2a83a2421c6b4a8e6a2e53425c9f2802e0/pynacl-1.6.1-cp314-cp314t-win32.whl", hash = "sha256:d8615ee34d01c8e0ab3f302dcdd7b32e2bcf698ba5f4809e7cc407c8cdea7717", size = 230482, upload-time = "2025-11-10T16:01:47.688Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c4/2c/942477957fba22da7bf99131850e5ebdff66623418ab48964e78a7a8293e/pynacl-1.6.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5f5b35c1a266f8a9ad22525049280a600b19edd1f785bccd01ae838437dcf935", size = 243232, upload-time = "2025-11-10T16:01:45.208Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/0c/bdbc0d04a53b96a765ab03aa2cf9a76ad8653d70bf1665459b9a0dedaa1c/pynacl-1.6.1-cp314-cp314t-win_arm64.whl", hash = "sha256:d984c91fe3494793b2a1fb1e91429539c6c28e9ec8209d26d25041ec599ccf63", size = 187907, upload-time = "2025-11-10T16:01:46.328Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591, upload-time = "2025-11-10T16:01:49.1Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866, upload-time = "2025-11-10T16:01:55.688Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001, upload-time = "2025-11-10T16:01:57.101Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024, upload-time = "2025-11-10T16:01:50.228Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766, upload-time = "2025-11-10T16:01:51.886Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275, upload-time = "2025-11-10T16:01:53.351Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891, upload-time = "2025-11-10T16:01:54.587Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291, upload-time = "2025-11-10T16:01:58.111Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839, upload-time = "2025-11-10T16:01:59.252Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371, upload-time = "2025-11-10T16:02:01.075Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031, upload-time = "2025-11-10T16:02:02.656Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585, upload-time = "2025-11-10T16:02:07.116Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923, upload-time = "2025-11-10T16:02:04.401Z" },
    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970, upload-time = "2025-11-10T16:02:05.786Z" },
 ]
 [[package]]
 name = "pynndescent"
 version = "0.5.13"
@ -6171,6 +6224,7 @@ dependencies = [
    { name = "pyairtable" },
    { name = "pyclipper" },
    { name = "pycryptodomex" },
    { name = "pygithub" },
    { name = "pyobvector" },
    { name = "pyodbc" },
    { name = "pypandoc" },
@ -6301,6 +6355,7 @@ requires-dist = [
    { name = "pyairtable", specifier = ">=3.3.0" },
    { name = "pyclipper", specifier = ">=1.4.0,<2.0.0" },
    { name = "pycryptodomex", specifier = "==3.20.0" },
    { name = "pygithub", specifier = ">=2.8.1" },
    { name = "pyobvector", specifier = "==0.2.18" },
    { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" },
    { name = "pypandoc", specifier = ">=1.16" },