From f099bc1236fc9f64c28c70786a1527f37a63b937 Mon Sep 17 00:00:00 2001
From: Magicbook1108 <newyorkupperbay@gmail.com>
Date: Mon, 29 Dec 2025 16:57:20 +0800
Subject: [PATCH] Feat: github connector (#12292)

### What problem does this PR solve?

Feat: github connector

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 common/data_source/config.py                  |   2 +
 common/data_source/connector_runner.py        | 217 ++++
 common/data_source/github/__init__.py         |   0
 common/data_source/github/connector.py        | 954 ++++++++++++++++++
 common/data_source/github/models.py           |  17 +
 common/data_source/github/rate_limit_utils.py |  24 +
 common/data_source/github/utils.py            |  46 +
 common/data_source/interfaces.py              |  15 +-
 pyproject.toml                                |   1 +
 uv.lock                                       |  55 +
 10 files changed, 1322 insertions(+), 9 deletions(-)
 create mode 100644 common/data_source/connector_runner.py
 create mode 100644 common/data_source/github/__init__.py
 create mode 100644 common/data_source/github/connector.py
 create mode 100644 common/data_source/github/models.py
 create mode 100644 common/data_source/github/rate_limit_utils.py
 create mode 100644 common/data_source/github/utils.py

diff --git a/common/data_source/config.py b/common/data_source/config.py
index e36ee404b..676696d65 100644
--- a/common/data_source/config.py
+++ b/common/data_source/config.py
@@ -232,6 +232,8 @@ _REPLACEMENT_EXPANSIONS = "body.view.value"
 
 BOX_WEB_OAUTH_REDIRECT_URI = os.environ.get("BOX_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/box/oauth/web/callback")
 
+GITHUB_CONNECTOR_BASE_URL = os.environ.get("GITHUB_CONNECTOR_BASE_URL") or None
+
 class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
     # remove links entirely
     STRIP = "strip"
diff --git a/common/data_source/connector_runner.py b/common/data_source/connector_runner.py
new file mode 100644
index 000000000..d47d65128
--- /dev/null
+++ b/common/data_source/connector_runner.py
@@ -0,0 +1,217 @@
+import sys
+import time
+import logging
+from collections.abc import Generator
+from datetime import datetime
+from typing import Generic
+from typing import TypeVar
+from common.data_source.interfaces import (
+    BaseConnector,
+    CheckpointedConnector,
+    CheckpointedConnectorWithPermSync,
+    CheckpointOutput,
+    LoadConnector,
+    PollConnector,
+)
+from common.data_source.models import ConnectorCheckpoint, ConnectorFailure, Document
+
+
+TimeRange = tuple[datetime, datetime]
+
+CT = TypeVar("CT", bound=ConnectorCheckpoint)
+
+
+def batched_doc_ids(
+    checkpoint_connector_generator: CheckpointOutput[CT],
+    batch_size: int,
+) -> Generator[set[str], None, None]:
+    batch: set[str] = set()
+    for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()(
+        checkpoint_connector_generator
+    ):
+        if document is not None:
+            batch.add(document.id)
+        elif (
+            failure and failure.failed_document and failure.failed_document.document_id
+        ):
+            batch.add(failure.failed_document.document_id)
+
+        if len(batch) >= batch_size:
+            yield batch
+            batch = set()
+    if len(batch) > 0:
+        yield batch
+
+
+class CheckpointOutputWrapper(Generic[CT]):
+    """
+    Wraps a CheckpointOutput generator to give things back in a more digestible format,
+    specifically for Document outputs.
+    The connector format is easier for the connector implementor (e.g. it enforces exactly
+    one new checkpoint is returned AND that the checkpoint is at the end), thus the different
+    formats.
+    """
+
+    def __init__(self) -> None:
+        self.next_checkpoint: CT | None = None
+
+    def __call__(
+        self,
+        checkpoint_connector_generator: CheckpointOutput[CT],
+    ) -> Generator[
+        tuple[Document | None, ConnectorFailure | None, CT | None],
+        None,
+        None,
+    ]:
+        # grabs the final return value and stores it in the `next_checkpoint` variable
+        def _inner_wrapper(
+            checkpoint_connector_generator: CheckpointOutput[CT],
+        ) -> CheckpointOutput[CT]:
+            self.next_checkpoint = yield from checkpoint_connector_generator
+            return self.next_checkpoint  # not used
+
+        for document_or_failure in _inner_wrapper(checkpoint_connector_generator):
+            if isinstance(document_or_failure, Document):
+                yield document_or_failure, None, None
+            elif isinstance(document_or_failure, ConnectorFailure):
+                yield None, document_or_failure, None
+            else:
+                raise ValueError(
+                    f"Invalid document_or_failure type: {type(document_or_failure)}"
+                )
+
+        if self.next_checkpoint is None:
+            raise RuntimeError(
+                "Checkpoint is None. This should never happen - the connector should always return a checkpoint."
+            )
+
+        yield None, None, self.next_checkpoint
+
+
+class ConnectorRunner(Generic[CT]):
+    """
+    Handles:
+        - Batching
+        - Additional exception logging
+        - Combining different connector types to a single interface
+    """
+
+    def __init__(
+        self,
+        connector: BaseConnector,
+        batch_size: int,
+        # cannot be True for non-checkpointed connectors
+        include_permissions: bool,
+        time_range: TimeRange | None = None,
+    ):
+        if not isinstance(connector, CheckpointedConnector) and include_permissions:
+            raise ValueError(
+                "include_permissions cannot be True for non-checkpointed connectors"
+            )
+
+        self.connector = connector
+        self.time_range = time_range
+        self.batch_size = batch_size
+        self.include_permissions = include_permissions
+
+        self.doc_batch: list[Document] = []
+
+    def run(self, checkpoint: CT) -> Generator[
+        tuple[list[Document] | None, ConnectorFailure | None, CT | None],
+        None,
+        None,
+    ]:
+        """Adds additional exception logging to the connector."""
+        try:
+            if isinstance(self.connector, CheckpointedConnector):
+                if self.time_range is None:
+                    raise ValueError("time_range is required for CheckpointedConnector")
+
+                start = time.monotonic()
+                if self.include_permissions:
+                    if not isinstance(
+                        self.connector, CheckpointedConnectorWithPermSync
+                    ):
+                        raise ValueError(
+                            "Connector does not support permission syncing"
+                        )
+                    load_from_checkpoint = (
+                        self.connector.load_from_checkpoint_with_perm_sync
+                    )
+                else:
+                    load_from_checkpoint = self.connector.load_from_checkpoint
+                checkpoint_connector_generator = load_from_checkpoint(
+                    start=self.time_range[0].timestamp(),
+                    end=self.time_range[1].timestamp(),
+                    checkpoint=checkpoint,
+                )
+                next_checkpoint: CT | None = None
+                # this is guaranteed to always run at least once with next_checkpoint being non-None
+                for document, failure, next_checkpoint in CheckpointOutputWrapper[CT]()(
+                    checkpoint_connector_generator
+                ):
+                    if document is not None and isinstance(document, Document):
+                        self.doc_batch.append(document)
+
+                    if failure is not None:
+                        yield None, failure, None
+
+                    if len(self.doc_batch) >= self.batch_size:
+                        yield self.doc_batch, None, None
+                        self.doc_batch = []
+
+                # yield remaining documents
+                if len(self.doc_batch) > 0:
+                    yield self.doc_batch, None, None
+                    self.doc_batch = []
+
+                yield None, None, next_checkpoint
+
+                logging.debug(
+                    f"Connector took {time.monotonic() - start} seconds to get to the next checkpoint."
+                )
+
+            else:
+                finished_checkpoint = self.connector.build_dummy_checkpoint()
+                finished_checkpoint.has_more = False
+
+                if isinstance(self.connector, PollConnector):
+                    if self.time_range is None:
+                        raise ValueError("time_range is required for PollConnector")
+
+                    for document_batch in self.connector.poll_source(
+                        start=self.time_range[0].timestamp(),
+                        end=self.time_range[1].timestamp(),
+                    ):
+                        yield document_batch, None, None
+
+                    yield None, None, finished_checkpoint
+                elif isinstance(self.connector, LoadConnector):
+                    for document_batch in self.connector.load_from_state():
+                        yield document_batch, None, None
+
+                    yield None, None, finished_checkpoint
+                else:
+                    raise ValueError(f"Invalid connector. type: {type(self.connector)}")
+        except Exception:
+            exc_type, _, exc_traceback = sys.exc_info()
+
+            # Traverse the traceback to find the last frame where the exception was raised
+            tb = exc_traceback
+            if tb is None:
+                logging.error("No traceback found for exception")
+                raise
+
+            while tb.tb_next:
+                tb = tb.tb_next  # Move to the next frame in the traceback
+
+            # Get the local variables from the frame where the exception occurred
+            local_vars = tb.tb_frame.f_locals
+            local_vars_str = "\n".join(
+                f"{key}: {value}" for key, value in local_vars.items()
+            )
+            logging.error(
+                f"Error in connector. type: {exc_type};\n"
+                f"local_vars below -> \n{local_vars_str[:1024]}"
+            )
+            raise
\ No newline at end of file
diff --git a/common/data_source/github/__init__.py b/common/data_source/github/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/common/data_source/github/connector.py b/common/data_source/github/connector.py
new file mode 100644
index 000000000..a6ee3a59a
--- /dev/null
+++ b/common/data_source/github/connector.py
@@ -0,0 +1,954 @@
+import copy
+import logging
+from collections.abc import Callable
+from collections.abc import Generator
+from datetime import datetime
+from datetime import timedelta
+from datetime import timezone
+from enum import Enum
+from typing import Any
+from typing import cast
+
+from github import Github
+from github import RateLimitExceededException
+from github import Repository
+from github.GithubException import GithubException
+from github.Issue import Issue
+from github.NamedUser import NamedUser
+from github.PaginatedList import PaginatedList
+from github.PullRequest import PullRequest
+from pydantic import BaseModel
+from typing_extensions import override
+
+from common.data_source.config import DocumentSource, GITHUB_CONNECTOR_BASE_URL
+from common.data_source.exceptions import (
+    ConnectorMissingCredentialError,
+    ConnectorValidationError,
+    CredentialExpiredError,
+    InsufficientPermissionsError,
+    UnexpectedValidationError,
+)
+from common.data_source.interfaces import CheckpointedConnectorWithPermSync, CheckpointOutput
+from common.data_source.models import (
+    ConnectorCheckpoint,
+    ConnectorFailure,
+    Document,
+    DocumentFailure,
+    ExternalAccess,
+    SecondsSinceUnixEpoch,
+    TextSection,
+)
+from common.data_source.connector_runner import ConnectorRunner
+from .models import SerializedRepository
+from .rate_limit_utils import sleep_after_rate_limit_exception
+from .utils import deserialize_repository
+from .utils import get_external_access_permission
+
+ITEMS_PER_PAGE = 100
+CURSOR_LOG_FREQUENCY = 50
+
+_MAX_NUM_RATE_LIMIT_RETRIES = 5
+
+ONE_DAY = timedelta(days=1)
+SLIM_BATCH_SIZE = 100
+# Cases
+# X (from start) standard run, no fallback to cursor-based pagination
+# X (from start) standard run errors, fallback to cursor-based pagination
+#  X error in the middle of a page
+#  X no errors: run to completion
+# X (from checkpoint) standard run, no fallback to cursor-based pagination
+# X (from checkpoint) continue from cursor-based pagination
+#  - retrying
+#  - no retrying
+
+# things to check:
+# checkpoint state on return
+# checkpoint progress (no infinite loop)
+
+
+class DocMetadata(BaseModel):
+    repo: str
+
+
+def get_nextUrl_key(pag_list: PaginatedList[PullRequest | Issue]) -> str:
+    if "_PaginatedList__nextUrl" in pag_list.__dict__:
+        return "_PaginatedList__nextUrl"
+    for key in pag_list.__dict__:
+        if "__nextUrl" in key:
+            return key
+    for key in pag_list.__dict__:
+        if "nextUrl" in key:
+            return key
+    return ""
+
+
+def get_nextUrl(
+    pag_list: PaginatedList[PullRequest | Issue], nextUrl_key: str
+) -> str | None:
+    return getattr(pag_list, nextUrl_key) if nextUrl_key else None
+
+
+def set_nextUrl(
+    pag_list: PaginatedList[PullRequest | Issue], nextUrl_key: str, nextUrl: str
+) -> None:
+    if nextUrl_key:
+        setattr(pag_list, nextUrl_key, nextUrl)
+    elif nextUrl:
+        raise ValueError("Next URL key not found: " + str(pag_list.__dict__))
+
+
+def _paginate_until_error(
+    git_objs: Callable[[], PaginatedList[PullRequest | Issue]],
+    cursor_url: str | None,
+    prev_num_objs: int,
+    cursor_url_callback: Callable[[str | None, int], None],
+    retrying: bool = False,
+) -> Generator[PullRequest | Issue, None, None]:
+    num_objs = prev_num_objs
+    pag_list = git_objs()
+    nextUrl_key = get_nextUrl_key(pag_list)
+    if cursor_url:
+        set_nextUrl(pag_list, nextUrl_key, cursor_url)
+    elif retrying:
+        # if we are retrying, we want to skip the objects retrieved
+        # over previous calls. Unfortunately, this WILL retrieve all
+        # pages before the one we are resuming from, so we really
+        # don't want this case to be hit often
+        logging.warning(
+            "Retrying from a previous cursor-based pagination call. "
+            "This will retrieve all pages before the one we are resuming from, "
+            "which may take a while and consume many API calls."
+        )
+        pag_list = cast(PaginatedList[PullRequest | Issue], pag_list[prev_num_objs:])
+        num_objs = 0
+
+    try:
+        # this for loop handles cursor-based pagination
+        for issue_or_pr in pag_list:
+            num_objs += 1
+            yield issue_or_pr
+            # used to store the current cursor url in the checkpoint. This value
+            # is updated during iteration over pag_list.
+            cursor_url_callback(get_nextUrl(pag_list, nextUrl_key), num_objs)
+
+            if num_objs % CURSOR_LOG_FREQUENCY == 0:
+                logging.info(
+                    f"Retrieved {num_objs} objects with current cursor url: {get_nextUrl(pag_list, nextUrl_key)}"
+                )
+
+    except Exception as e:
+        logging.exception(f"Error during cursor-based pagination: {e}")
+        if num_objs - prev_num_objs > 0:
+            raise
+
+        if get_nextUrl(pag_list, nextUrl_key) is not None and not retrying:
+            logging.info(
+                "Assuming that this error is due to cursor "
+                "expiration because no objects were retrieved. "
+                "Retrying from the first page."
+            )
+            yield from _paginate_until_error(
+                git_objs, None, prev_num_objs, cursor_url_callback, retrying=True
+            )
+            return
+
+        # for no cursor url or if we reach this point after a retry, raise the error
+        raise
+
+
+def _get_batch_rate_limited(
+    # We pass in a callable because we want git_objs to produce a fresh
+    # PaginatedList each time it's called to avoid using the same object for cursor-based pagination
+    # from a partial offset-based pagination call.
+    git_objs: Callable[[], PaginatedList],
+    page_num: int,
+    cursor_url: str | None,
+    prev_num_objs: int,
+    cursor_url_callback: Callable[[str | None, int], None],
+    github_client: Github,
+    attempt_num: int = 0,
+) -> Generator[PullRequest | Issue, None, None]:
+    if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
+        raise RuntimeError(
+            "Re-tried fetching batch too many times. Something is going wrong with fetching objects from Github"
+        )
+    try:
+        if cursor_url:
+            # when this is set, we are resuming from an earlier
+            # cursor-based pagination call.
+            yield from _paginate_until_error(
+                git_objs, cursor_url, prev_num_objs, cursor_url_callback
+            )
+            return
+        objs = list(git_objs().get_page(page_num))
+        # fetch all data here to disable lazy loading later
+        # this is needed to capture the rate limit exception here (if one occurs)
+        for obj in objs:
+            if hasattr(obj, "raw_data"):
+                getattr(obj, "raw_data")
+        yield from objs
+    except RateLimitExceededException:
+        sleep_after_rate_limit_exception(github_client)
+        yield from _get_batch_rate_limited(
+            git_objs,
+            page_num,
+            cursor_url,
+            prev_num_objs,
+            cursor_url_callback,
+            github_client,
+            attempt_num + 1,
+        )
+    except GithubException as e:
+        if not (
+            e.status == 422
+            and (
+                "cursor" in (e.message or "")
+                or "cursor" in (e.data or {}).get("message", "")
+            )
+        ):
+            raise
+        # Fallback to a cursor-based pagination strategy
+        # This can happen for "large datasets," but there's no documentation
+        # On the error on the web as far as we can tell.
+        # Error message:
+        # "Pagination with the page parameter is not supported for large datasets,
+        # please use cursor based pagination (after/before)"
+        yield from _paginate_until_error(
+            git_objs, cursor_url, prev_num_objs, cursor_url_callback
+        )
+
+
+def _get_userinfo(user: NamedUser) -> dict[str, str]:
+    def _safe_get(attr_name: str) -> str | None:
+        try:
+            return cast(str | None, getattr(user, attr_name))
+        except GithubException:
+            logging.debug(f"Error getting {attr_name} for user")
+            return None
+
+    return {
+        k: v
+        for k, v in {
+            "login": _safe_get("login"),
+            "name": _safe_get("name"),
+            "email": _safe_get("email"),
+        }.items()
+        if v is not None
+    }
+
+
+def _convert_pr_to_document(
+    pull_request: PullRequest, repo_external_access: ExternalAccess | None
+) -> Document:
+    repo_name = pull_request.base.repo.full_name if pull_request.base else ""
+    doc_metadata = DocMetadata(repo=repo_name)
+    return Document(
+        id=pull_request.html_url,
+        sections=[
+            TextSection(link=pull_request.html_url, text=pull_request.body or "")
+        ],
+        external_access=repo_external_access,
+        source=DocumentSource.GITHUB,
+        semantic_identifier=f"{pull_request.number}: {pull_request.title}",
+        # updated_at is UTC time but is timezone unaware, explicitly add UTC
+        # as there is logic in indexing to prevent wrong timestamped docs
+        # due to local time discrepancies with UTC
+        doc_updated_at=(
+            pull_request.updated_at.replace(tzinfo=timezone.utc)
+            if pull_request.updated_at
+            else None
+        ),
+        # this metadata is used in perm sync
+        doc_metadata=doc_metadata.model_dump(),
+        metadata={
+            k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
+            for k, v in {
+                "object_type": "PullRequest",
+                "id": pull_request.number,
+                "merged": pull_request.merged,
+                "state": pull_request.state,
+                "user": _get_userinfo(pull_request.user) if pull_request.user else None,
+                "assignees": [
+                    _get_userinfo(assignee) for assignee in pull_request.assignees
+                ],
+                "repo": (
+                    pull_request.base.repo.full_name if pull_request.base else None
+                ),
+                "num_commits": str(pull_request.commits),
+                "num_files_changed": str(pull_request.changed_files),
+                "labels": [label.name for label in pull_request.labels],
+                "created_at": (
+                    pull_request.created_at.replace(tzinfo=timezone.utc)
+                    if pull_request.created_at
+                    else None
+                ),
+                "updated_at": (
+                    pull_request.updated_at.replace(tzinfo=timezone.utc)
+                    if pull_request.updated_at
+                    else None
+                ),
+                "closed_at": (
+                    pull_request.closed_at.replace(tzinfo=timezone.utc)
+                    if pull_request.closed_at
+                    else None
+                ),
+                "merged_at": (
+                    pull_request.merged_at.replace(tzinfo=timezone.utc)
+                    if pull_request.merged_at
+                    else None
+                ),
+                "merged_by": (
+                    _get_userinfo(pull_request.merged_by)
+                    if pull_request.merged_by
+                    else None
+                ),
+            }.items()
+            if v is not None
+        },
+    )
+
+
+def _fetch_issue_comments(issue: Issue) -> str:
+    comments = issue.get_comments()
+    return "\nComment: ".join(comment.body for comment in comments)
+
+
+def _convert_issue_to_document(
+    issue: Issue, repo_external_access: ExternalAccess | None
+) -> Document:
+    repo_name = issue.repository.full_name if issue.repository else ""
+    doc_metadata = DocMetadata(repo=repo_name)
+    return Document(
+        id=issue.html_url,
+        sections=[TextSection(link=issue.html_url, text=issue.body or "")],
+        source=DocumentSource.GITHUB,
+        external_access=repo_external_access,
+        semantic_identifier=f"{issue.number}: {issue.title}",
+        # updated_at is UTC time but is timezone unaware
+        doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
+        # this metadata is used in perm sync
+        doc_metadata=doc_metadata.model_dump(),
+        metadata={
+            k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
+            for k, v in {
+                "object_type": "Issue",
+                "id": issue.number,
+                "state": issue.state,
+                "user": _get_userinfo(issue.user) if issue.user else None,
+                "assignees": [_get_userinfo(assignee) for assignee in issue.assignees],
+                "repo": issue.repository.full_name if issue.repository else None,
+                "labels": [label.name for label in issue.labels],
+                "created_at": (
+                    issue.created_at.replace(tzinfo=timezone.utc)
+                    if issue.created_at
+                    else None
+                ),
+                "updated_at": (
+                    issue.updated_at.replace(tzinfo=timezone.utc)
+                    if issue.updated_at
+                    else None
+                ),
+                "closed_at": (
+                    issue.closed_at.replace(tzinfo=timezone.utc)
+                    if issue.closed_at
+                    else None
+                ),
+                "closed_by": (
+                    _get_userinfo(issue.closed_by) if issue.closed_by else None
+                ),
+            }.items()
+            if v is not None
+        },
+    )
+
+
+class GithubConnectorStage(Enum):
+    START = "start"
+    PRS = "prs"
+    ISSUES = "issues"
+
+
+class GithubConnectorCheckpoint(ConnectorCheckpoint):
+    stage: GithubConnectorStage
+    curr_page: int
+
+    cached_repo_ids: list[int] | None = None
+    cached_repo: SerializedRepository | None = None
+
+    # Used for the fallback cursor-based pagination strategy
+    num_retrieved: int
+    cursor_url: str | None = None
+
+    def reset(self) -> None:
+        """
+        Resets curr_page, num_retrieved, and cursor_url to their initial values (0, 0, None)
+        """
+        self.curr_page = 0
+        self.num_retrieved = 0
+        self.cursor_url = None
+
+
+def make_cursor_url_callback(
+    checkpoint: GithubConnectorCheckpoint,
+) -> Callable[[str | None, int], None]:
+    def cursor_url_callback(cursor_url: str | None, num_objs: int) -> None:
+        # we want to maintain the old cursor url so code after retrieval
+        # can determine that we are using the fallback cursor-based pagination strategy
+        if cursor_url:
+            checkpoint.cursor_url = cursor_url
+        checkpoint.num_retrieved = num_objs
+
+    return cursor_url_callback
+
+
+class GithubConnector(CheckpointedConnectorWithPermSync[GithubConnectorCheckpoint]):
+    def __init__(
+        self,
+        repo_owner: str,
+        repositories: str | None = None,
+        state_filter: str = "all",
+        include_prs: bool = True,
+        include_issues: bool = False,
+    ) -> None:
+        self.repo_owner = repo_owner
+        self.repositories = repositories
+        self.state_filter = state_filter
+        self.include_prs = include_prs
+        self.include_issues = include_issues
+        self.github_client: Github | None = None
+
+    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+        # defaults to 30 items per page, can be set to as high as 100
+        self.github_client = (
+            Github(
+                credentials["github_access_token"],
+                base_url=GITHUB_CONNECTOR_BASE_URL,
+                per_page=ITEMS_PER_PAGE,
+            )
+            if GITHUB_CONNECTOR_BASE_URL
+            else Github(credentials["github_access_token"], per_page=ITEMS_PER_PAGE)
+        )
+        return None
+
+    def get_github_repo(
+        self, github_client: Github, attempt_num: int = 0
+    ) -> Repository.Repository:
+        if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
+            raise RuntimeError(
+                "Re-tried fetching repo too many times. Something is going wrong with fetching objects from Github"
+            )
+
+        try:
+            return github_client.get_repo(f"{self.repo_owner}/{self.repositories}")
+        except RateLimitExceededException:
+            sleep_after_rate_limit_exception(github_client)
+            return self.get_github_repo(github_client, attempt_num + 1)
+
+    def get_github_repos(
+        self, github_client: Github, attempt_num: int = 0
+    ) -> list[Repository.Repository]:
+        """Get specific repositories based on comma-separated repo_name string."""
+        if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
+            raise RuntimeError(
+                "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github"
+            )
+
+        try:
+            repos = []
+            # Split repo_name by comma and strip whitespace
+            repo_names = [
+                name.strip() for name in (cast(str, self.repositories)).split(",")
+            ]
+
+            for repo_name in repo_names:
+                if repo_name:  # Skip empty strings
+                    try:
+                        repo = github_client.get_repo(f"{self.repo_owner}/{repo_name}")
+                        repos.append(repo)
+                    except GithubException as e:
+                        logging.warning(
+                            f"Could not fetch repo {self.repo_owner}/{repo_name}: {e}"
+                        )
+
+            return repos
+        except RateLimitExceededException:
+            sleep_after_rate_limit_exception(github_client)
+            return self.get_github_repos(github_client, attempt_num + 1)
+
+    def get_all_repos(
+        self, github_client: Github, attempt_num: int = 0
+    ) -> list[Repository.Repository]:
+        if attempt_num > _MAX_NUM_RATE_LIMIT_RETRIES:
+            raise RuntimeError(
+                "Re-tried fetching repos too many times. Something is going wrong with fetching objects from Github"
+            )
+
+        try:
+            # Try to get organization first
+            try:
+                org = github_client.get_organization(self.repo_owner)
+                return list(org.get_repos())
+
+            except GithubException:
+                # If not an org, try as a user
+                user = github_client.get_user(self.repo_owner)
+                return list(user.get_repos())
+        except RateLimitExceededException:
+            sleep_after_rate_limit_exception(github_client)
+            return self.get_all_repos(github_client, attempt_num + 1)
+
+    def _pull_requests_func(
+        self, repo: Repository.Repository
+    ) -> Callable[[], PaginatedList[PullRequest]]:
+        return lambda: repo.get_pulls(
+            state=self.state_filter, sort="updated", direction="desc"
+        )
+
+    def _issues_func(
+        self, repo: Repository.Repository
+    ) -> Callable[[], PaginatedList[Issue]]:
+        return lambda: repo.get_issues(
+            state=self.state_filter, sort="updated", direction="desc"
+        )
+
+    def _fetch_from_github(
+        self,
+        checkpoint: GithubConnectorCheckpoint,
+        start: datetime | None = None,
+        end: datetime | None = None,
+        include_permissions: bool = False,
+    ) -> Generator[Document | ConnectorFailure, None, GithubConnectorCheckpoint]:
+        if self.github_client is None:
+            raise ConnectorMissingCredentialError("GitHub")
+
+        checkpoint = copy.deepcopy(checkpoint)
+
+        # First run of the connector, fetch all repos and store in checkpoint
+        if checkpoint.cached_repo_ids is None:
+            repos = []
+            if self.repositories:
+                if "," in self.repositories:
+                    # Multiple repositories specified
+                    repos = self.get_github_repos(self.github_client)
+                else:
+                    # Single repository (backward compatibility)
+                    repos = [self.get_github_repo(self.github_client)]
+            else:
+                # All repositories
+                repos = self.get_all_repos(self.github_client)
+            if not repos:
+                checkpoint.has_more = False
+                return checkpoint
+
+            curr_repo = repos.pop()
+            checkpoint.cached_repo_ids = [repo.id for repo in repos]
+            checkpoint.cached_repo = SerializedRepository(
+                id=curr_repo.id,
+                headers=curr_repo.raw_headers,
+                raw_data=curr_repo.raw_data,
+            )
+            checkpoint.stage = GithubConnectorStage.PRS
+            checkpoint.curr_page = 0
+            # save checkpoint with repo ids retrieved
+            return checkpoint
+
+        if checkpoint.cached_repo is None:
+            raise ValueError("No repo saved in checkpoint")
+
+        # Deserialize the repository from the checkpoint
+        repo = deserialize_repository(checkpoint.cached_repo, self.github_client)
+
+        cursor_url_callback = make_cursor_url_callback(checkpoint)
+        repo_external_access: ExternalAccess | None = None
+        if include_permissions:
+            repo_external_access = get_external_access_permission(
+                repo, self.github_client
+            )
+        if self.include_prs and checkpoint.stage == GithubConnectorStage.PRS:
+            logging.info(f"Fetching PRs for repo: {repo.name}")
+
+            pr_batch = _get_batch_rate_limited(
+                self._pull_requests_func(repo),
+                checkpoint.curr_page,
+                checkpoint.cursor_url,
+                checkpoint.num_retrieved,
+                cursor_url_callback,
+                self.github_client,
+            )
+            checkpoint.curr_page += 1  # NOTE: not used for cursor-based fallback
+            done_with_prs = False
+            num_prs = 0
+            pr = None
+            for pr in pr_batch:
+                num_prs += 1
+
+                # we iterate backwards in time, so at this point we stop processing prs
+                if (
+                    start is not None
+                    and pr.updated_at
+                    and pr.updated_at.replace(tzinfo=timezone.utc) < start
+                ):
+                    done_with_prs = True
+                    break
+                # Skip PRs updated after the end date
+                if (
+                    end is not None
+                    and pr.updated_at
+                    and pr.updated_at.replace(tzinfo=timezone.utc) > end
+                ):
+                    continue
+                try:
+                    yield _convert_pr_to_document(
+                        cast(PullRequest, pr), repo_external_access
+                    )
+                except Exception as e:
+                    error_msg = f"Error converting PR to document: {e}"
+                    logging.exception(error_msg)
+                    yield ConnectorFailure(
+                        failed_document=DocumentFailure(
+                            document_id=str(pr.id), document_link=pr.html_url
+                        ),
+                        failure_message=error_msg,
+                        exception=e,
+                    )
+                    continue
+
+            # If we reach this point with a cursor url in the checkpoint, we were using
+            # the fallback cursor-based pagination strategy. That strategy tries to get all
+            # PRs, so having curosr_url set means we are done with prs. However, we need to
+            # return AFTER the checkpoint reset to avoid infinite loops.
+
+            # if we found any PRs on the page and there are more PRs to get, return the checkpoint.
+            # In offset mode, while indexing without time constraints, the pr batch
+            # will be empty when we're done.
+            used_cursor = checkpoint.cursor_url is not None
+            logging.info(f"Fetched {num_prs} PRs for repo: {repo.name}")
+            if num_prs > 0 and not done_with_prs and not used_cursor:
+                return checkpoint
+
+            # if we went past the start date during the loop or there are no more
+            # prs to get, we move on to issues
+            checkpoint.stage = GithubConnectorStage.ISSUES
+            checkpoint.reset()
+
+            if used_cursor:
+                # save the checkpoint after changing stage; next run will continue from issues
+                return checkpoint
+
+        checkpoint.stage = GithubConnectorStage.ISSUES
+
+        if self.include_issues and checkpoint.stage == GithubConnectorStage.ISSUES:
+            logging.info(f"Fetching issues for repo: {repo.name}")
+
+            issue_batch = list(
+                _get_batch_rate_limited(
+                    self._issues_func(repo),
+                    checkpoint.curr_page,
+                    checkpoint.cursor_url,
+                    checkpoint.num_retrieved,
+                    cursor_url_callback,
+                    self.github_client,
+                )
+            )
+            logging.info(f"Fetched {len(issue_batch)} issues for repo: {repo.name}")
+            checkpoint.curr_page += 1
+            done_with_issues = False
+            num_issues = 0
+            for issue in issue_batch:
+                num_issues += 1
+                issue = cast(Issue, issue)
+                # we iterate backwards in time, so at this point we stop processing prs
+                if (
+                    start is not None
+                    and issue.updated_at.replace(tzinfo=timezone.utc) < start
+                ):
+                    done_with_issues = True
+                    break
+                # Skip PRs updated after the end date
+                if (
+                    end is not None
+                    and issue.updated_at.replace(tzinfo=timezone.utc) > end
+                ):
+                    continue
+
+                if issue.pull_request is not None:
+                    # PRs are handled separately
+                    continue
+
+                try:
+                    yield _convert_issue_to_document(issue, repo_external_access)
+                except Exception as e:
+                    error_msg = f"Error converting issue to document: {e}"
+                    logging.exception(error_msg)
+                    yield ConnectorFailure(
+                        failed_document=DocumentFailure(
+                            document_id=str(issue.id),
+                            document_link=issue.html_url,
+                        ),
+                        failure_message=error_msg,
+                        exception=e,
+                    )
+                    continue
+
+            logging.info(f"Fetched {num_issues} issues for repo: {repo.name}")
+            # if we found any issues on the page, and we're not done, return the checkpoint.
+            # don't return if we're using cursor-based pagination to avoid infinite loops
+            if num_issues > 0 and not done_with_issues and not checkpoint.cursor_url:
+                return checkpoint
+
+            # if we went past the start date during the loop or there are no more
+            # issues to get, we move on to the next repo
+            checkpoint.stage = GithubConnectorStage.PRS
+            checkpoint.reset()
+
+        checkpoint.has_more = len(checkpoint.cached_repo_ids) > 0
+        if checkpoint.cached_repo_ids:
+            next_id = checkpoint.cached_repo_ids.pop()
+            next_repo = self.github_client.get_repo(next_id)
+            checkpoint.cached_repo = SerializedRepository(
+                id=next_id,
+                headers=next_repo.raw_headers,
+                raw_data=next_repo.raw_data,
+            )
+            checkpoint.stage = GithubConnectorStage.PRS
+            checkpoint.reset()
+
+        if checkpoint.cached_repo_ids:
+            logging.info(
+                f"{len(checkpoint.cached_repo_ids)} repos remaining (IDs: {checkpoint.cached_repo_ids})"
+            )
+        else:
+            logging.info("No more repos remaining")
+
+        return checkpoint
+
+    def _load_from_checkpoint(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: GithubConnectorCheckpoint,
+        include_permissions: bool = False,
+    ) -> CheckpointOutput[GithubConnectorCheckpoint]:
+        start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
+        # add a day for timezone safety
+        end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) + ONE_DAY
+
+        # Move start time back by 3 hours, since some Issues/PRs are getting dropped
+        # Could be due to delayed processing on GitHub side
+        # The non-updated issues since last poll will be shortcut-ed and not embedded
+        adjusted_start_datetime = start_datetime - timedelta(hours=3)
+
+        epoch = datetime.fromtimestamp(0, tz=timezone.utc)
+        if adjusted_start_datetime < epoch:
+            adjusted_start_datetime = epoch
+
+        return self._fetch_from_github(
+            checkpoint,
+            start=adjusted_start_datetime,
+            end=end_datetime,
+            include_permissions=include_permissions,
+        )
+
+    @override
+    def load_from_checkpoint(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: GithubConnectorCheckpoint,
+    ) -> CheckpointOutput[GithubConnectorCheckpoint]:
+        return self._load_from_checkpoint(
+            start, end, checkpoint, include_permissions=False
+        )
+
+    @override
+    def load_from_checkpoint_with_perm_sync(
+        self,
+        start: SecondsSinceUnixEpoch,
+        end: SecondsSinceUnixEpoch,
+        checkpoint: GithubConnectorCheckpoint,
+    ) -> CheckpointOutput[GithubConnectorCheckpoint]:
+        return self._load_from_checkpoint(
+            start, end, checkpoint, include_permissions=True
+        )
+
+    def validate_connector_settings(self) -> None:
+        if self.github_client is None:
+            raise ConnectorMissingCredentialError("GitHub credentials not loaded.")
+
+        if not self.repo_owner:
+            raise ConnectorValidationError(
+                "Invalid connector settings: 'repo_owner' must be provided."
+            )
+
+        try:
+            if self.repositories:
+                if "," in self.repositories:
+                    # Multiple repositories specified
+                    repo_names = [name.strip() for name in self.repositories.split(",")]
+                    if not repo_names:
+                        raise ConnectorValidationError(
+                            "Invalid connector settings: No valid repository names provided."
+                        )
+
+                    # Validate at least one repository exists and is accessible
+                    valid_repos = False
+                    validation_errors = []
+
+                    for repo_name in repo_names:
+                        if not repo_name:
+                            continue
+
+                        try:
+                            test_repo = self.github_client.get_repo(
+                                f"{self.repo_owner}/{repo_name}"
+                            )
+                            logging.info(
+                                f"Successfully accessed repository: {self.repo_owner}/{repo_name}"
+                            )
+                            test_repo.get_contents("")
+                            valid_repos = True
+                            # If at least one repo is valid, we can proceed
+                            break
+                        except GithubException as e:
+                            validation_errors.append(
+                                f"Repository '{repo_name}': {e.data.get('message', str(e))}"
+                            )
+
+                    if not valid_repos:
+                        error_msg = (
+                            "None of the specified repositories could be accessed: "
+                        )
+                        error_msg += ", ".join(validation_errors)
+                        raise ConnectorValidationError(error_msg)
+                else:
+                    # Single repository (backward compatibility)
+                    test_repo = self.github_client.get_repo(
+                        f"{self.repo_owner}/{self.repositories}"
+                    )
+                    test_repo.get_contents("")
+            else:
+                # Try to get organization first
+                try:
+                    org = self.github_client.get_organization(self.repo_owner)
+                    total_count = org.get_repos().totalCount
+                    if total_count == 0:
+                        raise ConnectorValidationError(
+                            f"Found no repos for organization: {self.repo_owner}. "
+                            "Does the credential have the right scopes?"
+                        )
+                except GithubException as e:
+                    # Check for missing SSO
+                    MISSING_SSO_ERROR_MESSAGE = "You must grant your Personal Access token access to this organization".lower()
+                    if MISSING_SSO_ERROR_MESSAGE in str(e).lower():
+                        SSO_GUIDE_LINK = (
+                            "https://docs.github.com/en/enterprise-cloud@latest/authentication/"
+                            "authenticating-with-saml-single-sign-on/"
+                            "authorizing-a-personal-access-token-for-use-with-saml-single-sign-on"
+                        )
+                        raise ConnectorValidationError(
+                            f"Your GitHub token is missing authorization to access the "
+                            f"`{self.repo_owner}` organization. Please follow the guide to "
+                            f"authorize your token: {SSO_GUIDE_LINK}"
+                        )
+                    # If not an org, try as a user
+                    user = self.github_client.get_user(self.repo_owner)
+
+                    # Check if we can access any repos
+                    total_count = user.get_repos().totalCount
+                    if total_count == 0:
+                        raise ConnectorValidationError(
+                            f"Found no repos for user: {self.repo_owner}. "
+                            "Does the credential have the right scopes?"
+                        )
+
+        except RateLimitExceededException:
+            raise UnexpectedValidationError(
+                "Validation failed due to GitHub rate-limits being exceeded. Please try again later."
+            )
+
+        except GithubException as e:
+            if e.status == 401:
+                raise CredentialExpiredError(
+                    "GitHub credential appears to be invalid or expired (HTTP 401)."
+                )
+            elif e.status == 403:
+                raise InsufficientPermissionsError(
+                    "Your GitHub token does not have sufficient permissions for this repository (HTTP 403)."
+                )
+            elif e.status == 404:
+                if self.repositories:
+                    if "," in self.repositories:
+                        raise ConnectorValidationError(
+                            f"None of the specified GitHub repositories could be found for owner: {self.repo_owner}"
+                        )
+                    else:
+                        raise ConnectorValidationError(
+                            f"GitHub repository not found with name: {self.repo_owner}/{self.repositories}"
+                        )
+                else:
+                    raise ConnectorValidationError(
+                        f"GitHub user or organization not found: {self.repo_owner}"
+                    )
+            else:
+                raise ConnectorValidationError(
+                    f"Unexpected GitHub error (status={e.status}): {e.data}"
+                )
+
+        except Exception as exc:
+            raise Exception(
+                f"Unexpected error during GitHub settings validation: {exc}"
+            )
+
+    def validate_checkpoint_json(
+        self, checkpoint_json: str
+    ) -> GithubConnectorCheckpoint:
+        return GithubConnectorCheckpoint.model_validate_json(checkpoint_json)
+
+    def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint:
+        return GithubConnectorCheckpoint(
+            stage=GithubConnectorStage.PRS, curr_page=0, has_more=True, num_retrieved=0
+        )
+
+
+if __name__ == "__main__":
+    import os
+
+    # Initialize the connector
+    connector = GithubConnector(
+        repo_owner=os.environ["REPO_OWNER"],
+        repositories=os.environ.get("REPOSITORIES"),
+    )
+    connector.load_credentials(
+        {"github_access_token": os.environ["ACCESS_TOKEN_GITHUB"]}
+    )
+
+    if connector.github_client:
+        get_external_access_permission(
+            connector.get_github_repos(connector.github_client).pop(),
+            connector.github_client,
+        )
+
+    # Create a time range from epoch to now
+    end_time = datetime.now(timezone.utc)
+    start_time = datetime.fromtimestamp(0, tz=timezone.utc)
+    time_range = (start_time, end_time)
+
+    # Initialize the runner with a batch size of 10
+    runner: ConnectorRunner[GithubConnectorCheckpoint] = ConnectorRunner(
+        connector, batch_size=10, include_permissions=False, time_range=time_range
+    )
+
+    # Get initial checkpoint
+    checkpoint = connector.build_dummy_checkpoint()
+
+    # Run the connector
+    while checkpoint.has_more:
+        for doc_batch, failure, next_checkpoint in runner.run(checkpoint):
+            if doc_batch:
+                print(f"Retrieved batch of {len(doc_batch)} documents")
+                for doc in doc_batch:
+                    print(f"Document: {doc.semantic_identifier}")
+            if failure:
+                print(f"Failure: {failure.failure_message}")
+            if next_checkpoint:
+                checkpoint = next_checkpoint
\ No newline at end of file
diff --git a/common/data_source/github/models.py b/common/data_source/github/models.py
new file mode 100644
index 000000000..9754bfa8d
--- /dev/null
+++ b/common/data_source/github/models.py
@@ -0,0 +1,17 @@
+from typing import Any
+
+from github import Repository
+from github.Requester import Requester
+from pydantic import BaseModel
+
+
+class SerializedRepository(BaseModel):
+    # id is part of the raw_data as well, just pulled out for convenience
+    id: int
+    headers: dict[str, str | int]
+    raw_data: dict[str, Any]
+
+    def to_Repository(self, requester: Requester) -> Repository.Repository:
+        return Repository.Repository(
+            requester, self.headers, self.raw_data, completed=True
+        )
\ No newline at end of file
diff --git a/common/data_source/github/rate_limit_utils.py b/common/data_source/github/rate_limit_utils.py
new file mode 100644
index 000000000..d683bad08
--- /dev/null
+++ b/common/data_source/github/rate_limit_utils.py
@@ -0,0 +1,24 @@
+import time
+import logging
+from datetime import datetime
+from datetime import timedelta
+from datetime import timezone
+
+from github import Github
+
+
+def sleep_after_rate_limit_exception(github_client: Github) -> None:
+    """
+    Sleep until the GitHub rate limit resets.
+
+    Args:
+        github_client: The GitHub client that hit the rate limit
+    """
+    sleep_time = github_client.get_rate_limit().core.reset.replace(
+        tzinfo=timezone.utc
+    ) - datetime.now(tz=timezone.utc)
+    sleep_time += timedelta(minutes=1)  # add an extra minute just to be safe
+    logging.info(
+        "Ran into Github rate-limit. Sleeping %s seconds.", sleep_time.seconds
+    )
+    time.sleep(sleep_time.total_seconds())
\ No newline at end of file
diff --git a/common/data_source/github/utils.py b/common/data_source/github/utils.py
new file mode 100644
index 000000000..13b25ac51
--- /dev/null
+++ b/common/data_source/github/utils.py
@@ -0,0 +1,46 @@
+import logging
+from collections.abc import Callable
+from typing import cast
+
+from github import Github
+from github.Repository import Repository
+
+from common.data_source.models import ExternalAccess
+
+from .models import SerializedRepository
+
+
+def get_external_access_permission(
+    repo: Repository, github_client: Github
+) -> ExternalAccess:
+    """
+    Get the external access permission for a repository.
+    This functionality requires Enterprise Edition.
+    """
+    # RAGFlow doesn't implement the Onyx EE external-permissions system.
+    # Default to private/unknown permissions.
+    return ExternalAccess.empty()
+
+
+def deserialize_repository(
+    cached_repo: SerializedRepository, github_client: Github
+) -> Repository:
+    """
+    Deserialize a SerializedRepository back into a Repository object.
+    """
+    # Try to access the requester - different PyGithub versions may use different attribute names
+    try:
+        # Try to get the requester using getattr to avoid linter errors
+        requester = getattr(github_client, "_requester", None)
+        if requester is None:
+            requester = getattr(github_client, "_Github__requester", None)
+        if requester is None:
+            # If we can't find the requester attribute, we need to fall back to recreating the repo
+            raise AttributeError("Could not find requester attribute")
+
+        return cached_repo.to_Repository(requester)
+    except Exception as e:
+        # If all else fails, re-fetch the repo directly
+        logging.warning("Failed to deserialize repository: %s. Attempting to re-fetch.", e)
+        repo_id = cached_repo.id
+        return github_client.get_repo(repo_id)
\ No newline at end of file
diff --git a/common/data_source/interfaces.py b/common/data_source/interfaces.py
index c5c665aa2..cd180967f 100644
--- a/common/data_source/interfaces.py
+++ b/common/data_source/interfaces.py
@@ -236,16 +236,13 @@ class BaseConnector(abc.ABC, Generic[CT]):
 
     def validate_perm_sync(self) -> None:
         """
-        Don't override this; add a function to perm_sync_valid.py in the ee package
-        to do permission sync validation
+        Permission-sync validation hook.
+
+        RAGFlow doesn't ship the Onyx EE permission-sync validation package.
+        Connectors that support permission sync should override
+        `validate_connector_settings()` as needed.
         """
-        """
-        validate_connector_settings_fn = fetch_ee_implementation_or_noop(
-            "onyx.connectors.perm_sync_valid",
-            "validate_perm_sync",
-            noop_return_value=None,
-        )
-        validate_connector_settings_fn(self)"""
+        return None
 
     def set_allow_images(self, value: bool) -> None:
         """Implement if the underlying connector wants to skip/allow image downloading
diff --git a/pyproject.toml b/pyproject.toml
index c8a8755ad..e5b5efaa6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -149,6 +149,7 @@ dependencies = [
     #    "cryptography==46.0.3",
     #    "jinja2>=3.1.0",
     "pyairtable>=3.3.0",
+    "pygithub>=2.8.1",
     "asana>=5.2.2",
 ]
 
diff --git a/uv.lock b/uv.lock
index cfaaa401f..173246531 100644
--- a/uv.lock
+++ b/uv.lock
@@ -5509,6 +5509,22 @@ dependencies = [
 ]
 sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ba/8e/aedef81641c8dca6fd0fb7294de5bed9c45f3397d67fddf755c1042c2642/PyExecJS-1.5.1.tar.gz", hash = "sha256:34cc1d070976918183ff7bdc0ad71f8157a891c92708c00c5fbbff7a769f505c", size = 13344, upload-time = "2018-01-18T04:33:55.126Z" }
 
+[[package]]
+name = "pygithub"
+version = "2.8.1"
+source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
+dependencies = [
+    { name = "pyjwt", extra = ["crypto"] },
+    { name = "pynacl" },
+    { name = "requests" },
+    { name = "typing-extensions" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/74/e560bdeffea72ecb26cff27f0fad548bbff5ecc51d6a155311ea7f9e4c4c/pygithub-2.8.1.tar.gz", hash = "sha256:341b7c78521cb07324ff670afd1baa2bf5c286f8d9fd302c1798ba594a5400c9", size = 2246994, upload-time = "2025-09-02T17:41:54.674Z" }
+wheels = [
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/ba/7049ce39f653f6140aac4beb53a5aaf08b4407b6a3019aae394c1c5244ff/pygithub-2.8.1-py3-none-any.whl", hash = "sha256:23a0a5bca93baef082e03411bf0ce27204c32be8bfa7abc92fe4a3e132936df0", size = 432709, upload-time = "2025-09-02T17:41:52.947Z" },
+]
+
 [[package]]
 name = "pygments"
 version = "2.19.2"
@@ -5541,6 +5557,43 @@ wheels = [
     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7c/4c/ad33b92b9864cbde84f259d5df035a6447f91891f5be77788e2a3892bce3/pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9", size = 45300, upload-time = "2025-08-24T12:55:53.394Z" },
 ]
 
+[[package]]
+name = "pynacl"
+version = "1.6.1"
+source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
+dependencies = [
+    { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
+]
+sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616, upload-time = "2025-11-10T16:02:13.195Z" }
+wheels = [
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/75/d6/4b2dca33ed512de8f54e5c6074aa06eaeb225bfbcd9b16f33a414389d6bd/pynacl-1.6.1-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:7d7c09749450c385301a3c20dca967a525152ae4608c0a096fe8464bfc3df93d", size = 389109, upload-time = "2025-11-10T16:01:28.79Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/30/e8dbb8ff4fa2559bbbb2187ba0d0d7faf728d17cb8396ecf4a898b22d3da/pynacl-1.6.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc734c1696ffd49b40f7c1779c89ba908157c57345cf626be2e0719488a076d3", size = 808254, upload-time = "2025-11-10T16:01:37.839Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/f9/f5449c652f31da00249638dbab065ad4969c635119094b79b17c3a4da2ab/pynacl-1.6.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3cd787ec1f5c155dc8ecf39b1333cfef41415dc96d392f1ce288b4fe970df489", size = 1407365, upload-time = "2025-11-10T16:01:40.454Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/eb/2f/9aa5605f473b712065c0a193ebf4ad4725d7a245533f0cd7e5dcdbc78f35/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b35d93ab2df03ecb3aa506be0d3c73609a51449ae0855c2e89c7ed44abde40b", size = 843842, upload-time = "2025-11-10T16:01:30.524Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/32/8d/748f0f6956e207453da8f5f21a70885fbbb2e060d5c9d78e0a4a06781451/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dece79aecbb8f4640a1adbb81e4aa3bfb0e98e99834884a80eb3f33c7c30e708", size = 1445559, upload-time = "2025-11-10T16:01:33.663Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/d0/2387f0dcb0e9816f38373999e48db4728ed724d31accdd4e737473319d35/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c2228054f04bf32d558fb89bb99f163a8197d5a9bf4efa13069a7fa8d4b93fc3", size = 825791, upload-time = "2025-11-10T16:01:34.823Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/3d/ef6fb7eb072aaf15f280bc66f26ab97e7fc9efa50fb1927683013ef47473/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:2b12f1b97346f177affcdfdc78875ff42637cb40dcf79484a97dae3448083a78", size = 1410843, upload-time = "2025-11-10T16:01:36.401Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e3/fb/23824a017526850ee7d8a1cc4cd1e3e5082800522c10832edbbca8619537/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e735c3a1bdfde3834503baf1a6d74d4a143920281cb724ba29fb84c9f49b9c48", size = 801140, upload-time = "2025-11-10T16:01:42.013Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/d1/ebc6b182cb98603a35635b727d62f094bc201bf610f97a3bb6357fe688d2/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3384a454adf5d716a9fadcb5eb2e3e72cd49302d1374a60edc531c9957a9b014", size = 1371966, upload-time = "2025-11-10T16:01:43.297Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/64/f4/c9d7b6f02924b1f31db546c7bd2a83a2421c6b4a8e6a2e53425c9f2802e0/pynacl-1.6.1-cp314-cp314t-win32.whl", hash = "sha256:d8615ee34d01c8e0ab3f302dcdd7b32e2bcf698ba5f4809e7cc407c8cdea7717", size = 230482, upload-time = "2025-11-10T16:01:47.688Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c4/2c/942477957fba22da7bf99131850e5ebdff66623418ab48964e78a7a8293e/pynacl-1.6.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5f5b35c1a266f8a9ad22525049280a600b19edd1f785bccd01ae838437dcf935", size = 243232, upload-time = "2025-11-10T16:01:45.208Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/0c/bdbc0d04a53b96a765ab03aa2cf9a76ad8653d70bf1665459b9a0dedaa1c/pynacl-1.6.1-cp314-cp314t-win_arm64.whl", hash = "sha256:d984c91fe3494793b2a1fb1e91429539c6c28e9ec8209d26d25041ec599ccf63", size = 187907, upload-time = "2025-11-10T16:01:46.328Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591, upload-time = "2025-11-10T16:01:49.1Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866, upload-time = "2025-11-10T16:01:55.688Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001, upload-time = "2025-11-10T16:01:57.101Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024, upload-time = "2025-11-10T16:01:50.228Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766, upload-time = "2025-11-10T16:01:51.886Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275, upload-time = "2025-11-10T16:01:53.351Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891, upload-time = "2025-11-10T16:01:54.587Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291, upload-time = "2025-11-10T16:01:58.111Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839, upload-time = "2025-11-10T16:01:59.252Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371, upload-time = "2025-11-10T16:02:01.075Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031, upload-time = "2025-11-10T16:02:02.656Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585, upload-time = "2025-11-10T16:02:07.116Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923, upload-time = "2025-11-10T16:02:04.401Z" },
+    { url = "https://pypi.tuna.tsinghua.edu.cn/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970, upload-time = "2025-11-10T16:02:05.786Z" },
+]
+
 [[package]]
 name = "pynndescent"
 version = "0.5.13"
@@ -6171,6 +6224,7 @@ dependencies = [
     { name = "pyairtable" },
     { name = "pyclipper" },
     { name = "pycryptodomex" },
+    { name = "pygithub" },
     { name = "pyobvector" },
     { name = "pyodbc" },
     { name = "pypandoc" },
@@ -6301,6 +6355,7 @@ requires-dist = [
     { name = "pyairtable", specifier = ">=3.3.0" },
     { name = "pyclipper", specifier = ">=1.4.0,<2.0.0" },
     { name = "pycryptodomex", specifier = "==3.20.0" },
+    { name = "pygithub", specifier = ">=2.8.1" },
     { name = "pyobvector", specifier = "==0.2.18" },
     { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" },
     { name = "pypandoc", specifier = ">=1.16" },