mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-01 17:45:28 +08:00
Feat: Bitbucket connector (#12332)
### What problem does this PR solve? Feat: Bitbucket connector NOT READY TO MERGE ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
388
common/data_source/bitbucket/connector.py
Normal file
388
common/data_source/bitbucket/connector.py
Normal file
@ -0,0 +1,388 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
from common.data_source.config import INDEX_BATCH_SIZE
|
||||
from common.data_source.config import DocumentSource
|
||||
from common.data_source.config import REQUEST_TIMEOUT_SECONDS
|
||||
from common.data_source.exceptions import (
|
||||
ConnectorMissingCredentialError,
|
||||
CredentialExpiredError,
|
||||
InsufficientPermissionsError,
|
||||
UnexpectedValidationError,
|
||||
)
|
||||
from common.data_source.interfaces import CheckpointedConnector
|
||||
from common.data_source.interfaces import CheckpointOutput
|
||||
from common.data_source.interfaces import IndexingHeartbeatInterface
|
||||
from common.data_source.interfaces import SecondsSinceUnixEpoch
|
||||
from common.data_source.interfaces import SlimConnectorWithPermSync
|
||||
from common.data_source.models import ConnectorCheckpoint
|
||||
from common.data_source.models import ConnectorFailure
|
||||
from common.data_source.models import DocumentFailure
|
||||
from common.data_source.models import SlimDocument
|
||||
from common.data_source.bitbucket.utils import (
|
||||
build_auth_client,
|
||||
list_repositories,
|
||||
map_pr_to_document,
|
||||
paginate,
|
||||
PR_LIST_RESPONSE_FIELDS,
|
||||
SLIM_PR_LIST_RESPONSE_FIELDS,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import httpx
|
||||
|
||||
|
||||
class BitbucketConnectorCheckpoint(ConnectorCheckpoint):
|
||||
"""Checkpoint state for resumable Bitbucket PR indexing.
|
||||
|
||||
Fields:
|
||||
repos_queue: Materialized list of repository slugs to process.
|
||||
current_repo_index: Index of the repository currently being processed.
|
||||
next_url: Bitbucket "next" URL for continuing pagination within the current repo.
|
||||
"""
|
||||
|
||||
repos_queue: list[str] = []
|
||||
current_repo_index: int = 0
|
||||
next_url: str | None = None
|
||||
|
||||
|
||||
class BitbucketConnector(
|
||||
CheckpointedConnector[BitbucketConnectorCheckpoint],
|
||||
SlimConnectorWithPermSync,
|
||||
):
|
||||
"""Connector for indexing Bitbucket Cloud pull requests.
|
||||
|
||||
Args:
|
||||
workspace: Bitbucket workspace ID.
|
||||
repositories: Comma-separated list of repository slugs to index.
|
||||
projects: Comma-separated list of project keys to index all repositories within.
|
||||
batch_size: Max number of documents to yield per batch.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
workspace: str,
|
||||
repositories: str | None = None,
|
||||
projects: str | None = None,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> None:
|
||||
self.workspace = workspace
|
||||
self._repositories = (
|
||||
[s.strip() for s in repositories.split(",") if s.strip()]
|
||||
if repositories
|
||||
else None
|
||||
)
|
||||
self._projects: list[str] | None = (
|
||||
[s.strip() for s in projects.split(",") if s.strip()] if projects else None
|
||||
)
|
||||
self.batch_size = batch_size
|
||||
self.email: str | None = None
|
||||
self.api_token: str | None = None
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
"""Load API token-based credentials.
|
||||
|
||||
Expects a dict with keys: `bitbucket_email`, `bitbucket_api_token`.
|
||||
"""
|
||||
self.email = credentials.get("bitbucket_email")
|
||||
self.api_token = credentials.get("bitbucket_api_token")
|
||||
if not self.email or not self.api_token:
|
||||
raise ConnectorMissingCredentialError("Bitbucket")
|
||||
return None
|
||||
|
||||
def _client(self) -> httpx.Client:
|
||||
"""Build an authenticated HTTP client or raise if credentials missing."""
|
||||
if not self.email or not self.api_token:
|
||||
raise ConnectorMissingCredentialError("Bitbucket")
|
||||
return build_auth_client(self.email, self.api_token)
|
||||
|
||||
def _iter_pull_requests_for_repo(
|
||||
self,
|
||||
client: httpx.Client,
|
||||
repo_slug: str,
|
||||
params: dict[str, Any] | None = None,
|
||||
start_url: str | None = None,
|
||||
on_page: Callable[[str | None], None] | None = None,
|
||||
) -> Iterator[dict[str, Any]]:
|
||||
base = f"https://api.bitbucket.org/2.0/repositories/{self.workspace}/{repo_slug}/pullrequests"
|
||||
yield from paginate(
|
||||
client,
|
||||
base,
|
||||
params,
|
||||
start_url=start_url,
|
||||
on_page=on_page,
|
||||
)
|
||||
|
||||
def _build_params(
|
||||
self,
|
||||
fields: str = PR_LIST_RESPONSE_FIELDS,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Build Bitbucket fetch params.
|
||||
|
||||
Always include OPEN, MERGED, and DECLINED PRs. If both ``start`` and
|
||||
``end`` are provided, apply a single updated_on time window.
|
||||
"""
|
||||
|
||||
def _iso(ts: SecondsSinceUnixEpoch) -> str:
|
||||
return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
|
||||
|
||||
def _tc_epoch(
|
||||
lower_epoch: SecondsSinceUnixEpoch | None,
|
||||
upper_epoch: SecondsSinceUnixEpoch | None,
|
||||
) -> str | None:
|
||||
if lower_epoch is not None and upper_epoch is not None:
|
||||
lower_iso = _iso(lower_epoch)
|
||||
upper_iso = _iso(upper_epoch)
|
||||
return f'(updated_on > "{lower_iso}" AND updated_on <= "{upper_iso}")'
|
||||
return None
|
||||
|
||||
params: dict[str, Any] = {"fields": fields, "pagelen": 50}
|
||||
time_clause = _tc_epoch(start, end)
|
||||
q = '(state = "OPEN" OR state = "MERGED" OR state = "DECLINED")'
|
||||
if time_clause:
|
||||
q = f"{q} AND {time_clause}"
|
||||
params["q"] = q
|
||||
return params
|
||||
|
||||
def _iter_target_repositories(self, client: httpx.Client) -> Iterator[str]:
|
||||
"""Yield repository slugs based on configuration.
|
||||
|
||||
Priority:
|
||||
- repositories list
|
||||
- projects list (list repos by project key)
|
||||
- workspace (all repos)
|
||||
"""
|
||||
if self._repositories:
|
||||
for slug in self._repositories:
|
||||
yield slug
|
||||
return
|
||||
if self._projects:
|
||||
for project_key in self._projects:
|
||||
for repo in list_repositories(client, self.workspace, project_key):
|
||||
slug_val = repo.get("slug")
|
||||
if isinstance(slug_val, str) and slug_val:
|
||||
yield slug_val
|
||||
return
|
||||
for repo in list_repositories(client, self.workspace, None):
|
||||
slug_val = repo.get("slug")
|
||||
if isinstance(slug_val, str) and slug_val:
|
||||
yield slug_val
|
||||
|
||||
@override
|
||||
def load_from_checkpoint(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: BitbucketConnectorCheckpoint,
|
||||
) -> CheckpointOutput[BitbucketConnectorCheckpoint]:
|
||||
"""Resumable PR ingestion across repos and pages within a time window.
|
||||
|
||||
Yields Documents (or ConnectorFailure for per-PR mapping failures) and returns
|
||||
an updated checkpoint that records repo position and next page URL.
|
||||
"""
|
||||
new_checkpoint = copy.deepcopy(checkpoint)
|
||||
|
||||
with self._client() as client:
|
||||
# Materialize target repositories once
|
||||
if not new_checkpoint.repos_queue:
|
||||
# Preserve explicit order; otherwise ensure deterministic ordering
|
||||
repos_list = list(self._iter_target_repositories(client))
|
||||
new_checkpoint.repos_queue = sorted(set(repos_list))
|
||||
new_checkpoint.current_repo_index = 0
|
||||
new_checkpoint.next_url = None
|
||||
|
||||
repos = new_checkpoint.repos_queue
|
||||
if not repos or new_checkpoint.current_repo_index >= len(repos):
|
||||
new_checkpoint.has_more = False
|
||||
return new_checkpoint
|
||||
|
||||
repo_slug = repos[new_checkpoint.current_repo_index]
|
||||
|
||||
first_page_params = self._build_params(
|
||||
fields=PR_LIST_RESPONSE_FIELDS,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
|
||||
def _on_page(next_url: str | None) -> None:
|
||||
new_checkpoint.next_url = next_url
|
||||
|
||||
for pr in self._iter_pull_requests_for_repo(
|
||||
client,
|
||||
repo_slug,
|
||||
params=first_page_params,
|
||||
start_url=new_checkpoint.next_url,
|
||||
on_page=_on_page,
|
||||
):
|
||||
try:
|
||||
document = map_pr_to_document(pr, self.workspace, repo_slug)
|
||||
yield document
|
||||
except Exception as e:
|
||||
pr_id = pr.get("id")
|
||||
pr_link = (
|
||||
f"https://bitbucket.org/{self.workspace}/{repo_slug}/pull-requests/{pr_id}"
|
||||
if pr_id is not None
|
||||
else None
|
||||
)
|
||||
yield ConnectorFailure(
|
||||
failed_document=DocumentFailure(
|
||||
document_id=(
|
||||
f"{DocumentSource.BITBUCKET.value}:{self.workspace}:{repo_slug}:pr:{pr_id}"
|
||||
if pr_id is not None
|
||||
else f"{DocumentSource.BITBUCKET.value}:{self.workspace}:{repo_slug}:pr:unknown"
|
||||
),
|
||||
document_link=pr_link,
|
||||
),
|
||||
failure_message=f"Failed to process Bitbucket PR: {e}",
|
||||
exception=e,
|
||||
)
|
||||
|
||||
# Advance to next repository (if any) and set has_more accordingly
|
||||
new_checkpoint.current_repo_index += 1
|
||||
new_checkpoint.next_url = None
|
||||
new_checkpoint.has_more = new_checkpoint.current_repo_index < len(repos)
|
||||
|
||||
return new_checkpoint
|
||||
|
||||
@override
|
||||
def build_dummy_checkpoint(self) -> BitbucketConnectorCheckpoint:
|
||||
"""Create an initial checkpoint with work remaining."""
|
||||
return BitbucketConnectorCheckpoint(has_more=True)
|
||||
|
||||
@override
|
||||
def validate_checkpoint_json(
|
||||
self, checkpoint_json: str
|
||||
) -> BitbucketConnectorCheckpoint:
|
||||
"""Validate and deserialize a checkpoint instance from JSON."""
|
||||
return BitbucketConnectorCheckpoint.model_validate_json(checkpoint_json)
|
||||
|
||||
def retrieve_all_slim_docs_perm_sync(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> Iterator[list[SlimDocument]]:
|
||||
"""Return only document IDs for all existing pull requests."""
|
||||
batch: list[SlimDocument] = []
|
||||
params = self._build_params(
|
||||
fields=SLIM_PR_LIST_RESPONSE_FIELDS,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
with self._client() as client:
|
||||
for slug in self._iter_target_repositories(client):
|
||||
for pr in self._iter_pull_requests_for_repo(
|
||||
client, slug, params=params
|
||||
):
|
||||
pr_id = pr["id"]
|
||||
doc_id = f"{DocumentSource.BITBUCKET.value}:{self.workspace}:{slug}:pr:{pr_id}"
|
||||
batch.append(SlimDocument(id=doc_id))
|
||||
if len(batch) >= self.batch_size:
|
||||
yield batch
|
||||
batch = []
|
||||
if callback:
|
||||
if callback.should_stop():
|
||||
# Note: this is not actually used for permission sync yet, just pruning
|
||||
raise RuntimeError(
|
||||
"bitbucket_pr_sync: Stop signal detected"
|
||||
)
|
||||
callback.progress("bitbucket_pr_sync", len(batch))
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
def validate_connector_settings(self) -> None:
|
||||
"""Validate Bitbucket credentials and workspace access by probing a lightweight endpoint.
|
||||
|
||||
Raises:
|
||||
CredentialExpiredError: on HTTP 401
|
||||
InsufficientPermissionsError: on HTTP 403
|
||||
UnexpectedValidationError: on any other failure
|
||||
"""
|
||||
try:
|
||||
with self._client() as client:
|
||||
url = f"https://api.bitbucket.org/2.0/repositories/{self.workspace}"
|
||||
resp = client.get(
|
||||
url,
|
||||
params={"pagelen": 1, "fields": "pagelen"},
|
||||
timeout=REQUEST_TIMEOUT_SECONDS,
|
||||
)
|
||||
if resp.status_code == 401:
|
||||
raise CredentialExpiredError(
|
||||
"Invalid or expired Bitbucket credentials (HTTP 401)."
|
||||
)
|
||||
if resp.status_code == 403:
|
||||
raise InsufficientPermissionsError(
|
||||
"Insufficient permissions to access Bitbucket workspace (HTTP 403)."
|
||||
)
|
||||
if resp.status_code < 200 or resp.status_code >= 300:
|
||||
raise UnexpectedValidationError(
|
||||
f"Unexpected Bitbucket error (status={resp.status_code})."
|
||||
)
|
||||
except Exception as e:
|
||||
# Network or other unexpected errors
|
||||
if isinstance(
|
||||
e,
|
||||
(
|
||||
CredentialExpiredError,
|
||||
InsufficientPermissionsError,
|
||||
UnexpectedValidationError,
|
||||
ConnectorMissingCredentialError,
|
||||
),
|
||||
):
|
||||
raise
|
||||
raise UnexpectedValidationError(
|
||||
f"Unexpected error while validating Bitbucket settings: {e}"
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
bitbucket = BitbucketConnector(
|
||||
workspace="<YOUR_WORKSPACE>"
|
||||
)
|
||||
|
||||
bitbucket.load_credentials({
|
||||
"bitbucket_email": "<YOUR_EMAIL>",
|
||||
"bitbucket_api_token": "<YOUR_API_TOKEN>",
|
||||
})
|
||||
|
||||
bitbucket.validate_connector_settings()
|
||||
print("Credentials validated successfully.")
|
||||
|
||||
start_time = datetime.fromtimestamp(0, tz=timezone.utc)
|
||||
end_time = datetime.now(timezone.utc)
|
||||
|
||||
for doc_batch in bitbucket.retrieve_all_slim_docs_perm_sync(
|
||||
start=start_time.timestamp(),
|
||||
end=end_time.timestamp(),
|
||||
):
|
||||
for doc in doc_batch:
|
||||
print(doc)
|
||||
|
||||
|
||||
bitbucket_checkpoint = bitbucket.build_dummy_checkpoint()
|
||||
|
||||
while bitbucket_checkpoint.has_more:
|
||||
gen = bitbucket.load_from_checkpoint(
|
||||
start=start_time.timestamp(),
|
||||
end=end_time.timestamp(),
|
||||
checkpoint=bitbucket_checkpoint,
|
||||
)
|
||||
|
||||
while True:
|
||||
try:
|
||||
doc = next(gen)
|
||||
print(doc)
|
||||
except StopIteration as e:
|
||||
bitbucket_checkpoint = e.value
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user