Feat: Gitlab connector (#12248)

### What problem does this PR solve?

Feat: Gitlab connector
Fix: submit button in darkmode

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Magicbook1108
2025-12-29 17:05:20 +08:00
committed by GitHub
parent f099bc1236
commit c3ae1aaecd
14 changed files with 498 additions and 4 deletions

View File

@ -130,7 +130,7 @@ class FileSource(StrEnum):
GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
AIRTABLE = "airtable"
ASANA = "asana"
GITLAB = "gitlab"
class PipelineTaskType(StrEnum):
PARSE = "Parse"

View File

@ -55,6 +55,8 @@ class DocumentSource(str, Enum):
BOX = "box"
AIRTABLE = "airtable"
ASANA = "asana"
GITHUB = "github"
GITLAB = "gitlab"
class FileOrigin(str, Enum):
"""File origins"""

View File

@ -0,0 +1,340 @@
import fnmatch
import itertools
from collections import deque
from collections.abc import Iterable
from collections.abc import Iterator
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import TypeVar
import gitlab
from gitlab.v4.objects import Project
from common.data_source.config import DocumentSource, INDEX_BATCH_SIZE
from common.data_source.exceptions import ConnectorMissingCredentialError
from common.data_source.exceptions import ConnectorValidationError
from common.data_source.exceptions import CredentialExpiredError
from common.data_source.exceptions import InsufficientPermissionsError
from common.data_source.exceptions import UnexpectedValidationError
from common.data_source.interfaces import GenerateDocumentsOutput
from common.data_source.interfaces import LoadConnector
from common.data_source.interfaces import PollConnector
from common.data_source.interfaces import SecondsSinceUnixEpoch
from common.data_source.models import BasicExpertInfo
from common.data_source.models import Document
from common.data_source.utils import get_file_ext
T = TypeVar("T")
# List of directories/Files to exclude
exclude_patterns = [
"logs",
".github/",
".gitlab/",
".pre-commit-config.yaml",
]
def _batch_gitlab_objects(git_objs: Iterable[T], batch_size: int) -> Iterator[list[T]]:
it = iter(git_objs)
while True:
batch = list(itertools.islice(it, batch_size))
if not batch:
break
yield batch
def get_author(author: Any) -> BasicExpertInfo:
return BasicExpertInfo(
display_name=author.get("name"),
)
def _convert_merge_request_to_document(mr: Any) -> Document:
mr_text = mr.description or ""
doc = Document(
id=mr.web_url,
blob=mr_text,
source=DocumentSource.GITLAB,
semantic_identifier=mr.title,
extension=".md",
# updated_at is UTC time but is timezone unaware, explicitly add UTC
# as there is logic in indexing to prevent wrong timestamped docs
# due to local time discrepancies with UTC
doc_updated_at=mr.updated_at.replace(tzinfo=timezone.utc),
size_bytes=len(mr_text.encode("utf-8")),
primary_owners=[get_author(mr.author)],
metadata={"state": mr.state, "type": "MergeRequest", "web_url": mr.web_url},
)
return doc
def _convert_issue_to_document(issue: Any) -> Document:
issue_text = issue.description or ""
doc = Document(
id=issue.web_url,
blob=issue_text,
source=DocumentSource.GITLAB,
semantic_identifier=issue.title,
extension=".md",
# updated_at is UTC time but is timezone unaware, explicitly add UTC
# as there is logic in indexing to prevent wrong timestamped docs
# due to local time discrepancies with UTC
doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
size_bytes=len(issue_text.encode("utf-8")),
primary_owners=[get_author(issue.author)],
metadata={
"state": issue.state,
"type": issue.type if issue.type else "Issue",
"web_url": issue.web_url,
},
)
return doc
def _convert_code_to_document(
project: Project, file: Any, url: str, projectName: str, projectOwner: str
) -> Document:
# Dynamically get the default branch from the project object
default_branch = project.default_branch
# Fetch the file content using the correct branch
file_content_obj = project.files.get(
file_path=file["path"], ref=default_branch # Use the default branch
)
# BoxConnector uses raw bytes for blob. Keep the same here.
file_content_bytes = file_content_obj.decode()
file_url = f"{url}/{projectOwner}/{projectName}/-/blob/{default_branch}/{file['path']}"
# Try to use the last commit timestamp for incremental sync.
# Falls back to "now" if the commit lookup fails.
last_commit_at = None
try:
# Query commit history for this file on the default branch.
commits = project.commits.list(
ref_name=default_branch,
path=file["path"],
per_page=1,
)
if commits:
# committed_date is ISO string like "2024-01-01T00:00:00.000+00:00"
committed_date = commits[0].committed_date
if isinstance(committed_date, str):
last_commit_at = datetime.strptime(
committed_date, "%Y-%m-%dT%H:%M:%S.%f%z"
).astimezone(timezone.utc)
elif isinstance(committed_date, datetime):
last_commit_at = committed_date.astimezone(timezone.utc)
except Exception:
last_commit_at = None
# Create and return a Document object
doc = Document(
# Use a stable ID so reruns don't create duplicates.
id=file_url,
blob=file_content_bytes,
source=DocumentSource.GITLAB,
semantic_identifier=file.get("name"),
extension=get_file_ext(file.get("name")),
doc_updated_at=last_commit_at or datetime.now(tz=timezone.utc),
size_bytes=len(file_content_bytes) if file_content_bytes is not None else 0,
primary_owners=[], # Add owners if needed
metadata={
"type": "CodeFile",
"path": file.get("path"),
"ref": default_branch,
"project": f"{projectOwner}/{projectName}",
"web_url": file_url,
},
)
return doc
def _should_exclude(path: str) -> bool:
"""Check if a path matches any of the exclude patterns."""
return any(fnmatch.fnmatch(path, pattern) for pattern in exclude_patterns)
class GitlabConnector(LoadConnector, PollConnector):
def __init__(
self,
project_owner: str,
project_name: str,
batch_size: int = INDEX_BATCH_SIZE,
state_filter: str = "all",
include_mrs: bool = True,
include_issues: bool = True,
include_code_files: bool = False,
) -> None:
self.project_owner = project_owner
self.project_name = project_name
self.batch_size = batch_size
self.state_filter = state_filter
self.include_mrs = include_mrs
self.include_issues = include_issues
self.include_code_files = include_code_files
self.gitlab_client: gitlab.Gitlab | None = None
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self.gitlab_client = gitlab.Gitlab(
credentials["gitlab_url"], private_token=credentials["gitlab_access_token"]
)
return None
def validate_connector_settings(self) -> None:
if self.gitlab_client is None:
raise ConnectorMissingCredentialError("GitLab")
try:
self.gitlab_client.auth()
self.gitlab_client.projects.get(
f"{self.project_owner}/{self.project_name}",
lazy=True,
)
except gitlab.exceptions.GitlabAuthenticationError as e:
raise CredentialExpiredError(
"Invalid or expired GitLab credentials."
) from e
except gitlab.exceptions.GitlabAuthorizationError as e:
raise InsufficientPermissionsError(
"Insufficient permissions to access GitLab resources."
) from e
except gitlab.exceptions.GitlabGetError as e:
raise ConnectorValidationError(
"GitLab project not found or not accessible."
) from e
except Exception as e:
raise UnexpectedValidationError(
f"Unexpected error while validating GitLab settings: {e}"
) from e
def _fetch_from_gitlab(
self, start: datetime | None = None, end: datetime | None = None
) -> GenerateDocumentsOutput:
if self.gitlab_client is None:
raise ConnectorMissingCredentialError("Gitlab")
project: Project = self.gitlab_client.projects.get(
f"{self.project_owner}/{self.project_name}"
)
start_utc = start.astimezone(timezone.utc) if start else None
end_utc = end.astimezone(timezone.utc) if end else None
# Fetch code files
if self.include_code_files:
# Fetching using BFS as project.report_tree with recursion causing slow load
queue = deque([""]) # Start with the root directory
while queue:
current_path = queue.popleft()
files = project.repository_tree(path=current_path, all=True)
for file_batch in _batch_gitlab_objects(files, self.batch_size):
code_doc_batch: list[Document] = []
for file in file_batch:
if _should_exclude(file["path"]):
continue
if file["type"] == "blob":
doc = _convert_code_to_document(
project,
file,
self.gitlab_client.url,
self.project_name,
self.project_owner,
)
# Apply incremental window filtering for code files too.
if start_utc is not None and doc.doc_updated_at <= start_utc:
continue
if end_utc is not None and doc.doc_updated_at > end_utc:
continue
code_doc_batch.append(doc)
elif file["type"] == "tree":
queue.append(file["path"])
if code_doc_batch:
yield code_doc_batch
if self.include_mrs:
merge_requests = project.mergerequests.list(
state=self.state_filter,
order_by="updated_at",
sort="desc",
iterator=True,
)
for mr_batch in _batch_gitlab_objects(merge_requests, self.batch_size):
mr_doc_batch: list[Document] = []
for mr in mr_batch:
mr.updated_at = datetime.strptime(
mr.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z"
)
if start_utc is not None and mr.updated_at <= start_utc:
yield mr_doc_batch
return
if end_utc is not None and mr.updated_at > end_utc:
continue
mr_doc_batch.append(_convert_merge_request_to_document(mr))
yield mr_doc_batch
if self.include_issues:
issues = project.issues.list(state=self.state_filter, iterator=True)
for issue_batch in _batch_gitlab_objects(issues, self.batch_size):
issue_doc_batch: list[Document] = []
for issue in issue_batch:
issue.updated_at = datetime.strptime(
issue.updated_at, "%Y-%m-%dT%H:%M:%S.%f%z"
)
# Avoid re-syncing the last-seen item.
if start_utc is not None and issue.updated_at <= start_utc:
yield issue_doc_batch
return
if end_utc is not None and issue.updated_at > end_utc:
continue
issue_doc_batch.append(_convert_issue_to_document(issue))
yield issue_doc_batch
def load_from_state(self) -> GenerateDocumentsOutput:
return self._fetch_from_gitlab()
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
return self._fetch_from_gitlab(start_datetime, end_datetime)
if __name__ == "__main__":
import os
connector = GitlabConnector(
# gitlab_url="https://gitlab.com/api/v4",
project_owner=os.environ["PROJECT_OWNER"],
project_name=os.environ["PROJECT_NAME"],
batch_size=INDEX_BATCH_SIZE,
state_filter="all",
include_mrs=True,
include_issues=True,
include_code_files=True,
)
connector.load_credentials(
{
"gitlab_access_token": os.environ["GITLAB_ACCESS_TOKEN"],
"gitlab_url": os.environ["GITLAB_URL"],
}
)
document_batches = connector.load_from_state()
for f in document_batches:
print("Batch:", f)
print("Finished loading from state.")

View File

@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
from enum import IntFlag, auto
from types import TracebackType
from typing import Any, Dict, Generator, TypeVar, Generic, Callable, TypeAlias
from collections.abc import Iterator
from anthropic import BaseModel
from common.data_source.models import (
@ -16,6 +16,7 @@ from common.data_source.models import (
SecondsSinceUnixEpoch, GenerateSlimDocumentOutput
)
GenerateDocumentsOutput = Iterator[list[Document]]
class LoadConnector(ABC):
"""Load connector interface"""

View File

@ -151,6 +151,7 @@ dependencies = [
"pyairtable>=3.3.0",
"pygithub>=2.8.1",
"asana>=5.2.2",
"python-gitlab>=7.0.0",
]
[dependency-groups]

View File

@ -38,12 +38,24 @@ from api.db.services.connector_service import ConnectorService, SyncLogsService
from api.db.services.knowledgebase_service import KnowledgebaseService
from common import settings
from common.config_utils import show_configs
from common.data_source import BlobStorageConnector, NotionConnector, DiscordConnector, GoogleDriveConnector, MoodleConnector, JiraConnector, DropboxConnector, WebDAVConnector, AirtableConnector, AsanaConnector
from common.data_source import (
BlobStorageConnector,
NotionConnector,
DiscordConnector,
GoogleDriveConnector,
MoodleConnector,
JiraConnector,
DropboxConnector,
WebDAVConnector,
AirtableConnector,
AsanaConnector,
)
from common.constants import FileSource, TaskStatus
from common.data_source.config import INDEX_BATCH_SIZE
from common.data_source.confluence_connector import ConfluenceConnector
from common.data_source.gmail_connector import GmailConnector
from common.data_source.box_connector import BoxConnector
from common.data_source.gitlab_connector import GitlabConnector
from common.data_source.interfaces import CheckpointOutputWrapper
from common.log_utils import init_root_logger
from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
@ -843,6 +855,47 @@ class Asana(SyncBase):
return document_generator
class Gitlab(SyncBase):
SOURCE_NAME: str = FileSource.GITLAB
async def _generate(self, task: dict):
"""
Sync files from GitLab attachments.
"""
self.connector = GitlabConnector(
project_owner= self.conf.get("project_owner"),
project_name= self.conf.get("project_name"),
include_mrs = self.conf.get("include_mrs", False),
include_issues = self.conf.get("include_issues", False),
include_code_files= self.conf.get("include_code_files", False),
)
self.connector.load_credentials(
{
"gitlab_access_token": self.conf.get("credentials", {}).get("gitlab_access_token"),
"gitlab_url": self.conf.get("credentials", {}).get("gitlab_url"),
}
)
if task["reindex"] == "1" or not task["poll_range_start"]:
document_generator = self.connector.load_from_state()
begin_info = "totally"
else:
poll_start = task["poll_range_start"]
if poll_start is None:
document_generator = self.connector.load_from_state()
begin_info = "totally"
else:
document_generator = self.connector.poll_source(
poll_start.timestamp(),
datetime.now(timezone.utc).timestamp()
)
begin_info = "from {}".format(poll_start)
logging.info("Connect to Gitlab: ({}) {}".format(self.conf["project_name"], begin_info))
return document_generator
func_factory = {
FileSource.S3: S3,
FileSource.R2: R2,
@ -862,7 +915,8 @@ func_factory = {
FileSource.WEBDAV: WebDAV,
FileSource.BOX: BOX,
FileSource.AIRTABLE: Airtable,
FileSource.ASANA: Asana
FileSource.GITLAB: Gitlab,
FileSource.ASANA: Asana,
}

15
uv.lock generated
View File

@ -5924,6 +5924,19 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863, upload-time = "2024-01-23T06:32:58.246Z" },
]
[[package]]
name = "python-gitlab"
version = "7.0.0"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "requests" },
{ name = "requests-toolbelt" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5e/c4/0b613303b4f0fcda69b3d2e03d0a1fb1b6b079a7c7832e03a8d92461e9fe/python_gitlab-7.0.0.tar.gz", hash = "sha256:e4d934430f64efc09e6208b782c61cc0a3389527765e03ffbef17f4323dce441", size = 400568, upload-time = "2025-10-29T15:06:02.069Z" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/4f/9e/811edc46a15f8deb828cba7ef8aab3451dc11ca72d033f3df72a5af865d9/python_gitlab-7.0.0-py3-none-any.whl", hash = "sha256:712a6c8c5e79e7e66f6dabb25d8fe7831a6b238d4a5132f8231df6b3b890ceff", size = 144415, upload-time = "2025-10-29T15:06:00.232Z" },
]
[[package]]
name = "python-multipart"
version = "0.0.20"
@ -6232,6 +6245,7 @@ dependencies = [
{ name = "pypdf2" },
{ name = "python-calamine" },
{ name = "python-docx" },
{ name = "python-gitlab" },
{ name = "python-pptx" },
{ name = "pywencai" },
{ name = "qianfan" },
@ -6363,6 +6377,7 @@ requires-dist = [
{ name = "pypdf2", specifier = ">=3.0.1,<4.0.0" },
{ name = "python-calamine", specifier = ">=0.4.0" },
{ name = "python-docx", specifier = ">=1.1.2,<2.0.0" },
{ name = "python-gitlab", specifier = ">=7.0.0" },
{ name = "python-pptx", specifier = ">=1.0.2,<2.0.0" },
{ name = "pywencai", specifier = ">=0.13.1,<1.0.0" },
{ name = "qianfan", specifier = "==0.4.6" },

View File

@ -0,0 +1,2 @@
<?xml version="1.0" encoding="utf-8"?><!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
<svg width="800px" height="800px" viewBox="0 0 32 32" xmlns="http://www.w3.org/2000/svg"><title>file_type_gitlab</title><polygon points="16 28.896 16 28.896 21.156 13.029 10.844 13.029 16 28.896" style="fill:#e24329"/><polygon points="16 28.896 10.844 13.029 3.619 13.029 16 28.896" style="fill:#fc6d26"/><path d="M3.619,13.029h0L2.052,17.851a1.067,1.067,0,0,0,.388,1.193L16,28.9,3.619,13.029Z" style="fill:#fca326"/><path d="M3.619,13.029h7.225L7.739,3.473a.534.534,0,0,0-1.015,0L3.619,13.029Z" style="fill:#e24329"/><polygon points="16 28.896 21.156 13.029 28.381 13.029 16 28.896" style="fill:#fc6d26"/><path d="M28.381,13.029h0l1.567,4.822a1.067,1.067,0,0,1-.388,1.193L16,28.9,28.381,13.029Z" style="fill:#fca326"/><path d="M28.381,13.029H21.156l3.105-9.557a.534.534,0,0,1,1.015,0l3.105,9.557Z" style="fill:#e24329"/></svg>

After

Width:  |  Height:  |  Size: 946 B

View File

@ -929,6 +929,8 @@ Beispiel: Virtual Hosted Style`,
'Verbinden Sie Ihr Gmail über OAuth, um E-Mails zu synchronisieren.',
webdavDescription:
'Verbinden Sie sich mit WebDAV-Servern, um Dateien zu synchronisieren.',
gitlabDescription:
'Verbinden Sie GitLab, um Repositories, Issues, Merge Requests und zugehörige Dokumentation zu synchronisieren.',
webdavRemotePathTip:
'Optional: Geben Sie einen Ordnerpfad auf dem WebDAV-Server an (z.B. /Dokumente). Lassen Sie das Feld leer, um vom Stammverzeichnis aus zu synchronisieren.',
google_driveTokenTip:

View File

@ -933,6 +933,8 @@ Example: Virtual Hosted Style`,
boxDescription: 'Connect your Box drive to sync files and folders.',
airtableDescription:
'Connect to Airtable and synchronize files from a specified table within a designated workspace.',
gitlabDescription:
'Connect GitLab to sync repositories, issues, merge requests, and related documentation.',
asanaDescription:
'Connect to Asana and synchronize files from a specified workspace.',
dropboxAccessTokenTip:

View File

@ -749,6 +749,8 @@ export default {
'Подключите ваш диск Box для синхронизации файлов и папок.',
airtableDescription:
'Подключите Airtable и синхронизируйте файлы из указанной таблицы в заданном рабочем пространстве.',
gitlabDescription:
'Подключите GitLab для синхронизации репозиториев, задач, merge requests и связанной документации.',
asanaDescription:
'Подключите Asana и синхронизируйте файлы из рабочего пространства.',
google_driveDescription:

View File

@ -547,6 +547,8 @@ export default {
avatar: '头像',
avatarTip: '這會在你的個人主頁展示',
profileDescription: '在此更新您的照片和個人詳細信息。',
gitlabDescription:
'連接 GitLab同步儲存庫、Issue、合併請求MR及相關文件內容。',
bedrockCredentialsHint:
'提示Access Key / Secret Key 可留空,以啟用 AWS IAM 自動驗證。',
awsAuthModeAccessKeySecret: 'Access Key 和 Secret',

View File

@ -862,6 +862,8 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
dropboxDescription: '连接 Dropbox同步指定账号下的文件与文件夹。',
boxDescription: '连接你的 Box 云盘以同步文件和文件夹。',
airtableDescription: '连接 Airtable同步指定工作区下指定表格中的文件。',
gitlabDescription:
'连接 GitLab同步仓库、Issue、合并请求MR及相关文档内容。',
asanaDescription: '连接 Asana同步工作区中的文件。',
r2Description: '连接你的 Cloudflare R2 存储桶以导入和同步文件。',
dropboxAccessTokenTip:

View File

@ -25,6 +25,7 @@ export enum DataSourceKey {
OCI_STORAGE = 'oci_storage',
GOOGLE_CLOUD_STORAGE = 'google_cloud_storage',
AIRTABLE = 'airtable',
GITLAB = 'gitlab',
ASANA = 'asana',
// SHAREPOINT = 'sharepoint',
// SLACK = 'slack',
@ -110,6 +111,11 @@ export const generateDataSourceInfo = (t: TFunction) => {
description: t(`setting.${DataSourceKey.AIRTABLE}Description`),
icon: <SvgIcon name={'data-source/airtable'} width={38} />,
},
[DataSourceKey.GITLAB]: {
name: 'GitLab',
description: t(`setting.${DataSourceKey.GITLAB}Description`),
icon: <SvgIcon name={'data-source/gitlab'} width={38} />,
},
[DataSourceKey.ASANA]: {
name: 'Asana',
description: t(`setting.${DataSourceKey.ASANA}Description`),
@ -658,6 +664,54 @@ export const DataSourceFormFields = {
required: true,
},
],
[DataSourceKey.GITLAB]: [
{
label: 'Project Owner',
name: 'config.project_owner',
type: FormFieldType.Text,
required: true,
},
{
label: 'Project Name',
name: 'config.project_name',
type: FormFieldType.Text,
required: true,
},
{
label: 'GitLab Personal Access Token',
name: 'config.credentials.gitlab_access_token',
type: FormFieldType.Password,
required: true,
},
{
label: 'GitLab URL',
name: 'config.gitlab_url',
type: FormFieldType.Text,
required: true,
placeholder: 'https://gitlab.com',
},
{
label: 'include Merge Requests',
name: 'config.include_mrs',
type: FormFieldType.Checkbox,
required: false,
defaultValue: true,
},
{
label: 'include Issues',
name: 'config.include_issues',
type: FormFieldType.Checkbox,
required: false,
defaultValue: true,
},
{
label: 'include Code Files',
name: 'config.include_code_files',
type: FormFieldType.Checkbox,
required: false,
defaultValue: true,
},
],
[DataSourceKey.ASANA]: [
{
label: 'API Token',
@ -883,6 +937,21 @@ export const DataSourceFormDefaultValues = {
},
},
},
[DataSourceKey.GITLAB]: {
name: '',
source: DataSourceKey.GITLAB,
config: {
project_owner: '',
project_name: '',
gitlab_url: 'https://gitlab.com',
include_mrs: true,
include_issues: true,
include_code_files: true,
credentials: {
gitlab_access_token: '',
},
},
},
[DataSourceKey.ASANA]: {
name: '',
source: DataSourceKey.ASANA,