mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat: add Jira connector (#11285)
### What problem does this PR solve? Add Jira connector. <img width="978" height="925" alt="image" src="https://github.com/user-attachments/assets/78bb5c77-2710-4569-a76e-9087ca23b227" /> --- <img width="1903" height="489" alt="image" src="https://github.com/user-attachments/assets/193bc5c5-f751-4bd5-883a-2173282c2b96" /> --- <img width="1035" height="925" alt="image" src="https://github.com/user-attachments/assets/1a0aec19-30eb-4ada-9283-61d1c915f59d" /> --- <img width="1905" height="601" alt="image" src="https://github.com/user-attachments/assets/3dde1062-3f27-4717-8e09-fd5fd5e64171" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -11,7 +11,7 @@ from .confluence_connector import ConfluenceConnector
|
|||||||
from .discord_connector import DiscordConnector
|
from .discord_connector import DiscordConnector
|
||||||
from .dropbox_connector import DropboxConnector
|
from .dropbox_connector import DropboxConnector
|
||||||
from .google_drive.connector import GoogleDriveConnector
|
from .google_drive.connector import GoogleDriveConnector
|
||||||
from .jira_connector import JiraConnector
|
from .jira.connector import JiraConnector
|
||||||
from .sharepoint_connector import SharePointConnector
|
from .sharepoint_connector import SharePointConnector
|
||||||
from .teams_connector import TeamsConnector
|
from .teams_connector import TeamsConnector
|
||||||
from .config import BlobType, DocumentSource
|
from .config import BlobType, DocumentSource
|
||||||
|
|||||||
@ -13,6 +13,7 @@ def get_current_tz_offset() -> int:
|
|||||||
return round(time_diff.total_seconds() / 3600)
|
return round(time_diff.total_seconds() / 3600)
|
||||||
|
|
||||||
|
|
||||||
|
ONE_MINUTE = 60
|
||||||
ONE_HOUR = 3600
|
ONE_HOUR = 3600
|
||||||
ONE_DAY = ONE_HOUR * 24
|
ONE_DAY = ONE_HOUR * 24
|
||||||
|
|
||||||
@ -42,6 +43,7 @@ class DocumentSource(str, Enum):
|
|||||||
OCI_STORAGE = "oci_storage"
|
OCI_STORAGE = "oci_storage"
|
||||||
SLACK = "slack"
|
SLACK = "slack"
|
||||||
CONFLUENCE = "confluence"
|
CONFLUENCE = "confluence"
|
||||||
|
JIRA = "jira"
|
||||||
GOOGLE_DRIVE = "google_drive"
|
GOOGLE_DRIVE = "google_drive"
|
||||||
GMAIL = "gmail"
|
GMAIL = "gmail"
|
||||||
DISCORD = "discord"
|
DISCORD = "discord"
|
||||||
@ -178,6 +180,21 @@ GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD = int(
|
|||||||
os.environ.get("GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD", 10 * 1024 * 1024)
|
os.environ.get("GOOGLE_DRIVE_CONNECTOR_SIZE_THRESHOLD", 10 * 1024 * 1024)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
JIRA_CONNECTOR_LABELS_TO_SKIP = [
|
||||||
|
ignored_tag
|
||||||
|
for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
|
||||||
|
if ignored_tag
|
||||||
|
]
|
||||||
|
JIRA_CONNECTOR_MAX_TICKET_SIZE = int(
|
||||||
|
os.environ.get("JIRA_CONNECTOR_MAX_TICKET_SIZE", 100 * 1024)
|
||||||
|
)
|
||||||
|
JIRA_SYNC_TIME_BUFFER_SECONDS = int(
|
||||||
|
os.environ.get("JIRA_SYNC_TIME_BUFFER_SECONDS", ONE_MINUTE)
|
||||||
|
)
|
||||||
|
JIRA_TIMEZONE_OFFSET = float(
|
||||||
|
os.environ.get("JIRA_TIMEZONE_OFFSET", get_current_tz_offset())
|
||||||
|
)
|
||||||
|
|
||||||
OAUTH_SLACK_CLIENT_ID = os.environ.get("OAUTH_SLACK_CLIENT_ID", "")
|
OAUTH_SLACK_CLIENT_ID = os.environ.get("OAUTH_SLACK_CLIENT_ID", "")
|
||||||
OAUTH_SLACK_CLIENT_SECRET = os.environ.get("OAUTH_SLACK_CLIENT_SECRET", "")
|
OAUTH_SLACK_CLIENT_SECRET = os.environ.get("OAUTH_SLACK_CLIENT_SECRET", "")
|
||||||
OAUTH_CONFLUENCE_CLOUD_CLIENT_ID = os.environ.get(
|
OAUTH_CONFLUENCE_CLOUD_CLIENT_ID = os.environ.get(
|
||||||
|
|||||||
@ -1788,6 +1788,7 @@ class ConfluenceConnector(
|
|||||||
cql_url = self.confluence_client.build_cql_url(
|
cql_url = self.confluence_client.build_cql_url(
|
||||||
page_query, expand=",".join(_PAGE_EXPANSION_FIELDS)
|
page_query, expand=",".join(_PAGE_EXPANSION_FIELDS)
|
||||||
)
|
)
|
||||||
|
logging.info(f"[Confluence Connector] Building CQL URL {cql_url}")
|
||||||
return update_param_in_path(cql_url, "limit", str(limit))
|
return update_param_in_path(cql_url, "limit", str(limit))
|
||||||
|
|
||||||
@override
|
@override
|
||||||
|
|||||||
0
common/data_source/jira/__init__.py
Normal file
0
common/data_source/jira/__init__.py
Normal file
973
common/data_source/jira/connector.py
Normal file
973
common/data_source/jira/connector.py
Normal file
@ -0,0 +1,973 @@
|
|||||||
|
"""Checkpointed Jira connector that emits markdown blobs for each issue."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from typing import Any
|
||||||
|
from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
||||||
|
|
||||||
|
from jira import JIRA
|
||||||
|
from jira.resources import Issue
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from common.data_source.config import (
|
||||||
|
INDEX_BATCH_SIZE,
|
||||||
|
JIRA_CONNECTOR_LABELS_TO_SKIP,
|
||||||
|
JIRA_CONNECTOR_MAX_TICKET_SIZE,
|
||||||
|
JIRA_TIMEZONE_OFFSET,
|
||||||
|
ONE_HOUR,
|
||||||
|
DocumentSource,
|
||||||
|
)
|
||||||
|
from common.data_source.exceptions import (
|
||||||
|
ConnectorMissingCredentialError,
|
||||||
|
ConnectorValidationError,
|
||||||
|
InsufficientPermissionsError,
|
||||||
|
UnexpectedValidationError,
|
||||||
|
)
|
||||||
|
from common.data_source.interfaces import (
|
||||||
|
CheckpointedConnectorWithPermSync,
|
||||||
|
CheckpointOutputWrapper,
|
||||||
|
SecondsSinceUnixEpoch,
|
||||||
|
SlimConnectorWithPermSync,
|
||||||
|
)
|
||||||
|
from common.data_source.jira.utils import (
|
||||||
|
JIRA_CLOUD_API_VERSION,
|
||||||
|
JIRA_SERVER_API_VERSION,
|
||||||
|
build_issue_url,
|
||||||
|
extract_body_text,
|
||||||
|
extract_named_value,
|
||||||
|
extract_user,
|
||||||
|
format_attachments,
|
||||||
|
format_comments,
|
||||||
|
parse_jira_datetime,
|
||||||
|
should_skip_issue,
|
||||||
|
)
|
||||||
|
from common.data_source.models import (
|
||||||
|
ConnectorCheckpoint,
|
||||||
|
ConnectorFailure,
|
||||||
|
Document,
|
||||||
|
DocumentFailure,
|
||||||
|
SlimDocument,
|
||||||
|
)
|
||||||
|
from common.data_source.utils import is_atlassian_cloud_url, is_atlassian_date_error, scoped_url
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_DEFAULT_FIELDS = "summary,description,updated,created,status,priority,assignee,reporter,labels,issuetype,project,comment,attachment"
|
||||||
|
_SLIM_FIELDS = "key,project"
|
||||||
|
_MAX_RESULTS_FETCH_IDS = 5000
|
||||||
|
_JIRA_SLIM_PAGE_SIZE = 500
|
||||||
|
_JIRA_FULL_PAGE_SIZE = 50
|
||||||
|
_DEFAULT_ATTACHMENT_SIZE_LIMIT = 10 * 1024 * 1024 # 10MB
|
||||||
|
|
||||||
|
|
||||||
|
class JiraCheckpoint(ConnectorCheckpoint):
|
||||||
|
"""Checkpoint that tracks which slice of the current JQL result set was emitted."""
|
||||||
|
|
||||||
|
start_at: int = 0
|
||||||
|
cursor: str | None = None
|
||||||
|
ids_done: bool = False
|
||||||
|
all_issue_ids: list[list[str]] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
_TZ_OFFSET_PATTERN = re.compile(r"([+-])(\d{2})(:?)(\d{2})$")
|
||||||
|
|
||||||
|
|
||||||
|
class JiraConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync):
|
||||||
|
"""Retrieve Jira issues and emit them as markdown documents."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
jira_base_url: str,
|
||||||
|
project_key: str | None = None,
|
||||||
|
jql_query: str | None = None,
|
||||||
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
|
include_comments: bool = True,
|
||||||
|
include_attachments: bool = False,
|
||||||
|
labels_to_skip: Sequence[str] | None = None,
|
||||||
|
comment_email_blacklist: Sequence[str] | None = None,
|
||||||
|
scoped_token: bool = False,
|
||||||
|
attachment_size_limit: int | None = None,
|
||||||
|
timezone_offset: float | None = None,
|
||||||
|
) -> None:
|
||||||
|
if not jira_base_url:
|
||||||
|
raise ConnectorValidationError("Jira base URL must be provided.")
|
||||||
|
|
||||||
|
self.jira_base_url = jira_base_url.rstrip("/")
|
||||||
|
self.project_key = project_key
|
||||||
|
self.jql_query = jql_query
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.include_comments = include_comments
|
||||||
|
self.include_attachments = include_attachments
|
||||||
|
configured_labels = labels_to_skip or JIRA_CONNECTOR_LABELS_TO_SKIP
|
||||||
|
self.labels_to_skip = {label.lower() for label in configured_labels}
|
||||||
|
self.comment_email_blacklist = {email.lower() for email in comment_email_blacklist or []}
|
||||||
|
self.scoped_token = scoped_token
|
||||||
|
self.jira_client: JIRA | None = None
|
||||||
|
|
||||||
|
self.max_ticket_size = JIRA_CONNECTOR_MAX_TICKET_SIZE
|
||||||
|
self.attachment_size_limit = attachment_size_limit if attachment_size_limit and attachment_size_limit > 0 else _DEFAULT_ATTACHMENT_SIZE_LIMIT
|
||||||
|
self._fields_param = _DEFAULT_FIELDS
|
||||||
|
self._slim_fields = _SLIM_FIELDS
|
||||||
|
|
||||||
|
tz_offset_value = float(timezone_offset) if timezone_offset is not None else float(JIRA_TIMEZONE_OFFSET)
|
||||||
|
self.timezone_offset = tz_offset_value
|
||||||
|
self.timezone = timezone(offset=timedelta(hours=tz_offset_value))
|
||||||
|
self._timezone_overridden = timezone_offset is not None
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Connector lifecycle helpers
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
|
"""Instantiate the Jira client using either an API token or username/password."""
|
||||||
|
jira_url_for_client = self.jira_base_url
|
||||||
|
if self.scoped_token:
|
||||||
|
if is_atlassian_cloud_url(self.jira_base_url):
|
||||||
|
try:
|
||||||
|
jira_url_for_client = scoped_url(self.jira_base_url, "jira")
|
||||||
|
except ValueError as exc:
|
||||||
|
raise ConnectorValidationError(str(exc)) from exc
|
||||||
|
else:
|
||||||
|
logger.warning(f"[Jira] Scoped token requested but Jira base URL {self.jira_base_url} does not appear to be an Atlassian Cloud domain; scoped token ignored.")
|
||||||
|
|
||||||
|
user_email = credentials.get("jira_user_email") or credentials.get("username")
|
||||||
|
api_token = credentials.get("jira_api_token") or credentials.get("token") or credentials.get("api_token")
|
||||||
|
password = credentials.get("jira_password") or credentials.get("password")
|
||||||
|
rest_api_version = credentials.get("rest_api_version")
|
||||||
|
|
||||||
|
if not rest_api_version:
|
||||||
|
rest_api_version = JIRA_CLOUD_API_VERSION if api_token else JIRA_SERVER_API_VERSION
|
||||||
|
options: dict[str, Any] = {"rest_api_version": rest_api_version}
|
||||||
|
|
||||||
|
try:
|
||||||
|
if user_email and api_token:
|
||||||
|
self.jira_client = JIRA(
|
||||||
|
server=jira_url_for_client,
|
||||||
|
basic_auth=(user_email, api_token),
|
||||||
|
options=options,
|
||||||
|
)
|
||||||
|
elif api_token:
|
||||||
|
self.jira_client = JIRA(
|
||||||
|
server=jira_url_for_client,
|
||||||
|
token_auth=api_token,
|
||||||
|
options=options,
|
||||||
|
)
|
||||||
|
elif user_email and password:
|
||||||
|
self.jira_client = JIRA(
|
||||||
|
server=jira_url_for_client,
|
||||||
|
basic_auth=(user_email, password),
|
||||||
|
options=options,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ConnectorMissingCredentialError("Jira credentials must include either an API token or username/password.")
|
||||||
|
except Exception as exc: # pragma: no cover - jira lib raises many types
|
||||||
|
raise ConnectorMissingCredentialError(f"Jira: {exc}") from exc
|
||||||
|
self._sync_timezone_from_server()
|
||||||
|
return None
|
||||||
|
|
||||||
|
def validate_connector_settings(self) -> None:
|
||||||
|
"""Validate connectivity by fetching basic Jira info."""
|
||||||
|
if not self.jira_client:
|
||||||
|
raise ConnectorMissingCredentialError("Jira")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.jql_query:
|
||||||
|
dummy_checkpoint = self.build_dummy_checkpoint()
|
||||||
|
checkpoint_callback = self._make_checkpoint_callback(dummy_checkpoint)
|
||||||
|
iterator = self._perform_jql_search(
|
||||||
|
jql=self.jql_query,
|
||||||
|
start=0,
|
||||||
|
max_results=1,
|
||||||
|
fields="key",
|
||||||
|
all_issue_ids=dummy_checkpoint.all_issue_ids,
|
||||||
|
checkpoint_callback=checkpoint_callback,
|
||||||
|
next_page_token=dummy_checkpoint.cursor,
|
||||||
|
ids_done=dummy_checkpoint.ids_done,
|
||||||
|
)
|
||||||
|
next(iter(iterator), None)
|
||||||
|
elif self.project_key:
|
||||||
|
self.jira_client.project(self.project_key)
|
||||||
|
else:
|
||||||
|
self.jira_client.projects()
|
||||||
|
except Exception as exc: # pragma: no cover - dependent on Jira responses
|
||||||
|
self._handle_validation_error(exc)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Checkpointed connector implementation
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def load_from_checkpoint(
|
||||||
|
self,
|
||||||
|
start: SecondsSinceUnixEpoch,
|
||||||
|
end: SecondsSinceUnixEpoch,
|
||||||
|
checkpoint: JiraCheckpoint,
|
||||||
|
) -> Generator[Document | ConnectorFailure, None, JiraCheckpoint]:
|
||||||
|
"""Load Jira issues, emitting a Document per issue."""
|
||||||
|
try:
|
||||||
|
return (yield from self._load_with_retry(start, end, checkpoint))
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(f"[Jira] Jira query ultimately failed: {exc}")
|
||||||
|
yield ConnectorFailure(
|
||||||
|
failure_message=f"Failed to query Jira: {exc}",
|
||||||
|
exception=exc,
|
||||||
|
)
|
||||||
|
return JiraCheckpoint(has_more=False, start_at=checkpoint.start_at)
|
||||||
|
|
||||||
|
def load_from_checkpoint_with_perm_sync(
|
||||||
|
self,
|
||||||
|
start: SecondsSinceUnixEpoch,
|
||||||
|
end: SecondsSinceUnixEpoch,
|
||||||
|
checkpoint: JiraCheckpoint,
|
||||||
|
) -> Generator[Document | ConnectorFailure, None, JiraCheckpoint]:
|
||||||
|
"""Permissions are not synced separately, so reuse the standard loader."""
|
||||||
|
return (yield from self.load_from_checkpoint(start=start, end=end, checkpoint=checkpoint))
|
||||||
|
|
||||||
|
def _load_with_retry(
|
||||||
|
self,
|
||||||
|
start: SecondsSinceUnixEpoch,
|
||||||
|
end: SecondsSinceUnixEpoch,
|
||||||
|
checkpoint: JiraCheckpoint,
|
||||||
|
) -> Generator[Document | ConnectorFailure, None, JiraCheckpoint]:
|
||||||
|
if not self.jira_client:
|
||||||
|
raise ConnectorMissingCredentialError("Jira")
|
||||||
|
|
||||||
|
attempt_start = start
|
||||||
|
retried_with_buffer = False
|
||||||
|
attempt = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
attempt += 1
|
||||||
|
jql = self._build_jql(attempt_start, end)
|
||||||
|
logger.info(f"[Jira] Executing Jira JQL attempt {attempt} (start={attempt_start}, end={end}, buffered_retry={retried_with_buffer}): {jql}")
|
||||||
|
try:
|
||||||
|
return (yield from self._load_from_checkpoint_internal(jql, checkpoint, start_filter=start))
|
||||||
|
except Exception as exc:
|
||||||
|
if attempt_start is not None and not retried_with_buffer and is_atlassian_date_error(exc):
|
||||||
|
attempt_start = attempt_start - ONE_HOUR
|
||||||
|
retried_with_buffer = True
|
||||||
|
logger.info(f"[Jira] Atlassian date error detected; retrying with start={attempt_start}.")
|
||||||
|
continue
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _handle_validation_error(self, exc: Exception) -> None:
|
||||||
|
status_code = getattr(exc, "status_code", None)
|
||||||
|
if status_code == 401:
|
||||||
|
raise InsufficientPermissionsError("Jira credential appears to be invalid or expired (HTTP 401).") from exc
|
||||||
|
if status_code == 403:
|
||||||
|
raise InsufficientPermissionsError("Jira token does not have permission to access the requested resources (HTTP 403).") from exc
|
||||||
|
if status_code == 404:
|
||||||
|
raise ConnectorValidationError("Jira resource not found (HTTP 404).") from exc
|
||||||
|
if status_code == 429:
|
||||||
|
raise ConnectorValidationError("Jira rate limit exceeded during validation (HTTP 429).") from exc
|
||||||
|
|
||||||
|
message = getattr(exc, "text", str(exc))
|
||||||
|
if not message:
|
||||||
|
raise UnexpectedValidationError("Unexpected Jira validation error.") from exc
|
||||||
|
|
||||||
|
raise ConnectorValidationError(f"Jira validation failed: {message}") from exc
|
||||||
|
|
||||||
|
def _load_from_checkpoint_internal(
|
||||||
|
self,
|
||||||
|
jql: str,
|
||||||
|
checkpoint: JiraCheckpoint,
|
||||||
|
start_filter: SecondsSinceUnixEpoch | None = None,
|
||||||
|
) -> Generator[Document | ConnectorFailure, None, JiraCheckpoint]:
|
||||||
|
assert self.jira_client, "load_credentials must be called before loading issues."
|
||||||
|
|
||||||
|
page_size = self._full_page_size()
|
||||||
|
new_checkpoint = copy.deepcopy(checkpoint)
|
||||||
|
starting_offset = new_checkpoint.start_at or 0
|
||||||
|
current_offset = starting_offset
|
||||||
|
checkpoint_callback = self._make_checkpoint_callback(new_checkpoint)
|
||||||
|
|
||||||
|
issue_iter = self._perform_jql_search(
|
||||||
|
jql=jql,
|
||||||
|
start=current_offset,
|
||||||
|
max_results=page_size,
|
||||||
|
fields=self._fields_param,
|
||||||
|
all_issue_ids=new_checkpoint.all_issue_ids,
|
||||||
|
checkpoint_callback=checkpoint_callback,
|
||||||
|
next_page_token=new_checkpoint.cursor,
|
||||||
|
ids_done=new_checkpoint.ids_done,
|
||||||
|
)
|
||||||
|
|
||||||
|
start_cutoff = float(start_filter) if start_filter is not None else None
|
||||||
|
|
||||||
|
for issue in issue_iter:
|
||||||
|
current_offset += 1
|
||||||
|
issue_key = getattr(issue, "key", "unknown")
|
||||||
|
if should_skip_issue(issue, self.labels_to_skip):
|
||||||
|
continue
|
||||||
|
|
||||||
|
issue_updated = parse_jira_datetime(issue.raw.get("fields", {}).get("updated"))
|
||||||
|
if start_cutoff is not None and issue_updated is not None and issue_updated.timestamp() <= start_cutoff:
|
||||||
|
# Jira JQL only supports minute precision, so we discard already-processed
|
||||||
|
# issues here based on the original second-level cutoff.
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
document = self._issue_to_document(issue)
|
||||||
|
except Exception as exc: # pragma: no cover - defensive
|
||||||
|
logger.exception(f"[Jira] Failed to convert Jira issue {issue_key}: {exc}")
|
||||||
|
yield ConnectorFailure(
|
||||||
|
failure_message=f"Failed to convert Jira issue {issue_key}: {exc}",
|
||||||
|
failed_document=DocumentFailure(
|
||||||
|
document_id=issue_key,
|
||||||
|
document_link=build_issue_url(self.jira_base_url, issue_key),
|
||||||
|
),
|
||||||
|
exception=exc,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if document is not None:
|
||||||
|
yield document
|
||||||
|
if self.include_attachments:
|
||||||
|
for attachment_document in self._attachment_documents(issue):
|
||||||
|
if attachment_document is not None:
|
||||||
|
yield attachment_document
|
||||||
|
|
||||||
|
self._update_checkpoint_for_next_run(
|
||||||
|
checkpoint=new_checkpoint,
|
||||||
|
current_offset=current_offset,
|
||||||
|
starting_offset=starting_offset,
|
||||||
|
page_size=page_size,
|
||||||
|
)
|
||||||
|
new_checkpoint.start_at = current_offset
|
||||||
|
return new_checkpoint
|
||||||
|
|
||||||
|
def build_dummy_checkpoint(self) -> JiraCheckpoint:
|
||||||
|
"""Create an empty checkpoint used to kick off ingestion."""
|
||||||
|
return JiraCheckpoint(has_more=True, start_at=0)
|
||||||
|
|
||||||
|
def validate_checkpoint_json(self, checkpoint_json: str) -> JiraCheckpoint:
|
||||||
|
"""Validate a serialized checkpoint."""
|
||||||
|
return JiraCheckpoint.model_validate_json(checkpoint_json)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Slim connector implementation
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def retrieve_all_slim_docs_perm_sync(
|
||||||
|
self,
|
||||||
|
start: SecondsSinceUnixEpoch | None = None,
|
||||||
|
end: SecondsSinceUnixEpoch | None = None,
|
||||||
|
callback: Any = None, # noqa: ARG002 - maintained for interface compatibility
|
||||||
|
) -> Generator[list[SlimDocument], None, None]:
|
||||||
|
"""Return lightweight references to Jira issues (used for permission syncing)."""
|
||||||
|
if not self.jira_client:
|
||||||
|
raise ConnectorMissingCredentialError("Jira")
|
||||||
|
|
||||||
|
start_ts = start if start is not None else 0
|
||||||
|
end_ts = end if end is not None else datetime.now(timezone.utc).timestamp()
|
||||||
|
jql = self._build_jql(start_ts, end_ts)
|
||||||
|
|
||||||
|
checkpoint = self.build_dummy_checkpoint()
|
||||||
|
checkpoint_callback = self._make_checkpoint_callback(checkpoint)
|
||||||
|
prev_offset = 0
|
||||||
|
current_offset = 0
|
||||||
|
slim_batch: list[SlimDocument] = []
|
||||||
|
|
||||||
|
while checkpoint.has_more:
|
||||||
|
for issue in self._perform_jql_search(
|
||||||
|
jql=jql,
|
||||||
|
start=current_offset,
|
||||||
|
max_results=_JIRA_SLIM_PAGE_SIZE,
|
||||||
|
fields=self._slim_fields,
|
||||||
|
all_issue_ids=checkpoint.all_issue_ids,
|
||||||
|
checkpoint_callback=checkpoint_callback,
|
||||||
|
next_page_token=checkpoint.cursor,
|
||||||
|
ids_done=checkpoint.ids_done,
|
||||||
|
):
|
||||||
|
current_offset += 1
|
||||||
|
if should_skip_issue(issue, self.labels_to_skip):
|
||||||
|
continue
|
||||||
|
|
||||||
|
doc_id = build_issue_url(self.jira_base_url, issue.key)
|
||||||
|
slim_batch.append(SlimDocument(id=doc_id))
|
||||||
|
|
||||||
|
if len(slim_batch) >= _JIRA_SLIM_PAGE_SIZE:
|
||||||
|
yield slim_batch
|
||||||
|
slim_batch = []
|
||||||
|
|
||||||
|
self._update_checkpoint_for_next_run(
|
||||||
|
checkpoint=checkpoint,
|
||||||
|
current_offset=current_offset,
|
||||||
|
starting_offset=prev_offset,
|
||||||
|
page_size=_JIRA_SLIM_PAGE_SIZE,
|
||||||
|
)
|
||||||
|
prev_offset = current_offset
|
||||||
|
|
||||||
|
if slim_batch:
|
||||||
|
yield slim_batch
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Internal helpers
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _build_jql(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> str:
|
||||||
|
clauses: list[str] = []
|
||||||
|
if self.jql_query:
|
||||||
|
clauses.append(f"({self.jql_query})")
|
||||||
|
elif self.project_key:
|
||||||
|
clauses.append(f'project = "{self.project_key}"')
|
||||||
|
else:
|
||||||
|
raise ConnectorValidationError("Either project_key or jql_query must be provided for Jira connector.")
|
||||||
|
|
||||||
|
if self.labels_to_skip:
|
||||||
|
labels = ", ".join(f'"{label}"' for label in self.labels_to_skip)
|
||||||
|
clauses.append(f"labels NOT IN ({labels})")
|
||||||
|
|
||||||
|
if start is not None:
|
||||||
|
clauses.append(f'updated >= "{self._format_jql_time(start)}"')
|
||||||
|
if end is not None:
|
||||||
|
clauses.append(f'updated <= "{self._format_jql_time(end)}"')
|
||||||
|
|
||||||
|
if not clauses:
|
||||||
|
raise ConnectorValidationError("Unable to build Jira JQL query.")
|
||||||
|
|
||||||
|
jql = " AND ".join(clauses)
|
||||||
|
if "order by" not in jql.lower():
|
||||||
|
jql = f"{jql} ORDER BY updated ASC"
|
||||||
|
return jql
|
||||||
|
|
||||||
|
def _format_jql_time(self, timestamp: SecondsSinceUnixEpoch) -> str:
|
||||||
|
dt_utc = datetime.fromtimestamp(float(timestamp), tz=timezone.utc)
|
||||||
|
dt_local = dt_utc.astimezone(self.timezone)
|
||||||
|
# Jira only accepts minute-precision timestamps in JQL, so we format accordingly
|
||||||
|
# and rely on a post-query second-level filter to avoid duplicates.
|
||||||
|
return dt_local.strftime("%Y-%m-%d %H:%M")
|
||||||
|
|
||||||
|
def _issue_to_document(self, issue: Issue) -> Document | None:
|
||||||
|
fields = issue.raw.get("fields", {})
|
||||||
|
summary = fields.get("summary") or ""
|
||||||
|
description_text = extract_body_text(fields.get("description"))
|
||||||
|
comments_text = (
|
||||||
|
format_comments(
|
||||||
|
fields.get("comment"),
|
||||||
|
blacklist=self.comment_email_blacklist,
|
||||||
|
)
|
||||||
|
if self.include_comments
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
attachments_text = format_attachments(fields.get("attachment"))
|
||||||
|
|
||||||
|
reporter_name, reporter_email = extract_user(fields.get("reporter"))
|
||||||
|
assignee_name, assignee_email = extract_user(fields.get("assignee"))
|
||||||
|
status = extract_named_value(fields.get("status"))
|
||||||
|
priority = extract_named_value(fields.get("priority"))
|
||||||
|
issue_type = extract_named_value(fields.get("issuetype"))
|
||||||
|
project = fields.get("project") or {}
|
||||||
|
|
||||||
|
issue_url = build_issue_url(self.jira_base_url, issue.key)
|
||||||
|
|
||||||
|
metadata_lines = [
|
||||||
|
f"key: {issue.key}",
|
||||||
|
f"url: {issue_url}",
|
||||||
|
f"summary: {summary}",
|
||||||
|
f"status: {status or 'Unknown'}",
|
||||||
|
f"priority: {priority or 'Unspecified'}",
|
||||||
|
f"issue_type: {issue_type or 'Unknown'}",
|
||||||
|
f"project: {project.get('name') or ''}",
|
||||||
|
f"project_key: {project.get('key') or self.project_key or ''}",
|
||||||
|
]
|
||||||
|
|
||||||
|
if reporter_name:
|
||||||
|
metadata_lines.append(f"reporter: {reporter_name}")
|
||||||
|
if reporter_email:
|
||||||
|
metadata_lines.append(f"reporter_email: {reporter_email}")
|
||||||
|
if assignee_name:
|
||||||
|
metadata_lines.append(f"assignee: {assignee_name}")
|
||||||
|
if assignee_email:
|
||||||
|
metadata_lines.append(f"assignee_email: {assignee_email}")
|
||||||
|
if fields.get("labels"):
|
||||||
|
metadata_lines.append(f"labels: {', '.join(fields.get('labels'))}")
|
||||||
|
|
||||||
|
created_dt = parse_jira_datetime(fields.get("created"))
|
||||||
|
updated_dt = parse_jira_datetime(fields.get("updated")) or created_dt or datetime.now(timezone.utc)
|
||||||
|
metadata_lines.append(f"created: {created_dt.isoformat() if created_dt else ''}")
|
||||||
|
metadata_lines.append(f"updated: {updated_dt.isoformat() if updated_dt else ''}")
|
||||||
|
|
||||||
|
sections: list[str] = [
|
||||||
|
"---",
|
||||||
|
"\n".join(filter(None, metadata_lines)),
|
||||||
|
"---",
|
||||||
|
"",
|
||||||
|
"## Description",
|
||||||
|
description_text or "No description provided.",
|
||||||
|
]
|
||||||
|
|
||||||
|
if comments_text:
|
||||||
|
sections.extend(["", "## Comments", comments_text])
|
||||||
|
if attachments_text:
|
||||||
|
sections.extend(["", "## Attachments", attachments_text])
|
||||||
|
|
||||||
|
blob_text = "\n".join(sections).strip() + "\n"
|
||||||
|
blob = blob_text.encode("utf-8")
|
||||||
|
|
||||||
|
if len(blob) > self.max_ticket_size:
|
||||||
|
logger.info(f"[Jira] Skipping {issue.key} because it exceeds the maximum size of {self.max_ticket_size} bytes.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
semantic_identifier = f"{issue.key}: {summary}" if summary else issue.key
|
||||||
|
|
||||||
|
return Document(
|
||||||
|
id=issue_url,
|
||||||
|
source=DocumentSource.JIRA,
|
||||||
|
semantic_identifier=semantic_identifier,
|
||||||
|
extension=".md",
|
||||||
|
blob=blob,
|
||||||
|
doc_updated_at=updated_dt,
|
||||||
|
size_bytes=len(blob),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _attachment_documents(self, issue: Issue) -> Iterable[Document]:
|
||||||
|
attachments = issue.raw.get("fields", {}).get("attachment") or []
|
||||||
|
for attachment in attachments:
|
||||||
|
try:
|
||||||
|
document = self._attachment_to_document(issue, attachment)
|
||||||
|
if document is not None:
|
||||||
|
yield document
|
||||||
|
except Exception as exc: # pragma: no cover - defensive
|
||||||
|
failed_id = attachment.get("id") or attachment.get("filename")
|
||||||
|
issue_key = getattr(issue, "key", "unknown")
|
||||||
|
logger.warning(f"[Jira] Failed to process attachment {failed_id} for issue {issue_key}: {exc}")
|
||||||
|
|
||||||
|
def _attachment_to_document(self, issue: Issue, attachment: dict[str, Any]) -> Document | None:
|
||||||
|
if not self.include_attachments:
|
||||||
|
return None
|
||||||
|
|
||||||
|
filename = attachment.get("filename")
|
||||||
|
content_url = attachment.get("content")
|
||||||
|
if not filename or not content_url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
attachment_size = int(attachment.get("size", 0))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
attachment_size = 0
|
||||||
|
if attachment_size and attachment_size > self.attachment_size_limit:
|
||||||
|
logger.info(f"[Jira] Skipping attachment {filename} on {issue.key} because reported size exceeds limit ({self.attachment_size_limit} bytes).")
|
||||||
|
return None
|
||||||
|
|
||||||
|
blob = self._download_attachment(content_url)
|
||||||
|
if blob is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if len(blob) > self.attachment_size_limit:
|
||||||
|
logger.info(f"[Jira] Skipping attachment {filename} on {issue.key} because it exceeds the size limit ({self.attachment_size_limit} bytes).")
|
||||||
|
return None
|
||||||
|
|
||||||
|
attachment_time = parse_jira_datetime(attachment.get("created")) or parse_jira_datetime(attachment.get("updated"))
|
||||||
|
updated_dt = attachment_time or parse_jira_datetime(issue.raw.get("fields", {}).get("updated")) or datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
extension = os.path.splitext(filename)[1] or ""
|
||||||
|
document_id = f"{issue.key}::attachment::{attachment.get('id') or filename}"
|
||||||
|
semantic_identifier = f"{issue.key} attachment: {filename}"
|
||||||
|
|
||||||
|
return Document(
|
||||||
|
id=document_id,
|
||||||
|
source=DocumentSource.JIRA,
|
||||||
|
semantic_identifier=semantic_identifier,
|
||||||
|
extension=extension,
|
||||||
|
blob=blob,
|
||||||
|
doc_updated_at=updated_dt,
|
||||||
|
size_bytes=len(blob),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _download_attachment(self, url: str) -> bytes | None:
|
||||||
|
if not self.jira_client:
|
||||||
|
raise ConnectorMissingCredentialError("Jira")
|
||||||
|
response = self.jira_client._session.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.content
|
||||||
|
|
||||||
|
def _sync_timezone_from_server(self) -> None:
|
||||||
|
if self._timezone_overridden or not self.jira_client:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
server_info = self.jira_client.server_info()
|
||||||
|
except Exception as exc: # pragma: no cover - defensive
|
||||||
|
logger.info(f"[Jira] Unable to determine timezone from server info; continuing with offset {self.timezone_offset}. Error: {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
detected_offset = self._extract_timezone_offset(server_info)
|
||||||
|
if detected_offset is None or detected_offset == self.timezone_offset:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.timezone_offset = detected_offset
|
||||||
|
self.timezone = timezone(offset=timedelta(hours=detected_offset))
|
||||||
|
logger.info(f"[Jira] Timezone offset adjusted to {detected_offset} hours using Jira server info.")
|
||||||
|
|
||||||
|
def _extract_timezone_offset(self, server_info: dict[str, Any]) -> float | None:
|
||||||
|
server_time_raw = server_info.get("serverTime")
|
||||||
|
if isinstance(server_time_raw, str):
|
||||||
|
offset = self._parse_offset_from_datetime_string(server_time_raw)
|
||||||
|
if offset is not None:
|
||||||
|
return offset
|
||||||
|
|
||||||
|
tz_name = server_info.get("timeZone")
|
||||||
|
if isinstance(tz_name, str):
|
||||||
|
offset = self._offset_from_zone_name(tz_name)
|
||||||
|
if offset is not None:
|
||||||
|
return offset
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse_offset_from_datetime_string(value: str) -> float | None:
|
||||||
|
normalized = JiraConnector._normalize_datetime_string(value)
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(normalized)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
offset = dt.tzinfo.utcoffset(dt)
|
||||||
|
if offset is None:
|
||||||
|
return None
|
||||||
|
return offset.total_seconds() / 3600.0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize_datetime_string(value: str) -> str:
|
||||||
|
trimmed = (value or "").strip()
|
||||||
|
if trimmed.endswith("Z"):
|
||||||
|
return f"{trimmed[:-1]}+00:00"
|
||||||
|
|
||||||
|
match = _TZ_OFFSET_PATTERN.search(trimmed)
|
||||||
|
if match and match.group(3) != ":":
|
||||||
|
sign, hours, _, minutes = match.groups()
|
||||||
|
trimmed = f"{trimmed[: match.start()]}{sign}{hours}:{minutes}"
|
||||||
|
return trimmed
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _offset_from_zone_name(name: str) -> float | None:
|
||||||
|
try:
|
||||||
|
tz = ZoneInfo(name)
|
||||||
|
except (ZoneInfoNotFoundError, ValueError):
|
||||||
|
return None
|
||||||
|
reference = datetime.now(tz)
|
||||||
|
offset = reference.utcoffset()
|
||||||
|
if offset is None:
|
||||||
|
return None
|
||||||
|
return offset.total_seconds() / 3600.0
|
||||||
|
|
||||||
|
def _is_cloud_client(self) -> bool:
|
||||||
|
if not self.jira_client:
|
||||||
|
return False
|
||||||
|
rest_version = str(self.jira_client._options.get("rest_api_version", "")).strip()
|
||||||
|
return rest_version == str(JIRA_CLOUD_API_VERSION)
|
||||||
|
|
||||||
|
def _full_page_size(self) -> int:
|
||||||
|
return max(1, min(self.batch_size, _JIRA_FULL_PAGE_SIZE))
|
||||||
|
|
||||||
|
def _perform_jql_search(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
jql: str,
|
||||||
|
start: int,
|
||||||
|
max_results: int,
|
||||||
|
fields: str | None = None,
|
||||||
|
all_issue_ids: list[list[str]] | None = None,
|
||||||
|
checkpoint_callback: Callable[[Iterable[list[str]], str | None], None] | None = None,
|
||||||
|
next_page_token: str | None = None,
|
||||||
|
ids_done: bool = False,
|
||||||
|
) -> Iterable[Issue]:
|
||||||
|
assert self.jira_client, "Jira client not initialized."
|
||||||
|
is_cloud = self._is_cloud_client()
|
||||||
|
if is_cloud:
|
||||||
|
if all_issue_ids is None:
|
||||||
|
raise ValueError("all_issue_ids is required for Jira Cloud searches.")
|
||||||
|
yield from self._perform_jql_search_v3(
|
||||||
|
jql=jql,
|
||||||
|
max_results=max_results,
|
||||||
|
fields=fields,
|
||||||
|
all_issue_ids=all_issue_ids,
|
||||||
|
checkpoint_callback=checkpoint_callback,
|
||||||
|
next_page_token=next_page_token,
|
||||||
|
ids_done=ids_done,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
yield from self._perform_jql_search_v2(
|
||||||
|
jql=jql,
|
||||||
|
start=start,
|
||||||
|
max_results=max_results,
|
||||||
|
fields=fields,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _perform_jql_search_v3(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
jql: str,
|
||||||
|
max_results: int,
|
||||||
|
all_issue_ids: list[list[str]],
|
||||||
|
fields: str | None = None,
|
||||||
|
checkpoint_callback: Callable[[Iterable[list[str]], str | None], None] | None = None,
|
||||||
|
next_page_token: str | None = None,
|
||||||
|
ids_done: bool = False,
|
||||||
|
) -> Iterable[Issue]:
|
||||||
|
assert self.jira_client, "Jira client not initialized."
|
||||||
|
|
||||||
|
if not ids_done:
|
||||||
|
new_ids, page_token = self._enhanced_search_ids(jql, next_page_token)
|
||||||
|
if checkpoint_callback is not None and new_ids:
|
||||||
|
checkpoint_callback(
|
||||||
|
self._chunk_issue_ids(new_ids, max_results),
|
||||||
|
page_token,
|
||||||
|
)
|
||||||
|
elif checkpoint_callback is not None:
|
||||||
|
checkpoint_callback([], page_token)
|
||||||
|
|
||||||
|
if all_issue_ids:
|
||||||
|
issue_ids = all_issue_ids.pop()
|
||||||
|
if issue_ids:
|
||||||
|
yield from self._bulk_fetch_issues(issue_ids, fields)
|
||||||
|
|
||||||
|
def _perform_jql_search_v2(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
jql: str,
|
||||||
|
start: int,
|
||||||
|
max_results: int,
|
||||||
|
fields: str | None = None,
|
||||||
|
) -> Iterable[Issue]:
|
||||||
|
assert self.jira_client, "Jira client not initialized."
|
||||||
|
|
||||||
|
issues = self.jira_client.search_issues(
|
||||||
|
jql_str=jql,
|
||||||
|
startAt=start,
|
||||||
|
maxResults=max_results,
|
||||||
|
fields=fields or self._fields_param,
|
||||||
|
expand="renderedFields",
|
||||||
|
)
|
||||||
|
for issue in issues:
|
||||||
|
yield issue
|
||||||
|
|
||||||
|
def _enhanced_search_ids(
|
||||||
|
self,
|
||||||
|
jql: str,
|
||||||
|
next_page_token: str | None,
|
||||||
|
) -> tuple[list[str], str | None]:
|
||||||
|
assert self.jira_client, "Jira client not initialized."
|
||||||
|
enhanced_search_path = self.jira_client._get_url("search/jql")
|
||||||
|
params: dict[str, str | int | None] = {
|
||||||
|
"jql": jql,
|
||||||
|
"maxResults": _MAX_RESULTS_FETCH_IDS,
|
||||||
|
"nextPageToken": next_page_token,
|
||||||
|
"fields": "id",
|
||||||
|
}
|
||||||
|
response = self.jira_client._session.get(enhanced_search_path, params=params)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
return [str(issue["id"]) for issue in data.get("issues", [])], data.get("nextPageToken")
|
||||||
|
|
||||||
|
def _bulk_fetch_issues(
|
||||||
|
self,
|
||||||
|
issue_ids: list[str],
|
||||||
|
fields: str | None,
|
||||||
|
) -> Iterable[Issue]:
|
||||||
|
assert self.jira_client, "Jira client not initialized."
|
||||||
|
if not issue_ids:
|
||||||
|
return []
|
||||||
|
|
||||||
|
bulk_fetch_path = self.jira_client._get_url("issue/bulkfetch")
|
||||||
|
payload: dict[str, Any] = {"issueIdsOrKeys": issue_ids}
|
||||||
|
payload["fields"] = fields.split(",") if fields else ["*all"]
|
||||||
|
|
||||||
|
response = self.jira_client._session.post(bulk_fetch_path, json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
return [Issue(self.jira_client._options, self.jira_client._session, raw=issue) for issue in data.get("issues", [])]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _chunk_issue_ids(issue_ids: list[str], chunk_size: int) -> Iterable[list[str]]:
|
||||||
|
if chunk_size <= 0:
|
||||||
|
chunk_size = _JIRA_FULL_PAGE_SIZE
|
||||||
|
|
||||||
|
for idx in range(0, len(issue_ids), chunk_size):
|
||||||
|
yield issue_ids[idx : idx + chunk_size]
|
||||||
|
|
||||||
|
def _make_checkpoint_callback(self, checkpoint: JiraCheckpoint) -> Callable[[Iterable[list[str]], str | None], None]:
|
||||||
|
def checkpoint_callback(
|
||||||
|
issue_ids: Iterable[list[str]] | list[list[str]],
|
||||||
|
page_token: str | None,
|
||||||
|
) -> None:
|
||||||
|
for id_batch in issue_ids:
|
||||||
|
checkpoint.all_issue_ids.append(list(id_batch))
|
||||||
|
checkpoint.cursor = page_token
|
||||||
|
checkpoint.ids_done = page_token is None
|
||||||
|
|
||||||
|
return checkpoint_callback
|
||||||
|
|
||||||
|
def _update_checkpoint_for_next_run(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
checkpoint: JiraCheckpoint,
|
||||||
|
current_offset: int,
|
||||||
|
starting_offset: int,
|
||||||
|
page_size: int,
|
||||||
|
) -> None:
|
||||||
|
if self._is_cloud_client():
|
||||||
|
checkpoint.has_more = bool(checkpoint.all_issue_ids) or not checkpoint.ids_done
|
||||||
|
else:
|
||||||
|
checkpoint.has_more = current_offset - starting_offset == page_size
|
||||||
|
checkpoint.cursor = None
|
||||||
|
checkpoint.ids_done = True
|
||||||
|
checkpoint.all_issue_ids = []
|
||||||
|
|
||||||
|
|
||||||
|
def iterate_jira_documents(
|
||||||
|
connector: "JiraConnector",
|
||||||
|
start: SecondsSinceUnixEpoch,
|
||||||
|
end: SecondsSinceUnixEpoch,
|
||||||
|
iteration_limit: int = 100_000,
|
||||||
|
) -> Iterator[Document]:
|
||||||
|
"""Yield documents without materializing the entire result set."""
|
||||||
|
|
||||||
|
checkpoint = connector.build_dummy_checkpoint()
|
||||||
|
iterations = 0
|
||||||
|
|
||||||
|
while checkpoint.has_more:
|
||||||
|
wrapper = CheckpointOutputWrapper[JiraCheckpoint]()
|
||||||
|
generator = wrapper(connector.load_from_checkpoint(start=start, end=end, checkpoint=checkpoint))
|
||||||
|
|
||||||
|
for document, failure, next_checkpoint in generator:
|
||||||
|
if failure is not None:
|
||||||
|
failure_message = getattr(failure, "failure_message", str(failure))
|
||||||
|
raise RuntimeError(f"Failed to load Jira documents: {failure_message}")
|
||||||
|
if document is not None:
|
||||||
|
yield document
|
||||||
|
if next_checkpoint is not None:
|
||||||
|
checkpoint = next_checkpoint
|
||||||
|
|
||||||
|
iterations += 1
|
||||||
|
if iterations > iteration_limit:
|
||||||
|
raise RuntimeError("Too many iterations while loading Jira documents.")
|
||||||
|
|
||||||
|
|
||||||
|
def test_jira(
|
||||||
|
*,
|
||||||
|
base_url: str,
|
||||||
|
project_key: str | None = None,
|
||||||
|
jql_query: str | None = None,
|
||||||
|
credentials: dict[str, Any],
|
||||||
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
|
start_ts: float | None = None,
|
||||||
|
end_ts: float | None = None,
|
||||||
|
connector_options: dict[str, Any] | None = None,
|
||||||
|
) -> list[Document]:
|
||||||
|
"""Programmatic entry point that mirrors the CLI workflow."""
|
||||||
|
|
||||||
|
connector_kwargs = connector_options.copy() if connector_options else {}
|
||||||
|
connector = JiraConnector(
|
||||||
|
jira_base_url=base_url,
|
||||||
|
project_key=project_key,
|
||||||
|
jql_query=jql_query,
|
||||||
|
batch_size=batch_size,
|
||||||
|
**connector_kwargs,
|
||||||
|
)
|
||||||
|
connector.load_credentials(credentials)
|
||||||
|
connector.validate_connector_settings()
|
||||||
|
|
||||||
|
now_ts = datetime.now(timezone.utc).timestamp()
|
||||||
|
start = start_ts if start_ts is not None else 0.0
|
||||||
|
end = end_ts if end_ts is not None else now_ts
|
||||||
|
|
||||||
|
documents = list(iterate_jira_documents(connector, start=start, end=end))
|
||||||
|
logger.info(f"[Jira] Fetched {len(documents)} Jira documents.")
|
||||||
|
for doc in documents[:5]:
|
||||||
|
logger.info(f"[Jira] Document {doc.semantic_identifier} ({doc.id}) size={doc.size_bytes} bytes")
|
||||||
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
def _build_arg_parser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(description="Fetch Jira issues and print summary statistics.")
|
||||||
|
parser.add_argument("--base-url", dest="base_url", default=os.environ.get("JIRA_BASE_URL"))
|
||||||
|
parser.add_argument("--project", dest="project_key", default=os.environ.get("JIRA_PROJECT_KEY"))
|
||||||
|
parser.add_argument("--jql", dest="jql_query", default=os.environ.get("JIRA_JQL"))
|
||||||
|
parser.add_argument("--email", dest="user_email", default=os.environ.get("JIRA_USER_EMAIL"))
|
||||||
|
parser.add_argument("--token", dest="api_token", default=os.environ.get("JIRA_API_TOKEN"))
|
||||||
|
parser.add_argument("--password", dest="password", default=os.environ.get("JIRA_PASSWORD"))
|
||||||
|
parser.add_argument("--batch-size", dest="batch_size", type=int, default=int(os.environ.get("JIRA_BATCH_SIZE", INDEX_BATCH_SIZE)))
|
||||||
|
parser.add_argument("--include_comments", dest="include_comments", type=bool, default=True)
|
||||||
|
parser.add_argument("--include_attachments", dest="include_attachments", type=bool, default=True)
|
||||||
|
parser.add_argument("--attachment_size_limit", dest="attachment_size_limit", type=float, default=_DEFAULT_ATTACHMENT_SIZE_LIMIT)
|
||||||
|
parser.add_argument("--start-ts", dest="start_ts", type=float, default=None, help="Epoch seconds inclusive lower bound for updated issues.")
|
||||||
|
parser.add_argument("--end-ts", dest="end_ts", type=float, default=9999999999, help="Epoch seconds inclusive upper bound for updated issues.")
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main(config: dict[str, Any] | None = None) -> None:
|
||||||
|
if config is None:
|
||||||
|
args = _build_arg_parser().parse_args()
|
||||||
|
config = {
|
||||||
|
"base_url": args.base_url,
|
||||||
|
"project_key": args.project_key,
|
||||||
|
"jql_query": args.jql_query,
|
||||||
|
"batch_size": args.batch_size,
|
||||||
|
"start_ts": args.start_ts,
|
||||||
|
"end_ts": args.end_ts,
|
||||||
|
"include_comments": args.include_comments,
|
||||||
|
"include_attachments": args.include_attachments,
|
||||||
|
"attachment_size_limit": args.attachment_size_limit,
|
||||||
|
"credentials": {
|
||||||
|
"jira_user_email": args.user_email,
|
||||||
|
"jira_api_token": args.api_token,
|
||||||
|
"jira_password": args.password,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
base_url = config.get("base_url")
|
||||||
|
credentials = config.get("credentials", {})
|
||||||
|
|
||||||
|
print(f"[Jira] {config=}", flush=True)
|
||||||
|
print(f"[Jira] {credentials=}", flush=True)
|
||||||
|
|
||||||
|
if not base_url:
|
||||||
|
raise RuntimeError("Jira base URL must be provided via config or CLI arguments.")
|
||||||
|
if not (credentials.get("jira_api_token") or (credentials.get("jira_user_email") and credentials.get("jira_password"))):
|
||||||
|
raise RuntimeError("Provide either an API token or both email/password for Jira authentication.")
|
||||||
|
|
||||||
|
connector_options = {
|
||||||
|
key: value
|
||||||
|
for key, value in (
|
||||||
|
("include_comments", config.get("include_comments")),
|
||||||
|
("include_attachments", config.get("include_attachments")),
|
||||||
|
("attachment_size_limit", config.get("attachment_size_limit")),
|
||||||
|
("labels_to_skip", config.get("labels_to_skip")),
|
||||||
|
("comment_email_blacklist", config.get("comment_email_blacklist")),
|
||||||
|
("scoped_token", config.get("scoped_token")),
|
||||||
|
("timezone_offset", config.get("timezone_offset")),
|
||||||
|
)
|
||||||
|
if value is not None
|
||||||
|
}
|
||||||
|
|
||||||
|
documents = test_jira(
|
||||||
|
base_url=base_url,
|
||||||
|
project_key=config.get("project_key"),
|
||||||
|
jql_query=config.get("jql_query"),
|
||||||
|
credentials=credentials,
|
||||||
|
batch_size=config.get("batch_size", INDEX_BATCH_SIZE),
|
||||||
|
start_ts=config.get("start_ts"),
|
||||||
|
end_ts=config.get("end_ts"),
|
||||||
|
connector_options=connector_options,
|
||||||
|
)
|
||||||
|
|
||||||
|
preview_count = min(len(documents), 5)
|
||||||
|
for idx in range(preview_count):
|
||||||
|
doc = documents[idx]
|
||||||
|
print(f"[Jira] [Sample {idx + 1}] {doc.semantic_identifier} | id={doc.id} | size={doc.size_bytes} bytes")
|
||||||
|
|
||||||
|
print(f"[Jira] Jira connector test completed. Documents fetched: {len(documents)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover - manual execution path
|
||||||
|
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s %(name)s %(message)s")
|
||||||
|
main()
|
||||||
149
common/data_source/jira/utils.py
Normal file
149
common/data_source/jira/utils.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
"""Helper utilities for the Jira connector."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from collections.abc import Collection
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any, Iterable
|
||||||
|
|
||||||
|
from jira.resources import Issue
|
||||||
|
|
||||||
|
from common.data_source.utils import datetime_from_string
|
||||||
|
|
||||||
|
JIRA_SERVER_API_VERSION = os.environ.get("JIRA_SERVER_API_VERSION", "2")
|
||||||
|
JIRA_CLOUD_API_VERSION = os.environ.get("JIRA_CLOUD_API_VERSION", "3")
|
||||||
|
|
||||||
|
|
||||||
|
def build_issue_url(base_url: str, issue_key: str) -> str:
|
||||||
|
"""Return the canonical UI URL for a Jira issue."""
|
||||||
|
return f"{base_url.rstrip('/')}/browse/{issue_key}"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_jira_datetime(value: Any) -> datetime | None:
|
||||||
|
"""Best-effort parse of Jira datetime values to aware UTC datetimes."""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
if isinstance(value, datetime):
|
||||||
|
return value.astimezone(timezone.utc) if value.tzinfo else value.replace(tzinfo=timezone.utc)
|
||||||
|
if isinstance(value, str):
|
||||||
|
return datetime_from_string(value)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_named_value(value: Any) -> str | None:
|
||||||
|
"""Extract a readable string out of Jira's typed objects."""
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
if isinstance(value, str):
|
||||||
|
return value
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return value.get("name") or value.get("value")
|
||||||
|
return getattr(value, "name", None)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_user(value: Any) -> tuple[str | None, str | None]:
|
||||||
|
"""Return display name + email tuple for a Jira user blob."""
|
||||||
|
if value is None:
|
||||||
|
return None, None
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return value.get("displayName"), value.get("emailAddress")
|
||||||
|
|
||||||
|
display = getattr(value, "displayName", None)
|
||||||
|
email = getattr(value, "emailAddress", None)
|
||||||
|
return display, email
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_from_adf(adf: Any) -> str:
|
||||||
|
"""Flatten Atlassian Document Format (ADF) structures to text."""
|
||||||
|
texts: list[str] = []
|
||||||
|
|
||||||
|
def _walk(node: Any) -> None:
|
||||||
|
if node is None:
|
||||||
|
return
|
||||||
|
if isinstance(node, dict):
|
||||||
|
node_type = node.get("type")
|
||||||
|
if node_type == "text":
|
||||||
|
texts.append(node.get("text", ""))
|
||||||
|
for child in node.get("content", []):
|
||||||
|
_walk(child)
|
||||||
|
elif isinstance(node, list):
|
||||||
|
for child in node:
|
||||||
|
_walk(child)
|
||||||
|
|
||||||
|
_walk(adf)
|
||||||
|
return "\n".join(part for part in texts if part)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_body_text(value: Any) -> str:
|
||||||
|
"""Normalize Jira description/comments (raw/adf/str) into plain text."""
|
||||||
|
if value is None:
|
||||||
|
return ""
|
||||||
|
if isinstance(value, str):
|
||||||
|
return value.strip()
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return extract_text_from_adf(value).strip()
|
||||||
|
return str(value).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def format_comments(
|
||||||
|
comment_block: Any,
|
||||||
|
*,
|
||||||
|
blacklist: Collection[str],
|
||||||
|
) -> str:
|
||||||
|
"""Convert Jira comments into a markdown-ish bullet list."""
|
||||||
|
if not isinstance(comment_block, dict):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
comments = comment_block.get("comments") or []
|
||||||
|
lines: list[str] = []
|
||||||
|
normalized_blacklist = {email.lower() for email in blacklist if email}
|
||||||
|
|
||||||
|
for comment in comments:
|
||||||
|
author = comment.get("author") or {}
|
||||||
|
author_email = (author.get("emailAddress") or "").lower()
|
||||||
|
if author_email and author_email in normalized_blacklist:
|
||||||
|
continue
|
||||||
|
|
||||||
|
author_name = author.get("displayName") or author.get("name") or author_email or "Unknown"
|
||||||
|
created = parse_jira_datetime(comment.get("created"))
|
||||||
|
created_str = created.isoformat() if created else "Unknown time"
|
||||||
|
body = extract_body_text(comment.get("body"))
|
||||||
|
if not body:
|
||||||
|
continue
|
||||||
|
|
||||||
|
lines.append(f"- {author_name} ({created_str}):\n{body}")
|
||||||
|
|
||||||
|
return "\n\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def format_attachments(attachments: Any) -> str:
|
||||||
|
"""List Jira attachments as bullet points."""
|
||||||
|
if not isinstance(attachments, list):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
attachment_lines: list[str] = []
|
||||||
|
for attachment in attachments:
|
||||||
|
filename = attachment.get("filename")
|
||||||
|
if not filename:
|
||||||
|
continue
|
||||||
|
size = attachment.get("size")
|
||||||
|
size_text = f" ({size} bytes)" if isinstance(size, int) else ""
|
||||||
|
content_url = attachment.get("content") or ""
|
||||||
|
url_suffix = f" -> {content_url}" if content_url else ""
|
||||||
|
attachment_lines.append(f"- {filename}{size_text}{url_suffix}")
|
||||||
|
|
||||||
|
return "\n".join(attachment_lines)
|
||||||
|
|
||||||
|
|
||||||
|
def should_skip_issue(issue: Issue, labels_to_skip: set[str]) -> bool:
|
||||||
|
"""Return True if the issue contains any label from the skip list."""
|
||||||
|
if not labels_to_skip:
|
||||||
|
return False
|
||||||
|
|
||||||
|
fields = getattr(issue, "raw", {}).get("fields", {})
|
||||||
|
labels: Iterable[str] = fields.get("labels") or []
|
||||||
|
for label in labels:
|
||||||
|
if (label or "").lower() in labels_to_skip:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
@ -1,112 +0,0 @@
|
|||||||
"""Jira connector"""
|
|
||||||
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from jira import JIRA
|
|
||||||
|
|
||||||
from common.data_source.config import INDEX_BATCH_SIZE
|
|
||||||
from common.data_source.exceptions import (
|
|
||||||
ConnectorValidationError,
|
|
||||||
InsufficientPermissionsError,
|
|
||||||
UnexpectedValidationError, ConnectorMissingCredentialError
|
|
||||||
)
|
|
||||||
from common.data_source.interfaces import (
|
|
||||||
CheckpointedConnectorWithPermSync,
|
|
||||||
SecondsSinceUnixEpoch,
|
|
||||||
SlimConnectorWithPermSync
|
|
||||||
)
|
|
||||||
from common.data_source.models import (
|
|
||||||
ConnectorCheckpoint
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class JiraConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync):
|
|
||||||
"""Jira connector for accessing Jira issues and projects"""
|
|
||||||
|
|
||||||
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.jira_client: JIRA | None = None
|
|
||||||
|
|
||||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
|
||||||
"""Load Jira credentials"""
|
|
||||||
try:
|
|
||||||
url = credentials.get("url")
|
|
||||||
username = credentials.get("username")
|
|
||||||
password = credentials.get("password")
|
|
||||||
token = credentials.get("token")
|
|
||||||
|
|
||||||
if not url:
|
|
||||||
raise ConnectorMissingCredentialError("Jira URL is required")
|
|
||||||
|
|
||||||
if token:
|
|
||||||
# API token authentication
|
|
||||||
self.jira_client = JIRA(server=url, token_auth=token)
|
|
||||||
elif username and password:
|
|
||||||
# Basic authentication
|
|
||||||
self.jira_client = JIRA(server=url, basic_auth=(username, password))
|
|
||||||
else:
|
|
||||||
raise ConnectorMissingCredentialError("Jira credentials are incomplete")
|
|
||||||
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
raise ConnectorMissingCredentialError(f"Jira: {e}")
|
|
||||||
|
|
||||||
def validate_connector_settings(self) -> None:
|
|
||||||
"""Validate Jira connector settings"""
|
|
||||||
if not self.jira_client:
|
|
||||||
raise ConnectorMissingCredentialError("Jira")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Test connection by getting server info
|
|
||||||
self.jira_client.server_info()
|
|
||||||
except Exception as e:
|
|
||||||
if "401" in str(e) or "403" in str(e):
|
|
||||||
raise InsufficientPermissionsError("Invalid credentials or insufficient permissions")
|
|
||||||
elif "404" in str(e):
|
|
||||||
raise ConnectorValidationError("Jira instance not found")
|
|
||||||
else:
|
|
||||||
raise UnexpectedValidationError(f"Jira validation error: {e}")
|
|
||||||
|
|
||||||
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Any:
|
|
||||||
"""Poll Jira for recent issues"""
|
|
||||||
# Simplified implementation - in production this would handle actual polling
|
|
||||||
return []
|
|
||||||
|
|
||||||
def load_from_checkpoint(
|
|
||||||
self,
|
|
||||||
start: SecondsSinceUnixEpoch,
|
|
||||||
end: SecondsSinceUnixEpoch,
|
|
||||||
checkpoint: ConnectorCheckpoint,
|
|
||||||
) -> Any:
|
|
||||||
"""Load documents from checkpoint"""
|
|
||||||
# Simplified implementation
|
|
||||||
return []
|
|
||||||
|
|
||||||
def load_from_checkpoint_with_perm_sync(
|
|
||||||
self,
|
|
||||||
start: SecondsSinceUnixEpoch,
|
|
||||||
end: SecondsSinceUnixEpoch,
|
|
||||||
checkpoint: ConnectorCheckpoint,
|
|
||||||
) -> Any:
|
|
||||||
"""Load documents from checkpoint with permission sync"""
|
|
||||||
# Simplified implementation
|
|
||||||
return []
|
|
||||||
|
|
||||||
def build_dummy_checkpoint(self) -> ConnectorCheckpoint:
|
|
||||||
"""Build dummy checkpoint"""
|
|
||||||
return ConnectorCheckpoint()
|
|
||||||
|
|
||||||
def validate_checkpoint_json(self, checkpoint_json: str) -> ConnectorCheckpoint:
|
|
||||||
"""Validate checkpoint JSON"""
|
|
||||||
# Simplified implementation
|
|
||||||
return ConnectorCheckpoint()
|
|
||||||
|
|
||||||
def retrieve_all_slim_docs_perm_sync(
|
|
||||||
self,
|
|
||||||
start: SecondsSinceUnixEpoch | None = None,
|
|
||||||
end: SecondsSinceUnixEpoch | None = None,
|
|
||||||
callback: Any = None,
|
|
||||||
) -> Any:
|
|
||||||
"""Retrieve all simplified documents with permission sync"""
|
|
||||||
# Simplified implementation
|
|
||||||
return []
|
|
||||||
@ -48,17 +48,35 @@ from common.data_source.exceptions import RateLimitTriedTooManyTimesError
|
|||||||
from common.data_source.interfaces import CT, CheckpointedConnector, CheckpointOutputWrapper, ConfluenceUser, LoadFunction, OnyxExtensionType, SecondsSinceUnixEpoch, TokenResponse
|
from common.data_source.interfaces import CT, CheckpointedConnector, CheckpointOutputWrapper, ConfluenceUser, LoadFunction, OnyxExtensionType, SecondsSinceUnixEpoch, TokenResponse
|
||||||
from common.data_source.models import BasicExpertInfo, Document
|
from common.data_source.models import BasicExpertInfo, Document
|
||||||
|
|
||||||
|
_TZ_SUFFIX_PATTERN = re.compile(r"([+-])([\d:]+)$")
|
||||||
|
|
||||||
|
|
||||||
def datetime_from_string(datetime_string: str) -> datetime:
|
def datetime_from_string(datetime_string: str) -> datetime:
|
||||||
datetime_string = datetime_string.strip()
|
datetime_string = datetime_string.strip()
|
||||||
|
|
||||||
|
match_jira_format = _TZ_SUFFIX_PATTERN.search(datetime_string)
|
||||||
|
if match_jira_format:
|
||||||
|
sign, tz_field = match_jira_format.groups()
|
||||||
|
digits = tz_field.replace(":", "")
|
||||||
|
|
||||||
|
if digits.isdigit() and 1 <= len(digits) <= 4:
|
||||||
|
if len(digits) >= 3:
|
||||||
|
hours = digits[:-2].rjust(2, "0")
|
||||||
|
minutes = digits[-2:]
|
||||||
|
else:
|
||||||
|
hours = digits.rjust(2, "0")
|
||||||
|
minutes = "00"
|
||||||
|
|
||||||
|
normalized = f"{sign}{hours}:{minutes}"
|
||||||
|
datetime_string = f"{datetime_string[: match_jira_format.start()]}{normalized}"
|
||||||
|
|
||||||
# Handle the case where the datetime string ends with 'Z' (Zulu time)
|
# Handle the case where the datetime string ends with 'Z' (Zulu time)
|
||||||
if datetime_string.endswith('Z'):
|
if datetime_string.endswith("Z"):
|
||||||
datetime_string = datetime_string[:-1] + '+00:00'
|
datetime_string = datetime_string[:-1] + "+00:00"
|
||||||
|
|
||||||
# Handle timezone format "+0000" -> "+00:00"
|
# Handle timezone format "+0000" -> "+00:00"
|
||||||
if datetime_string.endswith('+0000'):
|
if datetime_string.endswith("+0000"):
|
||||||
datetime_string = datetime_string[:-5] + '+00:00'
|
datetime_string = datetime_string[:-5] + "+00:00"
|
||||||
|
|
||||||
datetime_object = datetime.fromisoformat(datetime_string)
|
datetime_object = datetime.fromisoformat(datetime_string)
|
||||||
|
|
||||||
@ -480,7 +498,7 @@ def get_file_ext(file_name: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def is_accepted_file_ext(file_ext: str, extension_type: OnyxExtensionType) -> bool:
|
def is_accepted_file_ext(file_ext: str, extension_type: OnyxExtensionType) -> bool:
|
||||||
image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
|
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"}
|
||||||
text_extensions = {".txt", ".md", ".mdx", ".conf", ".log", ".json", ".csv", ".tsv", ".xml", ".yml", ".yaml", ".sql"}
|
text_extensions = {".txt", ".md", ".mdx", ".conf", ".log", ".json", ".csv", ".tsv", ".xml", ".yml", ".yaml", ".sql"}
|
||||||
document_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".eml", ".epub", ".html"}
|
document_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".eml", ".epub", ".html"}
|
||||||
|
|
||||||
@ -902,6 +920,18 @@ def load_all_docs_from_checkpoint_connector(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_ATLASSIAN_CLOUD_DOMAINS = (".atlassian.net", ".jira.com", ".jira-dev.com")
|
||||||
|
|
||||||
|
|
||||||
|
def is_atlassian_cloud_url(url: str) -> bool:
|
||||||
|
try:
|
||||||
|
host = urlparse(url).hostname or ""
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
host = host.lower()
|
||||||
|
return any(host.endswith(domain) for domain in _ATLASSIAN_CLOUD_DOMAINS)
|
||||||
|
|
||||||
|
|
||||||
def get_cloudId(base_url: str) -> str:
|
def get_cloudId(base_url: str) -> str:
|
||||||
tenant_info_url = urljoin(base_url, "/_edge/tenant_info")
|
tenant_info_url = urljoin(base_url, "/_edge/tenant_info")
|
||||||
response = requests.get(tenant_info_url, timeout=10)
|
response = requests.get(tenant_info_url, timeout=10)
|
||||||
|
|||||||
@ -80,4 +80,4 @@ def log_exception(e, *args):
|
|||||||
raise Exception(a.text)
|
raise Exception(a.text)
|
||||||
else:
|
else:
|
||||||
logging.error(str(a))
|
logging.error(str(a))
|
||||||
raise e
|
raise e
|
||||||
|
|||||||
@ -20,33 +20,40 @@
|
|||||||
|
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import faulthandler
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
from datetime import datetime, timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
import trio
|
||||||
|
|
||||||
from api.db.services.connector_service import ConnectorService, SyncLogsService
|
from api.db.services.connector_service import ConnectorService, SyncLogsService
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from common.log_utils import init_root_logger
|
|
||||||
from common.config_utils import show_configs
|
|
||||||
from common.data_source import BlobStorageConnector, NotionConnector, DiscordConnector, GoogleDriveConnector
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
import signal
|
|
||||||
import trio
|
|
||||||
import faulthandler
|
|
||||||
from common.constants import FileSource, TaskStatus
|
|
||||||
from common import settings
|
from common import settings
|
||||||
from common.versions import get_ragflow_version
|
from common.config_utils import show_configs
|
||||||
|
from common.constants import FileSource, TaskStatus
|
||||||
|
from common.data_source import (
|
||||||
|
BlobStorageConnector,
|
||||||
|
DiscordConnector,
|
||||||
|
GoogleDriveConnector,
|
||||||
|
JiraConnector,
|
||||||
|
NotionConnector,
|
||||||
|
)
|
||||||
|
from common.data_source.config import INDEX_BATCH_SIZE
|
||||||
from common.data_source.confluence_connector import ConfluenceConnector
|
from common.data_source.confluence_connector import ConfluenceConnector
|
||||||
from common.data_source.interfaces import CheckpointOutputWrapper
|
from common.data_source.interfaces import CheckpointOutputWrapper
|
||||||
from common.data_source.utils import load_all_docs_from_checkpoint_connector
|
from common.data_source.utils import load_all_docs_from_checkpoint_connector
|
||||||
from common.data_source.config import INDEX_BATCH_SIZE
|
from common.log_utils import init_root_logger
|
||||||
from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
|
from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
|
||||||
|
from common.versions import get_ragflow_version
|
||||||
|
|
||||||
MAX_CONCURRENT_TASKS = int(os.environ.get('MAX_CONCURRENT_TASKS', "5"))
|
MAX_CONCURRENT_TASKS = int(os.environ.get("MAX_CONCURRENT_TASKS", "5"))
|
||||||
task_limiter = trio.Semaphore(MAX_CONCURRENT_TASKS)
|
task_limiter = trio.Semaphore(MAX_CONCURRENT_TASKS)
|
||||||
|
|
||||||
|
|
||||||
@ -72,31 +79,32 @@ class SyncBase:
|
|||||||
min_update = min([doc.doc_updated_at for doc in document_batch])
|
min_update = min([doc.doc_updated_at for doc in document_batch])
|
||||||
max_update = max([doc.doc_updated_at for doc in document_batch])
|
max_update = max([doc.doc_updated_at for doc in document_batch])
|
||||||
next_update = max([next_update, max_update])
|
next_update = max([next_update, max_update])
|
||||||
docs = [{
|
docs = [
|
||||||
"id": doc.id,
|
{
|
||||||
"connector_id": task["connector_id"],
|
"id": doc.id,
|
||||||
"source": self.SOURCE_NAME,
|
"connector_id": task["connector_id"],
|
||||||
"semantic_identifier": doc.semantic_identifier,
|
"source": self.SOURCE_NAME,
|
||||||
"extension": doc.extension,
|
"semantic_identifier": doc.semantic_identifier,
|
||||||
"size_bytes": doc.size_bytes,
|
"extension": doc.extension,
|
||||||
"doc_updated_at": doc.doc_updated_at,
|
"size_bytes": doc.size_bytes,
|
||||||
"blob": doc.blob
|
"doc_updated_at": doc.doc_updated_at,
|
||||||
} for doc in document_batch]
|
"blob": doc.blob,
|
||||||
|
}
|
||||||
|
for doc in document_batch
|
||||||
|
]
|
||||||
|
|
||||||
e, kb = KnowledgebaseService.get_by_id(task["kb_id"])
|
e, kb = KnowledgebaseService.get_by_id(task["kb_id"])
|
||||||
err, dids = SyncLogsService.duplicate_and_parse(kb, docs, task["tenant_id"], f"{self.SOURCE_NAME}/{task['connector_id']}", task["auto_parse"])
|
err, dids = SyncLogsService.duplicate_and_parse(kb, docs, task["tenant_id"], f"{self.SOURCE_NAME}/{task['connector_id']}", task["auto_parse"])
|
||||||
SyncLogsService.increase_docs(task["id"], min_update, max_update, len(docs), "\n".join(err), len(err))
|
SyncLogsService.increase_docs(task["id"], min_update, max_update, len(docs), "\n".join(err), len(err))
|
||||||
doc_num += len(docs)
|
doc_num += len(docs)
|
||||||
|
|
||||||
logging.info("{} docs synchronized till {}".format(doc_num, next_update))
|
prefix = "[Jira] " if self.SOURCE_NAME == FileSource.JIRA else ""
|
||||||
|
logging.info(f"{prefix}{doc_num} docs synchronized till {next_update}")
|
||||||
SyncLogsService.done(task["id"], task["connector_id"])
|
SyncLogsService.done(task["id"], task["connector_id"])
|
||||||
task["poll_range_start"] = next_update
|
task["poll_range_start"] = next_update
|
||||||
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
msg = '\n'.join([
|
msg = "\n".join(["".join(traceback.format_exception_only(None, ex)).strip(), "".join(traceback.format_exception(None, ex, ex.__traceback__)).strip()])
|
||||||
''.join(traceback.format_exception_only(None, ex)).strip(),
|
|
||||||
''.join(traceback.format_exception(None, ex, ex.__traceback__)).strip()
|
|
||||||
])
|
|
||||||
SyncLogsService.update_by_id(task["id"], {"status": TaskStatus.FAIL, "full_exception_trace": msg, "error_msg": str(ex)})
|
SyncLogsService.update_by_id(task["id"], {"status": TaskStatus.FAIL, "full_exception_trace": msg, "error_msg": str(ex)})
|
||||||
|
|
||||||
SyncLogsService.schedule(task["connector_id"], task["kb_id"], task["poll_range_start"])
|
SyncLogsService.schedule(task["connector_id"], task["kb_id"], task["poll_range_start"])
|
||||||
@ -109,21 +117,16 @@ class S3(SyncBase):
|
|||||||
SOURCE_NAME: str = FileSource.S3
|
SOURCE_NAME: str = FileSource.S3
|
||||||
|
|
||||||
async def _generate(self, task: dict):
|
async def _generate(self, task: dict):
|
||||||
self.connector = BlobStorageConnector(
|
self.connector = BlobStorageConnector(bucket_type=self.conf.get("bucket_type", "s3"), bucket_name=self.conf["bucket_name"], prefix=self.conf.get("prefix", ""))
|
||||||
bucket_type=self.conf.get("bucket_type", "s3"),
|
|
||||||
bucket_name=self.conf["bucket_name"],
|
|
||||||
prefix=self.conf.get("prefix", "")
|
|
||||||
)
|
|
||||||
self.connector.load_credentials(self.conf["credentials"])
|
self.connector.load_credentials(self.conf["credentials"])
|
||||||
document_batch_generator = self.connector.load_from_state() if task["reindex"]=="1" or not task["poll_range_start"] \
|
document_batch_generator = (
|
||||||
else self.connector.poll_source(task["poll_range_start"].timestamp(), datetime.now(timezone.utc).timestamp())
|
self.connector.load_from_state()
|
||||||
|
if task["reindex"] == "1" or not task["poll_range_start"]
|
||||||
|
else self.connector.poll_source(task["poll_range_start"].timestamp(), datetime.now(timezone.utc).timestamp())
|
||||||
|
)
|
||||||
|
|
||||||
begin_info = "totally" if task["reindex"]=="1" or not task["poll_range_start"] else "from {}".format(task["poll_range_start"])
|
begin_info = "totally" if task["reindex"] == "1" or not task["poll_range_start"] else "from {}".format(task["poll_range_start"])
|
||||||
logging.info("Connect to {}: {}(prefix/{}) {}".format(self.conf.get("bucket_type", "s3"),
|
logging.info("Connect to {}: {}(prefix/{}) {}".format(self.conf.get("bucket_type", "s3"), self.conf["bucket_name"], self.conf.get("prefix", ""), begin_info))
|
||||||
self.conf["bucket_name"],
|
|
||||||
self.conf.get("prefix", ""),
|
|
||||||
begin_info
|
|
||||||
))
|
|
||||||
return document_batch_generator
|
return document_batch_generator
|
||||||
|
|
||||||
|
|
||||||
@ -131,8 +134,8 @@ class Confluence(SyncBase):
|
|||||||
SOURCE_NAME: str = FileSource.CONFLUENCE
|
SOURCE_NAME: str = FileSource.CONFLUENCE
|
||||||
|
|
||||||
async def _generate(self, task: dict):
|
async def _generate(self, task: dict):
|
||||||
from common.data_source.interfaces import StaticCredentialsProvider
|
|
||||||
from common.data_source.config import DocumentSource
|
from common.data_source.config import DocumentSource
|
||||||
|
from common.data_source.interfaces import StaticCredentialsProvider
|
||||||
|
|
||||||
self.connector = ConfluenceConnector(
|
self.connector = ConfluenceConnector(
|
||||||
wiki_base=self.conf["wiki_base"],
|
wiki_base=self.conf["wiki_base"],
|
||||||
@ -141,11 +144,7 @@ class Confluence(SyncBase):
|
|||||||
# page_id=self.conf.get("page_id", ""),
|
# page_id=self.conf.get("page_id", ""),
|
||||||
)
|
)
|
||||||
|
|
||||||
credentials_provider = StaticCredentialsProvider(
|
credentials_provider = StaticCredentialsProvider(tenant_id=task["tenant_id"], connector_name=DocumentSource.CONFLUENCE, credential_json=self.conf["credentials"])
|
||||||
tenant_id=task["tenant_id"],
|
|
||||||
connector_name=DocumentSource.CONFLUENCE,
|
|
||||||
credential_json=self.conf["credentials"]
|
|
||||||
)
|
|
||||||
self.connector.set_credentials_provider(credentials_provider)
|
self.connector.set_credentials_provider(credentials_provider)
|
||||||
|
|
||||||
# Determine the time range for synchronization based on reindex or poll_range_start
|
# Determine the time range for synchronization based on reindex or poll_range_start
|
||||||
@ -174,10 +173,13 @@ class Notion(SyncBase):
|
|||||||
async def _generate(self, task: dict):
|
async def _generate(self, task: dict):
|
||||||
self.connector = NotionConnector(root_page_id=self.conf["root_page_id"])
|
self.connector = NotionConnector(root_page_id=self.conf["root_page_id"])
|
||||||
self.connector.load_credentials(self.conf["credentials"])
|
self.connector.load_credentials(self.conf["credentials"])
|
||||||
document_generator = self.connector.load_from_state() if task["reindex"]=="1" or not task["poll_range_start"] \
|
document_generator = (
|
||||||
else self.connector.poll_source(task["poll_range_start"].timestamp(), datetime.now(timezone.utc).timestamp())
|
self.connector.load_from_state()
|
||||||
|
if task["reindex"] == "1" or not task["poll_range_start"]
|
||||||
|
else self.connector.poll_source(task["poll_range_start"].timestamp(), datetime.now(timezone.utc).timestamp())
|
||||||
|
)
|
||||||
|
|
||||||
begin_info = "totally" if task["reindex"]=="1" or not task["poll_range_start"] else "from {}".format(task["poll_range_start"])
|
begin_info = "totally" if task["reindex"] == "1" or not task["poll_range_start"] else "from {}".format(task["poll_range_start"])
|
||||||
logging.info("Connect to Notion: root({}) {}".format(self.conf["root_page_id"], begin_info))
|
logging.info("Connect to Notion: root({}) {}".format(self.conf["root_page_id"], begin_info))
|
||||||
return document_generator
|
return document_generator
|
||||||
|
|
||||||
@ -194,13 +196,16 @@ class Discord(SyncBase):
|
|||||||
server_ids=server_ids.split(",") if server_ids else [],
|
server_ids=server_ids.split(",") if server_ids else [],
|
||||||
channel_names=channel_names.split(",") if channel_names else [],
|
channel_names=channel_names.split(",") if channel_names else [],
|
||||||
start_date=datetime(1970, 1, 1, tzinfo=timezone.utc).strftime("%Y-%m-%d"),
|
start_date=datetime(1970, 1, 1, tzinfo=timezone.utc).strftime("%Y-%m-%d"),
|
||||||
batch_size=self.conf.get("batch_size", 1024)
|
batch_size=self.conf.get("batch_size", 1024),
|
||||||
)
|
)
|
||||||
self.connector.load_credentials(self.conf["credentials"])
|
self.connector.load_credentials(self.conf["credentials"])
|
||||||
document_generator = self.connector.load_from_state() if task["reindex"]=="1" or not task["poll_range_start"] \
|
document_generator = (
|
||||||
else self.connector.poll_source(task["poll_range_start"].timestamp(), datetime.now(timezone.utc).timestamp())
|
self.connector.load_from_state()
|
||||||
|
if task["reindex"] == "1" or not task["poll_range_start"]
|
||||||
|
else self.connector.poll_source(task["poll_range_start"].timestamp(), datetime.now(timezone.utc).timestamp())
|
||||||
|
)
|
||||||
|
|
||||||
begin_info = "totally" if task["reindex"]=="1" or not task["poll_range_start"] else "from {}".format(task["poll_range_start"])
|
begin_info = "totally" if task["reindex"] == "1" or not task["poll_range_start"] else "from {}".format(task["poll_range_start"])
|
||||||
logging.info("Connect to Discord: servers({}), channel({}) {}".format(server_ids, channel_names, begin_info))
|
logging.info("Connect to Discord: servers({}), channel({}) {}".format(server_ids, channel_names, begin_info))
|
||||||
return document_generator
|
return document_generator
|
||||||
|
|
||||||
@ -285,7 +290,7 @@ class GoogleDrive(SyncBase):
|
|||||||
admin_email = self.connector.primary_admin_email
|
admin_email = self.connector.primary_admin_email
|
||||||
except RuntimeError:
|
except RuntimeError:
|
||||||
admin_email = "unknown"
|
admin_email = "unknown"
|
||||||
logging.info("Connect to Google Drive as %s %s", admin_email, begin_info)
|
logging.info(f"Connect to Google Drive as {admin_email} {begin_info}")
|
||||||
return document_batches()
|
return document_batches()
|
||||||
|
|
||||||
def _persist_rotated_credentials(self, connector_id: str, credentials: dict[str, Any]) -> None:
|
def _persist_rotated_credentials(self, connector_id: str, credentials: dict[str, Any]) -> None:
|
||||||
@ -303,7 +308,93 @@ class Jira(SyncBase):
|
|||||||
SOURCE_NAME: str = FileSource.JIRA
|
SOURCE_NAME: str = FileSource.JIRA
|
||||||
|
|
||||||
async def _generate(self, task: dict):
|
async def _generate(self, task: dict):
|
||||||
pass
|
connector_kwargs = {
|
||||||
|
"jira_base_url": self.conf["base_url"],
|
||||||
|
"project_key": self.conf.get("project_key"),
|
||||||
|
"jql_query": self.conf.get("jql_query"),
|
||||||
|
"batch_size": self.conf.get("batch_size", INDEX_BATCH_SIZE),
|
||||||
|
"include_comments": self.conf.get("include_comments", True),
|
||||||
|
"include_attachments": self.conf.get("include_attachments", False),
|
||||||
|
"labels_to_skip": self._normalize_list(self.conf.get("labels_to_skip")),
|
||||||
|
"comment_email_blacklist": self._normalize_list(self.conf.get("comment_email_blacklist")),
|
||||||
|
"scoped_token": self.conf.get("scoped_token", False),
|
||||||
|
"attachment_size_limit": self.conf.get("attachment_size_limit"),
|
||||||
|
"timezone_offset": self.conf.get("timezone_offset"),
|
||||||
|
}
|
||||||
|
|
||||||
|
self.connector = JiraConnector(**connector_kwargs)
|
||||||
|
|
||||||
|
credentials = self.conf.get("credentials")
|
||||||
|
if not credentials:
|
||||||
|
raise ValueError("Jira connector is missing credentials.")
|
||||||
|
|
||||||
|
self.connector.load_credentials(credentials)
|
||||||
|
self.connector.validate_connector_settings()
|
||||||
|
|
||||||
|
if task["reindex"] == "1" or not task["poll_range_start"]:
|
||||||
|
start_time = 0.0
|
||||||
|
begin_info = "totally"
|
||||||
|
else:
|
||||||
|
start_time = task["poll_range_start"].timestamp()
|
||||||
|
begin_info = f"from {task['poll_range_start']}"
|
||||||
|
|
||||||
|
end_time = datetime.now(timezone.utc).timestamp()
|
||||||
|
|
||||||
|
raw_batch_size = self.conf.get("sync_batch_size") or self.conf.get("batch_size") or INDEX_BATCH_SIZE
|
||||||
|
try:
|
||||||
|
batch_size = int(raw_batch_size)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
batch_size = INDEX_BATCH_SIZE
|
||||||
|
if batch_size <= 0:
|
||||||
|
batch_size = INDEX_BATCH_SIZE
|
||||||
|
|
||||||
|
def document_batches():
|
||||||
|
checkpoint = self.connector.build_dummy_checkpoint()
|
||||||
|
pending_docs = []
|
||||||
|
iterations = 0
|
||||||
|
iteration_limit = 100_000
|
||||||
|
|
||||||
|
while checkpoint.has_more:
|
||||||
|
wrapper = CheckpointOutputWrapper()
|
||||||
|
generator = wrapper(
|
||||||
|
self.connector.load_from_checkpoint(
|
||||||
|
start_time,
|
||||||
|
end_time,
|
||||||
|
checkpoint,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for document, failure, next_checkpoint in generator:
|
||||||
|
if failure is not None:
|
||||||
|
logging.warning(
|
||||||
|
f"[Jira] Jira connector failure: {getattr(failure, 'failure_message', failure)}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if document is not None:
|
||||||
|
pending_docs.append(document)
|
||||||
|
if len(pending_docs) >= batch_size:
|
||||||
|
yield pending_docs
|
||||||
|
pending_docs = []
|
||||||
|
if next_checkpoint is not None:
|
||||||
|
checkpoint = next_checkpoint
|
||||||
|
|
||||||
|
iterations += 1
|
||||||
|
if iterations > iteration_limit:
|
||||||
|
logging.error(f"[Jira] Task {task.get('id')} exceeded iteration limit ({iteration_limit}).")
|
||||||
|
raise RuntimeError("Too many iterations while loading Jira documents.")
|
||||||
|
|
||||||
|
if pending_docs:
|
||||||
|
yield pending_docs
|
||||||
|
|
||||||
|
logging.info(f"[Jira] Connect to Jira {connector_kwargs['jira_base_url']} {begin_info}")
|
||||||
|
return document_batches()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize_list(values: Any) -> list[str] | None:
|
||||||
|
if values is None:
|
||||||
|
return None
|
||||||
|
if isinstance(values, str):
|
||||||
|
values = [item.strip() for item in values.split(",")]
|
||||||
|
return [str(value).strip() for value in values if value is not None and str(value).strip()]
|
||||||
|
|
||||||
|
|
||||||
class SharePoint(SyncBase):
|
class SharePoint(SyncBase):
|
||||||
@ -337,9 +428,10 @@ func_factory = {
|
|||||||
FileSource.JIRA: Jira,
|
FileSource.JIRA: Jira,
|
||||||
FileSource.SHAREPOINT: SharePoint,
|
FileSource.SHAREPOINT: SharePoint,
|
||||||
FileSource.SLACK: Slack,
|
FileSource.SLACK: Slack,
|
||||||
FileSource.TEAMS: Teams
|
FileSource.TEAMS: Teams,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
async def dispatch_tasks():
|
async def dispatch_tasks():
|
||||||
async with trio.open_nursery() as nursery:
|
async with trio.open_nursery() as nursery:
|
||||||
while True:
|
while True:
|
||||||
@ -385,7 +477,7 @@ async def main():
|
|||||||
__/ |
|
__/ |
|
||||||
|___/
|
|___/
|
||||||
""")
|
""")
|
||||||
logging.info(f'RAGFlow version: {get_ragflow_version()}')
|
logging.info(f"RAGFlow version: {get_ragflow_version()}")
|
||||||
show_configs()
|
show_configs()
|
||||||
settings.init_settings()
|
settings.init_settings()
|
||||||
if sys.platform != "win32":
|
if sys.platform != "win32":
|
||||||
|
|||||||
16
web/src/assets/svg/data-source/jira.svg
Normal file
16
web/src/assets/svg/data-source/jira.svg
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="128" height="128" viewBox="0 0 128 128">
|
||||||
|
<defs>
|
||||||
|
<linearGradient id="jira-a" x1="88.136" y1="39.092" x2="68.472" y2="59.368" gradientUnits="userSpaceOnUse">
|
||||||
|
<stop offset="0.176" stop-color="#0052cc"/>
|
||||||
|
<stop offset="1" stop-color="#2684ff"/>
|
||||||
|
</linearGradient>
|
||||||
|
<linearGradient id="jira-b" x1="66.564" y1="62.256" x2="43.828" y2="84.376" gradientUnits="userSpaceOnUse">
|
||||||
|
<stop offset="0.176" stop-color="#0052cc"/>
|
||||||
|
<stop offset="1" stop-color="#2684ff"/>
|
||||||
|
</linearGradient>
|
||||||
|
</defs>
|
||||||
|
|
||||||
|
<path d="M108.023 16H61.805c0 11.52 9.324 20.848 20.847 20.848h8.5v8.226c0 11.52 9.328 20.848 20.848 20.848V19.977A3.98 3.98 0 00108.023 16z" fill="#2684ff"/>
|
||||||
|
<path d="M85.121 39.04H38.902c0 11.519 9.325 20.847 20.844 20.847h8.504v8.226c0 11.52 9.328 20.848 20.848 20.848V43.016a3.983 3.983 0 00-3.977-3.977z" fill="url(#jira-a)"/>
|
||||||
|
<path d="M62.219 62.078H16c0 11.524 9.324 20.848 20.848 20.848h8.5v8.23c0 11.52 9.328 20.844 20.847 20.844V66.059a3.984 3.984 0 00-3.976-3.98z" fill="url(#jira-b)"/>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 1.1 KiB |
@ -732,6 +732,33 @@ Example: general/v2/`,
|
|||||||
'Comma-separated emails whose “My Drive” contents should be indexed (include the primary admin).',
|
'Comma-separated emails whose “My Drive” contents should be indexed (include the primary admin).',
|
||||||
google_driveSharedFoldersTip:
|
google_driveSharedFoldersTip:
|
||||||
'Comma-separated Google Drive folder links to crawl.',
|
'Comma-separated Google Drive folder links to crawl.',
|
||||||
|
jiraDescription:
|
||||||
|
'Connect your Jira workspace to sync issues, comments, and attachments.',
|
||||||
|
jiraBaseUrlTip:
|
||||||
|
'Base URL of your Jira site (e.g., https://your-domain.atlassian.net).',
|
||||||
|
jiraProjectKeyTip:
|
||||||
|
'Optional: limit syncing to a single project key (e.g., ENG).',
|
||||||
|
jiraJqlTip:
|
||||||
|
'Optional JQL filter. Leave blank to rely on project/time filters.',
|
||||||
|
jiraBatchSizeTip:
|
||||||
|
'Maximum number of issues requested from Jira per batch.',
|
||||||
|
jiraCommentsTip:
|
||||||
|
'Include Jira comments in the generated markdown document.',
|
||||||
|
jiraAttachmentsTip:
|
||||||
|
'Download attachments as separate documents during sync.',
|
||||||
|
jiraAttachmentSizeTip:
|
||||||
|
'Attachments larger than this number of bytes will be skipped.',
|
||||||
|
jiraLabelsTip:
|
||||||
|
'Labels that should be skipped while indexing (comma separated).',
|
||||||
|
jiraBlacklistTip:
|
||||||
|
'Comments whose author email matches these entries will be ignored.',
|
||||||
|
jiraScopedTokenTip:
|
||||||
|
'Enable this when using scoped Atlassian tokens (api.atlassian.com).',
|
||||||
|
jiraEmailTip: 'Email associated with the Jira account/API token.',
|
||||||
|
jiraTokenTip:
|
||||||
|
'API token generated from https://id.atlassian.com/manage-profile/security/api-tokens.',
|
||||||
|
jiraPasswordTip:
|
||||||
|
'Optional password for Jira Server/Data Center environments.',
|
||||||
availableSourcesDescription: 'Select a data source to add',
|
availableSourcesDescription: 'Select a data source to add',
|
||||||
availableSources: 'Available sources',
|
availableSources: 'Available sources',
|
||||||
datasourceDescription: 'Manage your data source and connections',
|
datasourceDescription: 'Manage your data source and connections',
|
||||||
|
|||||||
@ -716,6 +716,23 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
|
|||||||
'需要索引其 “我的云端硬盘” 的邮箱,多个邮箱用逗号分隔(建议包含管理员)。',
|
'需要索引其 “我的云端硬盘” 的邮箱,多个邮箱用逗号分隔(建议包含管理员)。',
|
||||||
google_driveSharedFoldersTip:
|
google_driveSharedFoldersTip:
|
||||||
'需要同步的 Google Drive 文件夹链接,多个链接用逗号分隔。',
|
'需要同步的 Google Drive 文件夹链接,多个链接用逗号分隔。',
|
||||||
|
jiraDescription: '接入 Jira 工作区,持续同步Issues、评论与附件。',
|
||||||
|
jiraBaseUrlTip:
|
||||||
|
'Jira 的 Base URL,例如:https://your-domain.atlassian.net。',
|
||||||
|
jiraProjectKeyTip: '可选:仅同步指定的项目(如 RAG)。',
|
||||||
|
jiraJqlTip: '可选:自定义 JQL 过滤条件,留空则使用项目 / 时间范围。',
|
||||||
|
jiraBatchSizeTip: '每次向 Jira 请求的 Issue 数量上限。',
|
||||||
|
jiraCommentsTip: '同步评论。',
|
||||||
|
jiraAttachmentsTip: '开启后会将附件下载为独立文档。',
|
||||||
|
jiraAttachmentSizeTip: '超过该字节阈值的附件会被跳过。',
|
||||||
|
jiraLabelsTip: '需要跳过的标签(逗号分隔)。',
|
||||||
|
jiraBlacklistTip: '这些邮箱作者的评论会被忽略。',
|
||||||
|
jiraScopedTokenTip:
|
||||||
|
'仅当凭证为 Atlassian scoped token(api.atlassian.com)时生效。',
|
||||||
|
jiraEmailTip: '与 API Token 对应的 Jira 账户邮箱。',
|
||||||
|
jiraTokenTip:
|
||||||
|
'在 https://id.atlassian.com/manage-profile/security/api-tokens 生成的 API Token。 (Clould only)',
|
||||||
|
jiraPasswordTip: '可选:仅 Jira Server/Data Center 环境需要的密码字段。',
|
||||||
availableSourcesDescription: '选择要添加的数据源',
|
availableSourcesDescription: '选择要添加的数据源',
|
||||||
availableSources: '可用数据源',
|
availableSources: '可用数据源',
|
||||||
datasourceDescription: '管理您的数据源和连接',
|
datasourceDescription: '管理您的数据源和连接',
|
||||||
|
|||||||
@ -9,8 +9,8 @@ export enum DataSourceKey {
|
|||||||
NOTION = 'notion',
|
NOTION = 'notion',
|
||||||
DISCORD = 'discord',
|
DISCORD = 'discord',
|
||||||
GOOGLE_DRIVE = 'google_drive',
|
GOOGLE_DRIVE = 'google_drive',
|
||||||
// GMAIL = 'gmail',
|
// GMAIL = 'gmail',
|
||||||
// JIRA = 'jira',
|
JIRA = 'jira',
|
||||||
// SHAREPOINT = 'sharepoint',
|
// SHAREPOINT = 'sharepoint',
|
||||||
// SLACK = 'slack',
|
// SLACK = 'slack',
|
||||||
// TEAMS = 'teams',
|
// TEAMS = 'teams',
|
||||||
@ -42,6 +42,11 @@ export const DataSourceInfo = {
|
|||||||
description: t(`setting.${DataSourceKey.GOOGLE_DRIVE}Description`),
|
description: t(`setting.${DataSourceKey.GOOGLE_DRIVE}Description`),
|
||||||
icon: <SvgIcon name={'data-source/google-drive'} width={38} />,
|
icon: <SvgIcon name={'data-source/google-drive'} width={38} />,
|
||||||
},
|
},
|
||||||
|
[DataSourceKey.JIRA]: {
|
||||||
|
name: 'Jira',
|
||||||
|
description: t(`setting.${DataSourceKey.JIRA}Description`),
|
||||||
|
icon: <SvgIcon name={'data-source/jira'} width={38} />,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
export const DataSourceFormBaseFields = [
|
export const DataSourceFormBaseFields = [
|
||||||
@ -270,6 +275,106 @@ export const DataSourceFormFields = {
|
|||||||
defaultValue: 'uploaded',
|
defaultValue: 'uploaded',
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
[DataSourceKey.JIRA]: [
|
||||||
|
{
|
||||||
|
label: 'Jira Base URL',
|
||||||
|
name: 'config.base_url',
|
||||||
|
type: FormFieldType.Text,
|
||||||
|
required: true,
|
||||||
|
placeholder: 'https://your-domain.atlassian.net',
|
||||||
|
tooltip: t('setting.jiraBaseUrlTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Project Key',
|
||||||
|
name: 'config.project_key',
|
||||||
|
type: FormFieldType.Text,
|
||||||
|
required: false,
|
||||||
|
placeholder: 'RAGFlow',
|
||||||
|
tooltip: t('setting.jiraProjectKeyTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Custom JQL',
|
||||||
|
name: 'config.jql_query',
|
||||||
|
type: FormFieldType.Textarea,
|
||||||
|
required: false,
|
||||||
|
placeholder: 'project = RAG AND updated >= -7d',
|
||||||
|
tooltip: t('setting.jiraJqlTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Batch Size',
|
||||||
|
name: 'config.batch_size',
|
||||||
|
type: FormFieldType.Number,
|
||||||
|
required: false,
|
||||||
|
tooltip: t('setting.jiraBatchSizeTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Include Comments',
|
||||||
|
name: 'config.include_comments',
|
||||||
|
type: FormFieldType.Checkbox,
|
||||||
|
required: false,
|
||||||
|
defaultValue: true,
|
||||||
|
tooltip: t('setting.jiraCommentsTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Include Attachments',
|
||||||
|
name: 'config.include_attachments',
|
||||||
|
type: FormFieldType.Checkbox,
|
||||||
|
required: false,
|
||||||
|
defaultValue: false,
|
||||||
|
tooltip: t('setting.jiraAttachmentsTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Attachment Size Limit (bytes)',
|
||||||
|
name: 'config.attachment_size_limit',
|
||||||
|
type: FormFieldType.Number,
|
||||||
|
required: false,
|
||||||
|
defaultValue: 10 * 1024 * 1024,
|
||||||
|
tooltip: t('setting.jiraAttachmentSizeTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Labels to Skip',
|
||||||
|
name: 'config.labels_to_skip',
|
||||||
|
type: FormFieldType.Tag,
|
||||||
|
required: false,
|
||||||
|
tooltip: t('setting.jiraLabelsTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Comment Email Blacklist',
|
||||||
|
name: 'config.comment_email_blacklist',
|
||||||
|
type: FormFieldType.Tag,
|
||||||
|
required: false,
|
||||||
|
tooltip: t('setting.jiraBlacklistTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Use Scoped Token (Clould only)',
|
||||||
|
name: 'config.scoped_token',
|
||||||
|
type: FormFieldType.Checkbox,
|
||||||
|
required: false,
|
||||||
|
tooltip: t('setting.jiraScopedTokenTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Jira User Email (Cloud) or User Name (Server)',
|
||||||
|
name: 'config.credentials.jira_user_email',
|
||||||
|
type: FormFieldType.Text,
|
||||||
|
required: true,
|
||||||
|
placeholder: 'you@example.com',
|
||||||
|
tooltip: t('setting.jiraEmailTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Jira API Token (Cloud only)',
|
||||||
|
name: 'config.credentials.jira_api_token',
|
||||||
|
type: FormFieldType.Password,
|
||||||
|
required: false,
|
||||||
|
tooltip: t('setting.jiraTokenTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Jira Password (Server only)',
|
||||||
|
name: 'config.credentials.jira_password',
|
||||||
|
type: FormFieldType.Password,
|
||||||
|
required: false,
|
||||||
|
tooltip: t('setting.jiraPasswordTip'),
|
||||||
|
},
|
||||||
|
],
|
||||||
// [DataSourceKey.GOOGLE_DRIVE]: [
|
// [DataSourceKey.GOOGLE_DRIVE]: [
|
||||||
// {
|
// {
|
||||||
// label: 'Primary Admin Email',
|
// label: 'Primary Admin Email',
|
||||||
@ -433,4 +538,25 @@ export const DataSourceFormDefaultValues = {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
[DataSourceKey.JIRA]: {
|
||||||
|
name: '',
|
||||||
|
source: DataSourceKey.JIRA,
|
||||||
|
config: {
|
||||||
|
base_url: '',
|
||||||
|
project_key: '',
|
||||||
|
jql_query: '',
|
||||||
|
batch_size: 2,
|
||||||
|
include_comments: true,
|
||||||
|
include_attachments: false,
|
||||||
|
attachment_size_limit: 10 * 1024 * 1024,
|
||||||
|
labels_to_skip: [],
|
||||||
|
comment_email_blacklist: [],
|
||||||
|
scoped_token: false,
|
||||||
|
credentials: {
|
||||||
|
jira_user_email: '',
|
||||||
|
jira_api_token: '',
|
||||||
|
jira_password: '',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|||||||
@ -44,6 +44,12 @@ const dataSourceTemplates = [
|
|||||||
description: DataSourceInfo[DataSourceKey.NOTION].description,
|
description: DataSourceInfo[DataSourceKey.NOTION].description,
|
||||||
icon: DataSourceInfo[DataSourceKey.NOTION].icon,
|
icon: DataSourceInfo[DataSourceKey.NOTION].icon,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
id: DataSourceKey.JIRA,
|
||||||
|
name: DataSourceInfo[DataSourceKey.JIRA].name,
|
||||||
|
description: DataSourceInfo[DataSourceKey.JIRA].description,
|
||||||
|
icon: DataSourceInfo[DataSourceKey.JIRA].icon,
|
||||||
|
},
|
||||||
];
|
];
|
||||||
const DataSource = () => {
|
const DataSource = () => {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
|
|||||||
Reference in New Issue
Block a user