Add Moodle data source integration (#11325)

### What problem does this PR solve?

This PR adds a native Moodle connector to sync content (courses,
resources, forums, assignments, pages, books) into RAGFlow.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Levi
2025-11-21 12:58:49 +01:00
committed by GitHub
parent 174a2578e8
commit f0a14f5fce
11 changed files with 3371 additions and 2886 deletions

View File

@ -118,6 +118,7 @@ class FileSource(StrEnum):
SHAREPOINT = "sharepoint"
SLACK = "slack"
TEAMS = "teams"
MOODLE = "moodle"
class PipelineTaskType(StrEnum):

View File

@ -14,6 +14,7 @@ from .google_drive.connector import GoogleDriveConnector
from .jira.connector import JiraConnector
from .sharepoint_connector import SharePointConnector
from .teams_connector import TeamsConnector
from .moodle_connector import MoodleConnector
from .config import BlobType, DocumentSource
from .models import Document, TextSection, ImageSection, BasicExpertInfo
from .exceptions import (
@ -36,6 +37,7 @@ __all__ = [
"JiraConnector",
"SharePointConnector",
"TeamsConnector",
"MoodleConnector",
"BlobType",
"DocumentSource",
"Document",

View File

@ -48,6 +48,7 @@ class DocumentSource(str, Enum):
GOOGLE_DRIVE = "google_drive"
GMAIL = "gmail"
DISCORD = "discord"
MOODLE = "moodle"
S3_COMPATIBLE = "s3_compatible"

View File

@ -0,0 +1,378 @@
from __future__ import annotations
import logging
import os
from collections.abc import Generator
from datetime import datetime, timezone
from retry import retry
from typing import Any, Optional
from markdownify import markdownify as md
from moodle import Moodle as MoodleClient, MoodleException
from common.data_source.config import INDEX_BATCH_SIZE
from common.data_source.exceptions import (
ConnectorMissingCredentialError,
CredentialExpiredError,
InsufficientPermissionsError,
ConnectorValidationError,
)
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
from common.data_source.models import Document
from common.data_source.utils import batch_generator, rl_requests
logger = logging.getLogger(__name__)
class MoodleConnector(LoadConnector, PollConnector):
"""Moodle LMS connector for accessing course content"""
def __init__(self, moodle_url: str, batch_size: int = INDEX_BATCH_SIZE) -> None:
self.moodle_url = moodle_url.rstrip("/")
self.batch_size = batch_size
self.moodle_client: Optional[MoodleClient] = None
def _add_token_to_url(self, file_url: str) -> str:
"""Append Moodle token to URL if missing"""
if not self.moodle_client:
return file_url
token = getattr(self.moodle_client, "token", "")
if "token=" in file_url.lower():
return file_url
delimiter = "&" if "?" in file_url else "?"
return f"{file_url}{delimiter}token={token}"
def _log_error(self, context: str, error: Exception, level: str = "warning") -> None:
"""Simplified logging wrapper"""
msg = f"{context}: {error}"
if level == "error":
logger.error(msg)
else:
logger.warning(msg)
def _get_latest_timestamp(self, *timestamps: int) -> int:
"""Return latest valid timestamp"""
return max((t for t in timestamps if t and t > 0), default=0)
def _yield_in_batches(
self, generator: Generator[Document, None, None]
) -> Generator[list[Document], None, None]:
for batch in batch_generator(generator, self.batch_size):
yield batch
def load_credentials(self, credentials: dict[str, Any]) -> None:
token = credentials.get("moodle_token")
if not token:
raise ConnectorMissingCredentialError("Moodle API token is required")
try:
self.moodle_client = MoodleClient(
self.moodle_url + "/webservice/rest/server.php", token
)
self.moodle_client.core.webservice.get_site_info()
except MoodleException as e:
if "invalidtoken" in str(e).lower():
raise CredentialExpiredError("Moodle token is invalid or expired")
raise ConnectorMissingCredentialError(f"Failed to initialize Moodle client: {e}")
def validate_connector_settings(self) -> None:
if not self.moodle_client:
raise ConnectorMissingCredentialError("Moodle client not initialized")
try:
site_info = self.moodle_client.core.webservice.get_site_info()
if not site_info.sitename:
raise InsufficientPermissionsError("Invalid Moodle API response")
except MoodleException as e:
msg = str(e).lower()
if "invalidtoken" in msg:
raise CredentialExpiredError("Moodle token is invalid or expired")
if "accessexception" in msg:
raise InsufficientPermissionsError(
"Insufficient permissions. Ensure web services are enabled and permissions are correct."
)
raise ConnectorValidationError(f"Moodle validation error: {e}")
except Exception as e:
raise ConnectorValidationError(f"Unexpected validation error: {e}")
# -------------------------------------------------------------------------
# Data loading & polling
# -------------------------------------------------------------------------
def load_from_state(self) -> Generator[list[Document], None, None]:
if not self.moodle_client:
raise ConnectorMissingCredentialError("Moodle client not initialized")
logger.info("Starting full load from Moodle workspace")
courses = self._get_enrolled_courses()
if not courses:
logger.warning("No courses found to process")
return
yield from self._yield_in_batches(self._process_courses(courses))
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> Generator[list[Document], None, None]:
if not self.moodle_client:
raise ConnectorMissingCredentialError("Moodle client not initialized")
logger.info(
f"Polling Moodle updates between {datetime.fromtimestamp(start)} and {datetime.fromtimestamp(end)}"
)
courses = self._get_enrolled_courses()
if not courses:
logger.warning("No courses found to poll")
return
yield from self._yield_in_batches(self._get_updated_content(courses, start, end))
@retry(tries=3, delay=1, backoff=2)
def _get_enrolled_courses(self) -> list:
if not self.moodle_client:
raise ConnectorMissingCredentialError("Moodle client not initialized")
try:
return self.moodle_client.core.course.get_courses()
except MoodleException as e:
self._log_error("fetching courses", e, "error")
raise ConnectorValidationError(f"Failed to fetch courses: {e}")
@retry(tries=3, delay=1, backoff=2)
def _get_course_contents(self, course_id: int):
if not self.moodle_client:
raise ConnectorMissingCredentialError("Moodle client not initialized")
try:
return self.moodle_client.core.course.get_contents(courseid=course_id)
except MoodleException as e:
self._log_error(f"fetching course contents for {course_id}", e)
return []
def _process_courses(self, courses) -> Generator[Document, None, None]:
for course in courses:
try:
contents = self._get_course_contents(course.id)
for section in contents:
for module in section.modules:
doc = self._process_module(course, section, module)
if doc:
yield doc
except Exception as e:
self._log_error(f"processing course {course.fullname}", e)
def _get_updated_content(
self, courses, start: float, end: float
) -> Generator[Document, None, None]:
for course in courses:
try:
contents = self._get_course_contents(course.id)
for section in contents:
for module in section.modules:
times = [
getattr(module, "timecreated", 0),
getattr(module, "timemodified", 0),
]
if hasattr(module, "contents"):
times.extend(
getattr(c, "timemodified", 0)
for c in module.contents
if c and getattr(c, "timemodified", 0)
)
last_mod = self._get_latest_timestamp(*times)
if start < last_mod <= end:
doc = self._process_module(course, section, module)
if doc:
yield doc
except Exception as e:
self._log_error(f"polling course {course.fullname}", e)
def _process_module(
self, course, section, module
) -> Optional[Document]:
try:
mtype = module.modname
if mtype in ["label", "url"]:
return None
if mtype == "resource":
return self._process_resource(course, section, module)
if mtype == "forum":
return self._process_forum(course, section, module)
if mtype == "page":
return self._process_page(course, section, module)
if mtype in ["assign", "quiz"]:
return self._process_activity(course, section, module)
if mtype == "book":
return self._process_book(course, section, module)
except Exception as e:
self._log_error(f"processing module {getattr(module, 'name', '?')}", e)
return None
def _process_resource(self, course, section, module) -> Optional[Document]:
if not getattr(module, "contents", None):
return None
file_info = module.contents[0]
if not getattr(file_info, "fileurl", None):
return None
file_name = os.path.basename(file_info.filename)
ts = self._get_latest_timestamp(
getattr(module, "timecreated", 0),
getattr(module, "timemodified", 0),
getattr(file_info, "timemodified", 0),
)
try:
resp = rl_requests.get(self._add_token_to_url(file_info.fileurl), timeout=60)
resp.raise_for_status()
blob = resp.content
ext = os.path.splitext(file_name)[1] or ".bin"
semantic_id = f"{course.fullname} / {section.name} / {file_name}"
return Document(
id=f"moodle_resource_{module.id}",
source="moodle",
semantic_identifier=semantic_id,
extension=ext,
blob=blob,
doc_updated_at=datetime.fromtimestamp(ts or 0, tz=timezone.utc),
size_bytes=len(blob),
)
except Exception as e:
self._log_error(f"downloading resource {file_name}", e, "error")
return None
def _process_forum(self, course, section, module) -> Optional[Document]:
if not self.moodle_client or not getattr(module, "instance", None):
return None
try:
result = self.moodle_client.mod.forum.get_forum_discussions(forumid=module.instance)
disc_list = getattr(result, "discussions", [])
if not disc_list:
return None
markdown = [f"# {module.name}\n"]
latest_ts = self._get_latest_timestamp(
getattr(module, "timecreated", 0),
getattr(module, "timemodified", 0),
)
for d in disc_list:
markdown.append(f"## {d.name}\n\n{md(d.message or '')}\n\n---\n")
latest_ts = max(latest_ts, getattr(d, "timemodified", 0))
blob = "\n".join(markdown).encode("utf-8")
semantic_id = f"{course.fullname} / {section.name} / {module.name}"
return Document(
id=f"moodle_forum_{module.id}",
source="moodle",
semantic_identifier=semantic_id,
extension=".md",
blob=blob,
doc_updated_at=datetime.fromtimestamp(latest_ts or 0, tz=timezone.utc),
size_bytes=len(blob),
)
except Exception as e:
self._log_error(f"processing forum {module.name}", e)
return None
def _process_page(self, course, section, module) -> Optional[Document]:
if not getattr(module, "contents", None):
return None
file_info = module.contents[0]
if not getattr(file_info, "fileurl", None):
return None
file_name = os.path.basename(file_info.filename)
ts = self._get_latest_timestamp(
getattr(module, "timecreated", 0),
getattr(module, "timemodified", 0),
getattr(file_info, "timemodified", 0),
)
try:
resp = rl_requests.get(self._add_token_to_url(file_info.fileurl), timeout=60)
resp.raise_for_status()
blob = resp.content
ext = os.path.splitext(file_name)[1] or ".html"
semantic_id = f"{course.fullname} / {section.name} / {module.name}"
return Document(
id=f"moodle_page_{module.id}",
source="moodle",
semantic_identifier=semantic_id,
extension=ext,
blob=blob,
doc_updated_at=datetime.fromtimestamp(ts or 0, tz=timezone.utc),
size_bytes=len(blob),
)
except Exception as e:
self._log_error(f"processing page {file_name}", e, "error")
return None
def _process_activity(self, course, section, module) -> Optional[Document]:
desc = getattr(module, "description", "")
if not desc:
return None
mtype, mname = module.modname, module.name
markdown = f"# {mname}\n\n**Type:** {mtype.capitalize()}\n\n{md(desc)}"
ts = self._get_latest_timestamp(
getattr(module, "timecreated", 0),
getattr(module, "timemodified", 0),
getattr(module, "added", 0),
)
semantic_id = f"{course.fullname} / {section.name} / {mname}"
blob = markdown.encode("utf-8")
return Document(
id=f"moodle_{mtype}_{module.id}",
source="moodle",
semantic_identifier=semantic_id,
extension=".md",
blob=blob,
doc_updated_at=datetime.fromtimestamp(ts or 0, tz=timezone.utc),
size_bytes=len(blob),
)
def _process_book(self, course, section, module) -> Optional[Document]:
if not getattr(module, "contents", None):
return None
contents = module.contents
chapters = [
c for c in contents
if getattr(c, "fileurl", None) and os.path.basename(c.filename) == "index.html"
]
if not chapters:
return None
latest_ts = self._get_latest_timestamp(
getattr(module, "timecreated", 0),
getattr(module, "timemodified", 0),
*[getattr(c, "timecreated", 0) for c in contents],
*[getattr(c, "timemodified", 0) for c in contents],
)
markdown_parts = [f"# {module.name}\n"]
for ch in chapters:
try:
resp = rl_requests.get(self._add_token_to_url(ch.fileurl), timeout=60)
resp.raise_for_status()
html = resp.content.decode("utf-8", errors="ignore")
markdown_parts.append(md(html) + "\n\n---\n")
except Exception as e:
self._log_error(f"processing book chapter {ch.filename}", e)
blob = "\n".join(markdown_parts).encode("utf-8")
semantic_id = f"{course.fullname} / {section.name} / {module.name}"
return Document(
id=f"moodle_book_{module.id}",
source="moodle",
semantic_identifier=semantic_id,
extension=".md",
blob=blob,
doc_updated_at=datetime.fromtimestamp(latest_ts or 0, tz=timezone.utc),
size_bytes=len(blob),
)

View File

@ -133,7 +133,7 @@ dependencies = [
"pyicu>=2.15.3,<3.0.0",
"flasgger>=0.9.7.1,<0.10.0",
"xxhash>=3.5.0,<4.0.0",
"trio>=0.29.0",
"trio>=0.17.0,<0.29.0",
"langfuse>=2.60.0",
"debugpy>=1.8.13",
"mcp>=1.9.4",
@ -148,6 +148,7 @@ dependencies = [
"markdownify>=1.2.0",
"captcha>=0.7.1",
"pip>=25.2",
"moodlepy>=0.23.0",
"pypandoc>=1.16",
"pyobvector==0.2.18",
]

View File

@ -37,14 +37,8 @@ from api.db.services.connector_service import ConnectorService, SyncLogsService
from api.db.services.knowledgebase_service import KnowledgebaseService
from common import settings
from common.config_utils import show_configs
from common.data_source import BlobStorageConnector, NotionConnector, DiscordConnector, GoogleDriveConnector, MoodleConnector, JiraConnector
from common.constants import FileSource, TaskStatus
from common.data_source import (
BlobStorageConnector,
DiscordConnector,
GoogleDriveConnector,
JiraConnector,
NotionConnector,
)
from common.data_source.config import INDEX_BATCH_SIZE
from common.data_source.confluence_connector import ConfluenceConnector
from common.data_source.interfaces import CheckpointOutputWrapper
@ -418,6 +412,37 @@ class Teams(SyncBase):
pass
class Moodle(SyncBase):
SOURCE_NAME: str = FileSource.MOODLE
async def _generate(self, task: dict):
self.connector = MoodleConnector(
moodle_url=self.conf["moodle_url"],
batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE)
)
self.connector.load_credentials(self.conf["credentials"])
# Determine the time range for synchronization based on reindex or poll_range_start
if task["reindex"] == "1" or not task.get("poll_range_start"):
document_generator = self.connector.load_from_state()
begin_info = "totally"
else:
poll_start = task["poll_range_start"]
if poll_start is None:
document_generator = self.connector.load_from_state()
begin_info = "totally"
else:
document_generator = self.connector.poll_source(
poll_start.timestamp(),
datetime.now(timezone.utc).timestamp()
)
begin_info = "from {}".format(poll_start)
logging.info("Connect to Moodle: {} {}".format(self.conf["moodle_url"], begin_info))
return document_generator
func_factory = {
FileSource.S3: S3,
FileSource.NOTION: Notion,
@ -429,6 +454,7 @@ func_factory = {
FileSource.SHAREPOINT: SharePoint,
FileSource.SLACK: Slack,
FileSource.TEAMS: Teams,
FileSource.MOODLE: Moodle
}

5779
uv.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1230.87 315.18">
<path fill="#f98012" d="M289.61 309.77V201.51q0-33.94-28-33.95t-28.06 33.95v108.26H178.4V201.51q0-33.94-27.57-33.95-28.05 0-28 33.95v108.26H67.67V195.12q0-35.43 24.6-53.63 21.66-16.25 58.56-16.25 37.41 0 55.12 19.19 15.26-19.19 55.62-19.19 36.9 0 58.54 16.25 24.6 18.19 24.61 53.63v114.65Zm675.49-.5V0h55.16v309.27Zm-70.3 0v-18.22q-7.39 9.84-25.11 15.76a92.81 92.81 0 0 1-30.05 5.41q-39.4 0-63.28-27.09t-23.89-67c0-26.25 7.76-48.3 23.4-66 13.85-15.65 36.35-26.59 62.29-26.59 29.22 0 46.28 11 56.64 23.63V0h53.68v309.27Zm0-102.92q0-14.78-14-28.33T852 164.47q-21.16 0-33.48 17.24-10.85 15.3-10.84 37.43 0 21.68 10.84 36.94 12.3 17.75 33.48 17.73 12.81 0 27.83-12.07t15-24.86ZM648.57 314.19q-41.87 0-69.19-26.59T552 219.14q0-41.83 27.34-68.45t69.19-26.59q41.85 0 69.44 26.59t27.58 68.45q0 41.88-27.58 68.46t-69.4 26.59Zm0-145.77q-19.94 0-30.65 15.1t-10.71 35.88q0 20.78 10 35.13 11.46 16.34 31.4 16.32T680 254.53q10.46-14.34 10.46-35.13t-10-35.13q-11.46-15.86-31.89-15.85ZM449.13 314.19q-41.86 0-69.2-26.59t-27.33-68.46q0-41.83 27.33-68.45t69.2-26.59q41.83 0 69.44 26.59t27.57 68.45q0 41.88-27.57 68.46t-69.44 26.59Zm0-145.77q-19.94 0-30.66 15.1t-10.71 35.88q0 20.78 10 35.13 11.46 16.34 31.41 16.32t31.39-16.32Q491 240.19 491 219.4t-10-35.13q-11.44-15.86-31.87-15.85Zm636.45 67.47c1.18 13.13 18.25 41.37 46.31 41.37 27.31 0 40.23-15.77 40.87-22.16l58.11-.5c-6.34 19.39-32.1 60.58-100 60.58-28.24 0-54.08-8.79-72.64-26.35s-27.82-40.45-27.82-68.7q0-43.83 27.82-69.68t72.16-25.85q48.25 0 75.34 32 25.13 29.53 25.12 79.28Zm90.13-34c-2.3-11.83-7.23-21.49-14.77-29.06q-12.82-12.3-29.55-12.31-17.25 0-28.82 11.82t-15.5 29.55Z"/>
<path fill="#333" d="m174.74 116.9 54.74-40-.7-2.44C130 86.57 85.08 95.15 0 144.47l.79 2.24 6.76.07c-.62 6.81-1.7 23.64-.32 48.95-9.44 27.32-.24 45.88 8.4 66.07 1.37-21 1.23-44-5.22-66.89-1.35-25.14-.24-41.67.37-48.1l56.4.54a258 258 0 0 0 1.67 33.06c50.4 17.71 101.09-.06 128-43.72-7.47-8.37-22.11-19.79-22.11-19.79Z"/>
</svg>

After

Width:  |  Height:  |  Size: 2.0 KiB

View File

@ -736,9 +736,15 @@ Example: https://fsn1.your-objectstorage.com`,
google_drivePrimaryAdminTip:
'Email address that has access to the Drive content being synced.',
google_driveMyDriveEmailsTip:
'Comma-separated emails whose My Drive contents should be indexed (include the primary admin).',
'Comma-separated emails whose "My Drive" contents should be indexed (include the primary admin).',
google_driveSharedFoldersTip:
'Comma-separated Google Drive folder links to crawl.',
moodleDescription:
'Connect to your Moodle LMS to sync course content, forums, and resources.',
moodleUrlTip:
'The base URL of your Moodle instance (e.g., https://moodle.university.edu). Do not include /webservice or /login.',
moodleTokenTip:
'Generate a web service token in Moodle: Go to Site administration → Server → Web services → Manage tokens. The user must be enrolled in the courses you want to sync.',
jiraDescription:
'Connect your Jira workspace to sync issues, comments, and attachments.',
jiraBaseUrlTip:

View File

@ -9,7 +9,8 @@ export enum DataSourceKey {
NOTION = 'notion',
DISCORD = 'discord',
GOOGLE_DRIVE = 'google_drive',
// GMAIL = 'gmail',
MOODLE = 'moodle',
// GMAIL = 'gmail',
JIRA = 'jira',
// SHAREPOINT = 'sharepoint',
// SLACK = 'slack',
@ -42,6 +43,11 @@ export const DataSourceInfo = {
description: t(`setting.${DataSourceKey.GOOGLE_DRIVE}Description`),
icon: <SvgIcon name={'data-source/google-drive'} width={38} />,
},
[DataSourceKey.MOODLE]: {
name: 'Moodle',
description: t(`setting.${DataSourceKey.MOODLE}Description`),
icon: <SvgIcon name={'data-source/moodle'} width={38} />,
},
[DataSourceKey.JIRA]: {
name: 'Jira',
description: t(`setting.${DataSourceKey.JIRA}Description`),
@ -116,7 +122,7 @@ export const DataSourceFormFields = {
required: false,
placeholder: 'https://fsn1.your-objectstorage.com',
tooltip: t('setting.S3CompatibleEndpointUrlTip'),
shouldRender: (formValues) => {
shouldRender: (formValues: any) => {
return formValues?.config?.bucket_type === 's3_compatible';
},
},
@ -287,6 +293,21 @@ export const DataSourceFormFields = {
defaultValue: 'uploaded',
},
],
[DataSourceKey.MOODLE]: [
{
label: 'Moodle URL',
name: 'config.moodle_url',
type: FormFieldType.Text,
required: true,
placeholder: 'https://moodle.example.com',
},
{
label: 'API Token',
name: 'config.credentials.moodle_token',
type: FormFieldType.Password,
required: true,
},
],
[DataSourceKey.JIRA]: [
{
label: 'Jira Base URL',
@ -456,6 +477,16 @@ export const DataSourceFormDefaultValues = {
},
},
},
[DataSourceKey.MOODLE]: {
name: '',
source: DataSourceKey.MOODLE,
config: {
moodle_url: '',
credentials: {
moodle_token: '',
},
},
},
[DataSourceKey.JIRA]: {
name: '',
source: DataSourceKey.JIRA,

View File

@ -44,6 +44,12 @@ const dataSourceTemplates = [
description: DataSourceInfo[DataSourceKey.NOTION].description,
icon: DataSourceInfo[DataSourceKey.NOTION].icon,
},
{
id: DataSourceKey.MOODLE,
name: DataSourceInfo[DataSourceKey.MOODLE].name,
description: DataSourceInfo[DataSourceKey.MOODLE].description,
icon: DataSourceInfo[DataSourceKey.MOODLE].icon,
},
{
id: DataSourceKey.JIRA,
name: DataSourceInfo[DataSourceKey.JIRA].name,