mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Add Moodle data source integration (#11325)
### What problem does this PR solve? This PR adds a native Moodle connector to sync content (courses, resources, forums, assignments, pages, books) into RAGFlow. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
@ -14,6 +14,7 @@ from .google_drive.connector import GoogleDriveConnector
|
||||
from .jira.connector import JiraConnector
|
||||
from .sharepoint_connector import SharePointConnector
|
||||
from .teams_connector import TeamsConnector
|
||||
from .moodle_connector import MoodleConnector
|
||||
from .config import BlobType, DocumentSource
|
||||
from .models import Document, TextSection, ImageSection, BasicExpertInfo
|
||||
from .exceptions import (
|
||||
@ -36,6 +37,7 @@ __all__ = [
|
||||
"JiraConnector",
|
||||
"SharePointConnector",
|
||||
"TeamsConnector",
|
||||
"MoodleConnector",
|
||||
"BlobType",
|
||||
"DocumentSource",
|
||||
"Document",
|
||||
|
||||
@ -48,6 +48,7 @@ class DocumentSource(str, Enum):
|
||||
GOOGLE_DRIVE = "google_drive"
|
||||
GMAIL = "gmail"
|
||||
DISCORD = "discord"
|
||||
MOODLE = "moodle"
|
||||
S3_COMPATIBLE = "s3_compatible"
|
||||
|
||||
|
||||
|
||||
378
common/data_source/moodle_connector.py
Normal file
378
common/data_source/moodle_connector.py
Normal file
@ -0,0 +1,378 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime, timezone
|
||||
from retry import retry
|
||||
from typing import Any, Optional
|
||||
|
||||
from markdownify import markdownify as md
|
||||
from moodle import Moodle as MoodleClient, MoodleException
|
||||
|
||||
from common.data_source.config import INDEX_BATCH_SIZE
|
||||
from common.data_source.exceptions import (
|
||||
ConnectorMissingCredentialError,
|
||||
CredentialExpiredError,
|
||||
InsufficientPermissionsError,
|
||||
ConnectorValidationError,
|
||||
)
|
||||
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
|
||||
from common.data_source.models import Document
|
||||
from common.data_source.utils import batch_generator, rl_requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MoodleConnector(LoadConnector, PollConnector):
|
||||
"""Moodle LMS connector for accessing course content"""
|
||||
|
||||
def __init__(self, moodle_url: str, batch_size: int = INDEX_BATCH_SIZE) -> None:
|
||||
self.moodle_url = moodle_url.rstrip("/")
|
||||
self.batch_size = batch_size
|
||||
self.moodle_client: Optional[MoodleClient] = None
|
||||
|
||||
def _add_token_to_url(self, file_url: str) -> str:
|
||||
"""Append Moodle token to URL if missing"""
|
||||
if not self.moodle_client:
|
||||
return file_url
|
||||
token = getattr(self.moodle_client, "token", "")
|
||||
if "token=" in file_url.lower():
|
||||
return file_url
|
||||
delimiter = "&" if "?" in file_url else "?"
|
||||
return f"{file_url}{delimiter}token={token}"
|
||||
|
||||
def _log_error(self, context: str, error: Exception, level: str = "warning") -> None:
|
||||
"""Simplified logging wrapper"""
|
||||
msg = f"{context}: {error}"
|
||||
if level == "error":
|
||||
logger.error(msg)
|
||||
else:
|
||||
logger.warning(msg)
|
||||
|
||||
def _get_latest_timestamp(self, *timestamps: int) -> int:
|
||||
"""Return latest valid timestamp"""
|
||||
return max((t for t in timestamps if t and t > 0), default=0)
|
||||
|
||||
def _yield_in_batches(
|
||||
self, generator: Generator[Document, None, None]
|
||||
) -> Generator[list[Document], None, None]:
|
||||
for batch in batch_generator(generator, self.batch_size):
|
||||
yield batch
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> None:
|
||||
token = credentials.get("moodle_token")
|
||||
if not token:
|
||||
raise ConnectorMissingCredentialError("Moodle API token is required")
|
||||
|
||||
try:
|
||||
self.moodle_client = MoodleClient(
|
||||
self.moodle_url + "/webservice/rest/server.php", token
|
||||
)
|
||||
self.moodle_client.core.webservice.get_site_info()
|
||||
except MoodleException as e:
|
||||
if "invalidtoken" in str(e).lower():
|
||||
raise CredentialExpiredError("Moodle token is invalid or expired")
|
||||
raise ConnectorMissingCredentialError(f"Failed to initialize Moodle client: {e}")
|
||||
|
||||
def validate_connector_settings(self) -> None:
|
||||
if not self.moodle_client:
|
||||
raise ConnectorMissingCredentialError("Moodle client not initialized")
|
||||
|
||||
try:
|
||||
site_info = self.moodle_client.core.webservice.get_site_info()
|
||||
if not site_info.sitename:
|
||||
raise InsufficientPermissionsError("Invalid Moodle API response")
|
||||
except MoodleException as e:
|
||||
msg = str(e).lower()
|
||||
if "invalidtoken" in msg:
|
||||
raise CredentialExpiredError("Moodle token is invalid or expired")
|
||||
if "accessexception" in msg:
|
||||
raise InsufficientPermissionsError(
|
||||
"Insufficient permissions. Ensure web services are enabled and permissions are correct."
|
||||
)
|
||||
raise ConnectorValidationError(f"Moodle validation error: {e}")
|
||||
except Exception as e:
|
||||
raise ConnectorValidationError(f"Unexpected validation error: {e}")
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Data loading & polling
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def load_from_state(self) -> Generator[list[Document], None, None]:
|
||||
if not self.moodle_client:
|
||||
raise ConnectorMissingCredentialError("Moodle client not initialized")
|
||||
|
||||
logger.info("Starting full load from Moodle workspace")
|
||||
courses = self._get_enrolled_courses()
|
||||
if not courses:
|
||||
logger.warning("No courses found to process")
|
||||
return
|
||||
|
||||
yield from self._yield_in_batches(self._process_courses(courses))
|
||||
|
||||
def poll_source(
|
||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||
) -> Generator[list[Document], None, None]:
|
||||
if not self.moodle_client:
|
||||
raise ConnectorMissingCredentialError("Moodle client not initialized")
|
||||
|
||||
logger.info(
|
||||
f"Polling Moodle updates between {datetime.fromtimestamp(start)} and {datetime.fromtimestamp(end)}"
|
||||
)
|
||||
courses = self._get_enrolled_courses()
|
||||
if not courses:
|
||||
logger.warning("No courses found to poll")
|
||||
return
|
||||
|
||||
yield from self._yield_in_batches(self._get_updated_content(courses, start, end))
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _get_enrolled_courses(self) -> list:
|
||||
if not self.moodle_client:
|
||||
raise ConnectorMissingCredentialError("Moodle client not initialized")
|
||||
|
||||
try:
|
||||
return self.moodle_client.core.course.get_courses()
|
||||
except MoodleException as e:
|
||||
self._log_error("fetching courses", e, "error")
|
||||
raise ConnectorValidationError(f"Failed to fetch courses: {e}")
|
||||
|
||||
@retry(tries=3, delay=1, backoff=2)
|
||||
def _get_course_contents(self, course_id: int):
|
||||
if not self.moodle_client:
|
||||
raise ConnectorMissingCredentialError("Moodle client not initialized")
|
||||
|
||||
try:
|
||||
return self.moodle_client.core.course.get_contents(courseid=course_id)
|
||||
except MoodleException as e:
|
||||
self._log_error(f"fetching course contents for {course_id}", e)
|
||||
return []
|
||||
|
||||
def _process_courses(self, courses) -> Generator[Document, None, None]:
|
||||
for course in courses:
|
||||
try:
|
||||
contents = self._get_course_contents(course.id)
|
||||
for section in contents:
|
||||
for module in section.modules:
|
||||
doc = self._process_module(course, section, module)
|
||||
if doc:
|
||||
yield doc
|
||||
except Exception as e:
|
||||
self._log_error(f"processing course {course.fullname}", e)
|
||||
|
||||
def _get_updated_content(
|
||||
self, courses, start: float, end: float
|
||||
) -> Generator[Document, None, None]:
|
||||
for course in courses:
|
||||
try:
|
||||
contents = self._get_course_contents(course.id)
|
||||
for section in contents:
|
||||
for module in section.modules:
|
||||
times = [
|
||||
getattr(module, "timecreated", 0),
|
||||
getattr(module, "timemodified", 0),
|
||||
]
|
||||
if hasattr(module, "contents"):
|
||||
times.extend(
|
||||
getattr(c, "timemodified", 0)
|
||||
for c in module.contents
|
||||
if c and getattr(c, "timemodified", 0)
|
||||
)
|
||||
last_mod = self._get_latest_timestamp(*times)
|
||||
if start < last_mod <= end:
|
||||
doc = self._process_module(course, section, module)
|
||||
if doc:
|
||||
yield doc
|
||||
except Exception as e:
|
||||
self._log_error(f"polling course {course.fullname}", e)
|
||||
|
||||
def _process_module(
|
||||
self, course, section, module
|
||||
) -> Optional[Document]:
|
||||
try:
|
||||
mtype = module.modname
|
||||
if mtype in ["label", "url"]:
|
||||
return None
|
||||
if mtype == "resource":
|
||||
return self._process_resource(course, section, module)
|
||||
if mtype == "forum":
|
||||
return self._process_forum(course, section, module)
|
||||
if mtype == "page":
|
||||
return self._process_page(course, section, module)
|
||||
if mtype in ["assign", "quiz"]:
|
||||
return self._process_activity(course, section, module)
|
||||
if mtype == "book":
|
||||
return self._process_book(course, section, module)
|
||||
except Exception as e:
|
||||
self._log_error(f"processing module {getattr(module, 'name', '?')}", e)
|
||||
return None
|
||||
|
||||
def _process_resource(self, course, section, module) -> Optional[Document]:
|
||||
if not getattr(module, "contents", None):
|
||||
return None
|
||||
|
||||
file_info = module.contents[0]
|
||||
if not getattr(file_info, "fileurl", None):
|
||||
return None
|
||||
|
||||
file_name = os.path.basename(file_info.filename)
|
||||
ts = self._get_latest_timestamp(
|
||||
getattr(module, "timecreated", 0),
|
||||
getattr(module, "timemodified", 0),
|
||||
getattr(file_info, "timemodified", 0),
|
||||
)
|
||||
|
||||
try:
|
||||
resp = rl_requests.get(self._add_token_to_url(file_info.fileurl), timeout=60)
|
||||
resp.raise_for_status()
|
||||
blob = resp.content
|
||||
ext = os.path.splitext(file_name)[1] or ".bin"
|
||||
semantic_id = f"{course.fullname} / {section.name} / {file_name}"
|
||||
return Document(
|
||||
id=f"moodle_resource_{module.id}",
|
||||
source="moodle",
|
||||
semantic_identifier=semantic_id,
|
||||
extension=ext,
|
||||
blob=blob,
|
||||
doc_updated_at=datetime.fromtimestamp(ts or 0, tz=timezone.utc),
|
||||
size_bytes=len(blob),
|
||||
)
|
||||
except Exception as e:
|
||||
self._log_error(f"downloading resource {file_name}", e, "error")
|
||||
return None
|
||||
|
||||
def _process_forum(self, course, section, module) -> Optional[Document]:
|
||||
if not self.moodle_client or not getattr(module, "instance", None):
|
||||
return None
|
||||
|
||||
try:
|
||||
result = self.moodle_client.mod.forum.get_forum_discussions(forumid=module.instance)
|
||||
disc_list = getattr(result, "discussions", [])
|
||||
if not disc_list:
|
||||
return None
|
||||
|
||||
markdown = [f"# {module.name}\n"]
|
||||
latest_ts = self._get_latest_timestamp(
|
||||
getattr(module, "timecreated", 0),
|
||||
getattr(module, "timemodified", 0),
|
||||
)
|
||||
|
||||
for d in disc_list:
|
||||
markdown.append(f"## {d.name}\n\n{md(d.message or '')}\n\n---\n")
|
||||
latest_ts = max(latest_ts, getattr(d, "timemodified", 0))
|
||||
|
||||
blob = "\n".join(markdown).encode("utf-8")
|
||||
semantic_id = f"{course.fullname} / {section.name} / {module.name}"
|
||||
return Document(
|
||||
id=f"moodle_forum_{module.id}",
|
||||
source="moodle",
|
||||
semantic_identifier=semantic_id,
|
||||
extension=".md",
|
||||
blob=blob,
|
||||
doc_updated_at=datetime.fromtimestamp(latest_ts or 0, tz=timezone.utc),
|
||||
size_bytes=len(blob),
|
||||
)
|
||||
except Exception as e:
|
||||
self._log_error(f"processing forum {module.name}", e)
|
||||
return None
|
||||
|
||||
def _process_page(self, course, section, module) -> Optional[Document]:
|
||||
if not getattr(module, "contents", None):
|
||||
return None
|
||||
|
||||
file_info = module.contents[0]
|
||||
if not getattr(file_info, "fileurl", None):
|
||||
return None
|
||||
|
||||
file_name = os.path.basename(file_info.filename)
|
||||
ts = self._get_latest_timestamp(
|
||||
getattr(module, "timecreated", 0),
|
||||
getattr(module, "timemodified", 0),
|
||||
getattr(file_info, "timemodified", 0),
|
||||
)
|
||||
|
||||
try:
|
||||
resp = rl_requests.get(self._add_token_to_url(file_info.fileurl), timeout=60)
|
||||
resp.raise_for_status()
|
||||
blob = resp.content
|
||||
ext = os.path.splitext(file_name)[1] or ".html"
|
||||
semantic_id = f"{course.fullname} / {section.name} / {module.name}"
|
||||
return Document(
|
||||
id=f"moodle_page_{module.id}",
|
||||
source="moodle",
|
||||
semantic_identifier=semantic_id,
|
||||
extension=ext,
|
||||
blob=blob,
|
||||
doc_updated_at=datetime.fromtimestamp(ts or 0, tz=timezone.utc),
|
||||
size_bytes=len(blob),
|
||||
)
|
||||
except Exception as e:
|
||||
self._log_error(f"processing page {file_name}", e, "error")
|
||||
return None
|
||||
|
||||
def _process_activity(self, course, section, module) -> Optional[Document]:
|
||||
desc = getattr(module, "description", "")
|
||||
if not desc:
|
||||
return None
|
||||
|
||||
mtype, mname = module.modname, module.name
|
||||
markdown = f"# {mname}\n\n**Type:** {mtype.capitalize()}\n\n{md(desc)}"
|
||||
ts = self._get_latest_timestamp(
|
||||
getattr(module, "timecreated", 0),
|
||||
getattr(module, "timemodified", 0),
|
||||
getattr(module, "added", 0),
|
||||
)
|
||||
|
||||
semantic_id = f"{course.fullname} / {section.name} / {mname}"
|
||||
blob = markdown.encode("utf-8")
|
||||
return Document(
|
||||
id=f"moodle_{mtype}_{module.id}",
|
||||
source="moodle",
|
||||
semantic_identifier=semantic_id,
|
||||
extension=".md",
|
||||
blob=blob,
|
||||
doc_updated_at=datetime.fromtimestamp(ts or 0, tz=timezone.utc),
|
||||
size_bytes=len(blob),
|
||||
)
|
||||
|
||||
def _process_book(self, course, section, module) -> Optional[Document]:
|
||||
if not getattr(module, "contents", None):
|
||||
return None
|
||||
|
||||
contents = module.contents
|
||||
chapters = [
|
||||
c for c in contents
|
||||
if getattr(c, "fileurl", None) and os.path.basename(c.filename) == "index.html"
|
||||
]
|
||||
if not chapters:
|
||||
return None
|
||||
|
||||
latest_ts = self._get_latest_timestamp(
|
||||
getattr(module, "timecreated", 0),
|
||||
getattr(module, "timemodified", 0),
|
||||
*[getattr(c, "timecreated", 0) for c in contents],
|
||||
*[getattr(c, "timemodified", 0) for c in contents],
|
||||
)
|
||||
|
||||
markdown_parts = [f"# {module.name}\n"]
|
||||
for ch in chapters:
|
||||
try:
|
||||
resp = rl_requests.get(self._add_token_to_url(ch.fileurl), timeout=60)
|
||||
resp.raise_for_status()
|
||||
html = resp.content.decode("utf-8", errors="ignore")
|
||||
markdown_parts.append(md(html) + "\n\n---\n")
|
||||
except Exception as e:
|
||||
self._log_error(f"processing book chapter {ch.filename}", e)
|
||||
|
||||
blob = "\n".join(markdown_parts).encode("utf-8")
|
||||
semantic_id = f"{course.fullname} / {section.name} / {module.name}"
|
||||
return Document(
|
||||
id=f"moodle_book_{module.id}",
|
||||
source="moodle",
|
||||
semantic_identifier=semantic_id,
|
||||
extension=".md",
|
||||
blob=blob,
|
||||
doc_updated_at=datetime.fromtimestamp(latest_ts or 0, tz=timezone.utc),
|
||||
size_bytes=len(blob),
|
||||
)
|
||||
Reference in New Issue
Block a user