From 962bd5f5df1c078a3ef7b11516f72bc9789bae09 Mon Sep 17 00:00:00 2001 From: Levi <81591061+levischd@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:12:43 +0100 Subject: [PATCH] feat: improve Moodle connector functionality (#11665) ### What problem does this PR solve? Add metadata from moodle data source. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- common/data_source/moodle_connector.py | 185 +++++++++++++++++++++++-- 1 file changed, 173 insertions(+), 12 deletions(-) diff --git a/common/data_source/moodle_connector.py b/common/data_source/moodle_connector.py index 427e7e86c..39efcf07b 100644 --- a/common/data_source/moodle_connector.py +++ b/common/data_source/moodle_connector.py @@ -17,7 +17,11 @@ from common.data_source.exceptions import ( InsufficientPermissionsError, ConnectorValidationError, ) -from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch +from common.data_source.interfaces import ( + LoadConnector, + PollConnector, + SecondsSinceUnixEpoch, +) from common.data_source.models import Document from common.data_source.utils import batch_generator, rl_requests @@ -42,7 +46,9 @@ class MoodleConnector(LoadConnector, PollConnector): delimiter = "&" if "?" in file_url else "?" return f"{file_url}{delimiter}token={token}" - def _log_error(self, context: str, error: Exception, level: str = "warning") -> None: + def _log_error( + self, context: str, error: Exception, level: str = "warning" + ) -> None: """Simplified logging wrapper""" msg = f"{context}: {error}" if level == "error": @@ -73,7 +79,9 @@ class MoodleConnector(LoadConnector, PollConnector): except MoodleException as e: if "invalidtoken" in str(e).lower(): raise CredentialExpiredError("Moodle token is invalid or expired") - raise ConnectorMissingCredentialError(f"Failed to initialize Moodle client: {e}") + raise ConnectorMissingCredentialError( + f"Failed to initialize Moodle client: {e}" + ) def validate_connector_settings(self) -> None: if not self.moodle_client: @@ -125,7 +133,9 @@ class MoodleConnector(LoadConnector, PollConnector): logger.warning("No courses found to poll") return - yield from self._yield_in_batches(self._get_updated_content(courses, start, end)) + yield from self._yield_in_batches( + self._get_updated_content(courses, start, end) + ) @retry(tries=3, delay=1, backoff=2) def _get_enrolled_courses(self) -> list: @@ -187,9 +197,7 @@ class MoodleConnector(LoadConnector, PollConnector): except Exception as e: self._log_error(f"polling course {course.fullname}", e) - def _process_module( - self, course, section, module - ) -> Optional[Document]: + def _process_module(self, course, section, module) -> Optional[Document]: try: mtype = module.modname if mtype in ["label", "url"]: @@ -224,11 +232,37 @@ class MoodleConnector(LoadConnector, PollConnector): ) try: - resp = rl_requests.get(self._add_token_to_url(file_info.fileurl), timeout=60) + resp = rl_requests.get( + self._add_token_to_url(file_info.fileurl), timeout=60 + ) resp.raise_for_status() blob = resp.content ext = os.path.splitext(file_name)[1] or ".bin" semantic_id = f"{course.fullname} / {section.name} / {file_name}" + + # Create metadata dictionary with relevant information + metadata = { + "moodle_url": self.moodle_url, + "course_id": getattr(course, "id", None), + "course_name": getattr(course, "fullname", None), + "course_shortname": getattr(course, "shortname", None), + "section_id": getattr(section, "id", None), + "section_name": getattr(section, "name", None), + "section_number": getattr(section, "section", None), + "module_id": getattr(module, "id", None), + "module_name": getattr(module, "name", None), + "module_type": getattr(module, "modname", None), + "module_instance": getattr(module, "instance", None), + "file_url": getattr(file_info, "fileurl", None), + "file_name": file_name, + "file_size": getattr(file_info, "filesize", len(blob)), + "file_type": getattr(file_info, "mimetype", None), + "time_created": getattr(module, "timecreated", None), + "time_modified": getattr(module, "timemodified", None), + "visible": getattr(module, "visible", None), + "groupmode": getattr(module, "groupmode", None), + } + return Document( id=f"moodle_resource_{module.id}", source="moodle", @@ -237,6 +271,7 @@ class MoodleConnector(LoadConnector, PollConnector): blob=blob, doc_updated_at=datetime.fromtimestamp(ts or 0, tz=timezone.utc), size_bytes=len(blob), + metadata=metadata, ) except Exception as e: self._log_error(f"downloading resource {file_name}", e, "error") @@ -247,7 +282,9 @@ class MoodleConnector(LoadConnector, PollConnector): return None try: - result = self.moodle_client.mod.forum.get_forum_discussions(forumid=module.instance) + result = self.moodle_client.mod.forum.get_forum_discussions( + forumid=module.instance + ) disc_list = getattr(result, "discussions", []) if not disc_list: return None @@ -264,6 +301,38 @@ class MoodleConnector(LoadConnector, PollConnector): blob = "\n".join(markdown).encode("utf-8") semantic_id = f"{course.fullname} / {section.name} / {module.name}" + + # Create metadata dictionary with relevant information + metadata = { + "moodle_url": self.moodle_url, + "course_id": getattr(course, "id", None), + "course_name": getattr(course, "fullname", None), + "course_shortname": getattr(course, "shortname", None), + "section_id": getattr(section, "id", None), + "section_name": getattr(section, "name", None), + "section_number": getattr(section, "section", None), + "module_id": getattr(module, "id", None), + "module_name": getattr(module, "name", None), + "module_type": getattr(module, "modname", None), + "forum_id": getattr(module, "instance", None), + "discussion_count": len(disc_list), + "time_created": getattr(module, "timecreated", None), + "time_modified": getattr(module, "timemodified", None), + "visible": getattr(module, "visible", None), + "groupmode": getattr(module, "groupmode", None), + "discussions": [ + { + "id": getattr(d, "id", None), + "name": getattr(d, "name", None), + "user_id": getattr(d, "userid", None), + "user_fullname": getattr(d, "userfullname", None), + "time_created": getattr(d, "timecreated", None), + "time_modified": getattr(d, "timemodified", None), + } + for d in disc_list + ], + } + return Document( id=f"moodle_forum_{module.id}", source="moodle", @@ -272,6 +341,7 @@ class MoodleConnector(LoadConnector, PollConnector): blob=blob, doc_updated_at=datetime.fromtimestamp(latest_ts or 0, tz=timezone.utc), size_bytes=len(blob), + metadata=metadata, ) except Exception as e: self._log_error(f"processing forum {module.name}", e) @@ -293,11 +363,37 @@ class MoodleConnector(LoadConnector, PollConnector): ) try: - resp = rl_requests.get(self._add_token_to_url(file_info.fileurl), timeout=60) + resp = rl_requests.get( + self._add_token_to_url(file_info.fileurl), timeout=60 + ) resp.raise_for_status() blob = resp.content ext = os.path.splitext(file_name)[1] or ".html" semantic_id = f"{course.fullname} / {section.name} / {module.name}" + + # Create metadata dictionary with relevant information + metadata = { + "moodle_url": self.moodle_url, + "course_id": getattr(course, "id", None), + "course_name": getattr(course, "fullname", None), + "course_shortname": getattr(course, "shortname", None), + "section_id": getattr(section, "id", None), + "section_name": getattr(section, "name", None), + "section_number": getattr(section, "section", None), + "module_id": getattr(module, "id", None), + "module_name": getattr(module, "name", None), + "module_type": getattr(module, "modname", None), + "module_instance": getattr(module, "instance", None), + "page_url": getattr(file_info, "fileurl", None), + "file_name": file_name, + "file_size": getattr(file_info, "filesize", len(blob)), + "file_type": getattr(file_info, "mimetype", None), + "time_created": getattr(module, "timecreated", None), + "time_modified": getattr(module, "timemodified", None), + "visible": getattr(module, "visible", None), + "groupmode": getattr(module, "groupmode", None), + } + return Document( id=f"moodle_page_{module.id}", source="moodle", @@ -306,6 +402,7 @@ class MoodleConnector(LoadConnector, PollConnector): blob=blob, doc_updated_at=datetime.fromtimestamp(ts or 0, tz=timezone.utc), size_bytes=len(blob), + metadata=metadata, ) except Exception as e: self._log_error(f"processing page {file_name}", e, "error") @@ -326,6 +423,29 @@ class MoodleConnector(LoadConnector, PollConnector): semantic_id = f"{course.fullname} / {section.name} / {mname}" blob = markdown.encode("utf-8") + + # Create metadata dictionary with relevant information + metadata = { + "moodle_url": self.moodle_url, + "course_id": getattr(course, "id", None), + "course_name": getattr(course, "fullname", None), + "course_shortname": getattr(course, "shortname", None), + "section_id": getattr(section, "id", None), + "section_name": getattr(section, "name", None), + "section_number": getattr(section, "section", None), + "module_id": getattr(module, "id", None), + "module_name": getattr(module, "name", None), + "module_type": getattr(module, "modname", None), + "activity_type": mtype, + "activity_instance": getattr(module, "instance", None), + "description": desc, + "time_created": getattr(module, "timecreated", None), + "time_modified": getattr(module, "timemodified", None), + "added": getattr(module, "added", None), + "visible": getattr(module, "visible", None), + "groupmode": getattr(module, "groupmode", None), + } + return Document( id=f"moodle_{mtype}_{module.id}", source="moodle", @@ -334,6 +454,7 @@ class MoodleConnector(LoadConnector, PollConnector): blob=blob, doc_updated_at=datetime.fromtimestamp(ts or 0, tz=timezone.utc), size_bytes=len(blob), + metadata=metadata, ) def _process_book(self, course, section, module) -> Optional[Document]: @@ -342,8 +463,10 @@ class MoodleConnector(LoadConnector, PollConnector): contents = module.contents chapters = [ - c for c in contents - if getattr(c, "fileurl", None) and os.path.basename(c.filename) == "index.html" + c + for c in contents + if getattr(c, "fileurl", None) + and os.path.basename(c.filename) == "index.html" ] if not chapters: return None @@ -356,17 +479,54 @@ class MoodleConnector(LoadConnector, PollConnector): ) markdown_parts = [f"# {module.name}\n"] + chapter_info = [] + for ch in chapters: try: resp = rl_requests.get(self._add_token_to_url(ch.fileurl), timeout=60) resp.raise_for_status() html = resp.content.decode("utf-8", errors="ignore") markdown_parts.append(md(html) + "\n\n---\n") + + # Collect chapter information for metadata + chapter_info.append( + { + "chapter_id": getattr(ch, "chapterid", None), + "title": getattr(ch, "title", None), + "filename": getattr(ch, "filename", None), + "fileurl": getattr(ch, "fileurl", None), + "time_created": getattr(ch, "timecreated", None), + "time_modified": getattr(ch, "timemodified", None), + "size": getattr(ch, "filesize", None), + } + ) except Exception as e: self._log_error(f"processing book chapter {ch.filename}", e) blob = "\n".join(markdown_parts).encode("utf-8") semantic_id = f"{course.fullname} / {section.name} / {module.name}" + + # Create metadata dictionary with relevant information + metadata = { + "moodle_url": self.moodle_url, + "course_id": getattr(course, "id", None), + "course_name": getattr(course, "fullname", None), + "course_shortname": getattr(course, "shortname", None), + "section_id": getattr(section, "id", None), + "section_name": getattr(section, "name", None), + "section_number": getattr(section, "section", None), + "module_id": getattr(module, "id", None), + "module_name": getattr(module, "name", None), + "module_type": getattr(module, "modname", None), + "book_id": getattr(module, "instance", None), + "chapter_count": len(chapters), + "chapters": chapter_info, + "time_created": getattr(module, "timecreated", None), + "time_modified": getattr(module, "timemodified", None), + "visible": getattr(module, "visible", None), + "groupmode": getattr(module, "groupmode", None), + } + return Document( id=f"moodle_book_{module.id}", source="moodle", @@ -375,4 +535,5 @@ class MoodleConnector(LoadConnector, PollConnector): blob=blob, doc_updated_at=datetime.fromtimestamp(latest_ts or 0, tz=timezone.utc), size_bytes=len(blob), + metadata=metadata, )