Refa: Clean the folders. (#12890)

### Type of change - [x] Refactoring
2026-02-02 00:25:06 +08:00 · 2026-01-29 14:23:26 +08:00
parent d86b7f9721
commit 32c0161ff1
42 changed files with 50 additions and 52 deletions
--- a/rag/graphrag/general/community_reports_extractor.py
+++ b/rag/graphrag/general/community_reports_extractor.py
@ -0,0 +1,186 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+from common.misc_utils import thread_pool_exec
+
+"""
+Reference:
+ - [graphrag](https://github.com/microsoft/graphrag)
+"""
+
+import asyncio
+import logging
+import json
+import os
+import re
+from typing import Callable
+from dataclasses import dataclass
+import networkx as nx
+import pandas as pd
+
+from api.db.services.task_service import has_canceled
+from common.exceptions import TaskCanceledException
+from common.connection_utils import timeout
+from rag.graphrag.general import leiden
+from rag.graphrag.general.community_report_prompt import COMMUNITY_REPORT_PROMPT
+from rag.graphrag.general.extractor import Extractor
+from rag.graphrag.general.leiden import add_community_info2graph
+from rag.llm.chat_model import Base as CompletionLLM
+from rag.graphrag.utils import perform_variable_replacements, dict_has_keys_with_types, chat_limiter
+from common.token_utils import num_tokens_from_string
+
+@dataclass
+class CommunityReportsResult:
+    """Community reports result class definition."""
+
+    output: list[str]
+    structured_output: list[dict]
+
+
+class CommunityReportsExtractor(Extractor):
+    """Community reports extractor class definition."""
+
+    _extraction_prompt: str
+    _output_formatter_prompt: str
+    _max_report_length: int
+
+    def __init__(
+            self,
+            llm_invoker: CompletionLLM,
+            max_report_length: int | None = None,
+    ):
+        super().__init__(llm_invoker)
+        """Init method definition."""
+        self._llm = llm_invoker
+        self._extraction_prompt = COMMUNITY_REPORT_PROMPT
+        self._max_report_length = max_report_length or 1500
+
+    async def __call__(self, graph: nx.Graph, callback: Callable | None = None, task_id: str = ""):
+        enable_timeout_assertion = os.environ.get("ENABLE_TIMEOUT_ASSERTION")
+        for node_degree in graph.degree:
+            graph.nodes[str(node_degree[0])]["rank"] = int(node_degree[1])
+
+        communities: dict[str, dict[str, list]] = leiden.run(graph, {})
+        total = sum([len(comm.items()) for _, comm in communities.items()])
+        res_str = []
+        res_dict = []
+        over, token_count = 0, 0
+        @timeout(120)
+        async def extract_community_report(community):
+            nonlocal res_str, res_dict, over, token_count
+            if task_id:
+                if has_canceled(task_id):
+                    logging.info(f"Task {task_id} cancelled during community report extraction.")
+                    raise TaskCanceledException(f"Task {task_id} was cancelled")
+
+            cm_id, cm = community
+            weight = cm["weight"]
+            ents = cm["nodes"]
+            if len(ents) < 2:
+                return
+            ent_list = [{"entity": ent, "description": graph.nodes[ent]["description"]} for ent in ents]
+            ent_df = pd.DataFrame(ent_list)
+
+            rela_list = []
+            k = 0
+            for i in range(0, len(ents)):
+                if k >= 10000:
+                    break
+                for j in range(i + 1, len(ents)):
+                    if k >= 10000:
+                        break
+                    edge = graph.get_edge_data(ents[i], ents[j])
+                    if edge is None:
+                        continue
+                    rela_list.append({"source": ents[i], "target": ents[j], "description": edge["description"]})
+                    k += 1
+            rela_df = pd.DataFrame(rela_list)
+
+            prompt_variables = {
+                "entity_df": ent_df.to_csv(index_label="id"),
+                "relation_df": rela_df.to_csv(index_label="id")
+            }
+            text = perform_variable_replacements(self._extraction_prompt, variables=prompt_variables)
+            async with chat_limiter:
+                try:
+                    timeout = 180 if enable_timeout_assertion else 1000000000
+                    response = await asyncio.wait_for(thread_pool_exec(self._chat,text,[{"role": "user", "content": "Output:"}],{},task_id),timeout=timeout)
+                except asyncio.TimeoutError:
+                    logging.warning("extract_community_report._chat timeout, skipping...")
+                    return
+                except Exception as e:
+                    logging.error(f"extract_community_report._chat failed: {e}")
+                    return
+            token_count += num_tokens_from_string(text + response)
+            response = re.sub(r"^[^\{]*", "", response)
+            response = re.sub(r"[^\}]*$", "", response)
+            response = re.sub(r"\{\{", "{", response)
+            response = re.sub(r"\}\}", "}", response)
+            logging.debug(response)
+            try:
+                response = json.loads(response)
+            except json.JSONDecodeError as e:
+                logging.error(f"Failed to parse JSON response: {e}")
+                logging.error(f"Response content: {response}")
+                return
+            if not dict_has_keys_with_types(response, [
+                        ("title", str),
+                        ("summary", str),
+                        ("findings", list),
+                        ("rating", float),
+                        ("rating_explanation", str),
+                    ]):
+                return
+            response["weight"] = weight
+            response["entities"] = ents
+            add_community_info2graph(graph, ents, response["title"])
+            res_str.append(self._get_text_output(response))
+            res_dict.append(response)
+            over += 1
+            if callback:
+                callback(msg=f"Communities: {over}/{total}, used tokens: {token_count}")
+
+        st = asyncio.get_running_loop().time()
+        tasks = []
+        for level, comm in communities.items():
+            logging.info(f"Level {level}: Community: {len(comm.keys())}")
+            for community in comm.items():
+                if task_id and has_canceled(task_id):
+                    logging.info(f"Task {task_id} cancelled before community processing.")
+                    raise TaskCanceledException(f"Task {task_id} was cancelled")
+                tasks.append(asyncio.create_task(extract_community_report(community)))
+        try:
+            await asyncio.gather(*tasks, return_exceptions=False)
+        except Exception as e:
+            logging.error(f"Error in community processing: {e}")
+            for t in tasks:
+                t.cancel()
+            await asyncio.gather(*tasks, return_exceptions=True)
+            raise
+        if callback:
+            callback(msg=f"Community reports done in {asyncio.get_running_loop().time() - st:.2f}s, used tokens: {token_count}")
+
+        return CommunityReportsResult(
+            structured_output=res_dict,
+            output=res_str,
+        )
+
+    def _get_text_output(self, parsed_output: dict) -> str:
+        title = parsed_output.get("title", "Report")
+        summary = parsed_output.get("summary", "")
+        findings = parsed_output.get("findings", [])
+
+        def finding_summary(finding: dict):
+            if isinstance(finding, str):
+                return finding
+            return finding.get("summary")
+
+        def finding_explanation(finding: dict):
+            if isinstance(finding, str):
+                return ""
+            return finding.get("explanation")
+
+        report_sections = "\n\n".join(
+            f"## {finding_summary(f)}\n\n{finding_explanation(f)}" for f in findings
+        )
+        return f"# {title}\n\n{summary}\n\n{report_sections}"