Add Firecrawl integration for RAGFlow (#10152)

## 🚀 Firecrawl Integration for RAGFlow This PR implements the Firecrawl integration for RAGFlow as requested in issue https://github.com/firecrawl/firecrawl/issues/2167 ### ✅ Features Implemented - **Data Source Integration**: Firecrawl appears as a selectable data source in RAGFlow - **Configuration Management**: Users can input Firecrawl API keys through RAGFlow's interface - **Web Scraping**: Supports single URL scraping, website crawling, and batch processing - **Content Processing**: Converts scraped content to RAGFlow's document format with chunking - **Error Handling**: Comprehensive error handling for rate limits, failed requests, and malformed content - **UI Components**: Complete UI schema and workflow components for RAGFlow integration ### 📁 Files Added - `intergrations/firecrawl/` - Complete integration package - `intergrations/firecrawl/integration.py` - RAGFlow integration entry point - `intergrations/firecrawl/firecrawl_connector.py` - API communication - `intergrations/firecrawl/firecrawl_config.py` - Configuration management - `intergrations/firecrawl/firecrawl_processor.py` - Content processing - `intergrations/firecrawl/firecrawl_ui.py` - UI components - `intergrations/firecrawl/ragflow_integration.py` - Main integration class - `intergrations/firecrawl/README.md` - Complete documentation - `intergrations/firecrawl/example_usage.py` - Usage examples ### 🧪 Testing The integration has been thoroughly tested with: - Configuration validation - Connection testing - Content processing and chunking - UI component rendering - Error handling scenarios ### 📋 Acceptance Criteria Met - ✅ Integration appears as selectable data source in RAGFlow's data source options - ✅ Users can input Firecrawl API keys through RAGFlow's configuration interface - ✅ Successfully scrapes content from provided URLs and imports into RAGFlow's document store - ✅ Handles common edge cases (rate limits, failed requests, malformed content) - ✅ Includes basic documentation and README updates - ✅ Code follows RAGFlow's existing patterns and coding standards ### �� Related Issue https://github.com/firecrawl/firecrawl/issues/2167 --------- Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
2026-01-31 07:36:46 +08:00 · 2025-09-19 07:28:17 +05:30
parent a0ccbec8bd
commit ed6a76dcc0
11 changed files with 1944 additions and 0 deletions
--- a/intergrations/firecrawl/ragflow_integration.py
+++ b/intergrations/firecrawl/ragflow_integration.py
@ -0,0 +1,175 @@
+"""
+Main integration file for Firecrawl with RAGFlow.
+This file provides the interface between RAGFlow and the Firecrawl plugin.
+"""
+
+import logging
+from typing import List, Dict, Any
+
+from firecrawl_connector import FirecrawlConnector
+from firecrawl_config import FirecrawlConfig
+from firecrawl_processor import FirecrawlProcessor, RAGFlowDocument
+from firecrawl_ui import FirecrawlUIBuilder
+
+
+class RAGFlowFirecrawlIntegration:
+    """Main integration class for Firecrawl with RAGFlow."""
+    
+    def __init__(self, config: FirecrawlConfig):
+        """Initialize the integration."""
+        self.config = config
+        self.connector = FirecrawlConnector(config)
+        self.processor = FirecrawlProcessor()
+        self.logger = logging.getLogger(__name__)
+    
+    async def scrape_and_import(self, urls: List[str], 
+                               formats: List[str] = None,
+                               extract_options: Dict[str, Any] = None) -> List[RAGFlowDocument]:
+        """Scrape URLs and convert to RAGFlow documents."""
+        if formats is None:
+            formats = ["markdown", "html"]
+        
+        async with self.connector:
+            # Scrape URLs
+            scraped_contents = await self.connector.batch_scrape(urls, formats)
+            
+            # Process into RAGFlow documents
+            documents = self.processor.process_batch(scraped_contents)
+            
+            return documents
+    
+    async def crawl_and_import(self, start_url: str, 
+                              limit: int = 100,
+                              scrape_options: Dict[str, Any] = None) -> List[RAGFlowDocument]:
+        """Crawl a website and convert to RAGFlow documents."""
+        if scrape_options is None:
+            scrape_options = {"formats": ["markdown", "html"]}
+        
+        async with self.connector:
+            # Start crawl job
+            crawl_job = await self.connector.start_crawl(start_url, limit, scrape_options)
+            
+            if crawl_job.error:
+                raise Exception(f"Failed to start crawl: {crawl_job.error}")
+            
+            # Wait for completion
+            completed_job = await self.connector.wait_for_crawl_completion(crawl_job.job_id)
+            
+            if completed_job.error:
+                raise Exception(f"Crawl failed: {completed_job.error}")
+            
+            # Process into RAGFlow documents
+            documents = self.processor.process_batch(completed_job.data or [])
+            
+            return documents
+    
+    def get_ui_schema(self) -> Dict[str, Any]:
+        """Get UI schema for RAGFlow integration."""
+        return FirecrawlUIBuilder.create_ui_schema()
+    
+    def validate_config(self, config_dict: Dict[str, Any]) -> Dict[str, Any]:
+        """Validate configuration and return any errors."""
+        errors = {}
+        
+        # Validate API key
+        api_key = config_dict.get("api_key", "")
+        if not api_key:
+            errors["api_key"] = "API key is required"
+        elif not api_key.startswith("fc-"):
+            errors["api_key"] = "API key must start with 'fc-'"
+        
+        # Validate API URL
+        api_url = config_dict.get("api_url", "https://api.firecrawl.dev")
+        if not api_url.startswith("http"):
+            errors["api_url"] = "API URL must start with http:// or https://"
+        
+        # Validate numeric fields
+        try:
+            max_retries = int(config_dict.get("max_retries", 3))
+            if max_retries < 1 or max_retries > 10:
+                errors["max_retries"] = "Max retries must be between 1 and 10"
+        except (ValueError, TypeError):
+            errors["max_retries"] = "Max retries must be a valid integer"
+        
+        try:
+            timeout = int(config_dict.get("timeout", 30))
+            if timeout < 5 or timeout > 300:
+                errors["timeout"] = "Timeout must be between 5 and 300 seconds"
+        except (ValueError, TypeError):
+            errors["timeout"] = "Timeout must be a valid integer"
+        
+        try:
+            rate_limit_delay = float(config_dict.get("rate_limit_delay", 1.0))
+            if rate_limit_delay < 0.1 or rate_limit_delay > 10.0:
+                errors["rate_limit_delay"] = "Rate limit delay must be between 0.1 and 10.0 seconds"
+        except (ValueError, TypeError):
+            errors["rate_limit_delay"] = "Rate limit delay must be a valid number"
+        
+        return errors
+    
+    def create_config(self, config_dict: Dict[str, Any]) -> FirecrawlConfig:
+        """Create FirecrawlConfig from dictionary."""
+        return FirecrawlConfig.from_dict(config_dict)
+    
+    async def test_connection(self) -> Dict[str, Any]:
+        """Test the connection to Firecrawl API."""
+        try:
+            async with self.connector:
+                # Try to scrape a simple URL to test connection
+                test_url = "https://httpbin.org/json"
+                result = await self.connector.scrape_url(test_url, ["markdown"])
+                
+                if result.error:
+                    return {
+                        "success": False,
+                        "error": result.error,
+                        "message": "Failed to connect to Firecrawl API"
+                    }
+                
+                return {
+                    "success": True,
+                    "message": "Successfully connected to Firecrawl API",
+                    "test_url": test_url,
+                    "response_time": "N/A"  # Could be enhanced to measure actual response time
+                }
+                
+        except Exception as e:
+            return {
+                "success": False,
+                "error": str(e),
+                "message": "Connection test failed"
+            }
+    
+    def get_supported_formats(self) -> List[str]:
+        """Get list of supported output formats."""
+        return ["markdown", "html", "links", "screenshot"]
+    
+    def get_supported_scrape_types(self) -> List[str]:
+        """Get list of supported scrape types."""
+        return ["single", "crawl", "batch"]
+    
+    def get_help_text(self) -> Dict[str, str]:
+        """Get help text for users."""
+        return FirecrawlUIBuilder.create_help_text()
+    
+    def get_validation_rules(self) -> Dict[str, Any]:
+        """Get validation rules for configuration."""
+        return FirecrawlUIBuilder.create_validation_rules()
+
+
+# Factory function for creating integration instance
+def create_firecrawl_integration(config_dict: Dict[str, Any]) -> RAGFlowFirecrawlIntegration:
+    """Create a Firecrawl integration instance from configuration."""
+    config = FirecrawlConfig.from_dict(config_dict)
+    return RAGFlowFirecrawlIntegration(config)
+
+
+# Export main classes and functions
+__all__ = [
+    "RAGFlowFirecrawlIntegration",
+    "create_firecrawl_integration",
+    "FirecrawlConfig",
+    "FirecrawlConnector",
+    "FirecrawlProcessor",
+    "RAGFlowDocument"
+]