Add Firecrawl integration for RAGFlow (#10152)

## 🚀 Firecrawl Integration for RAGFlow This PR implements the Firecrawl integration for RAGFlow as requested in issue https://github.com/firecrawl/firecrawl/issues/2167 ### ✅ Features Implemented - **Data Source Integration**: Firecrawl appears as a selectable data source in RAGFlow - **Configuration Management**: Users can input Firecrawl API keys through RAGFlow's interface - **Web Scraping**: Supports single URL scraping, website crawling, and batch processing - **Content Processing**: Converts scraped content to RAGFlow's document format with chunking - **Error Handling**: Comprehensive error handling for rate limits, failed requests, and malformed content - **UI Components**: Complete UI schema and workflow components for RAGFlow integration ### 📁 Files Added - `intergrations/firecrawl/` - Complete integration package - `intergrations/firecrawl/integration.py` - RAGFlow integration entry point - `intergrations/firecrawl/firecrawl_connector.py` - API communication - `intergrations/firecrawl/firecrawl_config.py` - Configuration management - `intergrations/firecrawl/firecrawl_processor.py` - Content processing - `intergrations/firecrawl/firecrawl_ui.py` - UI components - `intergrations/firecrawl/ragflow_integration.py` - Main integration class - `intergrations/firecrawl/README.md` - Complete documentation - `intergrations/firecrawl/example_usage.py` - Usage examples ### 🧪 Testing The integration has been thoroughly tested with: - Configuration validation - Connection testing - Content processing and chunking - UI component rendering - Error handling scenarios ### 📋 Acceptance Criteria Met - ✅ Integration appears as selectable data source in RAGFlow's data source options - ✅ Users can input Firecrawl API keys through RAGFlow's configuration interface - ✅ Successfully scrapes content from provided URLs and imports into RAGFlow's document store - ✅ Handles common edge cases (rate limits, failed requests, malformed content) - ✅ Includes basic documentation and README updates - ✅ Code follows RAGFlow's existing patterns and coding standards ### �� Related Issue https://github.com/firecrawl/firecrawl/issues/2167 --------- Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
2026-02-01 08:05:07 +08:00 · 2025-09-19 07:28:17 +05:30
parent a0ccbec8bd
commit ed6a76dcc0
11 changed files with 1944 additions and 0 deletions
--- a/intergrations/firecrawl/firecrawl_ui.py
+++ b/intergrations/firecrawl/firecrawl_ui.py
@ -0,0 +1,259 @@
+"""
+UI components for Firecrawl integration in RAGFlow.
+"""
+
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+
+
+@dataclass
+class FirecrawlUIComponent:
+    """Represents a UI component for Firecrawl integration."""
+    
+    component_type: str
+    props: Dict[str, Any]
+    children: Optional[List['FirecrawlUIComponent']] = None
+
+
+class FirecrawlUIBuilder:
+    """Builder for Firecrawl UI components in RAGFlow."""
+    
+    @staticmethod
+    def create_data_source_config() -> Dict[str, Any]:
+        """Create configuration for Firecrawl data source."""
+        return {
+            "name": "firecrawl",
+            "display_name": "Firecrawl Web Scraper",
+            "description": "Import web content using Firecrawl's powerful scraping capabilities",
+            "icon": "🌐",
+            "category": "web",
+            "version": "1.0.0",
+            "author": "Firecrawl Team",
+            "config_schema": {
+                "type": "object",
+                "properties": {
+                    "api_key": {
+                        "type": "string",
+                        "title": "Firecrawl API Key",
+                        "description": "Your Firecrawl API key (starts with 'fc-')",
+                        "format": "password",
+                        "required": True
+                    },
+                    "api_url": {
+                        "type": "string",
+                        "title": "API URL",
+                        "description": "Firecrawl API endpoint",
+                        "default": "https://api.firecrawl.dev",
+                        "required": False
+                    },
+                    "max_retries": {
+                        "type": "integer",
+                        "title": "Max Retries",
+                        "description": "Maximum number of retry attempts",
+                        "default": 3,
+                        "minimum": 1,
+                        "maximum": 10
+                    },
+                    "timeout": {
+                        "type": "integer",
+                        "title": "Timeout (seconds)",
+                        "description": "Request timeout in seconds",
+                        "default": 30,
+                        "minimum": 5,
+                        "maximum": 300
+                    },
+                    "rate_limit_delay": {
+                        "type": "number",
+                        "title": "Rate Limit Delay",
+                        "description": "Delay between requests in seconds",
+                        "default": 1.0,
+                        "minimum": 0.1,
+                        "maximum": 10.0
+                    }
+                },
+                "required": ["api_key"]
+            }
+        }
+    
+    @staticmethod
+    def create_scraping_form() -> Dict[str, Any]:
+        """Create form for scraping configuration."""
+        return {
+            "type": "form",
+            "title": "Firecrawl Web Scraping",
+            "description": "Configure web scraping parameters",
+            "fields": [
+                {
+                    "name": "urls",
+                    "type": "array",
+                    "title": "URLs to Scrape",
+                    "description": "Enter URLs to scrape (one per line)",
+                    "items": {
+                        "type": "string",
+                        "format": "uri"
+                    },
+                    "required": True,
+                    "minItems": 1
+                },
+                {
+                    "name": "scrape_type",
+                    "type": "string",
+                    "title": "Scrape Type",
+                    "description": "Choose scraping method",
+                    "enum": ["single", "crawl", "batch"],
+                    "enumNames": ["Single URL", "Crawl Website", "Batch URLs"],
+                    "default": "single",
+                    "required": True
+                },
+                {
+                    "name": "formats",
+                    "type": "array",
+                    "title": "Output Formats",
+                    "description": "Select output formats",
+                    "items": {
+                        "type": "string",
+                        "enum": ["markdown", "html", "links", "screenshot"]
+                    },
+                    "default": ["markdown", "html"],
+                    "required": True
+                },
+                {
+                    "name": "crawl_limit",
+                    "type": "integer",
+                    "title": "Crawl Limit",
+                    "description": "Maximum number of pages to crawl (for crawl type)",
+                    "default": 100,
+                    "minimum": 1,
+                    "maximum": 1000,
+                    "condition": {
+                        "field": "scrape_type",
+                        "equals": "crawl"
+                    }
+                },
+                {
+                    "name": "extract_options",
+                    "type": "object",
+                    "title": "Extraction Options",
+                    "description": "Advanced extraction settings",
+                    "properties": {
+                        "extractMainContent": {
+                            "type": "boolean",
+                            "title": "Extract Main Content Only",
+                            "default": True
+                        },
+                        "excludeTags": {
+                            "type": "array",
+                            "title": "Exclude Tags",
+                            "description": "HTML tags to exclude",
+                            "items": {"type": "string"},
+                            "default": ["nav", "footer", "header", "aside"]
+                        },
+                        "includeTags": {
+                            "type": "array",
+                            "title": "Include Tags",
+                            "description": "HTML tags to include",
+                            "items": {"type": "string"},
+                            "default": ["main", "article", "section", "div", "p"]
+                        }
+                    }
+                }
+            ]
+        }
+    
+    @staticmethod
+    def create_progress_component() -> Dict[str, Any]:
+        """Create progress tracking component."""
+        return {
+            "type": "progress",
+            "title": "Scraping Progress",
+            "description": "Track the progress of your web scraping job",
+            "properties": {
+                "show_percentage": True,
+                "show_eta": True,
+                "show_details": True
+            }
+        }
+    
+    @staticmethod
+    def create_results_view() -> Dict[str, Any]:
+        """Create results display component."""
+        return {
+            "type": "results",
+            "title": "Scraping Results",
+            "description": "View and manage scraped content",
+            "properties": {
+                "show_preview": True,
+                "show_metadata": True,
+                "allow_editing": True,
+                "show_chunks": True
+            }
+        }
+    
+    @staticmethod
+    def create_error_handler() -> Dict[str, Any]:
+        """Create error handling component."""
+        return {
+            "type": "error_handler",
+            "title": "Error Handling",
+            "description": "Handle scraping errors and retries",
+            "properties": {
+                "show_retry_button": True,
+                "show_error_details": True,
+                "auto_retry": False,
+                "max_retries": 3
+            }
+        }
+    
+    @staticmethod
+    def create_validation_rules() -> Dict[str, Any]:
+        """Create validation rules for Firecrawl integration."""
+        return {
+            "url_validation": {
+                "pattern": r"^https?://.+",
+                "message": "URL must start with http:// or https://"
+            },
+            "api_key_validation": {
+                "pattern": r"^fc-[a-zA-Z0-9]+$",
+                "message": "API key must start with 'fc-' followed by alphanumeric characters"
+            },
+            "rate_limit_validation": {
+                "min": 0.1,
+                "max": 10.0,
+                "message": "Rate limit delay must be between 0.1 and 10.0 seconds"
+            }
+        }
+    
+    @staticmethod
+    def create_help_text() -> Dict[str, str]:
+        """Create help text for users."""
+        return {
+            "api_key_help": "Get your API key from https://firecrawl.dev. Sign up for a free account to get started.",
+            "url_help": "Enter the URLs you want to scrape. You can add multiple URLs for batch processing.",
+            "crawl_help": "Crawling will follow links from the starting URL and scrape all accessible pages within the limit.",
+            "formats_help": "Choose the output formats you need. Markdown is recommended for RAG processing.",
+            "extract_help": "Extraction options help filter content to get only the main content without navigation and ads."
+        }
+    
+    @staticmethod
+    def create_ui_schema() -> Dict[str, Any]:
+        """Create complete UI schema for Firecrawl integration."""
+        return {
+            "version": "1.0.0",
+            "components": {
+                "data_source_config": FirecrawlUIBuilder.create_data_source_config(),
+                "scraping_form": FirecrawlUIBuilder.create_scraping_form(),
+                "progress_component": FirecrawlUIBuilder.create_progress_component(),
+                "results_view": FirecrawlUIBuilder.create_results_view(),
+                "error_handler": FirecrawlUIBuilder.create_error_handler()
+            },
+            "validation_rules": FirecrawlUIBuilder.create_validation_rules(),
+            "help_text": FirecrawlUIBuilder.create_help_text(),
+            "workflow": [
+                "configure_data_source",
+                "setup_scraping_parameters",
+                "start_scraping_job",
+                "monitor_progress",
+                "review_results",
+                "import_to_ragflow"
+            ]
+        }