mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
## 🚀 Firecrawl Integration for RAGFlow This PR implements the Firecrawl integration for RAGFlow as requested in issue https://github.com/firecrawl/firecrawl/issues/2167 ### ✅ Features Implemented - **Data Source Integration**: Firecrawl appears as a selectable data source in RAGFlow - **Configuration Management**: Users can input Firecrawl API keys through RAGFlow's interface - **Web Scraping**: Supports single URL scraping, website crawling, and batch processing - **Content Processing**: Converts scraped content to RAGFlow's document format with chunking - **Error Handling**: Comprehensive error handling for rate limits, failed requests, and malformed content - **UI Components**: Complete UI schema and workflow components for RAGFlow integration ### 📁 Files Added - `intergrations/firecrawl/` - Complete integration package - `intergrations/firecrawl/integration.py` - RAGFlow integration entry point - `intergrations/firecrawl/firecrawl_connector.py` - API communication - `intergrations/firecrawl/firecrawl_config.py` - Configuration management - `intergrations/firecrawl/firecrawl_processor.py` - Content processing - `intergrations/firecrawl/firecrawl_ui.py` - UI components - `intergrations/firecrawl/ragflow_integration.py` - Main integration class - `intergrations/firecrawl/README.md` - Complete documentation - `intergrations/firecrawl/example_usage.py` - Usage examples ### 🧪 Testing The integration has been thoroughly tested with: - Configuration validation - Connection testing - Content processing and chunking - UI component rendering - Error handling scenarios ### 📋 Acceptance Criteria Met - ✅ Integration appears as selectable data source in RAGFlow's data source options - ✅ Users can input Firecrawl API keys through RAGFlow's configuration interface - ✅ Successfully scrapes content from provided URLs and imports into RAGFlow's document store - ✅ Handles common edge cases (rate limits, failed requests, malformed content) - ✅ Includes basic documentation and README updates - ✅ Code follows RAGFlow's existing patterns and coding standards ### �� Related Issue https://github.com/firecrawl/firecrawl/issues/2167 --------- Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
80 lines
2.8 KiB
Python
80 lines
2.8 KiB
Python
"""
|
|
Configuration management for Firecrawl integration with RAGFlow.
|
|
"""
|
|
|
|
import os
|
|
from typing import Dict, Any
|
|
from dataclasses import dataclass
|
|
import json
|
|
|
|
|
|
@dataclass
|
|
class FirecrawlConfig:
|
|
"""Configuration class for Firecrawl integration."""
|
|
|
|
api_key: str
|
|
api_url: str = "https://api.firecrawl.dev"
|
|
max_retries: int = 3
|
|
timeout: int = 30
|
|
rate_limit_delay: float = 1.0
|
|
max_concurrent_requests: int = 5
|
|
|
|
def __post_init__(self):
|
|
"""Validate configuration after initialization."""
|
|
if not self.api_key:
|
|
raise ValueError("Firecrawl API key is required")
|
|
|
|
if not self.api_key.startswith("fc-"):
|
|
raise ValueError("Invalid Firecrawl API key format. Must start with 'fc-'")
|
|
|
|
if self.max_retries < 1 or self.max_retries > 10:
|
|
raise ValueError("Max retries must be between 1 and 10")
|
|
|
|
if self.timeout < 5 or self.timeout > 300:
|
|
raise ValueError("Timeout must be between 5 and 300 seconds")
|
|
|
|
if self.rate_limit_delay < 0.1 or self.rate_limit_delay > 10.0:
|
|
raise ValueError("Rate limit delay must be between 0.1 and 10.0 seconds")
|
|
|
|
@classmethod
|
|
def from_env(cls) -> "FirecrawlConfig":
|
|
"""Create configuration from environment variables."""
|
|
api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
if not api_key:
|
|
raise ValueError("FIRECRAWL_API_KEY environment variable not set")
|
|
|
|
return cls(
|
|
api_key=api_key,
|
|
api_url=os.getenv("FIRECRAWL_API_URL", "https://api.firecrawl.dev"),
|
|
max_retries=int(os.getenv("FIRECRAWL_MAX_RETRIES", "3")),
|
|
timeout=int(os.getenv("FIRECRAWL_TIMEOUT", "30")),
|
|
rate_limit_delay=float(os.getenv("FIRECRAWL_RATE_LIMIT_DELAY", "1.0")),
|
|
max_concurrent_requests=int(os.getenv("FIRECRAWL_MAX_CONCURRENT", "5"))
|
|
)
|
|
|
|
@classmethod
|
|
def from_dict(cls, config_dict: Dict[str, Any]) -> "FirecrawlConfig":
|
|
"""Create configuration from dictionary."""
|
|
return cls(**config_dict)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert configuration to dictionary."""
|
|
return {
|
|
"api_key": self.api_key,
|
|
"api_url": self.api_url,
|
|
"max_retries": self.max_retries,
|
|
"timeout": self.timeout,
|
|
"rate_limit_delay": self.rate_limit_delay,
|
|
"max_concurrent_requests": self.max_concurrent_requests
|
|
}
|
|
|
|
def to_json(self) -> str:
|
|
"""Convert configuration to JSON string."""
|
|
return json.dumps(self.to_dict(), indent=2)
|
|
|
|
@classmethod
|
|
def from_json(cls, json_str: str) -> "FirecrawlConfig":
|
|
"""Create configuration from JSON string."""
|
|
config_dict = json.loads(json_str)
|
|
return cls.from_dict(config_dict)
|