mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
## 🚀 Firecrawl Integration for RAGFlow This PR implements the Firecrawl integration for RAGFlow as requested in issue https://github.com/firecrawl/firecrawl/issues/2167 ### ✅ Features Implemented - **Data Source Integration**: Firecrawl appears as a selectable data source in RAGFlow - **Configuration Management**: Users can input Firecrawl API keys through RAGFlow's interface - **Web Scraping**: Supports single URL scraping, website crawling, and batch processing - **Content Processing**: Converts scraped content to RAGFlow's document format with chunking - **Error Handling**: Comprehensive error handling for rate limits, failed requests, and malformed content - **UI Components**: Complete UI schema and workflow components for RAGFlow integration ### 📁 Files Added - `intergrations/firecrawl/` - Complete integration package - `intergrations/firecrawl/integration.py` - RAGFlow integration entry point - `intergrations/firecrawl/firecrawl_connector.py` - API communication - `intergrations/firecrawl/firecrawl_config.py` - Configuration management - `intergrations/firecrawl/firecrawl_processor.py` - Content processing - `intergrations/firecrawl/firecrawl_ui.py` - UI components - `intergrations/firecrawl/ragflow_integration.py` - Main integration class - `intergrations/firecrawl/README.md` - Complete documentation - `intergrations/firecrawl/example_usage.py` - Usage examples ### 🧪 Testing The integration has been thoroughly tested with: - Configuration validation - Connection testing - Content processing and chunking - UI component rendering - Error handling scenarios ### 📋 Acceptance Criteria Met - ✅ Integration appears as selectable data source in RAGFlow's data source options - ✅ Users can input Firecrawl API keys through RAGFlow's configuration interface - ✅ Successfully scrapes content from provided URLs and imports into RAGFlow's document store - ✅ Handles common edge cases (rate limits, failed requests, malformed content) - ✅ Includes basic documentation and README updates - ✅ Code follows RAGFlow's existing patterns and coding standards ### �� Related Issue https://github.com/firecrawl/firecrawl/issues/2167 --------- Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
176 lines
6.8 KiB
Python
176 lines
6.8 KiB
Python
"""
|
|
Main integration file for Firecrawl with RAGFlow.
|
|
This file provides the interface between RAGFlow and the Firecrawl plugin.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Dict, Any
|
|
|
|
from firecrawl_connector import FirecrawlConnector
|
|
from firecrawl_config import FirecrawlConfig
|
|
from firecrawl_processor import FirecrawlProcessor, RAGFlowDocument
|
|
from firecrawl_ui import FirecrawlUIBuilder
|
|
|
|
|
|
class RAGFlowFirecrawlIntegration:
|
|
"""Main integration class for Firecrawl with RAGFlow."""
|
|
|
|
def __init__(self, config: FirecrawlConfig):
|
|
"""Initialize the integration."""
|
|
self.config = config
|
|
self.connector = FirecrawlConnector(config)
|
|
self.processor = FirecrawlProcessor()
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
async def scrape_and_import(self, urls: List[str],
|
|
formats: List[str] = None,
|
|
extract_options: Dict[str, Any] = None) -> List[RAGFlowDocument]:
|
|
"""Scrape URLs and convert to RAGFlow documents."""
|
|
if formats is None:
|
|
formats = ["markdown", "html"]
|
|
|
|
async with self.connector:
|
|
# Scrape URLs
|
|
scraped_contents = await self.connector.batch_scrape(urls, formats)
|
|
|
|
# Process into RAGFlow documents
|
|
documents = self.processor.process_batch(scraped_contents)
|
|
|
|
return documents
|
|
|
|
async def crawl_and_import(self, start_url: str,
|
|
limit: int = 100,
|
|
scrape_options: Dict[str, Any] = None) -> List[RAGFlowDocument]:
|
|
"""Crawl a website and convert to RAGFlow documents."""
|
|
if scrape_options is None:
|
|
scrape_options = {"formats": ["markdown", "html"]}
|
|
|
|
async with self.connector:
|
|
# Start crawl job
|
|
crawl_job = await self.connector.start_crawl(start_url, limit, scrape_options)
|
|
|
|
if crawl_job.error:
|
|
raise Exception(f"Failed to start crawl: {crawl_job.error}")
|
|
|
|
# Wait for completion
|
|
completed_job = await self.connector.wait_for_crawl_completion(crawl_job.job_id)
|
|
|
|
if completed_job.error:
|
|
raise Exception(f"Crawl failed: {completed_job.error}")
|
|
|
|
# Process into RAGFlow documents
|
|
documents = self.processor.process_batch(completed_job.data or [])
|
|
|
|
return documents
|
|
|
|
def get_ui_schema(self) -> Dict[str, Any]:
|
|
"""Get UI schema for RAGFlow integration."""
|
|
return FirecrawlUIBuilder.create_ui_schema()
|
|
|
|
def validate_config(self, config_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Validate configuration and return any errors."""
|
|
errors = {}
|
|
|
|
# Validate API key
|
|
api_key = config_dict.get("api_key", "")
|
|
if not api_key:
|
|
errors["api_key"] = "API key is required"
|
|
elif not api_key.startswith("fc-"):
|
|
errors["api_key"] = "API key must start with 'fc-'"
|
|
|
|
# Validate API URL
|
|
api_url = config_dict.get("api_url", "https://api.firecrawl.dev")
|
|
if not api_url.startswith("http"):
|
|
errors["api_url"] = "API URL must start with http:// or https://"
|
|
|
|
# Validate numeric fields
|
|
try:
|
|
max_retries = int(config_dict.get("max_retries", 3))
|
|
if max_retries < 1 or max_retries > 10:
|
|
errors["max_retries"] = "Max retries must be between 1 and 10"
|
|
except (ValueError, TypeError):
|
|
errors["max_retries"] = "Max retries must be a valid integer"
|
|
|
|
try:
|
|
timeout = int(config_dict.get("timeout", 30))
|
|
if timeout < 5 or timeout > 300:
|
|
errors["timeout"] = "Timeout must be between 5 and 300 seconds"
|
|
except (ValueError, TypeError):
|
|
errors["timeout"] = "Timeout must be a valid integer"
|
|
|
|
try:
|
|
rate_limit_delay = float(config_dict.get("rate_limit_delay", 1.0))
|
|
if rate_limit_delay < 0.1 or rate_limit_delay > 10.0:
|
|
errors["rate_limit_delay"] = "Rate limit delay must be between 0.1 and 10.0 seconds"
|
|
except (ValueError, TypeError):
|
|
errors["rate_limit_delay"] = "Rate limit delay must be a valid number"
|
|
|
|
return errors
|
|
|
|
def create_config(self, config_dict: Dict[str, Any]) -> FirecrawlConfig:
|
|
"""Create FirecrawlConfig from dictionary."""
|
|
return FirecrawlConfig.from_dict(config_dict)
|
|
|
|
async def test_connection(self) -> Dict[str, Any]:
|
|
"""Test the connection to Firecrawl API."""
|
|
try:
|
|
async with self.connector:
|
|
# Try to scrape a simple URL to test connection
|
|
test_url = "https://httpbin.org/json"
|
|
result = await self.connector.scrape_url(test_url, ["markdown"])
|
|
|
|
if result.error:
|
|
return {
|
|
"success": False,
|
|
"error": result.error,
|
|
"message": "Failed to connect to Firecrawl API"
|
|
}
|
|
|
|
return {
|
|
"success": True,
|
|
"message": "Successfully connected to Firecrawl API",
|
|
"test_url": test_url,
|
|
"response_time": "N/A" # Could be enhanced to measure actual response time
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"message": "Connection test failed"
|
|
}
|
|
|
|
def get_supported_formats(self) -> List[str]:
|
|
"""Get list of supported output formats."""
|
|
return ["markdown", "html", "links", "screenshot"]
|
|
|
|
def get_supported_scrape_types(self) -> List[str]:
|
|
"""Get list of supported scrape types."""
|
|
return ["single", "crawl", "batch"]
|
|
|
|
def get_help_text(self) -> Dict[str, str]:
|
|
"""Get help text for users."""
|
|
return FirecrawlUIBuilder.create_help_text()
|
|
|
|
def get_validation_rules(self) -> Dict[str, Any]:
|
|
"""Get validation rules for configuration."""
|
|
return FirecrawlUIBuilder.create_validation_rules()
|
|
|
|
|
|
# Factory function for creating integration instance
|
|
def create_firecrawl_integration(config_dict: Dict[str, Any]) -> RAGFlowFirecrawlIntegration:
|
|
"""Create a Firecrawl integration instance from configuration."""
|
|
config = FirecrawlConfig.from_dict(config_dict)
|
|
return RAGFlowFirecrawlIntegration(config)
|
|
|
|
|
|
# Export main classes and functions
|
|
__all__ = [
|
|
"RAGFlowFirecrawlIntegration",
|
|
"create_firecrawl_integration",
|
|
"FirecrawlConfig",
|
|
"FirecrawlConnector",
|
|
"FirecrawlProcessor",
|
|
"RAGFlowDocument"
|
|
]
|