mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
## 🚀 Firecrawl Integration for RAGFlow This PR implements the Firecrawl integration for RAGFlow as requested in issue https://github.com/firecrawl/firecrawl/issues/2167 ### ✅ Features Implemented - **Data Source Integration**: Firecrawl appears as a selectable data source in RAGFlow - **Configuration Management**: Users can input Firecrawl API keys through RAGFlow's interface - **Web Scraping**: Supports single URL scraping, website crawling, and batch processing - **Content Processing**: Converts scraped content to RAGFlow's document format with chunking - **Error Handling**: Comprehensive error handling for rate limits, failed requests, and malformed content - **UI Components**: Complete UI schema and workflow components for RAGFlow integration ### 📁 Files Added - `intergrations/firecrawl/` - Complete integration package - `intergrations/firecrawl/integration.py` - RAGFlow integration entry point - `intergrations/firecrawl/firecrawl_connector.py` - API communication - `intergrations/firecrawl/firecrawl_config.py` - Configuration management - `intergrations/firecrawl/firecrawl_processor.py` - Content processing - `intergrations/firecrawl/firecrawl_ui.py` - UI components - `intergrations/firecrawl/ragflow_integration.py` - Main integration class - `intergrations/firecrawl/README.md` - Complete documentation - `intergrations/firecrawl/example_usage.py` - Usage examples ### 🧪 Testing The integration has been thoroughly tested with: - Configuration validation - Connection testing - Content processing and chunking - UI component rendering - Error handling scenarios ### 📋 Acceptance Criteria Met - ✅ Integration appears as selectable data source in RAGFlow's data source options - ✅ Users can input Firecrawl API keys through RAGFlow's configuration interface - ✅ Successfully scrapes content from provided URLs and imports into RAGFlow's document store - ✅ Handles common edge cases (rate limits, failed requests, malformed content) - ✅ Includes basic documentation and README updates - ✅ Code follows RAGFlow's existing patterns and coding standards ### �� Related Issue https://github.com/firecrawl/firecrawl/issues/2167 --------- Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
150 lines
5.0 KiB
Python
150 lines
5.0 KiB
Python
"""
|
|
RAGFlow Integration Entry Point for Firecrawl
|
|
|
|
This file provides the main entry point for the Firecrawl integration with RAGFlow.
|
|
It follows RAGFlow's integration patterns and provides the necessary interfaces.
|
|
"""
|
|
|
|
from typing import Dict, Any
|
|
import logging
|
|
|
|
from ragflow_integration import RAGFlowFirecrawlIntegration, create_firecrawl_integration
|
|
from firecrawl_ui import FirecrawlUIBuilder
|
|
|
|
# Set up logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class FirecrawlRAGFlowPlugin:
|
|
"""
|
|
Main plugin class for Firecrawl integration with RAGFlow.
|
|
This class provides the interface that RAGFlow expects from integrations.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the Firecrawl plugin."""
|
|
self.name = "firecrawl"
|
|
self.display_name = "Firecrawl Web Scraper"
|
|
self.description = "Import web content using Firecrawl's powerful scraping capabilities"
|
|
self.version = "1.0.0"
|
|
self.author = "Firecrawl Team"
|
|
self.category = "web"
|
|
self.icon = "🌐"
|
|
|
|
logger.info(f"Initialized {self.display_name} plugin v{self.version}")
|
|
|
|
def get_plugin_info(self) -> Dict[str, Any]:
|
|
"""Get plugin information for RAGFlow."""
|
|
return {
|
|
"name": self.name,
|
|
"display_name": self.display_name,
|
|
"description": self.description,
|
|
"version": self.version,
|
|
"author": self.author,
|
|
"category": self.category,
|
|
"icon": self.icon,
|
|
"supported_formats": ["markdown", "html", "links", "screenshot"],
|
|
"supported_scrape_types": ["single", "crawl", "batch"]
|
|
}
|
|
|
|
def get_config_schema(self) -> Dict[str, Any]:
|
|
"""Get configuration schema for RAGFlow."""
|
|
return FirecrawlUIBuilder.create_data_source_config()["config_schema"]
|
|
|
|
def get_ui_schema(self) -> Dict[str, Any]:
|
|
"""Get UI schema for RAGFlow."""
|
|
return FirecrawlUIBuilder.create_ui_schema()
|
|
|
|
def validate_config(self, config: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Validate configuration and return any errors."""
|
|
try:
|
|
integration = create_firecrawl_integration(config)
|
|
return integration.validate_config(config)
|
|
except Exception as e:
|
|
logger.error(f"Configuration validation error: {e}")
|
|
return {"general": str(e)}
|
|
|
|
def test_connection(self, config: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Test connection to Firecrawl API."""
|
|
try:
|
|
integration = create_firecrawl_integration(config)
|
|
# Run the async test_connection method
|
|
import asyncio
|
|
return asyncio.run(integration.test_connection())
|
|
except Exception as e:
|
|
logger.error(f"Connection test error: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"message": "Connection test failed"
|
|
}
|
|
|
|
def create_integration(self, config: Dict[str, Any]) -> RAGFlowFirecrawlIntegration:
|
|
"""Create and return a Firecrawl integration instance."""
|
|
return create_firecrawl_integration(config)
|
|
|
|
def get_help_text(self) -> Dict[str, str]:
|
|
"""Get help text for users."""
|
|
return FirecrawlUIBuilder.create_help_text()
|
|
|
|
def get_validation_rules(self) -> Dict[str, Any]:
|
|
"""Get validation rules for configuration."""
|
|
return FirecrawlUIBuilder.create_validation_rules()
|
|
|
|
|
|
# RAGFlow integration entry points
|
|
def get_plugin() -> FirecrawlRAGFlowPlugin:
|
|
"""Get the plugin instance for RAGFlow."""
|
|
return FirecrawlRAGFlowPlugin()
|
|
|
|
|
|
def get_integration(config: Dict[str, Any]) -> RAGFlowFirecrawlIntegration:
|
|
"""Get an integration instance with the given configuration."""
|
|
return create_firecrawl_integration(config)
|
|
|
|
|
|
def get_config_schema() -> Dict[str, Any]:
|
|
"""Get the configuration schema."""
|
|
return FirecrawlUIBuilder.create_data_source_config()["config_schema"]
|
|
|
|
|
|
def get_ui_schema() -> Dict[str, Any]:
|
|
"""Get the UI schema."""
|
|
return FirecrawlUIBuilder.create_ui_schema()
|
|
|
|
|
|
def validate_config(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Validate configuration."""
|
|
try:
|
|
integration = create_firecrawl_integration(config)
|
|
return integration.validate_config(config)
|
|
except Exception as e:
|
|
return {"general": str(e)}
|
|
|
|
|
|
def test_connection(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Test connection to Firecrawl API."""
|
|
try:
|
|
integration = create_firecrawl_integration(config)
|
|
return integration.test_connection()
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"message": "Connection test failed"
|
|
}
|
|
|
|
|
|
# Export main functions and classes
|
|
__all__ = [
|
|
"FirecrawlRAGFlowPlugin",
|
|
"get_plugin",
|
|
"get_integration",
|
|
"get_config_schema",
|
|
"get_ui_schema",
|
|
"validate_config",
|
|
"test_connection",
|
|
"RAGFlowFirecrawlIntegration",
|
|
"create_firecrawl_integration"
|
|
]
|