mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Add Firecrawl integration for RAGFlow (#10152)
## 🚀 Firecrawl Integration for RAGFlow This PR implements the Firecrawl integration for RAGFlow as requested in issue https://github.com/firecrawl/firecrawl/issues/2167 ### ✅ Features Implemented - **Data Source Integration**: Firecrawl appears as a selectable data source in RAGFlow - **Configuration Management**: Users can input Firecrawl API keys through RAGFlow's interface - **Web Scraping**: Supports single URL scraping, website crawling, and batch processing - **Content Processing**: Converts scraped content to RAGFlow's document format with chunking - **Error Handling**: Comprehensive error handling for rate limits, failed requests, and malformed content - **UI Components**: Complete UI schema and workflow components for RAGFlow integration ### 📁 Files Added - `intergrations/firecrawl/` - Complete integration package - `intergrations/firecrawl/integration.py` - RAGFlow integration entry point - `intergrations/firecrawl/firecrawl_connector.py` - API communication - `intergrations/firecrawl/firecrawl_config.py` - Configuration management - `intergrations/firecrawl/firecrawl_processor.py` - Content processing - `intergrations/firecrawl/firecrawl_ui.py` - UI components - `intergrations/firecrawl/ragflow_integration.py` - Main integration class - `intergrations/firecrawl/README.md` - Complete documentation - `intergrations/firecrawl/example_usage.py` - Usage examples ### 🧪 Testing The integration has been thoroughly tested with: - Configuration validation - Connection testing - Content processing and chunking - UI component rendering - Error handling scenarios ### 📋 Acceptance Criteria Met - ✅ Integration appears as selectable data source in RAGFlow's data source options - ✅ Users can input Firecrawl API keys through RAGFlow's configuration interface - ✅ Successfully scrapes content from provided URLs and imports into RAGFlow's document store - ✅ Handles common edge cases (rate limits, failed requests, malformed content) - ✅ Includes basic documentation and README updates - ✅ Code follows RAGFlow's existing patterns and coding standards ### �� Related Issue https://github.com/firecrawl/firecrawl/issues/2167 --------- Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
This commit is contained in:
259
intergrations/firecrawl/firecrawl_ui.py
Normal file
259
intergrations/firecrawl/firecrawl_ui.py
Normal file
@ -0,0 +1,259 @@
|
||||
"""
|
||||
UI components for Firecrawl integration in RAGFlow.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class FirecrawlUIComponent:
|
||||
"""Represents a UI component for Firecrawl integration."""
|
||||
|
||||
component_type: str
|
||||
props: Dict[str, Any]
|
||||
children: Optional[List['FirecrawlUIComponent']] = None
|
||||
|
||||
|
||||
class FirecrawlUIBuilder:
|
||||
"""Builder for Firecrawl UI components in RAGFlow."""
|
||||
|
||||
@staticmethod
|
||||
def create_data_source_config() -> Dict[str, Any]:
|
||||
"""Create configuration for Firecrawl data source."""
|
||||
return {
|
||||
"name": "firecrawl",
|
||||
"display_name": "Firecrawl Web Scraper",
|
||||
"description": "Import web content using Firecrawl's powerful scraping capabilities",
|
||||
"icon": "🌐",
|
||||
"category": "web",
|
||||
"version": "1.0.0",
|
||||
"author": "Firecrawl Team",
|
||||
"config_schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"api_key": {
|
||||
"type": "string",
|
||||
"title": "Firecrawl API Key",
|
||||
"description": "Your Firecrawl API key (starts with 'fc-')",
|
||||
"format": "password",
|
||||
"required": True
|
||||
},
|
||||
"api_url": {
|
||||
"type": "string",
|
||||
"title": "API URL",
|
||||
"description": "Firecrawl API endpoint",
|
||||
"default": "https://api.firecrawl.dev",
|
||||
"required": False
|
||||
},
|
||||
"max_retries": {
|
||||
"type": "integer",
|
||||
"title": "Max Retries",
|
||||
"description": "Maximum number of retry attempts",
|
||||
"default": 3,
|
||||
"minimum": 1,
|
||||
"maximum": 10
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"title": "Timeout (seconds)",
|
||||
"description": "Request timeout in seconds",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"maximum": 300
|
||||
},
|
||||
"rate_limit_delay": {
|
||||
"type": "number",
|
||||
"title": "Rate Limit Delay",
|
||||
"description": "Delay between requests in seconds",
|
||||
"default": 1.0,
|
||||
"minimum": 0.1,
|
||||
"maximum": 10.0
|
||||
}
|
||||
},
|
||||
"required": ["api_key"]
|
||||
}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_scraping_form() -> Dict[str, Any]:
|
||||
"""Create form for scraping configuration."""
|
||||
return {
|
||||
"type": "form",
|
||||
"title": "Firecrawl Web Scraping",
|
||||
"description": "Configure web scraping parameters",
|
||||
"fields": [
|
||||
{
|
||||
"name": "urls",
|
||||
"type": "array",
|
||||
"title": "URLs to Scrape",
|
||||
"description": "Enter URLs to scrape (one per line)",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"required": True,
|
||||
"minItems": 1
|
||||
},
|
||||
{
|
||||
"name": "scrape_type",
|
||||
"type": "string",
|
||||
"title": "Scrape Type",
|
||||
"description": "Choose scraping method",
|
||||
"enum": ["single", "crawl", "batch"],
|
||||
"enumNames": ["Single URL", "Crawl Website", "Batch URLs"],
|
||||
"default": "single",
|
||||
"required": True
|
||||
},
|
||||
{
|
||||
"name": "formats",
|
||||
"type": "array",
|
||||
"title": "Output Formats",
|
||||
"description": "Select output formats",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "html", "links", "screenshot"]
|
||||
},
|
||||
"default": ["markdown", "html"],
|
||||
"required": True
|
||||
},
|
||||
{
|
||||
"name": "crawl_limit",
|
||||
"type": "integer",
|
||||
"title": "Crawl Limit",
|
||||
"description": "Maximum number of pages to crawl (for crawl type)",
|
||||
"default": 100,
|
||||
"minimum": 1,
|
||||
"maximum": 1000,
|
||||
"condition": {
|
||||
"field": "scrape_type",
|
||||
"equals": "crawl"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "extract_options",
|
||||
"type": "object",
|
||||
"title": "Extraction Options",
|
||||
"description": "Advanced extraction settings",
|
||||
"properties": {
|
||||
"extractMainContent": {
|
||||
"type": "boolean",
|
||||
"title": "Extract Main Content Only",
|
||||
"default": True
|
||||
},
|
||||
"excludeTags": {
|
||||
"type": "array",
|
||||
"title": "Exclude Tags",
|
||||
"description": "HTML tags to exclude",
|
||||
"items": {"type": "string"},
|
||||
"default": ["nav", "footer", "header", "aside"]
|
||||
},
|
||||
"includeTags": {
|
||||
"type": "array",
|
||||
"title": "Include Tags",
|
||||
"description": "HTML tags to include",
|
||||
"items": {"type": "string"},
|
||||
"default": ["main", "article", "section", "div", "p"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_progress_component() -> Dict[str, Any]:
|
||||
"""Create progress tracking component."""
|
||||
return {
|
||||
"type": "progress",
|
||||
"title": "Scraping Progress",
|
||||
"description": "Track the progress of your web scraping job",
|
||||
"properties": {
|
||||
"show_percentage": True,
|
||||
"show_eta": True,
|
||||
"show_details": True
|
||||
}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_results_view() -> Dict[str, Any]:
|
||||
"""Create results display component."""
|
||||
return {
|
||||
"type": "results",
|
||||
"title": "Scraping Results",
|
||||
"description": "View and manage scraped content",
|
||||
"properties": {
|
||||
"show_preview": True,
|
||||
"show_metadata": True,
|
||||
"allow_editing": True,
|
||||
"show_chunks": True
|
||||
}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_error_handler() -> Dict[str, Any]:
|
||||
"""Create error handling component."""
|
||||
return {
|
||||
"type": "error_handler",
|
||||
"title": "Error Handling",
|
||||
"description": "Handle scraping errors and retries",
|
||||
"properties": {
|
||||
"show_retry_button": True,
|
||||
"show_error_details": True,
|
||||
"auto_retry": False,
|
||||
"max_retries": 3
|
||||
}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_validation_rules() -> Dict[str, Any]:
|
||||
"""Create validation rules for Firecrawl integration."""
|
||||
return {
|
||||
"url_validation": {
|
||||
"pattern": r"^https?://.+",
|
||||
"message": "URL must start with http:// or https://"
|
||||
},
|
||||
"api_key_validation": {
|
||||
"pattern": r"^fc-[a-zA-Z0-9]+$",
|
||||
"message": "API key must start with 'fc-' followed by alphanumeric characters"
|
||||
},
|
||||
"rate_limit_validation": {
|
||||
"min": 0.1,
|
||||
"max": 10.0,
|
||||
"message": "Rate limit delay must be between 0.1 and 10.0 seconds"
|
||||
}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_help_text() -> Dict[str, str]:
|
||||
"""Create help text for users."""
|
||||
return {
|
||||
"api_key_help": "Get your API key from https://firecrawl.dev. Sign up for a free account to get started.",
|
||||
"url_help": "Enter the URLs you want to scrape. You can add multiple URLs for batch processing.",
|
||||
"crawl_help": "Crawling will follow links from the starting URL and scrape all accessible pages within the limit.",
|
||||
"formats_help": "Choose the output formats you need. Markdown is recommended for RAG processing.",
|
||||
"extract_help": "Extraction options help filter content to get only the main content without navigation and ads."
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_ui_schema() -> Dict[str, Any]:
|
||||
"""Create complete UI schema for Firecrawl integration."""
|
||||
return {
|
||||
"version": "1.0.0",
|
||||
"components": {
|
||||
"data_source_config": FirecrawlUIBuilder.create_data_source_config(),
|
||||
"scraping_form": FirecrawlUIBuilder.create_scraping_form(),
|
||||
"progress_component": FirecrawlUIBuilder.create_progress_component(),
|
||||
"results_view": FirecrawlUIBuilder.create_results_view(),
|
||||
"error_handler": FirecrawlUIBuilder.create_error_handler()
|
||||
},
|
||||
"validation_rules": FirecrawlUIBuilder.create_validation_rules(),
|
||||
"help_text": FirecrawlUIBuilder.create_help_text(),
|
||||
"workflow": [
|
||||
"configure_data_source",
|
||||
"setup_scraping_parameters",
|
||||
"start_scraping_job",
|
||||
"monitor_progress",
|
||||
"review_results",
|
||||
"import_to_ragflow"
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user