Files
ragflow/intergrations/firecrawl/firecrawl_processor.py
Ajay ed6a76dcc0 Add Firecrawl integration for RAGFlow (#10152)
## 🚀 Firecrawl Integration for RAGFlow

This PR implements the Firecrawl integration for RAGFlow as requested in
issue https://github.com/firecrawl/firecrawl/issues/2167

###  Features Implemented

- **Data Source Integration**: Firecrawl appears as a selectable data
source in RAGFlow
- **Configuration Management**: Users can input Firecrawl API keys
through RAGFlow's interface
- **Web Scraping**: Supports single URL scraping, website crawling, and
batch processing
- **Content Processing**: Converts scraped content to RAGFlow's document
format with chunking
- **Error Handling**: Comprehensive error handling for rate limits,
failed requests, and malformed content
- **UI Components**: Complete UI schema and workflow components for
RAGFlow integration

### 📁 Files Added

- `intergrations/firecrawl/` - Complete integration package
- `intergrations/firecrawl/integration.py` - RAGFlow integration entry
point
- `intergrations/firecrawl/firecrawl_connector.py` - API communication
- `intergrations/firecrawl/firecrawl_config.py` - Configuration
management
- `intergrations/firecrawl/firecrawl_processor.py` - Content processing
- `intergrations/firecrawl/firecrawl_ui.py` - UI components
- `intergrations/firecrawl/ragflow_integration.py` - Main integration
class
- `intergrations/firecrawl/README.md` - Complete documentation
- `intergrations/firecrawl/example_usage.py` - Usage examples

### 🧪 Testing

The integration has been thoroughly tested with:
- Configuration validation
- Connection testing
- Content processing and chunking
- UI component rendering
- Error handling scenarios

### 📋 Acceptance Criteria Met

-  Integration appears as selectable data source in RAGFlow's data
source options
-  Users can input Firecrawl API keys through RAGFlow's configuration
interface
-  Successfully scrapes content from provided URLs and imports into
RAGFlow's document store
-  Handles common edge cases (rate limits, failed requests, malformed
content)
-  Includes basic documentation and README updates
-  Code follows RAGFlow's existing patterns and coding standards

### �� Related Issue

https://github.com/firecrawl/firecrawl/issues/2167

---------

Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
2025-09-19 09:58:17 +08:00

276 lines
9.5 KiB
Python

"""
Content processor for converting Firecrawl output to RAGFlow document format.
"""
import re
import hashlib
from typing import List, Dict, Any
from dataclasses import dataclass
import logging
from datetime import datetime
from firecrawl_connector import ScrapedContent
@dataclass
class RAGFlowDocument:
"""Represents a document in RAGFlow format."""
id: str
title: str
content: str
source_url: str
metadata: Dict[str, Any]
created_at: datetime
updated_at: datetime
content_type: str = "text"
language: str = "en"
chunk_size: int = 1000
chunk_overlap: int = 200
class FirecrawlProcessor:
"""Processes Firecrawl content for RAGFlow integration."""
def __init__(self):
"""Initialize the processor."""
self.logger = logging.getLogger(__name__)
def generate_document_id(self, url: str, content: str) -> str:
"""Generate a unique document ID."""
# Create a hash based on URL and content
content_hash = hashlib.md5(f"{url}:{content[:100]}".encode()).hexdigest()
return f"firecrawl_{content_hash}"
def clean_content(self, content: str) -> str:
"""Clean and normalize content."""
if not content:
return ""
# Remove excessive whitespace
content = re.sub(r'\s+', ' ', content)
# Remove HTML tags if present
content = re.sub(r'<[^>]+>', '', content)
# Remove special characters that might cause issues
content = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\']', '', content)
return content.strip()
def extract_title(self, content: ScrapedContent) -> str:
"""Extract title from scraped content."""
if content.title:
return content.title
if content.metadata and content.metadata.get("title"):
return content.metadata["title"]
# Extract title from markdown if available
if content.markdown:
title_match = re.search(r'^#\s+(.+)$', content.markdown, re.MULTILINE)
if title_match:
return title_match.group(1).strip()
# Fallback to URL
return content.url.split('/')[-1] or content.url
def extract_description(self, content: ScrapedContent) -> str:
"""Extract description from scraped content."""
if content.description:
return content.description
if content.metadata and content.metadata.get("description"):
return content.metadata["description"]
# Extract first paragraph from markdown
if content.markdown:
# Remove headers and get first paragraph
text = re.sub(r'^#+\s+.*$', '', content.markdown, flags=re.MULTILINE)
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
if paragraphs:
return paragraphs[0][:200] + "..." if len(paragraphs[0]) > 200 else paragraphs[0]
return ""
def extract_language(self, content: ScrapedContent) -> str:
"""Extract language from content metadata."""
if content.metadata and content.metadata.get("language"):
return content.metadata["language"]
# Simple language detection based on common words
if content.markdown:
text = content.markdown.lower()
if any(word in text for word in ["the", "and", "or", "but", "in", "on", "at"]):
return "en"
elif any(word in text for word in ["le", "la", "les", "de", "du", "des"]):
return "fr"
elif any(word in text for word in ["der", "die", "das", "und", "oder"]):
return "de"
elif any(word in text for word in ["el", "la", "los", "las", "de", "del"]):
return "es"
return "en" # Default to English
def create_metadata(self, content: ScrapedContent) -> Dict[str, Any]:
"""Create comprehensive metadata for RAGFlow document."""
metadata = {
"source": "firecrawl",
"url": content.url,
"domain": self.extract_domain(content.url),
"scraped_at": datetime.utcnow().isoformat(),
"status_code": content.status_code,
"content_length": len(content.markdown or ""),
"has_html": bool(content.html),
"has_markdown": bool(content.markdown)
}
# Add original metadata if available
if content.metadata:
metadata.update({
"original_title": content.metadata.get("title"),
"original_description": content.metadata.get("description"),
"original_language": content.metadata.get("language"),
"original_keywords": content.metadata.get("keywords"),
"original_robots": content.metadata.get("robots"),
"og_title": content.metadata.get("ogTitle"),
"og_description": content.metadata.get("ogDescription"),
"og_image": content.metadata.get("ogImage"),
"og_url": content.metadata.get("ogUrl")
})
return metadata
def extract_domain(self, url: str) -> str:
"""Extract domain from URL."""
try:
from urllib.parse import urlparse
return urlparse(url).netloc
except Exception:
return ""
def process_content(self, content: ScrapedContent) -> RAGFlowDocument:
"""Process scraped content into RAGFlow document format."""
if content.error:
raise ValueError(f"Content has error: {content.error}")
# Determine primary content
primary_content = content.markdown or content.html or ""
if not primary_content:
raise ValueError("No content available to process")
# Clean content
cleaned_content = self.clean_content(primary_content)
# Extract metadata
title = self.extract_title(content)
language = self.extract_language(content)
metadata = self.create_metadata(content)
# Generate document ID
doc_id = self.generate_document_id(content.url, cleaned_content)
# Create RAGFlow document
document = RAGFlowDocument(
id=doc_id,
title=title,
content=cleaned_content,
source_url=content.url,
metadata=metadata,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow(),
content_type="text",
language=language
)
return document
def process_batch(self, contents: List[ScrapedContent]) -> List[RAGFlowDocument]:
"""Process multiple scraped contents into RAGFlow documents."""
documents = []
for content in contents:
try:
document = self.process_content(content)
documents.append(document)
except Exception as e:
self.logger.error(f"Failed to process content from {content.url}: {e}")
continue
return documents
def chunk_content(self, document: RAGFlowDocument,
chunk_size: int = 1000,
chunk_overlap: int = 200) -> List[Dict[str, Any]]:
"""Chunk document content for RAG processing."""
content = document.content
chunks = []
if len(content) <= chunk_size:
return [{
"id": f"{document.id}_chunk_0",
"content": content,
"metadata": {
**document.metadata,
"chunk_index": 0,
"total_chunks": 1
}
}]
# Split content into chunks
start = 0
chunk_index = 0
while start < len(content):
end = start + chunk_size
# Try to break at sentence boundary
if end < len(content):
# Look for sentence endings
sentence_end = content.rfind('.', start, end)
if sentence_end > start + chunk_size // 2:
end = sentence_end + 1
chunk_content = content[start:end].strip()
if chunk_content:
chunks.append({
"id": f"{document.id}_chunk_{chunk_index}",
"content": chunk_content,
"metadata": {
**document.metadata,
"chunk_index": chunk_index,
"total_chunks": len(chunks) + 1, # Will be updated
"chunk_start": start,
"chunk_end": end
}
})
chunk_index += 1
# Move start position with overlap
start = end - chunk_overlap
if start >= len(content):
break
# Update total chunks count
for chunk in chunks:
chunk["metadata"]["total_chunks"] = len(chunks)
return chunks
def validate_document(self, document: RAGFlowDocument) -> bool:
"""Validate RAGFlow document."""
if not document.id:
return False
if not document.title:
return False
if not document.content:
return False
if not document.source_url:
return False
return True