mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Add Firecrawl integration for RAGFlow (#10152)
## 🚀 Firecrawl Integration for RAGFlow This PR implements the Firecrawl integration for RAGFlow as requested in issue https://github.com/firecrawl/firecrawl/issues/2167 ### ✅ Features Implemented - **Data Source Integration**: Firecrawl appears as a selectable data source in RAGFlow - **Configuration Management**: Users can input Firecrawl API keys through RAGFlow's interface - **Web Scraping**: Supports single URL scraping, website crawling, and batch processing - **Content Processing**: Converts scraped content to RAGFlow's document format with chunking - **Error Handling**: Comprehensive error handling for rate limits, failed requests, and malformed content - **UI Components**: Complete UI schema and workflow components for RAGFlow integration ### 📁 Files Added - `intergrations/firecrawl/` - Complete integration package - `intergrations/firecrawl/integration.py` - RAGFlow integration entry point - `intergrations/firecrawl/firecrawl_connector.py` - API communication - `intergrations/firecrawl/firecrawl_config.py` - Configuration management - `intergrations/firecrawl/firecrawl_processor.py` - Content processing - `intergrations/firecrawl/firecrawl_ui.py` - UI components - `intergrations/firecrawl/ragflow_integration.py` - Main integration class - `intergrations/firecrawl/README.md` - Complete documentation - `intergrations/firecrawl/example_usage.py` - Usage examples ### 🧪 Testing The integration has been thoroughly tested with: - Configuration validation - Connection testing - Content processing and chunking - UI component rendering - Error handling scenarios ### 📋 Acceptance Criteria Met - ✅ Integration appears as selectable data source in RAGFlow's data source options - ✅ Users can input Firecrawl API keys through RAGFlow's configuration interface - ✅ Successfully scrapes content from provided URLs and imports into RAGFlow's document store - ✅ Handles common edge cases (rate limits, failed requests, malformed content) - ✅ Includes basic documentation and README updates - ✅ Code follows RAGFlow's existing patterns and coding standards ### �� Related Issue https://github.com/firecrawl/firecrawl/issues/2167 --------- Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
This commit is contained in:
275
intergrations/firecrawl/firecrawl_processor.py
Normal file
275
intergrations/firecrawl/firecrawl_processor.py
Normal file
@ -0,0 +1,275 @@
|
||||
"""
|
||||
Content processor for converting Firecrawl output to RAGFlow document format.
|
||||
"""
|
||||
|
||||
import re
|
||||
import hashlib
|
||||
from typing import List, Dict, Any
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from firecrawl_connector import ScrapedContent
|
||||
|
||||
|
||||
@dataclass
|
||||
class RAGFlowDocument:
|
||||
"""Represents a document in RAGFlow format."""
|
||||
|
||||
id: str
|
||||
title: str
|
||||
content: str
|
||||
source_url: str
|
||||
metadata: Dict[str, Any]
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
content_type: str = "text"
|
||||
language: str = "en"
|
||||
chunk_size: int = 1000
|
||||
chunk_overlap: int = 200
|
||||
|
||||
|
||||
class FirecrawlProcessor:
|
||||
"""Processes Firecrawl content for RAGFlow integration."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the processor."""
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def generate_document_id(self, url: str, content: str) -> str:
|
||||
"""Generate a unique document ID."""
|
||||
# Create a hash based on URL and content
|
||||
content_hash = hashlib.md5(f"{url}:{content[:100]}".encode()).hexdigest()
|
||||
return f"firecrawl_{content_hash}"
|
||||
|
||||
def clean_content(self, content: str) -> str:
|
||||
"""Clean and normalize content."""
|
||||
if not content:
|
||||
return ""
|
||||
|
||||
# Remove excessive whitespace
|
||||
content = re.sub(r'\s+', ' ', content)
|
||||
|
||||
# Remove HTML tags if present
|
||||
content = re.sub(r'<[^>]+>', '', content)
|
||||
|
||||
# Remove special characters that might cause issues
|
||||
content = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\']', '', content)
|
||||
|
||||
return content.strip()
|
||||
|
||||
def extract_title(self, content: ScrapedContent) -> str:
|
||||
"""Extract title from scraped content."""
|
||||
if content.title:
|
||||
return content.title
|
||||
|
||||
if content.metadata and content.metadata.get("title"):
|
||||
return content.metadata["title"]
|
||||
|
||||
# Extract title from markdown if available
|
||||
if content.markdown:
|
||||
title_match = re.search(r'^#\s+(.+)$', content.markdown, re.MULTILINE)
|
||||
if title_match:
|
||||
return title_match.group(1).strip()
|
||||
|
||||
# Fallback to URL
|
||||
return content.url.split('/')[-1] or content.url
|
||||
|
||||
def extract_description(self, content: ScrapedContent) -> str:
|
||||
"""Extract description from scraped content."""
|
||||
if content.description:
|
||||
return content.description
|
||||
|
||||
if content.metadata and content.metadata.get("description"):
|
||||
return content.metadata["description"]
|
||||
|
||||
# Extract first paragraph from markdown
|
||||
if content.markdown:
|
||||
# Remove headers and get first paragraph
|
||||
text = re.sub(r'^#+\s+.*$', '', content.markdown, flags=re.MULTILINE)
|
||||
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
||||
if paragraphs:
|
||||
return paragraphs[0][:200] + "..." if len(paragraphs[0]) > 200 else paragraphs[0]
|
||||
|
||||
return ""
|
||||
|
||||
def extract_language(self, content: ScrapedContent) -> str:
|
||||
"""Extract language from content metadata."""
|
||||
if content.metadata and content.metadata.get("language"):
|
||||
return content.metadata["language"]
|
||||
|
||||
# Simple language detection based on common words
|
||||
if content.markdown:
|
||||
text = content.markdown.lower()
|
||||
if any(word in text for word in ["the", "and", "or", "but", "in", "on", "at"]):
|
||||
return "en"
|
||||
elif any(word in text for word in ["le", "la", "les", "de", "du", "des"]):
|
||||
return "fr"
|
||||
elif any(word in text for word in ["der", "die", "das", "und", "oder"]):
|
||||
return "de"
|
||||
elif any(word in text for word in ["el", "la", "los", "las", "de", "del"]):
|
||||
return "es"
|
||||
|
||||
return "en" # Default to English
|
||||
|
||||
def create_metadata(self, content: ScrapedContent) -> Dict[str, Any]:
|
||||
"""Create comprehensive metadata for RAGFlow document."""
|
||||
metadata = {
|
||||
"source": "firecrawl",
|
||||
"url": content.url,
|
||||
"domain": self.extract_domain(content.url),
|
||||
"scraped_at": datetime.utcnow().isoformat(),
|
||||
"status_code": content.status_code,
|
||||
"content_length": len(content.markdown or ""),
|
||||
"has_html": bool(content.html),
|
||||
"has_markdown": bool(content.markdown)
|
||||
}
|
||||
|
||||
# Add original metadata if available
|
||||
if content.metadata:
|
||||
metadata.update({
|
||||
"original_title": content.metadata.get("title"),
|
||||
"original_description": content.metadata.get("description"),
|
||||
"original_language": content.metadata.get("language"),
|
||||
"original_keywords": content.metadata.get("keywords"),
|
||||
"original_robots": content.metadata.get("robots"),
|
||||
"og_title": content.metadata.get("ogTitle"),
|
||||
"og_description": content.metadata.get("ogDescription"),
|
||||
"og_image": content.metadata.get("ogImage"),
|
||||
"og_url": content.metadata.get("ogUrl")
|
||||
})
|
||||
|
||||
return metadata
|
||||
|
||||
def extract_domain(self, url: str) -> str:
|
||||
"""Extract domain from URL."""
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
return urlparse(url).netloc
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
def process_content(self, content: ScrapedContent) -> RAGFlowDocument:
|
||||
"""Process scraped content into RAGFlow document format."""
|
||||
if content.error:
|
||||
raise ValueError(f"Content has error: {content.error}")
|
||||
|
||||
# Determine primary content
|
||||
primary_content = content.markdown or content.html or ""
|
||||
if not primary_content:
|
||||
raise ValueError("No content available to process")
|
||||
|
||||
# Clean content
|
||||
cleaned_content = self.clean_content(primary_content)
|
||||
|
||||
# Extract metadata
|
||||
title = self.extract_title(content)
|
||||
language = self.extract_language(content)
|
||||
metadata = self.create_metadata(content)
|
||||
|
||||
# Generate document ID
|
||||
doc_id = self.generate_document_id(content.url, cleaned_content)
|
||||
|
||||
# Create RAGFlow document
|
||||
document = RAGFlowDocument(
|
||||
id=doc_id,
|
||||
title=title,
|
||||
content=cleaned_content,
|
||||
source_url=content.url,
|
||||
metadata=metadata,
|
||||
created_at=datetime.utcnow(),
|
||||
updated_at=datetime.utcnow(),
|
||||
content_type="text",
|
||||
language=language
|
||||
)
|
||||
|
||||
return document
|
||||
|
||||
def process_batch(self, contents: List[ScrapedContent]) -> List[RAGFlowDocument]:
|
||||
"""Process multiple scraped contents into RAGFlow documents."""
|
||||
documents = []
|
||||
|
||||
for content in contents:
|
||||
try:
|
||||
document = self.process_content(content)
|
||||
documents.append(document)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to process content from {content.url}: {e}")
|
||||
continue
|
||||
|
||||
return documents
|
||||
|
||||
def chunk_content(self, document: RAGFlowDocument,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200) -> List[Dict[str, Any]]:
|
||||
"""Chunk document content for RAG processing."""
|
||||
content = document.content
|
||||
chunks = []
|
||||
|
||||
if len(content) <= chunk_size:
|
||||
return [{
|
||||
"id": f"{document.id}_chunk_0",
|
||||
"content": content,
|
||||
"metadata": {
|
||||
**document.metadata,
|
||||
"chunk_index": 0,
|
||||
"total_chunks": 1
|
||||
}
|
||||
}]
|
||||
|
||||
# Split content into chunks
|
||||
start = 0
|
||||
chunk_index = 0
|
||||
|
||||
while start < len(content):
|
||||
end = start + chunk_size
|
||||
|
||||
# Try to break at sentence boundary
|
||||
if end < len(content):
|
||||
# Look for sentence endings
|
||||
sentence_end = content.rfind('.', start, end)
|
||||
if sentence_end > start + chunk_size // 2:
|
||||
end = sentence_end + 1
|
||||
|
||||
chunk_content = content[start:end].strip()
|
||||
|
||||
if chunk_content:
|
||||
chunks.append({
|
||||
"id": f"{document.id}_chunk_{chunk_index}",
|
||||
"content": chunk_content,
|
||||
"metadata": {
|
||||
**document.metadata,
|
||||
"chunk_index": chunk_index,
|
||||
"total_chunks": len(chunks) + 1, # Will be updated
|
||||
"chunk_start": start,
|
||||
"chunk_end": end
|
||||
}
|
||||
})
|
||||
chunk_index += 1
|
||||
|
||||
# Move start position with overlap
|
||||
start = end - chunk_overlap
|
||||
if start >= len(content):
|
||||
break
|
||||
|
||||
# Update total chunks count
|
||||
for chunk in chunks:
|
||||
chunk["metadata"]["total_chunks"] = len(chunks)
|
||||
|
||||
return chunks
|
||||
|
||||
def validate_document(self, document: RAGFlowDocument) -> bool:
|
||||
"""Validate RAGFlow document."""
|
||||
if not document.id:
|
||||
return False
|
||||
|
||||
if not document.title:
|
||||
return False
|
||||
|
||||
if not document.content:
|
||||
return False
|
||||
|
||||
if not document.source_url:
|
||||
return False
|
||||
|
||||
return True
|
||||
Reference in New Issue
Block a user