mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-17 19:22:55 +08:00
Fix: Merge main branch (#10377)
### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
This commit is contained in:
175
intergrations/firecrawl/ragflow_integration.py
Normal file
175
intergrations/firecrawl/ragflow_integration.py
Normal file
@ -0,0 +1,175 @@
|
||||
"""
|
||||
Main integration file for Firecrawl with RAGFlow.
|
||||
This file provides the interface between RAGFlow and the Firecrawl plugin.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from firecrawl_connector import FirecrawlConnector
|
||||
from firecrawl_config import FirecrawlConfig
|
||||
from firecrawl_processor import FirecrawlProcessor, RAGFlowDocument
|
||||
from firecrawl_ui import FirecrawlUIBuilder
|
||||
|
||||
|
||||
class RAGFlowFirecrawlIntegration:
|
||||
"""Main integration class for Firecrawl with RAGFlow."""
|
||||
|
||||
def __init__(self, config: FirecrawlConfig):
|
||||
"""Initialize the integration."""
|
||||
self.config = config
|
||||
self.connector = FirecrawlConnector(config)
|
||||
self.processor = FirecrawlProcessor()
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
async def scrape_and_import(self, urls: List[str],
|
||||
formats: List[str] = None,
|
||||
extract_options: Dict[str, Any] = None) -> List[RAGFlowDocument]:
|
||||
"""Scrape URLs and convert to RAGFlow documents."""
|
||||
if formats is None:
|
||||
formats = ["markdown", "html"]
|
||||
|
||||
async with self.connector:
|
||||
# Scrape URLs
|
||||
scraped_contents = await self.connector.batch_scrape(urls, formats)
|
||||
|
||||
# Process into RAGFlow documents
|
||||
documents = self.processor.process_batch(scraped_contents)
|
||||
|
||||
return documents
|
||||
|
||||
async def crawl_and_import(self, start_url: str,
|
||||
limit: int = 100,
|
||||
scrape_options: Dict[str, Any] = None) -> List[RAGFlowDocument]:
|
||||
"""Crawl a website and convert to RAGFlow documents."""
|
||||
if scrape_options is None:
|
||||
scrape_options = {"formats": ["markdown", "html"]}
|
||||
|
||||
async with self.connector:
|
||||
# Start crawl job
|
||||
crawl_job = await self.connector.start_crawl(start_url, limit, scrape_options)
|
||||
|
||||
if crawl_job.error:
|
||||
raise Exception(f"Failed to start crawl: {crawl_job.error}")
|
||||
|
||||
# Wait for completion
|
||||
completed_job = await self.connector.wait_for_crawl_completion(crawl_job.job_id)
|
||||
|
||||
if completed_job.error:
|
||||
raise Exception(f"Crawl failed: {completed_job.error}")
|
||||
|
||||
# Process into RAGFlow documents
|
||||
documents = self.processor.process_batch(completed_job.data or [])
|
||||
|
||||
return documents
|
||||
|
||||
def get_ui_schema(self) -> Dict[str, Any]:
|
||||
"""Get UI schema for RAGFlow integration."""
|
||||
return FirecrawlUIBuilder.create_ui_schema()
|
||||
|
||||
def validate_config(self, config_dict: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate configuration and return any errors."""
|
||||
errors = {}
|
||||
|
||||
# Validate API key
|
||||
api_key = config_dict.get("api_key", "")
|
||||
if not api_key:
|
||||
errors["api_key"] = "API key is required"
|
||||
elif not api_key.startswith("fc-"):
|
||||
errors["api_key"] = "API key must start with 'fc-'"
|
||||
|
||||
# Validate API URL
|
||||
api_url = config_dict.get("api_url", "https://api.firecrawl.dev")
|
||||
if not api_url.startswith("http"):
|
||||
errors["api_url"] = "API URL must start with http:// or https://"
|
||||
|
||||
# Validate numeric fields
|
||||
try:
|
||||
max_retries = int(config_dict.get("max_retries", 3))
|
||||
if max_retries < 1 or max_retries > 10:
|
||||
errors["max_retries"] = "Max retries must be between 1 and 10"
|
||||
except (ValueError, TypeError):
|
||||
errors["max_retries"] = "Max retries must be a valid integer"
|
||||
|
||||
try:
|
||||
timeout = int(config_dict.get("timeout", 30))
|
||||
if timeout < 5 or timeout > 300:
|
||||
errors["timeout"] = "Timeout must be between 5 and 300 seconds"
|
||||
except (ValueError, TypeError):
|
||||
errors["timeout"] = "Timeout must be a valid integer"
|
||||
|
||||
try:
|
||||
rate_limit_delay = float(config_dict.get("rate_limit_delay", 1.0))
|
||||
if rate_limit_delay < 0.1 or rate_limit_delay > 10.0:
|
||||
errors["rate_limit_delay"] = "Rate limit delay must be between 0.1 and 10.0 seconds"
|
||||
except (ValueError, TypeError):
|
||||
errors["rate_limit_delay"] = "Rate limit delay must be a valid number"
|
||||
|
||||
return errors
|
||||
|
||||
def create_config(self, config_dict: Dict[str, Any]) -> FirecrawlConfig:
|
||||
"""Create FirecrawlConfig from dictionary."""
|
||||
return FirecrawlConfig.from_dict(config_dict)
|
||||
|
||||
async def test_connection(self) -> Dict[str, Any]:
|
||||
"""Test the connection to Firecrawl API."""
|
||||
try:
|
||||
async with self.connector:
|
||||
# Try to scrape a simple URL to test connection
|
||||
test_url = "https://httpbin.org/json"
|
||||
result = await self.connector.scrape_url(test_url, ["markdown"])
|
||||
|
||||
if result.error:
|
||||
return {
|
||||
"success": False,
|
||||
"error": result.error,
|
||||
"message": "Failed to connect to Firecrawl API"
|
||||
}
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": "Successfully connected to Firecrawl API",
|
||||
"test_url": test_url,
|
||||
"response_time": "N/A" # Could be enhanced to measure actual response time
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"message": "Connection test failed"
|
||||
}
|
||||
|
||||
def get_supported_formats(self) -> List[str]:
|
||||
"""Get list of supported output formats."""
|
||||
return ["markdown", "html", "links", "screenshot"]
|
||||
|
||||
def get_supported_scrape_types(self) -> List[str]:
|
||||
"""Get list of supported scrape types."""
|
||||
return ["single", "crawl", "batch"]
|
||||
|
||||
def get_help_text(self) -> Dict[str, str]:
|
||||
"""Get help text for users."""
|
||||
return FirecrawlUIBuilder.create_help_text()
|
||||
|
||||
def get_validation_rules(self) -> Dict[str, Any]:
|
||||
"""Get validation rules for configuration."""
|
||||
return FirecrawlUIBuilder.create_validation_rules()
|
||||
|
||||
|
||||
# Factory function for creating integration instance
|
||||
def create_firecrawl_integration(config_dict: Dict[str, Any]) -> RAGFlowFirecrawlIntegration:
|
||||
"""Create a Firecrawl integration instance from configuration."""
|
||||
config = FirecrawlConfig.from_dict(config_dict)
|
||||
return RAGFlowFirecrawlIntegration(config)
|
||||
|
||||
|
||||
# Export main classes and functions
|
||||
__all__ = [
|
||||
"RAGFlowFirecrawlIntegration",
|
||||
"create_firecrawl_integration",
|
||||
"FirecrawlConfig",
|
||||
"FirecrawlConnector",
|
||||
"FirecrawlProcessor",
|
||||
"RAGFlowDocument"
|
||||
]
|
||||
Reference in New Issue
Block a user