Fix: Merge main branch (#10377)

### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-12-08 20:42:30 +08:00 · 2025-09-30 13:13:15 +08:00
parent 4d6ff672eb
commit 20b577a72c
201 changed files with 7929 additions and 1110 deletions
--- a/intergrations/firecrawl/firecrawl_processor.py
+++ b/intergrations/firecrawl/firecrawl_processor.py
@ -0,0 +1,275 @@
+"""
+Content processor for converting Firecrawl output to RAGFlow document format.
+"""
+
+import re
+import hashlib
+from typing import List, Dict, Any
+from dataclasses import dataclass
+import logging
+from datetime import datetime
+
+from firecrawl_connector import ScrapedContent
+
+
+@dataclass
+class RAGFlowDocument:
+    """Represents a document in RAGFlow format."""
+    
+    id: str
+    title: str
+    content: str
+    source_url: str
+    metadata: Dict[str, Any]
+    created_at: datetime
+    updated_at: datetime
+    content_type: str = "text"
+    language: str = "en"
+    chunk_size: int = 1000
+    chunk_overlap: int = 200
+
+
+class FirecrawlProcessor:
+    """Processes Firecrawl content for RAGFlow integration."""
+    
+    def __init__(self):
+        """Initialize the processor."""
+        self.logger = logging.getLogger(__name__)
+    
+    def generate_document_id(self, url: str, content: str) -> str:
+        """Generate a unique document ID."""
+        # Create a hash based on URL and content
+        content_hash = hashlib.md5(f"{url}:{content[:100]}".encode()).hexdigest()
+        return f"firecrawl_{content_hash}"
+    
+    def clean_content(self, content: str) -> str:
+        """Clean and normalize content."""
+        if not content:
+            return ""
+        
+        # Remove excessive whitespace
+        content = re.sub(r'\s+', ' ', content)
+        
+        # Remove HTML tags if present
+        content = re.sub(r'<[^>]+>', '', content)
+        
+        # Remove special characters that might cause issues
+        content = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\']', '', content)
+        
+        return content.strip()
+    
+    def extract_title(self, content: ScrapedContent) -> str:
+        """Extract title from scraped content."""
+        if content.title:
+            return content.title
+        
+        if content.metadata and content.metadata.get("title"):
+            return content.metadata["title"]
+        
+        # Extract title from markdown if available
+        if content.markdown:
+            title_match = re.search(r'^#\s+(.+)$', content.markdown, re.MULTILINE)
+            if title_match:
+                return title_match.group(1).strip()
+        
+        # Fallback to URL
+        return content.url.split('/')[-1] or content.url
+    
+    def extract_description(self, content: ScrapedContent) -> str:
+        """Extract description from scraped content."""
+        if content.description:
+            return content.description
+        
+        if content.metadata and content.metadata.get("description"):
+            return content.metadata["description"]
+        
+        # Extract first paragraph from markdown
+        if content.markdown:
+            # Remove headers and get first paragraph
+            text = re.sub(r'^#+\s+.*$', '', content.markdown, flags=re.MULTILINE)
+            paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
+            if paragraphs:
+                return paragraphs[0][:200] + "..." if len(paragraphs[0]) > 200 else paragraphs[0]
+        
+        return ""
+    
+    def extract_language(self, content: ScrapedContent) -> str:
+        """Extract language from content metadata."""
+        if content.metadata and content.metadata.get("language"):
+            return content.metadata["language"]
+        
+        # Simple language detection based on common words
+        if content.markdown:
+            text = content.markdown.lower()
+            if any(word in text for word in ["the", "and", "or", "but", "in", "on", "at"]):
+                return "en"
+            elif any(word in text for word in ["le", "la", "les", "de", "du", "des"]):
+                return "fr"
+            elif any(word in text for word in ["der", "die", "das", "und", "oder"]):
+                return "de"
+            elif any(word in text for word in ["el", "la", "los", "las", "de", "del"]):
+                return "es"
+        
+        return "en"  # Default to English
+    
+    def create_metadata(self, content: ScrapedContent) -> Dict[str, Any]:
+        """Create comprehensive metadata for RAGFlow document."""
+        metadata = {
+            "source": "firecrawl",
+            "url": content.url,
+            "domain": self.extract_domain(content.url),
+            "scraped_at": datetime.utcnow().isoformat(),
+            "status_code": content.status_code,
+            "content_length": len(content.markdown or ""),
+            "has_html": bool(content.html),
+            "has_markdown": bool(content.markdown)
+        }
+        
+        # Add original metadata if available
+        if content.metadata:
+            metadata.update({
+                "original_title": content.metadata.get("title"),
+                "original_description": content.metadata.get("description"),
+                "original_language": content.metadata.get("language"),
+                "original_keywords": content.metadata.get("keywords"),
+                "original_robots": content.metadata.get("robots"),
+                "og_title": content.metadata.get("ogTitle"),
+                "og_description": content.metadata.get("ogDescription"),
+                "og_image": content.metadata.get("ogImage"),
+                "og_url": content.metadata.get("ogUrl")
+            })
+        
+        return metadata
+    
+    def extract_domain(self, url: str) -> str:
+        """Extract domain from URL."""
+        try:
+            from urllib.parse import urlparse
+            return urlparse(url).netloc
+        except Exception:
+            return ""
+    
+    def process_content(self, content: ScrapedContent) -> RAGFlowDocument:
+        """Process scraped content into RAGFlow document format."""
+        if content.error:
+            raise ValueError(f"Content has error: {content.error}")
+        
+        # Determine primary content
+        primary_content = content.markdown or content.html or ""
+        if not primary_content:
+            raise ValueError("No content available to process")
+        
+        # Clean content
+        cleaned_content = self.clean_content(primary_content)
+        
+        # Extract metadata
+        title = self.extract_title(content)
+        language = self.extract_language(content)
+        metadata = self.create_metadata(content)
+        
+        # Generate document ID
+        doc_id = self.generate_document_id(content.url, cleaned_content)
+        
+        # Create RAGFlow document
+        document = RAGFlowDocument(
+            id=doc_id,
+            title=title,
+            content=cleaned_content,
+            source_url=content.url,
+            metadata=metadata,
+            created_at=datetime.utcnow(),
+            updated_at=datetime.utcnow(),
+            content_type="text",
+            language=language
+        )
+        
+        return document
+    
+    def process_batch(self, contents: List[ScrapedContent]) -> List[RAGFlowDocument]:
+        """Process multiple scraped contents into RAGFlow documents."""
+        documents = []
+        
+        for content in contents:
+            try:
+                document = self.process_content(content)
+                documents.append(document)
+            except Exception as e:
+                self.logger.error(f"Failed to process content from {content.url}: {e}")
+                continue
+        
+        return documents
+    
+    def chunk_content(self, document: RAGFlowDocument, 
+                     chunk_size: int = 1000, 
+                     chunk_overlap: int = 200) -> List[Dict[str, Any]]:
+        """Chunk document content for RAG processing."""
+        content = document.content
+        chunks = []
+        
+        if len(content) <= chunk_size:
+            return [{
+                "id": f"{document.id}_chunk_0",
+                "content": content,
+                "metadata": {
+                    **document.metadata,
+                    "chunk_index": 0,
+                    "total_chunks": 1
+                }
+            }]
+        
+        # Split content into chunks
+        start = 0
+        chunk_index = 0
+        
+        while start < len(content):
+            end = start + chunk_size
+            
+            # Try to break at sentence boundary
+            if end < len(content):
+                # Look for sentence endings
+                sentence_end = content.rfind('.', start, end)
+                if sentence_end > start + chunk_size // 2:
+                    end = sentence_end + 1
+            
+            chunk_content = content[start:end].strip()
+            
+            if chunk_content:
+                chunks.append({
+                    "id": f"{document.id}_chunk_{chunk_index}",
+                    "content": chunk_content,
+                    "metadata": {
+                        **document.metadata,
+                        "chunk_index": chunk_index,
+                        "total_chunks": len(chunks) + 1,  # Will be updated
+                        "chunk_start": start,
+                        "chunk_end": end
+                    }
+                })
+                chunk_index += 1
+            
+            # Move start position with overlap
+            start = end - chunk_overlap
+            if start >= len(content):
+                break
+        
+        # Update total chunks count
+        for chunk in chunks:
+            chunk["metadata"]["total_chunks"] = len(chunks)
+        
+        return chunks
+    
+    def validate_document(self, document: RAGFlowDocument) -> bool:
+        """Validate RAGFlow document."""
+        if not document.id:
+            return False
+        
+        if not document.title:
+            return False
+        
+        if not document.content:
+            return False
+        
+        if not document.source_url:
+            return False
+        
+        return True