Fix: Merge main branch (#10377)

### What problem does this PR solve?


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
This commit is contained in:
Kevin Hu
2025-09-30 13:13:15 +08:00
committed by GitHub
parent 4d6ff672eb
commit 20b577a72c
201 changed files with 7929 additions and 1110 deletions

View File

@ -0,0 +1,275 @@
"""
Content processor for converting Firecrawl output to RAGFlow document format.
"""
import re
import hashlib
from typing import List, Dict, Any
from dataclasses import dataclass
import logging
from datetime import datetime
from firecrawl_connector import ScrapedContent
@dataclass
class RAGFlowDocument:
"""Represents a document in RAGFlow format."""
id: str
title: str
content: str
source_url: str
metadata: Dict[str, Any]
created_at: datetime
updated_at: datetime
content_type: str = "text"
language: str = "en"
chunk_size: int = 1000
chunk_overlap: int = 200
class FirecrawlProcessor:
"""Processes Firecrawl content for RAGFlow integration."""
def __init__(self):
"""Initialize the processor."""
self.logger = logging.getLogger(__name__)
def generate_document_id(self, url: str, content: str) -> str:
"""Generate a unique document ID."""
# Create a hash based on URL and content
content_hash = hashlib.md5(f"{url}:{content[:100]}".encode()).hexdigest()
return f"firecrawl_{content_hash}"
def clean_content(self, content: str) -> str:
"""Clean and normalize content."""
if not content:
return ""
# Remove excessive whitespace
content = re.sub(r'\s+', ' ', content)
# Remove HTML tags if present
content = re.sub(r'<[^>]+>', '', content)
# Remove special characters that might cause issues
content = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\']', '', content)
return content.strip()
def extract_title(self, content: ScrapedContent) -> str:
"""Extract title from scraped content."""
if content.title:
return content.title
if content.metadata and content.metadata.get("title"):
return content.metadata["title"]
# Extract title from markdown if available
if content.markdown:
title_match = re.search(r'^#\s+(.+)$', content.markdown, re.MULTILINE)
if title_match:
return title_match.group(1).strip()
# Fallback to URL
return content.url.split('/')[-1] or content.url
def extract_description(self, content: ScrapedContent) -> str:
"""Extract description from scraped content."""
if content.description:
return content.description
if content.metadata and content.metadata.get("description"):
return content.metadata["description"]
# Extract first paragraph from markdown
if content.markdown:
# Remove headers and get first paragraph
text = re.sub(r'^#+\s+.*$', '', content.markdown, flags=re.MULTILINE)
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
if paragraphs:
return paragraphs[0][:200] + "..." if len(paragraphs[0]) > 200 else paragraphs[0]
return ""
def extract_language(self, content: ScrapedContent) -> str:
"""Extract language from content metadata."""
if content.metadata and content.metadata.get("language"):
return content.metadata["language"]
# Simple language detection based on common words
if content.markdown:
text = content.markdown.lower()
if any(word in text for word in ["the", "and", "or", "but", "in", "on", "at"]):
return "en"
elif any(word in text for word in ["le", "la", "les", "de", "du", "des"]):
return "fr"
elif any(word in text for word in ["der", "die", "das", "und", "oder"]):
return "de"
elif any(word in text for word in ["el", "la", "los", "las", "de", "del"]):
return "es"
return "en" # Default to English
def create_metadata(self, content: ScrapedContent) -> Dict[str, Any]:
"""Create comprehensive metadata for RAGFlow document."""
metadata = {
"source": "firecrawl",
"url": content.url,
"domain": self.extract_domain(content.url),
"scraped_at": datetime.utcnow().isoformat(),
"status_code": content.status_code,
"content_length": len(content.markdown or ""),
"has_html": bool(content.html),
"has_markdown": bool(content.markdown)
}
# Add original metadata if available
if content.metadata:
metadata.update({
"original_title": content.metadata.get("title"),
"original_description": content.metadata.get("description"),
"original_language": content.metadata.get("language"),
"original_keywords": content.metadata.get("keywords"),
"original_robots": content.metadata.get("robots"),
"og_title": content.metadata.get("ogTitle"),
"og_description": content.metadata.get("ogDescription"),
"og_image": content.metadata.get("ogImage"),
"og_url": content.metadata.get("ogUrl")
})
return metadata
def extract_domain(self, url: str) -> str:
"""Extract domain from URL."""
try:
from urllib.parse import urlparse
return urlparse(url).netloc
except Exception:
return ""
def process_content(self, content: ScrapedContent) -> RAGFlowDocument:
"""Process scraped content into RAGFlow document format."""
if content.error:
raise ValueError(f"Content has error: {content.error}")
# Determine primary content
primary_content = content.markdown or content.html or ""
if not primary_content:
raise ValueError("No content available to process")
# Clean content
cleaned_content = self.clean_content(primary_content)
# Extract metadata
title = self.extract_title(content)
language = self.extract_language(content)
metadata = self.create_metadata(content)
# Generate document ID
doc_id = self.generate_document_id(content.url, cleaned_content)
# Create RAGFlow document
document = RAGFlowDocument(
id=doc_id,
title=title,
content=cleaned_content,
source_url=content.url,
metadata=metadata,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow(),
content_type="text",
language=language
)
return document
def process_batch(self, contents: List[ScrapedContent]) -> List[RAGFlowDocument]:
"""Process multiple scraped contents into RAGFlow documents."""
documents = []
for content in contents:
try:
document = self.process_content(content)
documents.append(document)
except Exception as e:
self.logger.error(f"Failed to process content from {content.url}: {e}")
continue
return documents
def chunk_content(self, document: RAGFlowDocument,
chunk_size: int = 1000,
chunk_overlap: int = 200) -> List[Dict[str, Any]]:
"""Chunk document content for RAG processing."""
content = document.content
chunks = []
if len(content) <= chunk_size:
return [{
"id": f"{document.id}_chunk_0",
"content": content,
"metadata": {
**document.metadata,
"chunk_index": 0,
"total_chunks": 1
}
}]
# Split content into chunks
start = 0
chunk_index = 0
while start < len(content):
end = start + chunk_size
# Try to break at sentence boundary
if end < len(content):
# Look for sentence endings
sentence_end = content.rfind('.', start, end)
if sentence_end > start + chunk_size // 2:
end = sentence_end + 1
chunk_content = content[start:end].strip()
if chunk_content:
chunks.append({
"id": f"{document.id}_chunk_{chunk_index}",
"content": chunk_content,
"metadata": {
**document.metadata,
"chunk_index": chunk_index,
"total_chunks": len(chunks) + 1, # Will be updated
"chunk_start": start,
"chunk_end": end
}
})
chunk_index += 1
# Move start position with overlap
start = end - chunk_overlap
if start >= len(content):
break
# Update total chunks count
for chunk in chunks:
chunk["metadata"]["total_chunks"] = len(chunks)
return chunks
def validate_document(self, document: RAGFlowDocument) -> bool:
"""Validate RAGFlow document."""
if not document.id:
return False
if not document.title:
return False
if not document.content:
return False
if not document.source_url:
return False
return True