mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Add support for HTML file (#973)
### What problem does this PR solve? Add support for HTML file ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -4,3 +4,4 @@ from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
|
||||
from .docx_parser import RAGFlowDocxParser as DocxParser
|
||||
from .excel_parser import RAGFlowExcelParser as ExcelParser
|
||||
from .ppt_parser import RAGFlowPptParser as PptParser
|
||||
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
||||
|
||||
27
deepdoc/parser/html_parser.py
Normal file
27
deepdoc/parser/html_parser.py
Normal file
@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from rag.nlp import find_codec
|
||||
import readability
|
||||
import html_text
|
||||
import chardet
|
||||
|
||||
def get_encoding(file):
|
||||
with open(file,'rb') as f:
|
||||
tmp = chardet.detect(f.read())
|
||||
return tmp['encoding']
|
||||
|
||||
class RAGFlowHtmlParser:
|
||||
def __call__(self, fnm, binary=None):
|
||||
txt = ""
|
||||
if binary:
|
||||
encoding = find_codec(binary)
|
||||
txt = binary.decode(encoding, errors="ignore")
|
||||
else:
|
||||
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
|
||||
txt = f.read()
|
||||
|
||||
html_doc = readability.Document(txt)
|
||||
title = html_doc.title()
|
||||
content = html_text.extract_text(html_doc.summary(html_partial=True))
|
||||
txt = f'{title}\n{content}'
|
||||
sections = txt.split("\n")
|
||||
return sections
|
||||
Reference in New Issue
Block a user