diff --git a/deepdoc/parser/html_parser.py b/deepdoc/parser/html_parser.py index 81183cf71..71bbb706a 100644 --- a/deepdoc/parser/html_parser.py +++ b/deepdoc/parser/html_parser.py @@ -15,35 +15,200 @@ # limitations under the License. # -from rag.nlp import find_codec -import readability -import html_text +from rag.nlp import find_codec, rag_tokenizer +import uuid import chardet - +from bs4 import BeautifulSoup, NavigableString, Tag, Comment +import html def get_encoding(file): with open(file,'rb') as f: tmp = chardet.detect(f.read()) return tmp['encoding'] +BLOCK_TAGS = [ + "h1", "h2", "h3", "h4", "h5", "h6", + "p", "div", "article", "section", "aside", + "ul", "ol", "li", + "table", "pre", "code", "blockquote", + "figure", "figcaption" +] +TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"} + class RAGFlowHtmlParser: - def __call__(self, fnm, binary=None): + def __call__(self, fnm, binary=None, chunk_token_num=None): if binary: encoding = find_codec(binary) txt = binary.decode(encoding, errors="ignore") else: with open(fnm, "r",encoding=get_encoding(fnm)) as f: txt = f.read() - return self.parser_txt(txt) + return self.parser_txt(txt, chunk_token_num) @classmethod - def parser_txt(cls, txt): + def parser_txt(cls, txt, chunk_token_num): if not isinstance(txt, str): raise TypeError("txt type should be string!") - html_doc = readability.Document(txt) - title = html_doc.title() - content = html_text.extract_text(html_doc.summary(html_partial=True)) - txt = f"{title}\n{content}" - sections = txt.split("\n") + + temp_sections = [] + soup = BeautifulSoup(txt, "html5lib") + # delete