diff --git a/deepdoc/parser/html_parser.py b/deepdoc/parser/html_parser.py
index 81183cf71..71bbb706a 100644
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@@ -15,35 +15,200 @@
# limitations under the License.
#
-from rag.nlp import find_codec
-import readability
-import html_text
+from rag.nlp import find_codec, rag_tokenizer
+import uuid
import chardet
-
+from bs4 import BeautifulSoup, NavigableString, Tag, Comment
+import html
def get_encoding(file):
with open(file,'rb') as f:
tmp = chardet.detect(f.read())
return tmp['encoding']
+BLOCK_TAGS = [
+ "h1", "h2", "h3", "h4", "h5", "h6",
+ "p", "div", "article", "section", "aside",
+ "ul", "ol", "li",
+ "table", "pre", "code", "blockquote",
+ "figure", "figcaption"
+]
+TITLE_TAGS = {"h1": "#", "h2": "##", "h3": "###", "h4": "#####", "h5": "#####", "h6": "######"}
+
class RAGFlowHtmlParser:
- def __call__(self, fnm, binary=None):
+ def __call__(self, fnm, binary=None, chunk_token_num=None):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else:
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
txt = f.read()
- return self.parser_txt(txt)
+ return self.parser_txt(txt, chunk_token_num)
@classmethod
- def parser_txt(cls, txt):
+ def parser_txt(cls, txt, chunk_token_num):
if not isinstance(txt, str):
raise TypeError("txt type should be string!")
- html_doc = readability.Document(txt)
- title = html_doc.title()
- content = html_text.extract_text(html_doc.summary(html_partial=True))
- txt = f"{title}\n{content}"
- sections = txt.split("\n")
+
+ temp_sections = []
+ soup = BeautifulSoup(txt, "html5lib")
+ # delete