mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Refactor parser code (#9042)
### What problem does this PR solve? Refactor code ### Type of change - [x] Refactoring Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
@ -33,7 +33,7 @@ class RAGFlowDocxParser:
|
|||||||
def __compose_table_content(self, df):
|
def __compose_table_content(self, df):
|
||||||
|
|
||||||
def blockType(b):
|
def blockType(b):
|
||||||
patt = [
|
pattern = [
|
||||||
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
||||||
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
||||||
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
|
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
|
||||||
@ -47,7 +47,7 @@ class RAGFlowDocxParser:
|
|||||||
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
|
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
|
||||||
(r"^.{1}$", "Sg")
|
(r"^.{1}$", "Sg")
|
||||||
]
|
]
|
||||||
for p, n in patt:
|
for p, n in pattern:
|
||||||
if re.search(p, b):
|
if re.search(p, b):
|
||||||
return n
|
return n
|
||||||
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
|
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
|
||||||
|
|||||||
@ -34,7 +34,7 @@ class RAGFlowExcelParser:
|
|||||||
file_like_object.seek(0)
|
file_like_object.seek(0)
|
||||||
|
|
||||||
if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
|
if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
|
||||||
logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook")
|
logging.info("Not an Excel file, converting CSV to Excel Workbook")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file_like_object.seek(0)
|
file_like_object.seek(0)
|
||||||
@ -42,18 +42,18 @@ class RAGFlowExcelParser:
|
|||||||
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
||||||
|
|
||||||
except Exception as e_csv:
|
except Exception as e_csv:
|
||||||
raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}")
|
raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return load_workbook(file_like_object,data_only= True)
|
return load_workbook(file_like_object,data_only= True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
|
logging.info(f"openpyxl load error: {e}, try pandas instead")
|
||||||
try:
|
try:
|
||||||
file_like_object.seek(0)
|
file_like_object.seek(0)
|
||||||
df = pd.read_excel(file_like_object)
|
df = pd.read_excel(file_like_object)
|
||||||
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
||||||
except Exception as e_pandas:
|
except Exception as e_pandas:
|
||||||
raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
|
raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _dataframe_to_workbook(df):
|
def _dataframe_to_workbook(df):
|
||||||
|
|||||||
@ -41,7 +41,7 @@ class RAGFlowHtmlParser:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def parser_txt(cls, txt):
|
def parser_txt(cls, txt):
|
||||||
if not isinstance(txt, str):
|
if not isinstance(txt, str):
|
||||||
raise TypeError("txt type should be str!")
|
raise TypeError("txt type should be string!")
|
||||||
html_doc = readability.Document(txt)
|
html_doc = readability.Document(txt)
|
||||||
title = html_doc.title()
|
title = html_doc.title()
|
||||||
content = html_text.extract_text(html_doc.summary(html_partial=True))
|
content = html_text.extract_text(html_doc.summary(html_partial=True))
|
||||||
|
|||||||
Reference in New Issue
Block a user