From 03daf4618c3b2469eb0646805c2114c4374d17a2 Mon Sep 17 00:00:00 2001 From: Jin Hai Date: Fri, 25 Jul 2025 12:04:07 +0800 Subject: [PATCH] Refactor parser code (#9042) ### What problem does this PR solve? Refactor code ### Type of change - [x] Refactoring Signed-off-by: Jin Hai --- deepdoc/parser/docx_parser.py | 4 ++-- deepdoc/parser/excel_parser.py | 8 ++++---- deepdoc/parser/html_parser.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index f37119615..2a65841e2 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -33,7 +33,7 @@ class RAGFlowDocxParser: def __compose_table_content(self, df): def blockType(b): - patt = [ + pattern = [ ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"), (r"^(20|19)[0-9]{2}年$", "Dt"), (r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"), @@ -47,7 +47,7 @@ class RAGFlowDocxParser: (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"), (r"^.{1}$", "Sg") ] - for p, n in patt: + for p, n in pattern: if re.search(p, b): return n tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1] diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 8eb726a08..a6040e4e1 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -34,7 +34,7 @@ class RAGFlowExcelParser: file_like_object.seek(0) if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')): - logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook") + logging.info("Not an Excel file, converting CSV to Excel Workbook") try: file_like_object.seek(0) @@ -42,18 +42,18 @@ class RAGFlowExcelParser: return RAGFlowExcelParser._dataframe_to_workbook(df) except Exception as e_csv: - raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}") + raise Exception(f"Failed to parse CSV and convert to Excel Workbook: {e_csv}") try: return load_workbook(file_like_object,data_only= True) except Exception as e: - logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead") + logging.info(f"openpyxl load error: {e}, try pandas instead") try: file_like_object.seek(0) df = pd.read_excel(file_like_object) return RAGFlowExcelParser._dataframe_to_workbook(df) except Exception as e_pandas: - raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}") + raise Exception(f"pandas.read_excel error: {e_pandas}, original openpyxl error: {e}") @staticmethod def _dataframe_to_workbook(df): diff --git a/deepdoc/parser/html_parser.py b/deepdoc/parser/html_parser.py index 29cc43a1d..ee1c9b2d6 100644 --- a/deepdoc/parser/html_parser.py +++ b/deepdoc/parser/html_parser.py @@ -41,7 +41,7 @@ class RAGFlowHtmlParser: @classmethod def parser_txt(cls, txt): if not isinstance(txt, str): - raise TypeError("txt type should be str!") + raise TypeError("txt type should be string!") html_doc = readability.Document(txt) title = html_doc.title() content = html_text.extract_text(html_doc.summary(html_partial=True))