mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Support table for markdown file in general parser (#1278)
### What problem does this PR solve? Support extracting table for markdown file in general parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -16,4 +16,5 @@ from .docx_parser import RAGFlowDocxParser as DocxParser
|
||||
from .excel_parser import RAGFlowExcelParser as ExcelParser
|
||||
from .ppt_parser import RAGFlowPptParser as PptParser
|
||||
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
||||
from .json_parser import RAGFlowJsonParser as JsonParser
|
||||
from .json_parser import RAGFlowJsonParser as JsonParser
|
||||
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
||||
44
deepdoc/parser/markdown_parser.py
Normal file
44
deepdoc/parser/markdown_parser.py
Normal file
@ -0,0 +1,44 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import re
|
||||
|
||||
class RAGFlowMarkdownParser:
|
||||
def __init__(self, chunk_token_num=128):
|
||||
self.chunk_token_num = int(chunk_token_num)
|
||||
|
||||
def extract_tables_and_remainder(self, markdown_text):
|
||||
# Standard Markdown table
|
||||
table_pattern = re.compile(
|
||||
r'''
|
||||
(?:\n|^)
|
||||
(?:\|.*?\|.*?\|.*?\n)
|
||||
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
||||
(?:\|.*?\|.*?\|.*?\n)+
|
||||
''', re.VERBOSE)
|
||||
tables = table_pattern.findall(markdown_text)
|
||||
remainder = table_pattern.sub('', markdown_text)
|
||||
|
||||
# Borderless Markdown table
|
||||
no_border_table_pattern = re.compile(
|
||||
r'''
|
||||
(?:\n|^)
|
||||
(?:\S.*?\|.*?\n)
|
||||
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
||||
(?:\S.*?\|.*?\n)+
|
||||
''', re.VERBOSE)
|
||||
no_border_tables = no_border_table_pattern.findall(remainder)
|
||||
tables.extend(no_border_tables)
|
||||
remainder = no_border_table_pattern.sub('', remainder)
|
||||
|
||||
return remainder, tables
|
||||
Reference in New Issue
Block a user