mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
refine page ranges (#147)
This commit is contained in:
@ -1074,15 +1074,15 @@ class HuParser:
|
||||
|
||||
|
||||
class PlainParser(object):
|
||||
def __call__(self, filename, **kwargs):
|
||||
def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
|
||||
self.outlines = []
|
||||
lines = []
|
||||
try:
|
||||
self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
|
||||
outlines = self.pdf.outline
|
||||
for page in self.pdf.pages:
|
||||
for page in self.pdf.pages[from_page:to_page]:
|
||||
lines.extend([t for t in page.extract_text().split("\n")])
|
||||
|
||||
outlines = self.pdf.outline
|
||||
def dfs(arr, depth):
|
||||
for a in arr:
|
||||
if isinstance(a, dict):
|
||||
|
||||
@ -15,6 +15,7 @@ import re
|
||||
from collections import Counter
|
||||
from copy import deepcopy
|
||||
import numpy as np
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from api.db import ParserType
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
@ -36,7 +37,8 @@ class LayoutRecognizer(Recognizer):
|
||||
"Equation",
|
||||
]
|
||||
def __init__(self, domain):
|
||||
super().__init__(self.labels, domain, os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
||||
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
|
||||
super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
||||
self.garbage_layouts = ["footer", "header", "reference"]
|
||||
|
||||
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
|
||||
|
||||
Reference in New Issue
Block a user