mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
fix plainPdf bugs (#152)
This commit is contained in:
@ -68,7 +68,7 @@ class Pdf(PdfParser):
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
||||
for txt, poss in pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback):
|
||||
from_page=from_page, to_page=to_page, callback=callback)[0]:
|
||||
sections.append(txt + poss)
|
||||
|
||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||
|
||||
Reference in New Issue
Block a user