remove PyMuPDF (#618)

### What problem does this PR solve? #613 ### Type of change - [x] Other (please describe):
2026-01-23 03:26:53 +08:00 · 2024-04-30 12:38:09 +08:00
parent 7059ec2298
commit cab274f560
5 changed files with 13 additions and 37 deletions
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -2,7 +2,6 @@
 import os
 import random

-import fitz
 import xgboost as xgb
 from io import BytesIO
 import torch
@ -922,9 +921,7 @@ class RAGFlowPdfParser:
                fnm) if not binary else pdfplumber.open(BytesIO(binary))
            return len(pdf.pages)
        except Exception as e:
-            pdf = fitz.open(fnm) if not binary else fitz.open(
-                stream=fnm, filetype="pdf")
-            return len(pdf)
+            logging.error(str(e))

    def __images__(self, fnm, zoomin=3, page_from=0,
                   page_to=299, callback=None):
@ -946,23 +943,7 @@ class RAGFlowPdfParser:
                               self.pdf.pages[page_from:page_to]]
            self.total_page = len(self.pdf.pages)
        except Exception as e:
-            self.pdf = fitz.open(fnm) if isinstance(
-                fnm, str) else fitz.open(
-                stream=fnm, filetype="pdf")
-            self.page_images = []
-            self.page_chars = []
-            mat = fitz.Matrix(zoomin, zoomin)
-            self.total_page = len(self.pdf)
-            for i, page in enumerate(self.pdf):
-                if i < page_from:
-                    continue
-                if i >= page_to:
-                    break
-                pix = page.get_pixmap(matrix=mat)
-                img = Image.frombytes("RGB", [pix.width, pix.height],
-                                      pix.samples)
-                self.page_images.append(img)
-                self.page_chars.append([])
+            logging.error(str(e))

        self.outlines = []
        try: