mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: In ragflow/rag/app /naive.py, if there are multiple images in one line, the other images will be lost (#9968)
### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/9966 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
@ -41,37 +41,43 @@ class Docx(DocxParser):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def get_picture(self, document, paragraph):
|
def get_picture(self, document, paragraph):
|
||||||
img = paragraph._element.xpath('.//pic:pic')
|
imgs = paragraph._element.xpath('.//pic:pic')
|
||||||
if not img:
|
if not imgs:
|
||||||
return None
|
return None
|
||||||
img = img[0]
|
res_img = None
|
||||||
|
for img in imgs:
|
||||||
embed = img.xpath('.//a:blip/@r:embed')
|
embed = img.xpath('.//a:blip/@r:embed')
|
||||||
if not embed:
|
if not embed:
|
||||||
return None
|
continue
|
||||||
embed = embed[0]
|
embed = embed[0]
|
||||||
try:
|
try:
|
||||||
related_part = document.part.related_parts[embed]
|
related_part = document.part.related_parts[embed]
|
||||||
image_blob = related_part.image.blob
|
image_blob = related_part.image.blob
|
||||||
except UnrecognizedImageError:
|
except UnrecognizedImageError:
|
||||||
logging.info("Unrecognized image format. Skipping image.")
|
logging.info("Unrecognized image format. Skipping image.")
|
||||||
return None
|
continue
|
||||||
except UnexpectedEndOfFileError:
|
except UnexpectedEndOfFileError:
|
||||||
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
|
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
|
||||||
return None
|
continue
|
||||||
except InvalidImageStreamError:
|
except InvalidImageStreamError:
|
||||||
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
||||||
return None
|
continue
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
||||||
return None
|
continue
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
||||||
return None
|
continue
|
||||||
try:
|
try:
|
||||||
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
||||||
return image
|
if res_img is None:
|
||||||
|
res_img = image
|
||||||
|
else:
|
||||||
|
res_img = concat_img(res_img, image)
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
continue
|
||||||
|
|
||||||
|
return res_img
|
||||||
|
|
||||||
def __clean(self, line):
|
def __clean(self, line):
|
||||||
line = re.sub(r"\u3000", " ", line).strip()
|
line = re.sub(r"\u3000", " ", line).strip()
|
||||||
|
|||||||
Reference in New Issue
Block a user