Fix docx images (#2756)

### What problem does this PR solve?

#2755 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
lidp
2024-10-09 19:37:32 +08:00
committed by GitHub
parent 2df15742fc
commit 20e63f8ec4

View File

@ -16,14 +16,15 @@ from docx import Document
from timeit import default_timer as timer from timeit import default_timer as timer
import re import re
from deepdoc.parser.pdf_parser import PlainParser from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, \
naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
from rag.settings import cron_logger from rag.settings import cron_logger
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
from PIL import Image from PIL import Image
from functools import reduce from functools import reduce
from markdown import markdown from markdown import markdown
from docx.image.exceptions import UnrecognizedImageError from docx.image.exceptions import UnrecognizedImageError, UnexpectedEndOfFileError, InvalidImageStreamError
class Docx(DocxParser): class Docx(DocxParser):
@ -42,6 +43,12 @@ class Docx(DocxParser):
except UnrecognizedImageError: except UnrecognizedImageError:
print("Unrecognized image format. Skipping image.") print("Unrecognized image format. Skipping image.")
return None return None
except UnexpectedEndOfFileError:
print("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
return None
except InvalidImageStreamError:
print("The recognized image stream appears to be corrupted. Skipping image.")
return None
try: try:
image = Image.open(BytesIO(image_blob)).convert('RGB') image = Image.open(BytesIO(image_blob)).convert('RGB')
return image return image
@ -101,7 +108,7 @@ class Docx(DocxParser):
while i < len(r.cells): while i < len(r.cells):
span = 1 span = 1
c = r.cells[i] c = r.cells[i]
for j in range(i+1, len(r.cells)): for j in range(i + 1, len(r.cells)):
if c.text == r.cells[j].text: if c.text == r.cells[j].text:
span += 1 span += 1
i = j i = j
@ -136,9 +143,9 @@ class Pdf(PdfParser):
self._text_merge() self._text_merge()
callback(0.67, "Text merging finished") callback(0.67, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, True, True) tbls = self._extract_table_figure(True, zoomin, True, True)
#self._naive_vertical_merge() # self._naive_vertical_merge()
self._concat_downward() self._concat_downward()
#self._filter_forpages() # self._filter_forpages()
cron_logger.info("layouts: {}".format(timer() - start)) cron_logger.info("layouts: {}".format(timer() - start))
return [(b["text"], self._line_tag(b, zoomin)) return [(b["text"], self._line_tag(b, zoomin))
@ -158,8 +165,8 @@ class Markdown(MarkdownParser):
tbls = [] tbls = []
for sec in remainder.split("\n"): for sec in remainder.split("\n"):
if num_tokens_from_string(sec) > 10 * self.chunk_token_num: if num_tokens_from_string(sec) > 10 * self.chunk_token_num:
sections.append((sec[:int(len(sec)/2)], "")) sections.append((sec[:int(len(sec) / 2)], ""))
sections.append((sec[int(len(sec)/2):], "")) sections.append((sec[int(len(sec) / 2):], ""))
else: else:
sections.append((sec, "")) sections.append((sec, ""))
print(tables) print(tables)
@ -191,7 +198,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections, tbls = Docx()(filename, binary) sections, tbls = Docx()(filename, binary)
res = tokenize_table(tbls, doc, eng) # just for table res = tokenize_table(tbls, doc, eng) # just for table
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
st = timer() st = timer()
@ -276,7 +283,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
def dummy(prog=None, msg=""): def dummy(prog=None, msg=""):
pass pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)