Support displaying images in the chunks of docx files when using general parser (#1253)

### What problem does this PR solve?

Support displaying images in chunks of docx files when using general
parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Zhedong Cen
2024-06-24 16:29:36 +08:00
committed by GitHub
parent 9a0736b20f
commit 38bd02f402
3 changed files with 121 additions and 28 deletions

View File

@ -16,16 +16,28 @@ from docx import Document
from timeit import default_timer as timer
import re
from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
from rag.settings import cron_logger
from rag.utils import num_tokens_from_string
from PIL import Image
from functools import reduce
class Docx(DocxParser):
def __init__(self):
pass
def get_picture(self, document, paragraph):
img = paragraph._element.xpath('.//pic:pic')
if not img:
return None
img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
related_part = document.part.related_parts[embed]
image = related_part.image
image = Image.open(BytesIO(image.blob)).convert('RGB')
return image
def __clean(self, line):
line = re.sub(r"\u3000", " ", line).strip()
return line
@ -35,17 +47,41 @@ class Docx(DocxParser):
filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
last_image = None
for p in self.doc.paragraphs:
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
lines.append(self.__clean(p.text))
if from_page <= pn < to_page:
current_image = None
if p.text.strip():
if p.style.name == 'Caption':
former_image = None
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
former_image = lines[-1][1].pop()
elif last_image:
former_image = last_image
last_image = None
lines.append((self.__clean(p.text), [former_image], p.style.name))
else:
current_image = self.get_picture(self.doc, p)
image_list = [current_image]
if last_image:
image_list.insert(0, last_image)
last_image = None
lines.append((self.__clean(p.text), image_list, p.style.name))
else:
if current_image := self.get_picture(self.doc, p):
if lines:
lines[-1][1].append(current_image)
else:
last_image = current_image
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
new_line = [(line[0], reduce(concat_img, line[1])) for line in lines]
tbls = []
for tb in self.doc.tables:
html= "<table>"
@ -64,7 +100,7 @@ class Docx(DocxParser):
html += "</tr>"
html += "</table>"
tbls.append(((None, html), ""))
return [(l, "") for l in lines if l], tbls
return new_line, tbls
class Pdf(PdfParser):
@ -123,8 +159,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = Docx()(filename, binary)
res = tokenize_table(tbls, doc, eng)
res = tokenize_table(tbls, doc, eng) # just for table
callback(0.8, "Finish parsing.")
st = timer()
chunks, images = naive_merge_docx(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf(