mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Support displaying images in the chunks of docx files when using general parser (#1253)
### What problem does this PR solve? Support displaying images in chunks of docx files when using general parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -24,6 +24,7 @@ import copy
|
||||
import roman_numbers as r
|
||||
from word2number import w2n
|
||||
from cn2an import cn2an
|
||||
from PIL import Image
|
||||
|
||||
all_codecs = [
|
||||
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
||||
@ -246,6 +247,19 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
||||
return res
|
||||
|
||||
|
||||
def tokenize_chunks_docx(chunks, doc, eng, images):
|
||||
res = []
|
||||
# wrap up as es documents
|
||||
for ck, image in zip(chunks, images):
|
||||
if len(ck.strip()) == 0:continue
|
||||
print("--", ck)
|
||||
d = copy.deepcopy(doc)
|
||||
d["image"] = image
|
||||
tokenize(d, ck, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
||||
|
||||
def tokenize_table(tbls, doc, eng, batch_size=10):
|
||||
res = []
|
||||
# add tables
|
||||
@ -504,4 +518,54 @@ def docx_question_level(p):
|
||||
if p.style.name.startswith('Heading'):
|
||||
return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
|
||||
else:
|
||||
return 0, re.sub(r"\u3000", " ", p.text).strip()
|
||||
return 0, re.sub(r"\u3000", " ", p.text).strip()
|
||||
|
||||
def concat_img(img1, img2):
|
||||
if img1 and not img2:
|
||||
return img1
|
||||
if not img1 and img2:
|
||||
return img2
|
||||
if not img1 and not img2:
|
||||
return None
|
||||
width1, height1 = img1.size
|
||||
width2, height2 = img2.size
|
||||
|
||||
new_width = max(width1, width2)
|
||||
new_height = height1 + height2
|
||||
new_image = Image.new('RGB', (new_width, new_height))
|
||||
|
||||
new_image.paste(img1, (0, 0))
|
||||
new_image.paste(img2, (0, height1))
|
||||
|
||||
return new_image
|
||||
|
||||
def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
if not sections:
|
||||
return []
|
||||
|
||||
cks = [""]
|
||||
images = [None]
|
||||
tk_nums = [0]
|
||||
|
||||
def add_chunk(t, image, pos=""):
|
||||
nonlocal cks, tk_nums, delimiter
|
||||
tnum = num_tokens_from_string(t)
|
||||
if tnum < 8:
|
||||
pos = ""
|
||||
if tk_nums[-1] > chunk_token_num:
|
||||
if t.find(pos) < 0:
|
||||
t += pos
|
||||
cks.append(t)
|
||||
images.append(image)
|
||||
tk_nums.append(tnum)
|
||||
else:
|
||||
if cks[-1].find(pos) < 0:
|
||||
t += pos
|
||||
cks[-1] += t
|
||||
images[-1] = concat_img(images[-1], image)
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
for sec, image in sections:
|
||||
add_chunk(sec, image, '')
|
||||
|
||||
return cks, images
|
||||
Reference in New Issue
Block a user