Fix: table tag on chunks. (#12126)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu
2025-12-24 09:32:19 +08:00
committed by GitHub
parent 17b8bb62b6
commit c33134ea2c
5 changed files with 26 additions and 17 deletions

View File

@ -348,7 +348,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d["doc_type_kwd"] = "table"
if img:
d["image"] = img
d["doc_type_kwd"] = "image"
if d["content_with_weight"].find("<tr>") < 0:
d["doc_type_kwd"] = "image"
if poss:
add_positions(d, poss)
res.append(d)
@ -361,7 +362,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d["doc_type_kwd"] = "table"
if img:
d["image"] = img
d["doc_type_kwd"] = "image"
if d["content_with_weight"].find("<tr>") < 0:
d["doc_type_kwd"] = "image"
add_positions(d, poss)
res.append(d)
return res