Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-12-10 16:32:58 +08:00
committed by GitHub
parent 87e46b4425
commit 03f00c9e6f
11 changed files with 89 additions and 42 deletions

View File

@ -20,7 +20,6 @@ from rag.nlp import tokenize, is_english
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read
import json
class Ppt(PptParser):
@ -109,9 +108,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
d["page_num_list"] = json.dumps([pn + 1])
d["top_list"] = json.dumps([0])
d["position_list"] = json.dumps([(pn + 1, 0, img.size[0], 0, img.size[1])])
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
tokenize(d, txt, eng)
res.append(d)
return res
@ -125,10 +124,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pn += from_page
if img:
d["image"] = img
d["page_num_list"] = json.dumps([pn + 1])
d["top_list"] = json.dumps([0])
d["position_list"] = json.dumps([
(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)])
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
tokenize(d, txt, eng)
res.append(d)
return res