Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-12-10 16:32:58 +08:00
committed by GitHub
parent 87e46b4425
commit 03f00c9e6f
11 changed files with 89 additions and 42 deletions

View File

@ -22,7 +22,6 @@ from rag.utils import num_tokens_from_string
from . import rag_tokenizer
import re
import copy
import json
import roman_numbers as r
from word2number import w2n
from cn2an import cn2an
@ -311,16 +310,16 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
def add_positions(d, poss):
if not poss:
return
page_num_list = []
position_list = []
top_list = []
page_num_int = []
position_int = []
top_int = []
for pn, left, right, top, bottom in poss:
page_num_list.append(int(pn + 1))
top_list.append(int(top))
position_list.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
d["page_num_list"] = json.dumps(page_num_list)
d["position_list"] = json.dumps(position_list)
d["top_list"] = json.dumps(top_list)
page_num_int.append(int(pn + 1))
top_int.append(int(top))
position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
d["page_num_int"] = page_num_int
d["position_int"] = position_int
d["top_int"] = top_int
def remove_contents_table(sections, eng=False):

View File

@ -16,7 +16,6 @@
import logging
import re
import json
from dataclasses import dataclass
from rag.utils import rmSpace
@ -74,7 +73,7 @@ class Dealer:
offset, limit = pg * ps, (pg + 1) * ps
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
"doc_id", "position_list", "knowledge_graph_kwd", "question_kwd", "question_tks",
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
"available_int", "content_with_weight", "pagerank_fea"])
kwds = set([])
@ -82,6 +81,8 @@ class Dealer:
q_vec = []
if not qst:
if req.get("sort"):
orderBy.asc("page_num_int")
orderBy.asc("top_int")
orderBy.desc("create_timestamp_flt")
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
@ -340,7 +341,7 @@ class Dealer:
chunk = sres.field[id]
dnm = chunk["docnm_kwd"]
did = chunk["doc_id"]
position_list = chunk.get("position_list", "[]")
position_int = chunk.get("position_int", [])
d = {
"chunk_id": id,
"content_ltks": chunk["content_ltks"],
@ -354,7 +355,7 @@ class Dealer:
"vector_similarity": vsim[i],
"term_similarity": tsim[i],
"vector": chunk.get(vector_column, zero_vector),
"positions": json.loads(position_list)
"positions": position_int,
}
if highlight and sres.highlight:
if id in sres.highlight: