accelerate term weight calculation (#3206)

### What problem does this PR solve?



### Type of change

- [x] Performance Improvement
This commit is contained in:
Kevin Hu
2024-11-05 13:11:26 +08:00
committed by GitHub
parent 677f02c2a7
commit 55953819c1
3 changed files with 18 additions and 13 deletions

View File

@ -1,4 +1,4 @@
#
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -158,7 +158,7 @@ class Dealer:
tks.append(t)
return tks
def weights(self, tks):
def weights(self, tks, preprocess=True):
def skill(t):
if t not in self.sk:
return 1
@ -222,14 +222,20 @@ class Dealer:
def idf(s, N): return math.log10(10 + ((N - s + 0.5) / (s + 0.5)))
tw = []
for tk in tks:
tt = self.tokenMerge(self.pretoken(tk, True))
idf1 = np.array([idf(freq(t), 10000000) for t in tt])
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
if not preprocess:
idf1 = np.array([idf(freq(t), 10000000) for t in tks])
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
wts = (0.3 * idf1 + 0.7 * idf2) * \
np.array([ner(t) * postag(t) for t in tt])
tw.extend(zip(tt, wts))
np.array([ner(t) * postag(t) for t in tks])
tw = zip(tks, wts)
else:
for tk in tks:
tt = self.tokenMerge(self.pretoken(tk, True))
idf1 = np.array([idf(freq(t), 10000000) for t in tt])
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
wts = (0.3 * idf1 + 0.7 * idf2) * \
np.array([ner(t) * postag(t) for t in tt])
tw.extend(zip(tt, wts))
S = np.sum([s for _, s in tw])
return [(t, s / S) for t, s in tw]