deal with stop reason being length problem (#109)

This commit is contained in:
KevinHuSh
2024-03-07 16:12:01 +08:00
committed by GitHub
parent b69b5dd4e5
commit 2d7c9080f4
6 changed files with 59 additions and 27 deletions

View File

@ -73,12 +73,13 @@ class Pdf(PdfParser):
return res
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
The supported file formats are pdf, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
eng = lang.lower() == "english"
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -98,8 +99,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
d = copy.deepcopy(doc)
d["image"] = img
d["page_num_obj"] = [pn+1]
tokenize(d, txt, pdf_parser.is_english)
d["page_num_int"] = [pn+1]
d["top_int"] = [0]
d["position_int"].append((pn + 1, 0, img.size[0], 0, img.size[1]))
tokenize(d, txt, eng)
res.append(d)
return res

View File

@ -14,9 +14,13 @@
# limitations under the License.
#
from abc import ABC
from copy import deepcopy
from openai import OpenAI
import openai
from rag.nlp import is_english
class Base(ABC):
def __init__(self, key, model_name):
@ -34,13 +38,17 @@ class GptTurbo(Base):
def chat(self, system, history, gen_conf):
if system: history.insert(0, {"role": "system", "content": system})
try:
res = self.client.chat.completions.create(
response = self.client.chat.completions.create(
model=self.model_name,
messages=history,
**gen_conf)
return res.choices[0].message.content.strip(), res.usage.completion_tokens
ans = response.output.choices[0]['message']['content'].strip()
if response.output.choices[0].get("finish_reason", "") == "length":
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
return ans, response.usage.completion_tokens
except openai.APIError as e:
return "ERROR: "+str(e), 0
return "**ERROR**: "+str(e), 0
from dashscope import Generation
@ -59,9 +67,16 @@ class QWenChat(Base):
result_format='message',
**gen_conf
)
ans = ""
tk_count = 0
if response.status_code == HTTPStatus.OK:
return response.output.choices[0]['message']['content'], response.usage.output_tokens
return "ERROR: " + response.message, 0
ans += response.output.choices[0]['message']['content']
tk_count += response.usage.output_tokens
if response.output.choices[0].get("finish_reason", "") == "length":
ans += "...\nFor the content length reason, it stopped, continue?" if is_english([ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
return ans, tk_count
return "**ERROR**: " + response.message, tk_count
from zhipuai import ZhipuAI
@ -73,11 +88,16 @@ class ZhipuChat(Base):
def chat(self, system, history, gen_conf):
from http import HTTPStatus
if system: history.insert(0, {"role": "system", "content": system})
response = self.client.chat.completions.create(
self.model_name,
messages=history,
**gen_conf
)
if response.status_code == HTTPStatus.OK:
return response.output.choices[0]['message']['content'], response.usage.completion_tokens
return "ERROR: " + response.message, 0
try:
response = self.client.chat.completions.create(
self.model_name,
messages=history,
**gen_conf
)
ans = response.output.choices[0]['message']['content'].strip()
if response.output.choices[0].get("finish_reason", "") == "length":
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
return ans, response.usage.completion_tokens
except Exception as e:
return "**ERROR**: " + str(e), 0

View File

@ -224,12 +224,13 @@ class Dealer:
chunks_tks,
tkweight, vtweight)
mx = np.max(sim) * 0.99
if mx < 0.35:
if mx < 0.66:
continue
cites[idx[i]] = list(
set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
res = ""
seted = set([])
for i, p in enumerate(pieces):
res += p
if i not in idx:
@ -237,7 +238,10 @@ class Dealer:
if i not in cites:
continue
for c in cites[i]: assert int(c) < len(chunk_v)
for c in cites[i]: res += f" ##{c}$$"
for c in cites[i]:
if c in seted:continue
res += f" ##{c}$$"
seted.add(c)
return res
@ -318,7 +322,7 @@ class Dealer:
if dnm not in ranks["doc_aggs"]:
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
ranks["doc_aggs"][dnm]["count"] += 1
ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
ranks["doc_aggs"] = []#[{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
return ranks