llm configuation refine and trievalTest API refine (#40)

This commit is contained in:
KevinHuSh
2024-01-19 19:51:57 +08:00
committed by GitHub
parent f3dd131403
commit 484e5abc1f
39 changed files with 160 additions and 121 deletions

View File

@ -1,5 +1,5 @@
#
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -19,7 +19,7 @@ from .cv_model import *
EmbeddingModel = {
"local": HuEmbedding,
"Infiniflow": HuEmbedding,
"OpenAI": OpenAIEmbed,
"通义千问": QWenEmbed,
}
@ -27,12 +27,14 @@ EmbeddingModel = {
CvModel = {
"OpenAI": GptV4,
"Infiniflow": GptV4,
"通义千问": QWenCV,
}
ChatModel = {
"OpenAI": GptTurbo,
"Infiniflow": GptTurbo,
"通义千问": QWenChat,
}

View File

@ -1,5 +1,5 @@
#
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@ -1,5 +1,5 @@
#
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@ -1,5 +1,5 @@
#
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -24,6 +24,9 @@ import numpy as np
from rag.utils import num_tokens_from_string
flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=torch.cuda.is_available())
class Base(ABC):
def __init__(self, key, model_name):
@ -47,9 +50,7 @@ class HuEmbedding(Base):
^_-
"""
self.model = FlagModel("BAAI/bge-large-zh-v1.5",
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
use_fp16=torch.cuda.is_available())
self.model = flag_model
def encode(self, texts: list, batch_size=32):

View File

@ -42,7 +42,7 @@ class EsQueryer:
def question(self, txt, tbl="qa", min_match="60%"):
txt = re.sub(
r"[ \t,,。??/`!&]+",
r"[ \r\n\t,,。??/`!&]+",
" ",
huqie.tradi2simp(
huqie.strQ2B(

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import fitz
import xgboost as xgb
from io import BytesIO
import torch
@ -1527,8 +1528,6 @@ class HuParser:
return "\n\n".join(res)
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
self.pdf = pdfplumber.open(fnm) if isinstance(
fnm, str) else pdfplumber.open(BytesIO(fnm))
self.lefted_chars = []
self.mean_height = []
self.mean_width = []
@ -1536,13 +1535,26 @@ class HuParser:
self.garbages = {}
self.page_cum_height = [0]
self.page_layout = []
self.page_images = [p.to_image(
resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[:299])]
try:
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
self.page_images = [p.to_image(resolution=72*zoomin).annotated for i,p in enumerate(self.pdf.pages[:299])]
self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in range(len(self.page_images))]
except Exception as e:
self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
self.page_images = []
self.page_chars = []
mat = fitz.Matrix(zoomin, zoomin)
for page in self.pdf:
pix = page.getPixmap(matrix = mat)
img = Image.frombytes("RGB", [pix.width, pix.height],
pix.samples)
self.page_images.append(img)
self.page_chars.append([])
logging.info("Images converted.")
logging.info("Table processed.")
for i, img in enumerate(self.page_images):
chars = [c for c in self.pdf.pages[i].chars if self._has_color(c)]
chars = self.page_chars[i]
self.mean_height.append(
np.median(sorted([c["height"] for c in chars])) if chars else 0
)

View File

@ -1,5 +1,5 @@
#
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@ -1,5 +1,5 @@
#
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.