mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
llm configuation refine and trievalTest API refine (#40)
This commit is contained in:
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -19,7 +19,7 @@ from .cv_model import *
|
||||
|
||||
|
||||
EmbeddingModel = {
|
||||
"local": HuEmbedding,
|
||||
"Infiniflow": HuEmbedding,
|
||||
"OpenAI": OpenAIEmbed,
|
||||
"通义千问": QWenEmbed,
|
||||
}
|
||||
@ -27,12 +27,14 @@ EmbeddingModel = {
|
||||
|
||||
CvModel = {
|
||||
"OpenAI": GptV4,
|
||||
"Infiniflow": GptV4,
|
||||
"通义千问": QWenCV,
|
||||
}
|
||||
|
||||
|
||||
ChatModel = {
|
||||
"OpenAI": GptTurbo,
|
||||
"Infiniflow": GptTurbo,
|
||||
"通义千问": QWenChat,
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -24,6 +24,9 @@ import numpy as np
|
||||
|
||||
from rag.utils import num_tokens_from_string
|
||||
|
||||
flag_model = FlagModel("BAAI/bge-large-zh-v1.5",
|
||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||
use_fp16=torch.cuda.is_available())
|
||||
|
||||
class Base(ABC):
|
||||
def __init__(self, key, model_name):
|
||||
@ -47,9 +50,7 @@ class HuEmbedding(Base):
|
||||
^_-
|
||||
|
||||
"""
|
||||
self.model = FlagModel("BAAI/bge-large-zh-v1.5",
|
||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||
use_fp16=torch.cuda.is_available())
|
||||
self.model = flag_model
|
||||
|
||||
|
||||
def encode(self, texts: list, batch_size=32):
|
||||
|
||||
@ -42,7 +42,7 @@ class EsQueryer:
|
||||
|
||||
def question(self, txt, tbl="qa", min_match="60%"):
|
||||
txt = re.sub(
|
||||
r"[ \t,,。??/`!!&]+",
|
||||
r"[ \r\n\t,,。??/`!!&]+",
|
||||
" ",
|
||||
huqie.tradi2simp(
|
||||
huqie.strQ2B(
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import fitz
|
||||
import xgboost as xgb
|
||||
from io import BytesIO
|
||||
import torch
|
||||
@ -1527,8 +1528,6 @@ class HuParser:
|
||||
return "\n\n".join(res)
|
||||
|
||||
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
|
||||
self.pdf = pdfplumber.open(fnm) if isinstance(
|
||||
fnm, str) else pdfplumber.open(BytesIO(fnm))
|
||||
self.lefted_chars = []
|
||||
self.mean_height = []
|
||||
self.mean_width = []
|
||||
@ -1536,13 +1535,26 @@ class HuParser:
|
||||
self.garbages = {}
|
||||
self.page_cum_height = [0]
|
||||
self.page_layout = []
|
||||
self.page_images = [p.to_image(
|
||||
resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[:299])]
|
||||
try:
|
||||
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
|
||||
self.page_images = [p.to_image(resolution=72*zoomin).annotated for i,p in enumerate(self.pdf.pages[:299])]
|
||||
self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in range(len(self.page_images))]
|
||||
except Exception as e:
|
||||
self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
|
||||
self.page_images = []
|
||||
self.page_chars = []
|
||||
mat = fitz.Matrix(zoomin, zoomin)
|
||||
for page in self.pdf:
|
||||
pix = page.getPixmap(matrix = mat)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
||||
pix.samples)
|
||||
self.page_images.append(img)
|
||||
self.page_chars.append([])
|
||||
|
||||
logging.info("Images converted.")
|
||||
logging.info("Table processed.")
|
||||
|
||||
for i, img in enumerate(self.page_images):
|
||||
chars = [c for c in self.pdf.pages[i].chars if self._has_color(c)]
|
||||
chars = self.page_chars[i]
|
||||
self.mean_height.append(
|
||||
np.median(sorted([c["height"] for c in chars])) if chars else 0
|
||||
)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright 2019 The InfiniFlow Authors. All Rights Reserved.
|
||||
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
||||
Reference in New Issue
Block a user