mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
finish add thumbnail to video,image,pdf files (#18)
This commit is contained in:
118
python/svr/add_thumbnail2file.py
Normal file
118
python/svr/add_thumbnail2file.py
Normal file
@ -0,0 +1,118 @@
|
||||
import sys, datetime, random, re, cv2
|
||||
from os.path import dirname, realpath
|
||||
sys.path.append(dirname(realpath(__file__)) + "/../")
|
||||
from util.db_conn import Postgres
|
||||
from util.minio_conn import HuMinio
|
||||
from util import findMaxDt
|
||||
import base64
|
||||
from io import BytesIO
|
||||
import pandas as pd
|
||||
from PIL import Image
|
||||
import pdfplumber
|
||||
|
||||
|
||||
PG = Postgres("infiniflow", "docgpt")
|
||||
MINIO = HuMinio("infiniflow")
|
||||
def set_thumbnail(did, base64):
|
||||
sql = f"""
|
||||
update doc_info set thumbnail_base64='{base64}'
|
||||
where
|
||||
did={did}
|
||||
"""
|
||||
PG.update(sql)
|
||||
|
||||
|
||||
def collect(comm, mod, tm):
|
||||
sql = f"""
|
||||
select
|
||||
did, uid, doc_name, location, updated_at
|
||||
from doc_info
|
||||
where
|
||||
updated_at >= '{tm}'
|
||||
and MOD(did, {comm}) = {mod}
|
||||
and is_deleted=false
|
||||
and type <> 'folder'
|
||||
and thumbnail_base64=''
|
||||
order by updated_at asc
|
||||
limit 10
|
||||
"""
|
||||
docs = PG.select(sql)
|
||||
if len(docs) == 0:return pd.DataFrame()
|
||||
|
||||
mtm = str(docs["updated_at"].max())[:19]
|
||||
print("TOTAL:", len(docs), "To: ", mtm)
|
||||
return docs
|
||||
|
||||
|
||||
def build(row):
|
||||
if not re.search(r"\.(pdf|jpg|jpeg|png|gif|svg|apng|icon|ico|webp|mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$",
|
||||
row["doc_name"].lower().strip()):
|
||||
set_thumbnail(row["did"], "_")
|
||||
return
|
||||
|
||||
def thumbnail(img, SIZE=128):
|
||||
w,h = img.size
|
||||
p = SIZE/max(w, h)
|
||||
w, h = int(w*p), int(h*p)
|
||||
img.thumbnail((w, h))
|
||||
buffered = BytesIO()
|
||||
try:
|
||||
img.save(buffered, format="JPEG")
|
||||
except Exception as e:
|
||||
try:
|
||||
img.save(buffered, format="PNG")
|
||||
except Exception as ee:
|
||||
pass
|
||||
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
|
||||
|
||||
iobytes = BytesIO(MINIO.get("%s-upload"%str(row["uid"]), row["location"]))
|
||||
if re.search(r"\.pdf$", row["doc_name"].lower().strip()):
|
||||
pdf = pdfplumber.open(iobytes)
|
||||
img = pdf.pages[0].to_image().annotated
|
||||
set_thumbnail(row["did"], thumbnail(img))
|
||||
|
||||
if re.search(r"\.(jpg|jpeg|png|gif|svg|apng|webp|icon|ico)$", row["doc_name"].lower().strip()):
|
||||
img = Image.open(iobytes)
|
||||
set_thumbnail(row["did"], thumbnail(img))
|
||||
|
||||
if re.search(r"\.(mpg|mpeg|avi|rm|rmvb|mov|wmv|mp4)$", row["doc_name"].lower().strip()):
|
||||
url = MINIO.get_presigned_url("%s-upload"%str(row["uid"]),
|
||||
row["location"],
|
||||
expires=datetime.timedelta(seconds=60)
|
||||
)
|
||||
cap = cv2.VideoCapture(url)
|
||||
succ = cap.isOpened()
|
||||
i = random.randint(1, 11)
|
||||
while succ:
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
if i > 0:
|
||||
i -= 1
|
||||
continue
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
print(img.size)
|
||||
set_thumbnail(row["did"], thumbnail(img))
|
||||
cap.release()
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
|
||||
def main(comm, mod):
|
||||
global model
|
||||
tm_fnm = f"res/thumbnail-{comm}-{mod}.tm"
|
||||
tm = findMaxDt(tm_fnm)
|
||||
rows = collect(comm, mod, tm)
|
||||
if len(rows) == 0:return
|
||||
|
||||
tmf = open(tm_fnm, "a+")
|
||||
for _, r in rows.iterrows():
|
||||
build(r)
|
||||
tmf.write(str(r["updated_at"]) + "\n")
|
||||
tmf.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from mpi4py import MPI
|
||||
comm = MPI.COMM_WORLD
|
||||
main(comm.Get_size(), comm.Get_rank())
|
||||
|
||||
@ -54,11 +54,24 @@ class HuMinio(object):
|
||||
r = self.conn.get_object(bucket, fnm)
|
||||
return r.read()
|
||||
except Exception as e:
|
||||
logging.error(f"Fail get {bucket}/{fnm}: "+str(e))
|
||||
logging.error(f"fail get {bucket}/{fnm}: "+str(e))
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
|
||||
def get_presigned_url(self, bucket, fnm, expires):
|
||||
for _ in range(10):
|
||||
try:
|
||||
return self.conn.get_presigned_url("GET", bucket, fnm, expires)
|
||||
except Exception as e:
|
||||
logging.error(f"fail get {bucket}/{fnm}: "+str(e))
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
conn = HuMinio("infiniflow")
|
||||
fnm = "/opt/home/kevinhu/docgpt/upload/13/11-408.jpg"
|
||||
|
||||
Reference in New Issue
Block a user