mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-17 11:09:06 +08:00
use minio to store uploaded files; build dialog server; (#16)
* format code * use minio to store uploaded files; build dialog server;
This commit is contained in:
@ -3,6 +3,7 @@ import re
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
from nlp import huqie
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
class HuDocxParser:
|
||||
@ -97,7 +98,7 @@ class HuDocxParser:
|
||||
return ["\n".join(lines)]
|
||||
|
||||
def __call__(self, fnm):
|
||||
self.doc = Document(fnm)
|
||||
self.doc = Document(fnm) if isinstance(fnm, str) else Document(BytesIO(fnm))
|
||||
secs = [(p.text, p.style.name) for p in self.doc.paragraphs]
|
||||
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
||||
return secs, tbls
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
from openpyxl import load_workbook
|
||||
import sys
|
||||
from io import BytesIO
|
||||
|
||||
|
||||
class HuExcelParser:
|
||||
def __call__(self, fnm):
|
||||
wb = load_workbook(fnm)
|
||||
if isinstance(fnm, str):wb = load_workbook(fnm)
|
||||
else: wb = load_workbook(BytesIO(fnm))
|
||||
res = []
|
||||
for sheetname in wb.sheetnames:
|
||||
ws = wb[sheetname]
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import xgboost as xgb
|
||||
from io import BytesIO
|
||||
import torch
|
||||
import re
|
||||
import pdfplumber
|
||||
@ -1525,7 +1526,7 @@ class HuParser:
|
||||
return "\n\n".join(res)
|
||||
|
||||
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
|
||||
self.pdf = pdfplumber.open(fnm)
|
||||
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
|
||||
self.lefted_chars = []
|
||||
self.mean_height = []
|
||||
self.mean_width = []
|
||||
|
||||
Reference in New Issue
Block a user