mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
22
python/README.md
Normal file
22
python/README.md
Normal file
@ -0,0 +1,22 @@
|
||||
|
||||
```shell
|
||||
|
||||
docker pull postgres
|
||||
|
||||
LOCAL_POSTGRES_DATA=./postgres-data
|
||||
|
||||
docker run
|
||||
--name docass-postgres
|
||||
-p 5455:5432
|
||||
-v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data
|
||||
-e POSTGRES_USER=root
|
||||
-e POSTGRES_PASSWORD=infiniflow_docass
|
||||
-e POSTGRES_DB=docass
|
||||
-d
|
||||
postgres
|
||||
|
||||
docker network create elastic
|
||||
docker pull elasticsearch:8.11.3;
|
||||
docker pull docker.elastic.co/kibana/kibana:8.11.3
|
||||
|
||||
```
|
||||
@ -1,4 +1,8 @@
|
||||
[online]
|
||||
es=127.0.0.1:9200
|
||||
idx_nm=toxic
|
||||
pgdb_usr=root
|
||||
pgdb_pwd=infiniflow_docass
|
||||
pgdb_host=127.0.0.1
|
||||
pgdb_port=5432
|
||||
|
||||
|
||||
@ -291,6 +291,12 @@ class PdfChunker(HuChunker):
|
||||
|
||||
|
||||
class DocxChunker(HuChunker):
|
||||
|
||||
@dataclass
|
||||
class Fields:
|
||||
text_chunks: List = None
|
||||
table_chunks: List = None
|
||||
|
||||
def __init__(self, doc_parser):
|
||||
self.doc = doc_parser
|
||||
super().__init__()
|
||||
@ -336,6 +342,12 @@ class DocxChunker(HuChunker):
|
||||
|
||||
|
||||
class ExcelChunker(HuChunker):
|
||||
|
||||
@dataclass
|
||||
class Fields:
|
||||
text_chunks: List = None
|
||||
table_chunks: List = None
|
||||
|
||||
def __init__(self, excel_parser):
|
||||
self.excel = excel_parser
|
||||
super().__init__()
|
||||
@ -354,10 +366,10 @@ if __name__ == "__main__":
|
||||
from parser import PdfParser
|
||||
ckr = PdfChunker(PdfParser())
|
||||
if sys.argv[1].split(".")[-1].lower().find("doc") >= 0:
|
||||
from .parser import DocxParser
|
||||
from parser import DocxParser
|
||||
ckr = DocxChunker(DocxParser())
|
||||
if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0:
|
||||
from .parser import ExcelParser
|
||||
from parser import ExcelParser
|
||||
ckr = ExcelChunker(ExcelParser())
|
||||
|
||||
# ckr.html(sys.argv[1])
|
||||
|
||||
@ -323,7 +323,7 @@ class HuParser:
|
||||
return layouts
|
||||
|
||||
def __table_paddle(self, images):
|
||||
tbls = self.tbl_det([np.array(img) for img in images], thr=0.5)
|
||||
tbls = self.tbl_det([img for img in images], threshold=0.5)
|
||||
res = []
|
||||
# align left&right for rows, align top&bottom for columns
|
||||
for tbl in tbls:
|
||||
|
||||
44
python/util/db_conn.py
Normal file
44
python/util/db_conn.py
Normal file
@ -0,0 +1,44 @@
|
||||
import logging
|
||||
import time
|
||||
from util import config
|
||||
import pandas as pd
|
||||
|
||||
class Postgre(object):
|
||||
def __init__(self, env, dbnm):
|
||||
self.config = config.init(env)
|
||||
self.conn = None
|
||||
self.dbnm = dbnm
|
||||
self.__open__()
|
||||
|
||||
def __open__(self):
|
||||
import psycopg2
|
||||
try:
|
||||
if self.conn:self.__close__()
|
||||
del self.conn
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
try:
|
||||
self.conn = psycopg2.connect(f"dbname={self.dbnm} user={self.config.get('pgdb_usr')} password={self.config.get('pgdb_pwd')} host={self.config.get('pgdb_host')} port={self.config.get('pgdb_port')}")
|
||||
except Exception as e:
|
||||
logging.error("Fail to connect %s "%self.config.get("pgdb_host") + str(e))
|
||||
|
||||
|
||||
def __close__(self):
|
||||
try:
|
||||
self.conn.close()
|
||||
except Exception as e:
|
||||
logging.error("Fail to close %s "%self.config.get("pgdb_host") + str(e))
|
||||
|
||||
|
||||
def select(self, sql):
|
||||
for _ in range(10):
|
||||
try:
|
||||
return pd.read_sql(sql, self.conn)
|
||||
except Exception as e:
|
||||
logging.error(f"Fail to exec {sql}l "+str(e))
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
return pd.DataFrame()
|
||||
|
||||
Reference in New Issue
Block a user