diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 000000000..6bdc0f472 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,68 @@ +version: '2.2' +services: + es01: + container_name: docass-es-01 + image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} + volumes: + - esdata01:/usr/share/elasticsearch/data + ports: + - ${ES_PORT}:9200 + environment: + - node.name=es01 + - cluster.name=${CLUSTER_NAME} + - cluster.initial_master_nodes=es01 + - ELASTIC_PASSWORD=${ELASTIC_PASSWORD} + - bootstrap.memory_lock=false + - xpack.security.enabled=false + mem_limit: ${MEM_LIMIT} + ulimits: + memlock: + soft: -1 + hard: -1 + networks: + - docass + restart: always + + kibana: + depends_on: + - es01 + image: docker.elastic.co/kibana/kibana:${STACK_VERSION} + container_name: docass-kibana + volumes: + - kibanadata:/usr/share/kibana/data + ports: + - ${KIBANA_PORT}:5601 + environment: + - SERVERNAME=kibana + - ELASTICSEARCH_HOSTS=http://es01:9200 + mem_limit: ${MEM_LIMIT} + networks: + - docass + + postgres: + image: postgres + container_name: docass-postgres + environment: + - POSTGRES_USER=${POSTGRES_USER} + - POSTGRES_PASSWORD=${POSTGRES_PASSWORD} + - POSTGRES_DB=${POSTGRES_DB} + ports: + - 5455:5455 + volumes: + - pg_data:/usr/share/elasticsearch/data + networks: + - docass + restart: always + + +volumes: + esdata01: + driver: local + kibanadata: + driver: local + pg_data: + driver: local + +networks: + docass: + driver: bridge diff --git a/python/README.md b/python/README.md new file mode 100644 index 000000000..4f351eb9a --- /dev/null +++ b/python/README.md @@ -0,0 +1,22 @@ + +```shell + +docker pull postgres + +LOCAL_POSTGRES_DATA=./postgres-data + +docker run + --name docass-postgres + -p 5455:5432 + -v $LOCAL_POSTGRES_DATA:/var/lib/postgresql/data + -e POSTGRES_USER=root + -e POSTGRES_PASSWORD=infiniflow_docass + -e POSTGRES_DB=docass + -d + postgres + +docker network create elastic +docker pull elasticsearch:8.11.3; +docker pull docker.elastic.co/kibana/kibana:8.11.3 + +``` diff --git a/python/conf/sys.cnf b/python/conf/sys.cnf index 375573651..fc0d64c41 100755 --- a/python/conf/sys.cnf +++ b/python/conf/sys.cnf @@ -1,4 +1,8 @@ [online] es=127.0.0.1:9200 idx_nm=toxic +pgdb_usr=root +pgdb_pwd=infiniflow_docass +pgdb_host=127.0.0.1 +pgdb_port=5432 diff --git a/python/nlp/huchunk.py b/python/nlp/huchunk.py index ba47545fc..619640227 100644 --- a/python/nlp/huchunk.py +++ b/python/nlp/huchunk.py @@ -291,6 +291,12 @@ class PdfChunker(HuChunker): class DocxChunker(HuChunker): + + @dataclass + class Fields: + text_chunks: List = None + table_chunks: List = None + def __init__(self, doc_parser): self.doc = doc_parser super().__init__() @@ -336,6 +342,12 @@ class DocxChunker(HuChunker): class ExcelChunker(HuChunker): + + @dataclass + class Fields: + text_chunks: List = None + table_chunks: List = None + def __init__(self, excel_parser): self.excel = excel_parser super().__init__() @@ -354,10 +366,10 @@ if __name__ == "__main__": from parser import PdfParser ckr = PdfChunker(PdfParser()) if sys.argv[1].split(".")[-1].lower().find("doc") >= 0: - from .parser import DocxParser + from parser import DocxParser ckr = DocxChunker(DocxParser()) if sys.argv[1].split(".")[-1].lower().find("xlsx") >= 0: - from .parser import ExcelParser + from parser import ExcelParser ckr = ExcelChunker(ExcelParser()) # ckr.html(sys.argv[1]) diff --git a/python/parser/pdf_parser.py b/python/parser/pdf_parser.py index 744716aab..7fd341518 100644 --- a/python/parser/pdf_parser.py +++ b/python/parser/pdf_parser.py @@ -323,7 +323,7 @@ class HuParser: return layouts def __table_paddle(self, images): - tbls = self.tbl_det([np.array(img) for img in images], thr=0.5) + tbls = self.tbl_det([img for img in images], threshold=0.5) res = [] # align left&right for rows, align top&bottom for columns for tbl in tbls: diff --git a/python/util/db_conn.py b/python/util/db_conn.py new file mode 100644 index 000000000..b67e13e92 --- /dev/null +++ b/python/util/db_conn.py @@ -0,0 +1,44 @@ +import logging +import time +from util import config +import pandas as pd + +class Postgre(object): + def __init__(self, env, dbnm): + self.config = config.init(env) + self.conn = None + self.dbnm = dbnm + self.__open__() + + def __open__(self): + import psycopg2 + try: + if self.conn:self.__close__() + del self.conn + except Exception as e: + pass + + try: + self.conn = psycopg2.connect(f"dbname={self.dbnm} user={self.config.get('pgdb_usr')} password={self.config.get('pgdb_pwd')} host={self.config.get('pgdb_host')} port={self.config.get('pgdb_port')}") + except Exception as e: + logging.error("Fail to connect %s "%self.config.get("pgdb_host") + str(e)) + + + def __close__(self): + try: + self.conn.close() + except Exception as e: + logging.error("Fail to close %s "%self.config.get("pgdb_host") + str(e)) + + + def select(self, sql): + for _ in range(10): + try: + return pd.read_sql(sql, self.conn) + except Exception as e: + logging.error(f"Fail to exec {sql}l "+str(e)) + self.__open__() + time.sleep(1) + + return pd.DataFrame() +