READEME refined (#156)

This commit is contained in:
KevinHuSh
2024-03-27 13:14:36 +08:00
committed by GitHub
parent fd7fcb5baf
commit 37185466e2
3 changed files with 19 additions and 16 deletions

View File

@ -42,7 +42,9 @@ class Pdf(PdfParser):
self._text_merge()
callback(0.67, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
#self._naive_vertical_merge()
self._concat_downward()
#self._filter_forpages()
cron_logger.info("paddle layouts:".format(
(timer() - start) / (self.total_page + 0.1)))
@ -79,7 +81,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf(
) if parser_config["layout_recognize"] else PlainParser()
) if parser_config.get("layout_recognize", True) else PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
res = tokenize_table(tbls, doc, eng)

View File

@ -7,7 +7,6 @@ from elasticsearch_dsl import Q, Search
from typing import List, Optional, Dict, Union
from dataclasses import dataclass
from api.settings import chat_logger
from rag.settings import es_logger
from rag.utils import rmSpace
from rag.nlp import huqie, query
@ -365,6 +364,7 @@ class Dealer:
return ranks
def sql_retrieval(self, sql, fetch_size=128, format="json"):
from api.settings import chat_logger
sql = re.sub(r"[ ]+", " ", sql)
sql = sql.replace("%", "")
es_logger.info(f"Get es sql: {sql}")