Enlarge the term weight difference (#3435)

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement
This commit is contained in:
Kevin Hu
2024-11-15 15:41:50 +08:00
committed by GitHub
parent 6d451dbe06
commit ca9e97d2f2
3 changed files with 15 additions and 2 deletions

View File

@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License
#
import os.path
import pathlib
import re
@ -36,7 +37,7 @@ from api.db.services.document_service import DocumentService, doc_upload_and_par
from api.settings import RetCode, docStoreConn
from api.utils.api_utils import get_json_result
from rag.utils.storage_factory import STORAGE_IMPL
from api.utils.file_utils import filename_type, thumbnail
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
from api.utils.web_utils import html2pdf, is_valid_url
from api.constants import IMG_BASE64_PREFIX
@ -529,15 +530,25 @@ def parse():
if not is_valid_url(url):
return get_json_result(
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
os.makedirs(download_path, exist_ok=True)
from selenium.webdriver import Chrome, ChromeOptions
options = ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_experimental_option('prefs', {
'download.default_directory': download_path,
'download.prompt_for_download': False,
'download.directory_upgrade': True,
'safebrowsing.enabled': True
})
driver = Chrome(options=options)
driver.get(url)
print(driver.get_downloadable_files())
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
driver.close()
return get_json_result(data="\n".join(sections))
if 'file' not in request.files: