mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Supports obtaining PDF documents from web pages (#1107)
### What problem does this PR solve? Knowledge base management supports crawling information from web pages and generating PDF documents ### Type of change - [x] New Feature (Support document from web pages)
This commit is contained in:
82
api/utils/web_utils.py
Normal file
82
api/utils/web_utils.py
Normal file
@ -0,0 +1,82 @@
|
||||
import re
|
||||
import json
|
||||
import base64
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support.expected_conditions import staleness_of
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
|
||||
def html2pdf(
|
||||
source: str,
|
||||
timeout: int = 2,
|
||||
install_driver: bool = True,
|
||||
print_options: dict = {},
|
||||
):
|
||||
result = __get_pdf_from_html(source, timeout, install_driver, print_options)
|
||||
return result
|
||||
|
||||
|
||||
def __send_devtools(driver, cmd, params={}):
|
||||
resource = "/session/%s/chromium/send_command_and_get_result" % driver.session_id
|
||||
url = driver.command_executor._url + resource
|
||||
body = json.dumps({"cmd": cmd, "params": params})
|
||||
response = driver.command_executor._request("POST", url, body)
|
||||
|
||||
if not response:
|
||||
raise Exception(response.get("value"))
|
||||
|
||||
return response.get("value")
|
||||
|
||||
|
||||
def __get_pdf_from_html(
|
||||
path: str,
|
||||
timeout: int,
|
||||
install_driver: bool,
|
||||
print_options: dict
|
||||
):
|
||||
webdriver_options = Options()
|
||||
webdriver_prefs = {}
|
||||
webdriver_options.add_argument("--headless")
|
||||
webdriver_options.add_argument("--disable-gpu")
|
||||
webdriver_options.add_argument("--no-sandbox")
|
||||
webdriver_options.add_argument("--disable-dev-shm-usage")
|
||||
webdriver_options.experimental_options["prefs"] = webdriver_prefs
|
||||
|
||||
webdriver_prefs["profile.default_content_settings"] = {"images": 2}
|
||||
|
||||
if install_driver:
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(service=service, options=webdriver_options)
|
||||
else:
|
||||
driver = webdriver.Chrome(options=webdriver_options)
|
||||
|
||||
driver.get(path)
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, timeout).until(
|
||||
staleness_of(driver.find_element(by=By.TAG_NAME, value="html"))
|
||||
)
|
||||
except TimeoutException:
|
||||
calculated_print_options = {
|
||||
"landscape": False,
|
||||
"displayHeaderFooter": False,
|
||||
"printBackground": True,
|
||||
"preferCSSPageSize": True,
|
||||
}
|
||||
calculated_print_options.update(print_options)
|
||||
result = __send_devtools(
|
||||
driver, "Page.printToPDF", calculated_print_options)
|
||||
driver.quit()
|
||||
return base64.b64decode(result["data"])
|
||||
|
||||
|
||||
def is_valid_url(url: str) -> bool:
|
||||
return bool(re.match(r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", url))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user