selenium snapshot

This commit is contained in:
2025-07-28 22:33:31 +02:00
parent 8329d7ed68
commit b386a375b5

View File

@@ -1,12 +1,72 @@
import time
from functools import partial
from config import APP
from config import APP, URL
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import dagster as dg
asset = partial(dg.asset, key_prefix=APP)
@asset
def raw_html() -> None:
print("todo")
@asset(
io_manager_key="html_io_manager",
name="raw",
tags={"dagster/image": "dagster-code-stocks:playwright"},
)
def raw_html(context: dg.AssetExecutionContext) -> None:
# Start a headless browser
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.set_window_size(1000, 2000)
# Load the page
driver.get(URL)
# Wait until at least one toggle button is present
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
driver.execute_script("document.body.style.zoom='50%'")
# Find all toggle buttons (wait ensures DOM is ready)
toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn")
context.log.info(f"Found {len(toggle_buttons)} toggle buttons")
# Click each toggle button if it's visible and enabled
for i, btn in enumerate(toggle_buttons):
try:
# Wait until clickable and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn))
btn.click()
time.sleep(1) # short pause for content to load
if i == len(toggle_buttons) - 1:
break
scroll_step = 500
total_height = driver.execute_script("return document.body.scrollHeight")
current_position = 0
while current_position < total_height:
driver.execute_script(f"window.scrollTo(0, {current_position});")
time.sleep(0.1)
current_position += scroll_step
total_height = driver.execute_script(
"return document.body.scrollHeight"
)
except Exception as e:
context.log.info(f"Skipped button due to error: {e}")
# Continue with scraping after all sections are expanded...
html = driver.page_source
# Close browser when done
driver.quit()
return html