selenium snapshot

This commit is contained in:
2025-07-28 22:33:31 +02:00
parent 8329d7ed68
commit b386a375b5

View File

@@ -1,12 +1,72 @@
import time
from functools import partial from functools import partial
from config import APP from config import APP, URL
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import dagster as dg import dagster as dg
asset = partial(dg.asset, key_prefix=APP) asset = partial(dg.asset, key_prefix=APP)
@asset @asset(
def raw_html() -> None: io_manager_key="html_io_manager",
print("todo") name="raw",
tags={"dagster/image": "dagster-code-stocks:playwright"},
)
def raw_html(context: dg.AssetExecutionContext) -> None:
# Start a headless browser
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.set_window_size(1000, 2000)
# Load the page
driver.get(URL)
# Wait until at least one toggle button is present
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
driver.execute_script("document.body.style.zoom='50%'")
# Find all toggle buttons (wait ensures DOM is ready)
toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn")
context.log.info(f"Found {len(toggle_buttons)} toggle buttons")
# Click each toggle button if it's visible and enabled
for i, btn in enumerate(toggle_buttons):
try:
# Wait until clickable and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn))
btn.click()
time.sleep(1) # short pause for content to load
if i == len(toggle_buttons) - 1:
break
scroll_step = 500
total_height = driver.execute_script("return document.body.scrollHeight")
current_position = 0
while current_position < total_height:
driver.execute_script(f"window.scrollTo(0, {current_position});")
time.sleep(0.1)
current_position += scroll_step
total_height = driver.execute_script(
"return document.body.scrollHeight"
)
except Exception as e:
context.log.info(f"Skipped button due to error: {e}")
# Continue with scraping after all sections are expanded...
html = driver.page_source
# Close browser when done
driver.quit()
return html