selenium snapshot
This commit is contained in:
@@ -1,12 +1,72 @@
|
||||
import time
|
||||
from functools import partial
|
||||
|
||||
from config import APP
|
||||
from config import APP, URL
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
import dagster as dg
|
||||
|
||||
asset = partial(dg.asset, key_prefix=APP)
|
||||
|
||||
|
||||
@asset
|
||||
def raw_html() -> None:
|
||||
print("todo")
|
||||
@asset(
|
||||
io_manager_key="html_io_manager",
|
||||
name="raw",
|
||||
tags={"dagster/image": "dagster-code-stocks:playwright"},
|
||||
)
|
||||
def raw_html(context: dg.AssetExecutionContext) -> None:
|
||||
# Start a headless browser
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("--headless")
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.set_window_size(1000, 2000)
|
||||
|
||||
# Load the page
|
||||
driver.get(URL)
|
||||
|
||||
# Wait until at least one toggle button is present
|
||||
wait = WebDriverWait(driver, 20)
|
||||
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
|
||||
|
||||
driver.execute_script("document.body.style.zoom='50%'")
|
||||
|
||||
# Find all toggle buttons (wait ensures DOM is ready)
|
||||
toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn")
|
||||
context.log.info(f"Found {len(toggle_buttons)} toggle buttons")
|
||||
|
||||
# Click each toggle button if it's visible and enabled
|
||||
for i, btn in enumerate(toggle_buttons):
|
||||
try:
|
||||
# Wait until clickable and click
|
||||
WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn))
|
||||
btn.click()
|
||||
time.sleep(1) # short pause for content to load
|
||||
|
||||
if i == len(toggle_buttons) - 1:
|
||||
break
|
||||
|
||||
scroll_step = 500
|
||||
total_height = driver.execute_script("return document.body.scrollHeight")
|
||||
current_position = 0
|
||||
|
||||
while current_position < total_height:
|
||||
driver.execute_script(f"window.scrollTo(0, {current_position});")
|
||||
time.sleep(0.1)
|
||||
current_position += scroll_step
|
||||
total_height = driver.execute_script(
|
||||
"return document.body.scrollHeight"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
context.log.info(f"Skipped button due to error: {e}")
|
||||
|
||||
# Continue with scraping after all sections are expanded...
|
||||
html = driver.page_source
|
||||
|
||||
# Close browser when done
|
||||
driver.quit()
|
||||
|
||||
return html
|
||||
|
||||
Reference in New Issue
Block a user