selenium snapshot
This commit is contained in:
@@ -1,12 +1,72 @@
|
|||||||
|
import time
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from config import APP
|
from config import APP, URL
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
|
||||||
import dagster as dg
|
import dagster as dg
|
||||||
|
|
||||||
asset = partial(dg.asset, key_prefix=APP)
|
asset = partial(dg.asset, key_prefix=APP)
|
||||||
|
|
||||||
|
|
||||||
@asset
|
@asset(
|
||||||
def raw_html() -> None:
|
io_manager_key="html_io_manager",
|
||||||
print("todo")
|
name="raw",
|
||||||
|
tags={"dagster/image": "dagster-code-stocks:playwright"},
|
||||||
|
)
|
||||||
|
def raw_html(context: dg.AssetExecutionContext) -> None:
|
||||||
|
# Start a headless browser
|
||||||
|
options = webdriver.ChromeOptions()
|
||||||
|
options.add_argument("--headless")
|
||||||
|
driver = webdriver.Chrome(options=options)
|
||||||
|
driver.set_window_size(1000, 2000)
|
||||||
|
|
||||||
|
# Load the page
|
||||||
|
driver.get(URL)
|
||||||
|
|
||||||
|
# Wait until at least one toggle button is present
|
||||||
|
wait = WebDriverWait(driver, 20)
|
||||||
|
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
|
||||||
|
|
||||||
|
driver.execute_script("document.body.style.zoom='50%'")
|
||||||
|
|
||||||
|
# Find all toggle buttons (wait ensures DOM is ready)
|
||||||
|
toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn")
|
||||||
|
context.log.info(f"Found {len(toggle_buttons)} toggle buttons")
|
||||||
|
|
||||||
|
# Click each toggle button if it's visible and enabled
|
||||||
|
for i, btn in enumerate(toggle_buttons):
|
||||||
|
try:
|
||||||
|
# Wait until clickable and click
|
||||||
|
WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn))
|
||||||
|
btn.click()
|
||||||
|
time.sleep(1) # short pause for content to load
|
||||||
|
|
||||||
|
if i == len(toggle_buttons) - 1:
|
||||||
|
break
|
||||||
|
|
||||||
|
scroll_step = 500
|
||||||
|
total_height = driver.execute_script("return document.body.scrollHeight")
|
||||||
|
current_position = 0
|
||||||
|
|
||||||
|
while current_position < total_height:
|
||||||
|
driver.execute_script(f"window.scrollTo(0, {current_position});")
|
||||||
|
time.sleep(0.1)
|
||||||
|
current_position += scroll_step
|
||||||
|
total_height = driver.execute_script(
|
||||||
|
"return document.body.scrollHeight"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
context.log.info(f"Skipped button due to error: {e}")
|
||||||
|
|
||||||
|
# Continue with scraping after all sections are expanded...
|
||||||
|
html = driver.page_source
|
||||||
|
|
||||||
|
# Close browser when done
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
return html
|
||||||
|
|||||||
Reference in New Issue
Block a user