diff --git a/apps/stocks/src/assets.py b/apps/stocks/src/assets.py index 5ab9ee0..fad1ff3 100644 --- a/apps/stocks/src/assets.py +++ b/apps/stocks/src/assets.py @@ -1,12 +1,72 @@ +import time from functools import partial -from config import APP +from config import APP, URL +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait import dagster as dg asset = partial(dg.asset, key_prefix=APP) -@asset -def raw_html() -> None: - print("todo") +@asset( + io_manager_key="html_io_manager", + name="raw", + tags={"dagster/image": "dagster-code-stocks:playwright"}, +) +def raw_html(context: dg.AssetExecutionContext) -> None: + # Start a headless browser + options = webdriver.ChromeOptions() + options.add_argument("--headless") + driver = webdriver.Chrome(options=options) + driver.set_window_size(1000, 2000) + + # Load the page + driver.get(URL) + + # Wait until at least one toggle button is present + wait = WebDriverWait(driver, 20) + wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn"))) + + driver.execute_script("document.body.style.zoom='50%'") + + # Find all toggle buttons (wait ensures DOM is ready) + toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn") + context.log.info(f"Found {len(toggle_buttons)} toggle buttons") + + # Click each toggle button if it's visible and enabled + for i, btn in enumerate(toggle_buttons): + try: + # Wait until clickable and click + WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn)) + btn.click() + time.sleep(1) # short pause for content to load + + if i == len(toggle_buttons) - 1: + break + + scroll_step = 500 + total_height = driver.execute_script("return document.body.scrollHeight") + current_position = 0 + + while current_position < total_height: + driver.execute_script(f"window.scrollTo(0, {current_position});") + time.sleep(0.1) + current_position += scroll_step + total_height = driver.execute_script( + "return document.body.scrollHeight" + ) + + except Exception as e: + context.log.info(f"Skipped button due to error: {e}") + + # Continue with scraping after all sections are expanded... + html = driver.page_source + + # Close browser when done + driver.quit() + + return html