use playwright to scrape stocks page
This commit is contained in:
@@ -1,72 +1,74 @@
|
||||
import time
|
||||
import asyncio
|
||||
from functools import partial
|
||||
|
||||
from config import APP, URL
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
import dagster as dg
|
||||
|
||||
asset = partial(dg.asset, key_prefix=APP)
|
||||
TAGS = {"app": APP}
|
||||
asset = partial(dg.asset, key_prefix=APP, tags=TAGS)
|
||||
|
||||
|
||||
@asset(
|
||||
io_manager_key="html_io_manager",
|
||||
name="raw",
|
||||
tags={"dagster/image": "dagster-code-stocks:playwright"},
|
||||
)
|
||||
def raw_html(context: dg.AssetExecutionContext) -> None:
|
||||
# Start a headless browser
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("--headless")
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.set_window_size(1000, 2000)
|
||||
async def main() -> str:
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(viewport={"width": 1000, "height": 2000})
|
||||
page = await context.new_page()
|
||||
|
||||
# Load the page
|
||||
driver.get(URL)
|
||||
await page.goto(URL, timeout=60000)
|
||||
|
||||
# Wait until at least one toggle button is present
|
||||
wait = WebDriverWait(driver, 20)
|
||||
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
|
||||
# Wait until at least one toggle button is present
|
||||
await page.wait_for_selector(".toggle-btn", timeout=20000)
|
||||
|
||||
driver.execute_script("document.body.style.zoom='50%'")
|
||||
# Set zoom
|
||||
await page.evaluate("document.body.style.zoom='50%'")
|
||||
|
||||
# Find all toggle buttons (wait ensures DOM is ready)
|
||||
toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn")
|
||||
context.log.info(f"Found {len(toggle_buttons)} toggle buttons")
|
||||
# Find all toggle buttons
|
||||
toggle_buttons = await page.query_selector_all(".toggle-btn")
|
||||
print(f"Found {len(toggle_buttons)} toggle buttons")
|
||||
|
||||
# Click each toggle button if it's visible and enabled
|
||||
for i, btn in enumerate(toggle_buttons):
|
||||
try:
|
||||
# Wait until clickable and click
|
||||
WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn))
|
||||
btn.click()
|
||||
time.sleep(1) # short pause for content to load
|
||||
for i, btn in enumerate(toggle_buttons):
|
||||
try:
|
||||
# Ensure it's visible and enabled
|
||||
if await btn.is_visible() and await btn.is_enabled():
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
if i == len(toggle_buttons) - 1:
|
||||
break
|
||||
if i == len(toggle_buttons) - 1:
|
||||
break
|
||||
|
||||
scroll_step = 500
|
||||
total_height = driver.execute_script("return document.body.scrollHeight")
|
||||
current_position = 0
|
||||
# Scroll down gradually
|
||||
scroll_step = 500
|
||||
total_height = await page.evaluate("() => document.body.scrollHeight")
|
||||
current_position = 0
|
||||
|
||||
while current_position < total_height:
|
||||
driver.execute_script(f"window.scrollTo(0, {current_position});")
|
||||
time.sleep(0.1)
|
||||
current_position += scroll_step
|
||||
total_height = driver.execute_script(
|
||||
"return document.body.scrollHeight"
|
||||
)
|
||||
while current_position < total_height:
|
||||
await page.evaluate(f"window.scrollTo(0, {current_position});")
|
||||
await page.wait_for_timeout(100)
|
||||
current_position += scroll_step
|
||||
total_height = await page.evaluate(
|
||||
"() => document.body.scrollHeight"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
context.log.info(f"Skipped button due to error: {e}")
|
||||
except Exception as e:
|
||||
print(f"Skipped button due to error: {e}")
|
||||
|
||||
# Continue with scraping after all sections are expanded...
|
||||
html = driver.page_source
|
||||
# Get the page content
|
||||
page_source = await page.content()
|
||||
|
||||
# Close browser when done
|
||||
driver.quit()
|
||||
# Close the browser
|
||||
await browser.close()
|
||||
|
||||
return html
|
||||
# Continue scraping logic here...
|
||||
print("Scraping done")
|
||||
|
||||
# Save the page content to a file
|
||||
with open("scraped_page.html", "w") as f:
|
||||
f.write(page_source)
|
||||
return page_source
|
||||
|
||||
|
||||
@asset(io_manager_key="html_io_manager", name="raw")
|
||||
def raw_html() -> str:
|
||||
return asyncio.run(main())
|
||||
|
||||
@@ -2,3 +2,4 @@ import os
|
||||
from pathlib import Path
|
||||
|
||||
APP = os.environ.get("APP", Path(__file__).parent.parent.name)
|
||||
URL = "https://www.hellostocks.ai/superinvestor/strategies"
|
||||
|
||||
@@ -4,6 +4,7 @@ import assets
|
||||
import sensors
|
||||
from dagster_polars import PolarsParquetIOManager
|
||||
from icecream import install
|
||||
from resources import HtmlIOManager
|
||||
|
||||
import dagster as dg
|
||||
from dagster import load_assets_from_modules
|
||||
@@ -12,15 +13,17 @@ install()
|
||||
|
||||
APP = os.environ["APP"]
|
||||
|
||||
storage_dir = os.environ.get("STORAGE_DIR", "/storage")
|
||||
|
||||
definitions = dg.Definitions(
|
||||
assets=[
|
||||
asset.with_attributes(
|
||||
group_names_by_key={asset.key: APP},
|
||||
tags_by_key={asset.key: {"app": APP}},
|
||||
)
|
||||
for asset in load_assets_from_modules([assets])
|
||||
],
|
||||
resources={
|
||||
"html_io_manager": HtmlIOManager(base_dir=storage_dir),
|
||||
"polars_parquet_io_manager": PolarsParquetIOManager(
|
||||
base_dir=os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}"
|
||||
),
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
from datetime import date
|
||||
from datetime import datetime
|
||||
|
||||
import jobs
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from config import URL
|
||||
|
||||
import dagster as dg
|
||||
|
||||
@@ -10,6 +14,34 @@ import dagster as dg
|
||||
def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]:
|
||||
ic(context.cursor)
|
||||
|
||||
yield dg.RunRequest()
|
||||
response = requests.get(URL)
|
||||
response.raise_for_status()
|
||||
|
||||
context.update_cursor(date.today().strftime("%Y-%m-%d"))
|
||||
try:
|
||||
# Parse with BeautifulSoup
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Find the first <div> after </header>
|
||||
if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
|
||||
# Extract date part using regex
|
||||
match = re.search(
|
||||
r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text
|
||||
)
|
||||
if match:
|
||||
day, _, month, year = match.groups()
|
||||
date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
|
||||
date_str = date.strftime("%Y-%m-%d ")
|
||||
context.log.info(f"Found date: {date_str}")
|
||||
|
||||
if date_str > context.cursor:
|
||||
context.update_cursor(date_str)
|
||||
yield dg.RunRequest()
|
||||
return
|
||||
except Exception as e:
|
||||
context.log.error(f"Parsing error: {e}")
|
||||
|
||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
file = f"{now_str} stocks.html"
|
||||
context.log.info(f"Saving file: {file}")
|
||||
with open(f"/cache/{file}") as fp:
|
||||
fp.write(response.text)
|
||||
|
||||
Reference in New Issue
Block a user