move utils
This commit is contained in:
@@ -1,19 +0,0 @@
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
from datetime import date, datetime
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def extract_date(page_source: str) -> Iterator[date]:
|
||||
# Parse with BeautifulSoup
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
# Find the first <div> after </header>
|
||||
if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
|
||||
# Extract date part using regex
|
||||
match = re.search(r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text)
|
||||
if match:
|
||||
day, _, month, year = match.groups()
|
||||
date_obj = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
|
||||
yield date_obj
|
||||
0
apps/stocks/src/utils/__init__.py
Normal file
0
apps/stocks/src/utils/__init__.py
Normal file
55
apps/stocks/src/utils/extracter.py
Normal file
55
apps/stocks/src/utils/extracter.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
from datetime import date, datetime
|
||||
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
def extract_date(page_source: str) -> Iterator[date]:
|
||||
# Parse with BeautifulSoup
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
# Find the first <div> after </header>
|
||||
if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
|
||||
# Extract date part using regex
|
||||
match = re.search(r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text)
|
||||
if match:
|
||||
day, _, month, year = match.groups()
|
||||
date_obj = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
|
||||
yield date_obj
|
||||
|
||||
|
||||
def extract_tables(
|
||||
page_source: str,
|
||||
) -> Iterator[tuple[str | None, str | None, DataFrame]]:
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
accordion_items = soup.find_all("div", class_="accordion-item")
|
||||
|
||||
for item in accordion_items:
|
||||
# Extract the title
|
||||
header = item.find("div", class_="accordion-header")
|
||||
title = header.find("h2").get_text(strip=True) if header else None
|
||||
|
||||
# Extract the description
|
||||
description_block = item.find("div", class_="accordion-description")
|
||||
description = (
|
||||
description_block.find("p").get_text(strip=True)
|
||||
if description_block
|
||||
else None
|
||||
)
|
||||
|
||||
# Extract the table
|
||||
table = item.find("table")
|
||||
if table:
|
||||
rows = []
|
||||
for row in table.find_all("tr"):
|
||||
cells = [
|
||||
cell.get_text(strip=True) for cell in row.find_all(["th", "td"])
|
||||
]
|
||||
rows.append(cells)
|
||||
|
||||
if rows:
|
||||
df = pd.DataFrame(rows[1:], columns=rows[0])
|
||||
yield title, description, df
|
||||
60
apps/stocks/src/utils/scraper.py
Normal file
60
apps/stocks/src/utils/scraper.py
Normal file
@@ -0,0 +1,60 @@
|
||||
async def scrape(url: str) -> str:
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(viewport={"width": 1000, "height": 2000})
|
||||
page = await context.new_page()
|
||||
|
||||
await page.goto(url, timeout=60000)
|
||||
|
||||
# Wait until at least one toggle button is present
|
||||
await page.wait_for_selector(".toggle-btn", timeout=20000)
|
||||
|
||||
# Set zoom
|
||||
await page.evaluate("document.body.style.zoom='50%'")
|
||||
|
||||
# Find all toggle buttons
|
||||
toggle_buttons = await page.query_selector_all(".toggle-btn")
|
||||
print(f"Found {len(toggle_buttons)} toggle buttons")
|
||||
|
||||
for i, btn in enumerate(toggle_buttons):
|
||||
try:
|
||||
# Ensure it's visible and enabled
|
||||
if await btn.is_visible() and await btn.is_enabled():
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
if i == len(toggle_buttons) - 1:
|
||||
break
|
||||
|
||||
# Scroll down gradually
|
||||
scroll_step = 500
|
||||
total_height = await page.evaluate("() => document.body.scrollHeight")
|
||||
current_position = 0
|
||||
|
||||
while current_position < total_height:
|
||||
await page.evaluate(f"window.scrollTo(0, {current_position});")
|
||||
await page.wait_for_timeout(100)
|
||||
current_position += scroll_step
|
||||
total_height = await page.evaluate(
|
||||
"() => document.body.scrollHeight"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Skipped button due to error: {e}")
|
||||
|
||||
# Get the page content
|
||||
page_source = await page.content()
|
||||
|
||||
# Close the browser
|
||||
await browser.close()
|
||||
|
||||
# Continue scraping logic here...
|
||||
print("Scraping done")
|
||||
|
||||
# Save the page content to a file
|
||||
with open("/cache/scraped_page.html", "w") as fp:
|
||||
fp.write(page_source)
|
||||
|
||||
return page_source
|
||||
11
apps/stocks/src/utils/text.py
Normal file
11
apps/stocks/src/utils/text.py
Normal file
@@ -0,0 +1,11 @@
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
|
||||
def slugify(text: str) -> str:
|
||||
# Normalize unicode characters
|
||||
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
|
||||
# Replace non-word characters with hyphens
|
||||
text = re.sub(r"[^\w\s-]", "", text).strip().lower()
|
||||
# Replace spaces and repeated hyphens with a single hyphen
|
||||
return re.sub(r"[-\s]+", "-", text)
|
||||
Reference in New Issue
Block a user