Files
dagster/apps/vinyl/src/sounds/fetch.py

111 lines
3.0 KiB
Python

#!/usr/bin/python3
import time
from datetime import datetime
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
def get_page_count(html_content):
soup = BeautifulSoup(html_content, "html.parser")
# Find all pagination links
page_links = soup.select("ul.pagination li a")
# Extract the numbers from the hrefs and convert to integers
page_numbers = [
int(link.get_text()) for link in page_links if link.get_text().isdigit()
]
return max(page_numbers)
def parse_page(html_content):
entries = []
soup = BeautifulSoup(html_content, "html.parser")
for product in soup.find_all("div", {"class": "search-product"}):
item_id = product.find("a", rel=True)["rel"][0]
name = product.find("h5").text.strip()
artist_title = name.split("-")
artist = artist_title[0].strip()
title = artist_title[1].strip()
price = (
product.find("span", class_="product-price")
.text.strip()
.replace("", "")
.strip()
)
entry = {
"id": item_id,
"name": name,
"artist": artist,
"title": title,
"price": price,
}
if detail := product.find("h6", {"class": "hide-for-small"}):
entry["detail"] = detail.text
if supply := product.find("div", {"class": "product-voorraad"}):
entry["supply"] = supply.text
for info in product.find_all("div", {"class": "product-info"}):
info = info.text.split(":")
if "Genre" in info[0]:
entry["genre"] = info[1].strip()
if "Releasedatum" in info[0]:
entry["release"] = info[1].strip()
entries.append(entry)
return pd.DataFrame(entries).reindex(
columns=[
"id",
"name",
"artist",
"title",
"price",
"supply",
"release",
"genre",
"detail",
]
)
def fetch_deals():
# Get page count
page_count = get_page_count(
requests.get("https://www.sounds.nl/uitverkoop/1/lp/all/art").text
)
time.sleep(1)
print(f"Number of pages: {page_count}")
# Parse all pages
base_url = "https://www.sounds.nl/uitverkoop/{page_number}/lp/all"
dfs = []
for i in tqdm(range(page_count)):
df = parse_page(requests.get(base_url.format(page_number=i)).text)
dfs.append(df)
time.sleep(2)
# Combine dfs
return pd.concat(dfs) if dfs else pd.DataFrame(columns=["id", "name", "price"])
if __name__ == "__main__":
df = fetch_deals()
print(f"Found {len(df)} deals")
# Show current deals
print(df.sort_values(by="price").head(10))
# Write to file
now = datetime.now()
prefix = now.strftime("%Y-%m-%d_%H:%M:%S")
directory = "/home/bram/src/python"
filepath = f"{directory}/{prefix}_sounds.csv"
print(f"Writing data to {filepath}")
df.to_csv(filepath)