#!/usr/bin/python3 import time from datetime import datetime import pandas as pd import requests from bs4 import BeautifulSoup from tqdm import tqdm def get_page_count(html_content): soup = BeautifulSoup(html_content, "html.parser") # Find all pagination links page_links = soup.select("ul.pagination li a") # Extract the numbers from the hrefs and convert to integers page_numbers = [ int(link.get_text()) for link in page_links if link.get_text().isdigit() ] return max(page_numbers) def parse_page(html_content): entries = [] soup = BeautifulSoup(html_content, "html.parser") for product in soup.find_all("div", {"class": "search-product"}): item_id = product.find("a", rel=True)["rel"][0] name = product.find("h5").text.strip() artist_title = name.split("-") artist = artist_title[0].strip() title = artist_title[1].strip() price = ( product.find("span", class_="product-price") .text.strip() .replace("€", "") .strip() ) entry = { "id": item_id, "name": name, "artist": artist, "title": title, "price": price, } if detail := product.find("h6", {"class": "hide-for-small"}): entry["detail"] = detail.text if supply := product.find("div", {"class": "product-voorraad"}): entry["supply"] = supply.text for info in product.find_all("div", {"class": "product-info"}): info = info.text.split(":") if "Genre" in info[0]: entry["genre"] = info[1].strip() if "Releasedatum" in info[0]: entry["release"] = info[1].strip() entries.append(entry) return pd.DataFrame(entries).reindex( columns=[ "id", "name", "artist", "title", "price", "supply", "release", "genre", "detail", ] ) def fetch_deals(): # Get page count page_count = get_page_count( requests.get("https://www.sounds.nl/uitverkoop/1/lp/all/art").text ) time.sleep(1) print(f"Number of pages: {page_count}") # Parse all pages base_url = "https://www.sounds.nl/uitverkoop/{page_number}/lp/all" dfs = [] for i in tqdm(range(page_count)): df = parse_page(requests.get(base_url.format(page_number=i)).text) dfs.append(df) time.sleep(2) # Combine dfs return pd.concat(dfs) if dfs else pd.DataFrame(columns=["id", "name", "price"]) if __name__ == "__main__": df = fetch_deals() print(f"Found {len(df)} deals") # Show current deals print(df.sort_values(by="price").head(10)) # Write to file now = datetime.now() prefix = now.strftime("%Y-%m-%d_%H:%M:%S") directory = "/home/bram/src/python" filepath = f"{directory}/{prefix}_sounds.csv" print(f"Writing data to {filepath}") df.to_csv(filepath)