articles = soup.select("article.thumb-block") for article in articles: try: link_tag = article.select_one("a") if not link_tag: continue cat_url = link_tag.get("href") # Title title_tag = article.select_one(".cat-title") title = title_tag.get_text(strip=True) if title_tag else link_tag.get("title") # Thumbnail thumb_url = None img_tag = article.select_one("img") if img_tag: thumb_url = img_tag.get("data-lazy-src") or img_tag.get("src") # Handle svg placeholder if "data:image/svg+xml" in str(thumb_url) or not thumb_url: # fallback to source srcset if available or finding another img # The HTML shows picture = article.select_one("picture") if picture: source = picture.select_one("source") if source: srcset = source.get("data-lazy-srcset") if srcset: # Take the first url from srcset (usually comma separated) # "url 150w, url 238w, ..." thumb_url = srcset.split(",")[0].split(" ")[0] if cat_url and title: categories.append({ "name": title, "url": cat_url, "thumbnail_url": thumb_url, "video_count": None }) except Exception as e: print(f"Error parsing category item: {e}") continue return categories async def list_videos(base_url: str = BASE_URL, page: int = 1, limit: int = 20) -> list[dict[str, object]]: """ List videos from a category or homepage. """ if "fapnut.net" not in base_url: base_url = BASE_URL url = base_url if page > 1: # Check if base_url is a search query if "?s=" in base_url or "&s=" in base_url: # WordPress search pagination: /page/X/?s=query # Remove existing /page/X/ if present clean_url = re.sub(r"/page/\d+/?", "", base_url) from urllib.parse import urlparse, parse_qs, urlencode, urlunparse parsed = urlparse(clean_url) query_params = parse_qs(parsed.query) # Reconstruct URL with /page/X/ path before query # Base domain + path (without query) path = parsed.path.rstrip("/") new_path = f"{path}/page/{page}/" # Re-add query params new_query = urlencode(query_params, doseq=True) url = urlunparse(( parsed.scheme, parsed.netloc, new_path, parsed.params, new_query, parsed.fragment )) elif "/page/" in base_url: # Already has page, might be tricky. Assume base_url is a category root. # Remove trailing slash url = base_url.rstrip("/") # Check if it ends with /page/X url = re.sub(r"/page/\d+/?$", "", url) url = f"{url}/page/{page}/" else: url = base_url.rstrip("/") + f"/page/{page}/" print(f"Fetching list from: {url}") try: html = await fetch_html(url) except Exception as e: print(f"Error fetching {url}: {e}") return [] soup = BeautifulSoup(html, "html.parser") videos = [] # Selector based on HTML:

articles = soup.select("article.thumb-block") for article in articles: try: # Title matches: ... link_tag = article.select_one("a") if not link_tag: continue video_url = link_tag.get("href") title = link_tag.get("title") # Thumbnail matches: data-main-thumb="..." on article or inside picture/img thumb_url = article.get("data-main-thumb") if not thumb_url: img_tag = article.select_one("img") if img_tag: thumb_url = img_tag.get("data-lazy-src") or img_tag.get("src") # Duration matches: ... duration_tag = article.select_one(".duration") duration = duration_tag.get_text(strip=True) if duration_tag else None if video_url and title: videos.append({ "url": video_url, "title": title, "thumbnail_url": thumb_url, "duration": duration, "views": None, # Not visible in snippet "uploader_name": None, # Actors listed in class or tags, but not generic uploader "upload_time": None }) except Exception as e: print(f"Error parsing video item: {e}") continue return videos async def scrape(url: str) -> dict[str, object]: """ Scrape a single video page. """ html = await fetch_html(url) soup = BeautifulSoup(html, "html.parser") # Title title = "" title_tag = soup.select_one("h1.entry-title") if title_tag: title = title_tag.get_text(strip=True) # Tags / Categories tags = [] #

tag_links = soup.select(".tags-list a.label") for link in tag_links: tags.append(link.get_text(strip=True)) # Actors #

... Actor Name ...

actors = [] actor_links = soup.select("#video-actors a") for link in actor_links: actors.append(link.get_text(strip=True)) # Video extraction # The iframe src contains base64 encoded params #