Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 0 additions & 43 deletions scrapers/AdultTime/AdultTime.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,47 +44,6 @@ def sitename_from_url(_url: str) -> str | None:
return None


preview_site_map = {
"adulttimepilots": "adulttimepilots.com",
"all-sexstudio": "allsexstudio.net",
"caughtfapping": "caughtfapping.com",
"daddysboy": "daddysboy.org",
"dareweshare": "dareweshare.net",
"gostuckyourself": "gostuckyourself.net",
"gostuckyourself-channel": "gostuckyourself.net",
"kissmefuckme": "kissmefuckme.net",
"milfoverload-channel": "milfoverload.net",
"mommysboy": "mommysboy.net",
"preggoworld-channel": "preggoworld.net",
"shewantshim": "shewantshim.net",
"watchyoucheat": "watchyoucheat.net",
"womensworld": "adulttimepilots.net",
}


def preview_urls(urls: list[str]) -> list[str]:
"""
some sites have scene preview pages using the url_title as the path, e.g.
- https://adulttimepilots.com/Expose-Her-Therapy/
- https://daddysboy.org/A-Bets-A-Bet-Pop/
- https://dareweshare.net/Thats-Good-Teamwork/
"""
if matching_urls := [
urlparse(url)
for sitename in preview_site_map
for url in urls
if sitename_from_url(url) == sitename
]:
return [
parsed_url._replace(
netloc=preview_site_map.get(sitename_from_url(parsed_url.path).lower()),
path=url_title_from_path(parsed_url.path).lower(),
).geturl()
for parsed_url in matching_urls
]
return []


def _is_valid_url(_url: str, highest_status_code: int = 299):
"""
Checks if an URL is valid by making a HEAD request and ensuring the response status code is
Expand Down Expand Up @@ -330,8 +289,6 @@ def postprocess_scene(scene: ScrapedScene, api_scene: dict[str, Any]) -> Scraped
log.debug(f'scene"[urls]" (before): {scene["urls"]}')
scene["urls"] = [fix_url(url) for url in urls]
log.debug(f'scene"[urls]" (after fix): {scene["urls"]}')
scene["urls"].extend(preview_urls(scene["urls"]))
log.debug(f'scene"[urls]" (after extend with preview): {scene["urls"]}')

if action_tags := api_scene.get("action_tags"):
process_action_tags(action_tags)
Expand Down
24 changes: 22 additions & 2 deletions scrapers/AdultTime/AdultTime.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# yaml-language-server: $schema=../../validator/scraper.schema.json
# requires: AlgoliaAPI
name: "AdultTime"
name: AdultTime
sceneByURL:
- action: script
url:
Expand Down Expand Up @@ -97,6 +97,26 @@ sceneByURL:
- AdultTime.py
- girlsway
- scene-by-url
- action: script
url:
# these are affiliate sites that link to the same scenes as the main sites above
# but with different URLs, so we need to scrape them separately to get the correct scene URLs
- adulttimepilots.com
- allsexstudio.net
- daddysboy.org
- dareweshare.net
- gostuckyourself.net
- kissmefuckme.net
- milfoverload.net
- mommysboy.net
- preggoworld.net
- watchyoucheat.net
script:
- python
- AdultTime.py
- affiliate
- girlsway
- scene-by-url
sceneByFragment:
action: script
script:
Expand Down Expand Up @@ -247,4 +267,4 @@ performerByFragment:
- AdultTime.py
- girlsway
- performer-by-fragment
# Last Updated May 22, 2025
# Last Updated April 2, 2026
56 changes: 52 additions & 4 deletions scrapers/AlgoliaAPI/AlgoliaAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"""
from base64 import b64decode, b64encode
import configparser
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
from difflib import SequenceMatcher
import json
import os
Expand Down Expand Up @@ -460,11 +462,54 @@ def scene_from_url(
postprocess: Callable[[ScrapedScene, dict], ScrapedScene] = default_postprocess
) -> ScrapedScene | None:
"Scrapes a scene from a URL, running an optional postprocess function on the result"
if isinstance(sites, list) and "affiliate" in sites:
sites.remove("affiliate")
return scene_from_affiliate_url(_url, sites, fragment, postprocess)
clip_id = id_from_url(_url)
site = sites[0] # TODO: handle multiple sites
log.debug(f"Clip ID: {clip_id}, Site: {site}")
return scene_from_id(clip_id, [site], fragment, postprocess)

def scene_from_affiliate_url(
_url: str,
sites: list[str],
fragment: dict[str, Any] = None,
postprocess: Callable[[ScrapedScene, dict], ScrapedScene] = default_postprocess
) -> ScrapedScene | None:
"Scrapes a scene from an affiliate URL, running an optional postprocess function on the result"
r = requests.get(_url, timeout=10)
if r.status_code != 200:
log.error(f"Failed to fetch affiliate URL: {r.status_code}")
return None
soup = bs(r.text, features='html.parser')
# attempt to extract title and date from the affiliate page, to use as a fragment for matching the correct scene
fragment = {}
# title is in //h2 tag
if h2 := soup.find("h2"):
fragment["title"] = h2.get_text(strip=True)
else:
# fallback to using the URL slug as the title, which is less reliable but better than nothing
parsed_url = urlparse(_url)
slug = parsed_url.path.split("/")[-1]
fragment["title"] = slug.replace("-", " ").title()
# date is in //div[@id="title-single"]/span
# format is usually like "March 18th, 2026" but can also be relative like "Yesterday"
if date_span := soup.select_one("#title-single > span"):
date_text = date_span.get_text(strip=True)
# parse relative date formats to absolute date, e.g. "Yesterday" -> "2026-03-17"
if date_text.lower() == "yesterday":
fragment["date"] = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")
# parse absolute date formats to "YYYY-MM-DD"
else:
# This regex looks for digits (\d+) followed by st, nd, rd, or th
# and replaces just the suffix with an empty string.
clean_date_text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", date_text)
try:
fragment["date"] = datetime.strptime(clean_date_text, "%B %d, %Y").strftime("%Y-%m-%d")
except ValueError as e:
log.error(f"Failed to parse date from affiliate page: {e}")
return scene_from_fragment(fragment, sites, postprocess)

def scene_url_from_photoset(photoset_from_api: dict[str, Any], site: str) -> str | None:
"Extracts scene URL from API photoset properties"
if (
Expand Down Expand Up @@ -743,10 +788,13 @@ def scene_search(
if len(api_scenes := [hit.to_dict() for hit in response.hits]) == 1: # single search result
return [postprocess(to_scraped_scene(api_scenes[0], site), api_scenes[0])]
if len(api_scenes) > 1: # multiple search results
return [
postprocess(to_scraped_scene(api_scene, site), api_scene)
for api_scene in sort_api_scenes_by_match(api_scenes, fragment) # sort
]
sorted_scenes = sort_api_scenes_by_match(api_scenes, fragment) # sort
# postprocess concurrently to save time
with ThreadPoolExecutor() as executor:
results = list(executor.map(
lambda s: postprocess(to_scraped_scene(s, site), s), sorted_scenes
))
return results
return []

def add_photoset_match_metadata(
Expand Down
4 changes: 1 addition & 3 deletions scrapers/Mypervmom.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@ sceneByURL:
url:
- boyfriendsharing.com
- brattyfamily.com
- gostuckyourself.net
- hugecockbreak.com
- littlefromasia.com
- mommysboy.net
- momxxx.org
- mybadmilfs.com
- mydaughterswap.com
Expand Down Expand Up @@ -43,4 +41,4 @@ xPathScrapers:
- map:
BrattySis: Bratty Sis
PervMom: Perv Mom
# Last Updated November 05, 2022
# Last Updated April 2, 2026