From 74f22ee69e20f3f442a2d66de4ce014503c93d42 Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 2 Apr 2026 13:12:19 +0100 Subject: [PATCH 1/3] don't add affiliate site URL --- scrapers/AdultTime/AdultTime.py | 43 -------------------------------- scrapers/AdultTime/AdultTime.yml | 24 ++++++++++++++++-- 2 files changed, 22 insertions(+), 45 deletions(-) diff --git a/scrapers/AdultTime/AdultTime.py b/scrapers/AdultTime/AdultTime.py index 91df10652..345420155 100644 --- a/scrapers/AdultTime/AdultTime.py +++ b/scrapers/AdultTime/AdultTime.py @@ -44,47 +44,6 @@ def sitename_from_url(_url: str) -> str | None: return None -preview_site_map = { - "adulttimepilots": "adulttimepilots.com", - "all-sexstudio": "allsexstudio.net", - "caughtfapping": "caughtfapping.com", - "daddysboy": "daddysboy.org", - "dareweshare": "dareweshare.net", - "gostuckyourself": "gostuckyourself.net", - "gostuckyourself-channel": "gostuckyourself.net", - "kissmefuckme": "kissmefuckme.net", - "milfoverload-channel": "milfoverload.net", - "mommysboy": "mommysboy.net", - "preggoworld-channel": "preggoworld.net", - "shewantshim": "shewantshim.net", - "watchyoucheat": "watchyoucheat.net", - "womensworld": "adulttimepilots.net", -} - - -def preview_urls(urls: list[str]) -> list[str]: - """ - some sites have scene preview pages using the url_title as the path, e.g. - - https://adulttimepilots.com/Expose-Her-Therapy/ - - https://daddysboy.org/A-Bets-A-Bet-Pop/ - - https://dareweshare.net/Thats-Good-Teamwork/ - """ - if matching_urls := [ - urlparse(url) - for sitename in preview_site_map - for url in urls - if sitename_from_url(url) == sitename - ]: - return [ - parsed_url._replace( - netloc=preview_site_map.get(sitename_from_url(parsed_url.path).lower()), - path=url_title_from_path(parsed_url.path).lower(), - ).geturl() - for parsed_url in matching_urls - ] - return [] - - def _is_valid_url(_url: str, highest_status_code: int = 299): """ Checks if an URL is valid by making a HEAD request and ensuring the response status code is @@ -330,8 +289,6 @@ def postprocess_scene(scene: ScrapedScene, api_scene: dict[str, Any]) -> Scraped log.debug(f'scene"[urls]" (before): {scene["urls"]}') scene["urls"] = [fix_url(url) for url in urls] log.debug(f'scene"[urls]" (after fix): {scene["urls"]}') - scene["urls"].extend(preview_urls(scene["urls"])) - log.debug(f'scene"[urls]" (after extend with preview): {scene["urls"]}') if action_tags := api_scene.get("action_tags"): process_action_tags(action_tags) diff --git a/scrapers/AdultTime/AdultTime.yml b/scrapers/AdultTime/AdultTime.yml index 2d645e9fe..69e7e7ff3 100644 --- a/scrapers/AdultTime/AdultTime.yml +++ b/scrapers/AdultTime/AdultTime.yml @@ -1,6 +1,6 @@ # yaml-language-server: $schema=../../validator/scraper.schema.json # requires: AlgoliaAPI -name: "AdultTime" +name: AdultTime sceneByURL: - action: script url: @@ -97,6 +97,26 @@ sceneByURL: - AdultTime.py - girlsway - scene-by-url + - action: script + url: + # these are affiliate sites that link to the same scenes as the main sites above + # but with different URLs, so we need to scrape them separately to get the correct scene URLs + - adulttimepilots.com + - allsexstudio.net + - daddysboy.org + - dareweshare.net + - gostuckyourself.net + - kissmefuckme.net + - milfoverload.net + - mommysboy.net + - preggoworld.net + - watchyoucheat.net + script: + - python + - AdultTime.py + - affiliate + - girlsway + - scene-by-url sceneByFragment: action: script script: @@ -247,4 +267,4 @@ performerByFragment: - AdultTime.py - girlsway - performer-by-fragment -# Last Updated May 22, 2025 +# Last Updated April 2, 2026 From 06a6e9a06e096fcb0cb86a015ce37c03872f0e7d Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 2 Apr 2026 13:13:02 +0100 Subject: [PATCH 2/3] handle affiliate scene by URL --- scrapers/AlgoliaAPI/AlgoliaAPI.py | 56 ++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/scrapers/AlgoliaAPI/AlgoliaAPI.py b/scrapers/AlgoliaAPI/AlgoliaAPI.py index bedfd6d91..f4937db1e 100644 --- a/scrapers/AlgoliaAPI/AlgoliaAPI.py +++ b/scrapers/AlgoliaAPI/AlgoliaAPI.py @@ -3,6 +3,8 @@ """ from base64 import b64decode, b64encode import configparser +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime, timedelta from difflib import SequenceMatcher import json import os @@ -460,11 +462,54 @@ def scene_from_url( postprocess: Callable[[ScrapedScene, dict], ScrapedScene] = default_postprocess ) -> ScrapedScene | None: "Scrapes a scene from a URL, running an optional postprocess function on the result" + if isinstance(sites, list) and "affiliate" in sites: + sites.remove("affiliate") + return scene_from_affiliate_url(_url, sites, fragment, postprocess) clip_id = id_from_url(_url) site = sites[0] # TODO: handle multiple sites log.debug(f"Clip ID: {clip_id}, Site: {site}") return scene_from_id(clip_id, [site], fragment, postprocess) +def scene_from_affiliate_url( + _url: str, + sites: list[str], + fragment: dict[str, Any] = None, + postprocess: Callable[[ScrapedScene, dict], ScrapedScene] = default_postprocess +) -> ScrapedScene | None: + "Scrapes a scene from an affiliate URL, running an optional postprocess function on the result" + r = requests.get(_url, timeout=10) + if r.status_code != 200: + log.error(f"Failed to fetch affiliate URL: {r.status_code}") + return None + soup = bs(r.text, features='html.parser') + # attempt to extract title and date from the affiliate page, to use as a fragment for matching the correct scene + fragment = {} + # title is in //h2 tag + if h2 := soup.find("h2"): + fragment["title"] = h2.get_text(strip=True) + else: + # fallback to using the URL slug as the title, which is less reliable but better than nothing + parsed_url = urlparse(_url) + slug = parsed_url.path.split("/")[-1] + fragment["title"] = slug.replace("-", " ").title() + # date is in //div[@id="title-single"]/span + # format is usually like "March 18th, 2026" but can also be relative like "Yesterday" + if date_span := soup.select_one("#title-single > span"): + date_text = date_span.get_text(strip=True) + # parse relative date formats to absolute date, e.g. "Yesterday" -> "2026-03-17" + if date_text.lower() == "yesterday": + fragment["date"] = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") + # parse absolute date formats to "YYYY-MM-DD" + else: + # This regex looks for digits (\d+) followed by st, nd, rd, or th + # and replaces just the suffix with an empty string. + clean_date_text = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", date_text) + try: + fragment["date"] = datetime.strptime(clean_date_text, "%B %d, %Y").strftime("%Y-%m-%d") + except ValueError as e: + log.error(f"Failed to parse date from affiliate page: {e}") + return scene_from_fragment(fragment, sites, postprocess) + def scene_url_from_photoset(photoset_from_api: dict[str, Any], site: str) -> str | None: "Extracts scene URL from API photoset properties" if ( @@ -743,10 +788,13 @@ def scene_search( if len(api_scenes := [hit.to_dict() for hit in response.hits]) == 1: # single search result return [postprocess(to_scraped_scene(api_scenes[0], site), api_scenes[0])] if len(api_scenes) > 1: # multiple search results - return [ - postprocess(to_scraped_scene(api_scene, site), api_scene) - for api_scene in sort_api_scenes_by_match(api_scenes, fragment) # sort - ] + sorted_scenes = sort_api_scenes_by_match(api_scenes, fragment) # sort + # postprocess concurrently to save time + with ThreadPoolExecutor() as executor: + results = list(executor.map( + lambda s: postprocess(to_scraped_scene(s, site), s), sorted_scenes + )) + return results return [] def add_photoset_match_metadata( From 0a7b8fd1cbc002c04e6aae0feb48a747c4c4f4f2 Mon Sep 17 00:00:00 2001 From: nrg101 Date: Thu, 2 Apr 2026 13:13:58 +0100 Subject: [PATCH 3/3] remove affiliate promo sites from Mypervmom that are now handled by AdultTime --- scrapers/Mypervmom.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scrapers/Mypervmom.yml b/scrapers/Mypervmom.yml index c8a6b1679..bb04ac23e 100644 --- a/scrapers/Mypervmom.yml +++ b/scrapers/Mypervmom.yml @@ -4,10 +4,8 @@ sceneByURL: url: - boyfriendsharing.com - brattyfamily.com - - gostuckyourself.net - hugecockbreak.com - littlefromasia.com - - mommysboy.net - momxxx.org - mybadmilfs.com - mydaughterswap.com @@ -43,4 +41,4 @@ xPathScrapers: - map: BrattySis: Bratty Sis PervMom: Perv Mom -# Last Updated November 05, 2022 +# Last Updated April 2, 2026