From 53acd87f6f4e589297e4d3719ac02c14bb663345 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrew=20Kour=C3=A9?= <andrew@koure.org>
Date: Thu, 12 Feb 2026 17:08:04 +0000
Subject: [PATCH] Added docstrings to ridership

---
 ingestor/chalicelib/ridership/arcgis.py     |  9 ++
 ingestor/chalicelib/ridership/dynamo.py     |  6 ++
 ingestor/chalicelib/ridership/gtfs.py       |  5 ++
 ingestor/chalicelib/ridership/ingest.py     | 20 +++++
 ingestor/chalicelib/ridership/process.py    | 99 +++++++++++++++++++++
 ingestor/chalicelib/ridership/sharepoint.py | 85 +++++++++++-------
 6 files changed, 194 insertions(+), 30 deletions(-)

diff --git a/ingestor/chalicelib/ridership/arcgis.py b/ingestor/chalicelib/ridership/arcgis.py
index 96d71df..ba746d3 100644
--- a/ingestor/chalicelib/ridership/arcgis.py
+++ b/ingestor/chalicelib/ridership/arcgis.py
@@ -40,6 +40,15 @@ def ride_update_cache():
 
 
 def download_latest_ridership_files() -> Tuple[str | None, str | None, str | None, str | None, str | None]:
+    """Download the latest ridership files for all transit modes.
+
+    Fetches subway and bus data from SharePoint, and commuter rail, ferry,
+    and The RIDE data from ArcGIS.
+
+    Returns:
+        Tuple of file paths (subway, bus, commuter rail, ferry, The RIDE),
+        where each element may be None if the download failed.
+    """
     sharepoint = SharepointConnection()
 
     cr_tmp_path = NamedTemporaryFile().name
diff --git a/ingestor/chalicelib/ridership/dynamo.py b/ingestor/chalicelib/ridership/dynamo.py
index b2fe3a6..a53d3c5 100644
--- a/ingestor/chalicelib/ridership/dynamo.py
+++ b/ingestor/chalicelib/ridership/dynamo.py
@@ -6,6 +6,12 @@
 
 
 def ingest_ridership_to_dynamo(entries_by_line_id: Dict[str, List[Dict]]):
+    """Batch write ridership entries to the DynamoDB Ridership table.
+
+    Args:
+        entries_by_line_id: Mapping of line IDs to lists of ridership entry dicts,
+            each containing 'date' (YYYY-MM-DD) and 'count' keys.
+    """
     dynamodb = boto3.resource("dynamodb")
     Ridership = dynamodb.Table(DYNAMO_TABLE_NAME)
     with Ridership.batch_writer() as batch:
diff --git a/ingestor/chalicelib/ridership/gtfs.py b/ingestor/chalicelib/ridership/gtfs.py
index 631aa0d..25aa8ae 100644
--- a/ingestor/chalicelib/ridership/gtfs.py
+++ b/ingestor/chalicelib/ridership/gtfs.py
@@ -8,6 +8,11 @@
 
 
 def get_routes_by_line_id() -> Dict[str, Route]:
+    """Fetch GTFS route data from S3 and group routes by their line ID.
+
+    Returns:
+        Mapping of line IDs to lists of Route objects from the latest GTFS feed.
+    """
     s3 = boto3.resource("s3")
     archive = MbtaGtfsArchive(
         local_archive_path=TemporaryDirectory().name,
diff --git a/ingestor/chalicelib/ridership/ingest.py b/ingestor/chalicelib/ridership/ingest.py
index 20f56ec..379e2d3 100644
--- a/ingestor/chalicelib/ridership/ingest.py
+++ b/ingestor/chalicelib/ridership/ingest.py
@@ -13,6 +13,21 @@ def get_ridership_by_line_id(
     ridership_by_route_id: Dict[str, Dict],
     routes_by_line_id: Dict[str, Route],
 ):
+    """Aggregate ridership data from route-level to line-level.
+
+    Sums ridership counts across all routes belonging to each line, grouping
+    by date. Handles Green Line branch aggregation and adds The RIDE as a
+    separate line entry.
+
+    Args:
+        ridership_by_route_id: Mapping of route IDs to lists of ridership
+            entry dicts with 'date' and 'count' keys.
+        routes_by_line_id: Mapping of line IDs to lists of Route objects.
+
+    Returns:
+        Mapping of line IDs to sorted lists of ridership entries with
+        summed counts per date.
+    """
     by_line_id = {}
     for line_id, routes in routes_by_line_id.items():
         route_entries = [entry for route in routes for entry in ridership_by_route_id.get(route.route_id, [])]
@@ -34,6 +49,11 @@ def get_ridership_by_line_id(
 
 
 def ingest_ridership_data():
+    """Run the full ridership ingestion pipeline.
+
+    Downloads the latest ridership files for all transit modes, processes
+    and aggregates them by line ID, and writes the results to DynamoDB.
+    """
     routes = get_routes_by_line_id()
     cr_update_cache()
     ferry_update_cache()
diff --git a/ingestor/chalicelib/ridership/process.py b/ingestor/chalicelib/ridership/process.py
index bdf2cce..c2149f3 100644
--- a/ingestor/chalicelib/ridership/process.py
+++ b/ingestor/chalicelib/ridership/process.py
@@ -64,6 +64,22 @@ def pre_process_csv(
     count_key: str,
     route_name: str | None = None,
 ):
+    """Pre-process a CSV file by aggregating daily ridership into weekly totals.
+
+    Reads the CSV, groups records by ISO year, week, and route, sums the count
+    values, and writes the result to a temporary CSV file.
+
+    Args:
+        path_to_csv_file: Path to the input CSV file.
+        date_key: Column name containing date values.
+        route_key: Column name containing route identifiers. If None and
+            route_name is provided, a 'Route' column is added with route_name.
+        count_key: Column name containing ridership counts.
+        route_name: Constant route name to assign when route_key is None.
+
+    Returns:
+        Path to a temporary CSV file containing the weekly aggregated data.
+    """
     if route_key is None and route_name is not None:
         route_key = "Route"
         df = pd.read_csv(path_to_csv_file, usecols=[date_key, count_key])
@@ -93,6 +109,21 @@ def format_ridership_csv(
     count_key: str,
     route_ids_map: Union[None, Dict[str, str]] = None,
 ):
+    """Format a ridership CSV into a dict of weekly average peak-day counts by route.
+
+    Reads ridership data, filters to weekday non-holiday (peak) days, computes
+    weekly averages per route, and returns the results grouped by route ID.
+
+    Args:
+        path_to_csv_file: Path to the input CSV file.
+        date_key: Column name containing date values.
+        route_key: Column name containing route identifiers.
+        count_key: Column name containing ridership counts.
+        route_ids_map: Optional mapping from raw route names to canonical route IDs.
+
+    Returns:
+        Dict mapping route IDs to lists of dicts with 'date' and 'count' keys.
+    """
     # read data, convert to datetime
     df = pd.read_csv(path_to_csv_file)
     df[date_key] = pd.to_datetime(df[date_key])
@@ -141,6 +172,17 @@ def format_ridership_csv(
 
 
 def format_subway_data(path_to_csv_file: str):
+    """Format subway gated station validation data into weekly peak-day averages.
+
+    Reads the subway CSV, filters to peak weekdays, computes weekly average
+    validations per route/line, and maps route names to canonical IDs.
+
+    Args:
+        path_to_csv_file: Path to the subway ridership CSV file.
+
+    Returns:
+        Dict mapping route IDs to lists of dicts with 'date' and 'count' keys.
+    """
     # read data, convert to datetime
     df = pd.read_csv(path_to_csv_file)
     df["servicedate"] = pd.to_datetime(df["servicedate"])
@@ -192,6 +234,17 @@ def format_subway_data(path_to_csv_file: str):
 
 
 def format_bus_data(path_to_excel_file: str):
+    """Format bus ridership data from an Excel file into weekly counts by route.
+
+    Reads the 'Weekly by Route' sheet, normalizes column names, and groups
+    ridership data by route with mapped canonical route IDs.
+
+    Args:
+        path_to_excel_file: Path to the bus ridership Excel file.
+
+    Returns:
+        Dict mapping route IDs to lists of dicts with 'date' and 'count' keys.
+    """
     # read data - new format doesn't need skiprows
     df = pd.read_excel(
         path_to_excel_file,
@@ -231,6 +284,14 @@ def format_bus_data(path_to_excel_file: str):
 
 
 def format_cr_data(path_to_ridershp_file: str):
+    """Format commuter rail ridership data into weekly peak-day averages by line.
+
+    Args:
+        path_to_ridershp_file: Path to the commuter rail ridership CSV file.
+
+    Returns:
+        Dict mapping CR route IDs to lists of dicts with 'date' and 'count' keys.
+    """
     ridership_by_route = format_ridership_csv(
         path_to_csv_file=path_to_ridershp_file,
         date_key="servicedate",
@@ -242,6 +303,17 @@ def format_cr_data(path_to_ridershp_file: str):
 
 
 def format_ferry_data(path_to_ridership_file: str):
+    """Format ferry ridership data into weekly peak-day averages by route.
+
+    Pre-processes daily departure data into weekly aggregates, then formats
+    into peak-day averages with canonical ferry route IDs.
+
+    Args:
+        path_to_ridership_file: Path to the ferry ridership CSV file.
+
+    Returns:
+        Dict mapping ferry route IDs to lists of dicts with 'date' and 'count' keys.
+    """
     preprocess = pre_process_csv(
         path_to_csv_file=path_to_ridership_file,
         date_key="actual_departure",
@@ -259,6 +331,17 @@ def format_ferry_data(path_to_ridership_file: str):
 
 
 def format_the_ride_data(path_to_ridership_file: str):
+    """Format The RIDE paratransit ridership data into weekly peak-day averages.
+
+    Pre-processes daily completed trip data into weekly aggregates, then
+    formats into peak-day averages under a single 'RIDE' route.
+
+    Args:
+        path_to_ridership_file: Path to The RIDE ridership CSV file.
+
+    Returns:
+        Dict mapping 'RIDE' to a list of dicts with 'date' and 'count' keys.
+    """
     preprocess = pre_process_csv(
         path_to_csv_file=path_to_ridership_file,
         date_key="Date",
@@ -282,6 +365,22 @@ def get_ridership_by_route_id(
     path_to_ferry_file: str | None,
     path_to_ride_file: str | None,
 ):
+    """Process all ridership files and merge into a single dict keyed by route ID.
+
+    Formats each transit mode's data file (if provided) and combines the results
+    into a unified mapping of route IDs to ridership entries.
+
+    Args:
+        path_to_subway_file: Path to the subway ridership CSV, or None to skip.
+        path_to_bus_file: Path to the bus ridership Excel file, or None to skip.
+        path_to_cr_file: Path to the commuter rail ridership CSV, or None to skip.
+        path_to_ferry_file: Path to the ferry ridership CSV, or None to skip.
+        path_to_ride_file: Path to The RIDE ridership CSV, or None to skip.
+
+    Returns:
+        Dict mapping route IDs to lists of dicts with 'date' and 'count' keys,
+        spanning all provided transit modes.
+    """
     subway = format_subway_data(path_to_subway_file) if path_to_subway_file else {}
     bus = format_bus_data(path_to_bus_file) if path_to_bus_file else {}
     cr = format_cr_data(path_to_cr_file) if path_to_cr_file else {}
diff --git a/ingestor/chalicelib/ridership/sharepoint.py b/ingestor/chalicelib/ridership/sharepoint.py
index fa0f8e7..bbed858 100644
--- a/ingestor/chalicelib/ridership/sharepoint.py
+++ b/ingestor/chalicelib/ridership/sharepoint.py
@@ -18,25 +18,39 @@
 
 class SharepointConnection:
     def __init__(self, user_agent: str = DEFAULT_USER_AGENT, base_url=BASE_URL, prefix="mbta") -> None:
+        """Initialize a SharePoint connection with session and configuration.
+
+        Args:
+            user_agent: User-Agent string for HTTP requests.
+            base_url: Base SharePoint URL for folder browsing.
+            prefix: SharePoint tenant prefix used in download URLs.
+        """
         self.session = self.setup_session(user_agent)
         self.base_url = base_url
         self.all_files = []
         self.prefix = prefix
 
     def setup_session(self, user_agent: str) -> requests.Session:
+        """Create and configure an HTTP session with the given User-Agent.
+
+        Args:
+            user_agent: User-Agent header value for the session.
+
+        Returns:
+            Configured requests.Session instance.
+        """
         session = requests.Session()
         session.headers.update({"User-Agent": user_agent})
         return session
 
     def get_sharepoint_folder_contents_anonymous(self, share_url):
-        """
-        Get contents of a SharePoint folder using anonymous access via sharing link.
+        """Get contents of a SharePoint folder using anonymous access via sharing link.
 
         Args:
-            share_url: The SharePoint 'anyone with the link' URL
+            share_url: The SharePoint 'anyone with the link' URL.
 
         Returns:
-            List of dictionaries containing file information, or None on error
+            List of dictionaries containing file information, or None on error.
         """
         # Follow the sharing link
         response = self.session.get(share_url, allow_redirects=True)
@@ -51,6 +65,17 @@ def get_sharepoint_folder_contents_anonymous(self, share_url):
         return files
 
     def parse_g_data(self, html: str):
+        """Parse the g_listData JavaScript variable from a SharePoint HTML page.
+
+        Extracts the embedded JSON file listing from the page source by locating
+        and parsing the g_listData variable assignment.
+
+        Args:
+            html: Raw HTML content from a SharePoint folder page.
+
+        Returns:
+            List of file info dictionaries, or None if parsing fails.
+        """
         # Extract g_listData which contains the file list
         # Find the start of g_listData
         start_marker = "g_listData = "
@@ -118,15 +143,13 @@ def parse_g_data(self, html: str):
             return None
 
     def get_folder_by_path(self, folder_path):
-        """
-        Get contents of a specific folder by its server-relative path.
+        """Get contents of a specific folder by its server-relative path.
 
         Args:
-            session: requests.Session with cookies
-            folder_path: Server-relative path like '/sites/PublicData/Shared Documents/...'
+            folder_path: Server-relative path like '/sites/PublicData/Shared Documents/...'.
 
         Returns:
-            List of file info dictionaries
+            List of file info dictionaries, or None on error.
         """
         # Construct the URL to view that specific folder
 
@@ -142,14 +165,13 @@ def get_folder_by_path(self, folder_path):
         return files
 
     def list_all_files_recursive(self, folder_path):
-        """
-        Recursively list all files in a folder and its subfolders.
+        """Recursively list all files in a folder and its subfolders.
 
         Args:
-            folder_path: Server-relative path to start from
+            folder_path: Server-relative path to start from.
 
         Returns:
-            List of all files (not folders) found
+            List of all file info dictionaries (not folders) found.
         """
         files = self.get_folder_by_path(folder_path)
         if not files:
@@ -178,15 +200,14 @@ def list_all_files_recursive(self, folder_path):
         return all_files
 
     def download_sharepoint_file_anonymous(self, file_ref, output_path):
-        """
-        Download a file from SharePoint using an existing session.
+        """Download a file from SharePoint using an existing session.
 
         Args:
-            file_ref: The FileRef path from the file list
-            output_path: Local path to save the file
+            file_ref: The FileRef path from the file list.
+            output_path: Local path to save the file.
 
         Returns:
-            True if successful, False otherwise
+            True if successful, False otherwise.
         """
         # Construct download URL
         download_url = f"https://{self.prefix}.sharepoint.com{file_ref}?download=1"
@@ -203,17 +224,22 @@ def download_sharepoint_file_anonymous(self, file_ref, output_path):
             return False
 
     def fetch_sharepoint_file(self, file_regex=None, share_url=None, target_date=None, bus_data=True):
-        """
-        Downloads files from Sharepoint matching a regex pattern.
+        """Download files from SharePoint matching a regex pattern.
 
         Args:
-            file_regex (str or Pattern): Regular expression pattern to match against filenames. If None, uses default patterns based on bus_data.
-            share_url (str): SharePoint sharing URL to download from. If None, uses default URLs based on bus_data.
-            target_date (date): Date object specifying which file to download. Used for default subway data pattern matching. Optional for Bus data, required for Subway Data when file_regex is None.
-            bus_data (bool): Whether to download Bus Data (True) or Subway Data (False). Only used when file_regex is None.
+            file_regex: Regular expression pattern (str or compiled Pattern) to match
+                against filenames. If None, uses default patterns based on bus_data.
+            share_url: SharePoint sharing URL to download from. If None, uses default
+                URLs based on bus_data.
+            target_date: Date object specifying which file to download. Used for default
+                subway data pattern matching. Optional for bus data, required for subway
+                data when file_regex is None.
+            bus_data: Whether to download bus data (True) or subway data (False). Only
+                used when file_regex is None.
 
         Returns:
-            str: Path to named Temporary File with data.
+            Path to a named temporary file containing the downloaded data, or None
+            if no matching file is found.
         """
         # Determine share URL
         if share_url is None:
@@ -289,16 +315,15 @@ def fetch_sharepoint_file(self, file_regex=None, share_url=None, target_date=Non
 
 
 def get_file_matching_date_pattern(files: List[dict], pattern: Pattern, target_date: Optional[date] = None):
-    """
-    Find a file matching a date pattern and extract the date from its name.
+    """Find a file matching a date pattern and extract the date from its name.
 
     Args:
-        files: List of file dictionaries with 'name' key
-        pattern: Compiled regex pattern with three capture groups for year, month, day
+        files: List of file dictionaries with 'name' key.
+        pattern: Compiled regex pattern with three capture groups for year, month, day.
         target_date: Specific date to match. If None, returns the newest matching file.
 
     Returns:
-        Tuple of (file_dict, date) if match found, None otherwise
+        Tuple of (file_dict, date) if a match is found, None otherwise.
     """
     newest_file = None
     newest_date = None