Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions mbta-performance/chalicelib/historic/backfill/bus.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ..constants import BUS_ARCGIS_IDS
from ..process import process_bus_events

from ..download import download_all_bus_data
Expand All @@ -10,13 +11,13 @@ def backfill_bus_data(years: list = None, routes: list = None, output_dir: str =
This replaces the bash script functionality.

Args:
years: List of years to process (default: 2018-2025)
years: List of years to process (default: all years in BUS_ARCGIS_IDS)
routes: List of route IDs to process (default: common bus routes)
output_dir: Output directory for processed data
nozip: Whether to skip gzipping files
"""
if years is None:
years = list(range(2018, 2026)) # 2018-2025
years = [int(y) for y in BUS_ARCGIS_IDS.keys()]

pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

Expand Down
22 changes: 19 additions & 3 deletions mbta-performance/chalicelib/historic/backfill/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from datetime import date

from ..constants import ARCGIS_IDS, HISTORIC_COLUMNS_LAMP, HISTORIC_COLUMNS_PRE_LAMP
from ..download import download_historic_data, list_files_in_dir, prep_local_dir, unzip_historic_data
from ..process import process_events


def backfill_single_year(year: str):
def backfill_single_year(year: str, start_date: date = None, end_date: date = None):
print(f"Backfilling year {year}")
# download the data
zip_file = download_historic_data(year)
Expand All @@ -13,9 +15,11 @@ def backfill_single_year(year: str):
for file in list_files_in_dir(input_dir):
# in 2024 data moved to LAMP and the format changed
if int(year) >= 2024:
process_events(file, "data/output", columns=HISTORIC_COLUMNS_LAMP)
process_events(file, "data/output", columns=HISTORIC_COLUMNS_LAMP, start_date=start_date, end_date=end_date)
else:
process_events(file, "data/output", columns=HISTORIC_COLUMNS_PRE_LAMP)
process_events(
file, "data/output", columns=HISTORIC_COLUMNS_PRE_LAMP, start_date=start_date, end_date=end_date
)
print(f"Finished backfilling year {year}")


Expand All @@ -28,5 +32,17 @@ def backfill_all_years():
backfill_single_year(year)


def backfill_date_range(start_date: date, end_date: date):
"""Backfill only data within [start_date, end_date], spanning whichever years are needed."""
prep_local_dir()

for year in range(start_date.year, end_date.year + 1):
year_str = str(year)
if year_str not in ARCGIS_IDS:
print(f"Skipping year {year_str}: no ARCGIS_ID configured")
continue
backfill_single_year(year_str, start_date=start_date, end_date=end_date)


if __name__ == "__main__":
backfill_all_years()
2 changes: 2 additions & 0 deletions mbta-performance/chalicelib/historic/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"2023": "9a7f5634db72459ab731b6a9b274a1d4",
"2024": "0711756aa5e1400891e79b984a94b495",
"2025": "e2344a2297004b36b82f57772926ed1a",
"2026": "a6d2a50d28fc43d8a4f86f73085d17b0",
}

# ARCGIS_IDS for each year, bus data
Expand All @@ -21,6 +22,7 @@
"2023": "b7b36fdb7b3a4728af2fccc78c2ca5b7",
"2024": "96c77138c3144906bce93d0257531b6a",
"2025": "924df13d845f4907bb6a6c3ed380d57a",
"2026": "9d8a8cad277545c984c1b25ed10b7d3c",
}


Expand Down
2 changes: 1 addition & 1 deletion mbta-performance/chalicelib/historic/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def clean_unicode_bom(file_path: str):


def download_all_bus_data():
"""Download all bus data files (2018-2025)."""
"""Download all bus data files for every year in BUS_ARCGIS_IDS."""
prep_local_dir()

# Download bus data for each year
Expand Down
16 changes: 15 additions & 1 deletion mbta-performance/chalicelib/historic/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@
from .gtfs_archive import add_gtfs_headways


def process_events(input_csv: str, outdir: str, nozip: bool = False, columns: list = HISTORIC_COLUMNS):
def process_events(
input_csv: str,
outdir: str,
nozip: bool = False,
columns: list = HISTORIC_COLUMNS,
start_date=None,
end_date=None,
):
df = pd.read_csv(
input_csv,
usecols=columns,
Expand All @@ -31,6 +38,13 @@ def process_events(input_csv: str, outdir: str, nozip: bool = False, columns: li
},
)

if start_date is not None:
df = df[df["service_date"].dt.date >= start_date]
if end_date is not None:
df = df[df["service_date"].dt.date <= end_date]
if df.empty:
return

df["event_time"] = df["service_date"] + pd.to_timedelta(df["event_time_sec"], unit="s")
df.drop(columns=["event_time_sec"], inplace=True)

Expand Down
Loading