Skip to content

Commit 3f74bd6

Browse files
authored
feat: add dataset acquisition script and set up dependencies (#2)
- Rename fetch script to download_sbb_gtfs.py for clearer intent. - Enhance download script with standard English comments and error handling. - Update requirements.txt to include requests for fetching data. - Ignore data/raw_data/ directory in .gitignore to avoid tracking large files.
1 parent 8ec4e43 commit 3f74bd6

3 files changed

Lines changed: 58 additions & 0 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,6 @@ coverage/
4242
# Misc
4343
.vercel
4444
.netlify
45+
46+
# Data
47+
data/raw_data/

data/scripts/download_sbb_gtfs.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import requests
2+
import zipfile
3+
import os
4+
5+
# Define absolute paths based on this script's location
6+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
7+
# DATA_DIR represents the 'data' folder
8+
DATA_DIR = os.path.dirname(SCRIPT_DIR)
9+
DEFAULT_EXTRACT_DIR = os.path.join(DATA_DIR, "raw_data")
10+
11+
def download_sbb_gtfs(extract_dir=DEFAULT_EXTRACT_DIR):
12+
# Official SBB 2026 GTFS static data permalink
13+
# No API Key required, directly downloads the latest version
14+
permalink_url = "https://data.opentransportdata.swiss/dataset/timetable-2026-gtfs2020/permalink"
15+
16+
print("Connecting to official SBB permalink to fetch the latest GTFS data...")
17+
18+
try:
19+
# Save temp zip in the data directory
20+
zip_filename = os.path.join(DATA_DIR, "sbb_gtfs_temp.zip")
21+
print("Starting dataset download...")
22+
23+
# Use allow_redirects=True to follow redirects to the actual zip file URL
24+
with requests.get(permalink_url, stream=True, allow_redirects=True) as r:
25+
r.raise_for_status()
26+
with open(zip_filename, 'wb') as f:
27+
for chunk in r.iter_content(chunk_size=8192):
28+
if chunk:
29+
f.write(chunk)
30+
31+
print(f"Download completed. Extracting to directory: ./{extract_dir}/")
32+
33+
# Extract the contents of the zip file
34+
os.makedirs(extract_dir, exist_ok=True)
35+
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
36+
zip_ref.extractall(extract_dir)
37+
38+
# Clean up the temporary zip file
39+
os.remove(zip_filename)
40+
print("Done. The dataset is ready for use.")
41+
42+
# List core files to verify extraction
43+
extracted_files = os.listdir(extract_dir)
44+
print(f"Total files extracted: {len(extracted_files)}. Examples: {', '.join(extracted_files[:5])}...")
45+
46+
except requests.exceptions.RequestException as e:
47+
print(f"Network request failed: {e}")
48+
except zipfile.BadZipFile:
49+
print("Error: The downloaded ZIP file is corrupted.")
50+
except Exception as e:
51+
print(f"An unknown error occurred: {e}")
52+
53+
if __name__ == "__main__":
54+
download_sbb_gtfs()

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
requests

0 commit comments

Comments
 (0)