Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions docs/france_readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
## France Solar Data Pipeline for PVNet
This edit/ contribution adds support for France RTE solar generation data to the project.

## Changes
- Added France data processing script
- Created admin region metadata CSV
- Updated data pipeline to use integer location_ids
- Added inspection script for validation

## Data API
The Definitive datasets follow the format:
https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_{Region}_Annuel-Definitif_{Year}.zip

The consolidate datasets follow the format:
https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_{Region}_En-cours-Consolide.zip

Note that TCH (le Taux de CHarge), which refers to the actual production compared to installed solar capacity is only available from 2020. Hence, initially we use 2020 to 2024 (5 years) of data.

## Summer Time Behavior
When transitioning to summer time (e.g. 26 Mar 2023 2:00 to 03:00), entries between 2:00 and 3:00 are duplicated.
When transitioning back to winter time (e.g. 29 Oct 2023 3:00 to 2:00), data entries are ambiguous and 2 timesteps will be missing.

### ZARR File
The converted zarr file is available on huggingface, link:
https://huggingface.co/datasets/hhhn2/France_PV_data

## Testing
- Ran process_france_data.py successfully
- Validated output with inspect_france_training_pipeline.py

## Data Processing Results
Data Quality

Generation (MW):
Shape: (12, 87696)
Range: [0.00, 4002.00] MW
Mean: 174.98 MW
NaN count: 120 (0.01%)

Capacity (MWp):
Shape: (12, 87696)
Range: [122.70, 6000.00] MWp
Mean: 1170.15 MWp
NaN count: 0 (0.00%)

Per-Region Statistics

0:
Generation: [0.0, 2194.0] MW, Mean: 238.2 MW, NaN: 0.0%
Capacity: 1655.3 MWp, NaN: 0.0%

1:
Generation: [0.0, 883.0] MW, Mean: 78.7 MW, NaN: 0.0%
Capacity: 537.8 MWp, NaN: 0.0%

2:
Generation: [0.0, 568.0] MW, Mean: 49.3 MW, NaN: 0.0%
Capacity: 364.3 MWp, NaN: 0.0%

3:
Generation: [0.0, 975.0] MW, Mean: 97.5 MW, NaN: 0.0%
Capacity: 665.7 MWp, NaN: 0.0%

4:
Generation: [0.0, 1337.0] MW, Mean: 134.2 MW, NaN: 0.0%
Capacity: 998.7 MWp, NaN: 0.0%

5:
Generation: [0.0, 629.0] MW, Mean: 49.0 MW, NaN: 0.0%
Capacity: 361.4 MWp, NaN: 0.0%

6:
Generation: [0.0, 306.0] MW, Mean: 26.8 MW, NaN: 0.0%
Capacity: 218.7 MWp, NaN: 0.0%

7:
Generation: [0.0, 464.0] MW, Mean: 32.1 MW, NaN: 0.0%
Capacity: 247.4 MWp, NaN: 0.0%

8:
Generation: [0.0, 4002.0] MW, Mean: 534.9 MW, NaN: 0.0%
Capacity: 3524.9 MWp, NaN: 0.0%

9:
Generation: [0.0, 3287.0] MW, Mean: 438.6 MW, NaN: 0.0%
Capacity: 2799.3 MWp, NaN: 0.0%

10:
Generation: [0.0, 1213.0] MW, Mean: 118.8 MW, NaN: 0.0%
Capacity: 860.4 MWp, NaN: 0.0%

11:
Generation: [0.0, 1942.0] MW, Mean: 301.8 MW, NaN: 0.0%
Capacity: 1807.9 MWp, NaN: 0.0%

## Next Steps
- Make a NOAA GFS pipeline for France
- Compare with baseline model
13 changes: 13 additions & 0 deletions src/open_data_pvnet/configs/admin_region_lat_lon.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
location_id,region,principal_municipality,latitude,longitude
0,"Auvergne-Rhône-Alpes","Lyon",45.7640,4.8357
1,"Bourgogne-Franche-Comté","Dijon",47.3220,5.0415
2,"Bretagne","Rennes",48.1173,-1.6778
3,"Centre-Val-de-Loire","Orléans",47.9030,1.9093
4,"Grand-Est","Strasbourg",48.5734,7.7521
5,"Hauts-de-France","Lille",50.6292,3.0573
6,"Ile-de-France","Paris",48.8566,2.3522
7,"Normandie","Rouen",49.4432,1.0993
8,"Nouvelle-Aquitaine","Bordeaux",44.8378,-0.5792
9,"Occitanie","Toulouse",43.6047,1.4442
10,"Pays-de-la-Loire","Nantes",47.2184,-1.5536
11,"PACA","Marseille",43.2965,5.3698
1 change: 0 additions & 1 deletion src/open_data_pvnet/scripts/fetch_pvlive_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from pvlive_api import PVLive
import logging


logger = logging.getLogger(__name__)


Expand Down
15 changes: 9 additions & 6 deletions src/open_data_pvnet/scripts/generate_combined_gsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,13 @@

from src.open_data_pvnet.scripts.fetch_pvlive_data import PVLiveData

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


def main(
start_year: int = typer.Option(2020, help="Start year for data collection"),
end_year: int = typer.Option(2025, help="End year for data collection"),
output_folder: str = typer.Option("data", help="Output folder for the zarr dataset")
output_folder: str = typer.Option("data", help="Output folder for the zarr dataset"),
):
"""
Generate combined GSP data for all GSPs and save as a zarr dataset.
Expand All @@ -51,15 +52,15 @@ def main(
all_dataframes = []

# Changed range to start from 0 to include gsp_id=0
for gsp_id in range(0, 319):
for gsp_id in range(0, 319):
logging.info(f"Processing GSP ID {gsp_id}")
df = data_source.get_data_between(
start=range_start,
end=range_end,
entity_id=gsp_id,
extra_fields="capacity_mwp,installedcapacity_mwp"
extra_fields="capacity_mwp,installedcapacity_mwp",
)

if df is not None and not df.empty:
# Add gsp_id column to the dataframe
df["gsp_id"] = gsp_id
Expand Down Expand Up @@ -87,7 +88,9 @@ def main(
xr_pv.to_zarr(output_path, mode="w", consolidated=True)

logging.info(f"Successfully saved combined GSP dataset to {output_path}")
logging.info(f"Dataset contains GSPs 0-318 for period {range_start.date()} to {range_end.date()}")
logging.info(
f"Dataset contains GSPs 0-318 for period {range_start.date()} to {range_end.date()}"
)


if __name__ == "__main__":
Expand Down
147 changes: 147 additions & 0 deletions src/open_data_pvnet/scripts/get_generation_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
France PVNet Data Download Script

This script downloads and process mainland France solar generation data from RTE's éCO2mix platform for PVNet training.

Data source:
RTE éCO2mix Dataset: https://www.rte-france.com/en/data-publications/eco2mix/download-indicators
- Half Hourly data for the 12 administrative regions of France, from Jan 2020 to Dec 2023 (definitive data)
- Consolidated data for Jan to Dec 2024 (in-progress data)
- Capacity (TCH) data available from Jan 2020

Usage:
python get_generation_csv.py --start_yr 2019 --end_yr 2023 --consolidate_yr 2024
# where users need to determine the consolidate year for assignment of a year in the file name
# based on the latest available data on RTE. This way filenames will be consistent with the year of data they contain.
"""

import requests
import pandas as pd
import os
from time import sleep
import zipfile
import argparse
import logging

# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

base_dir = os.getcwd()
parent_3_levels_up = os.path.dirname(os.path.dirname(os.path.dirname(base_dir)))
output_dir = os.path.join(parent_3_levels_up, "tmp")

admin_region_list = [
"Auvergne-Rhône-Alpes",
"Bourgogne-Franche-Comté",
"Bretagne",
"Centre-Val-de-Loire",
"Grand-Est",
"Hauts-de-France",
"Ile-de-France",
"Normandie",
"Nouvelle-Aquitaine",
"Occitanie",
"Pays-de-la-Loire",
"PACA",
]


def get_region_generation_csv(region, year, consolidated=False) -> None:
"""Download and extract the annual generation CSV for a given region and year.

Args:
region (str): The name of the region.
year (int): The year for which to download the data.
consolidated (bool): If True, download consolidated (in-progress) data,
otherwise download definitive data.
"""
if consolidated:
url = f"https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_{region}_En-cours-Consolide.zip"
data_type = "Consolidated"
else:
url = f"https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_{region}_Annuel-Definitif_{year}.zip"
data_type = "Definitive"

# Download the ZIP file
response = requests.get(url)

# Check if request was successful
if response.status_code != 200:
logger.error(
f"Failed to download {region} {year} ({data_type}): HTTP {response.status_code}"
)
return

# Save to temporary ZIP file
temp_zip = f"temp_{region}_{year}.zip"
with open(temp_zip, "wb") as f:
f.write(response.content)

# Try to extract the ZIP file
try:
with zipfile.ZipFile(temp_zip, "r") as zip_ref:
# Get list of files in the zip
file_list = zip_ref.namelist()
# Find the XLS file (assuming there's one XLS file in the zip)
xls_file = [f for f in file_list if f.endswith(".xls") or f.endswith(".xlsx")][0]
# Extract just that file
zip_ref.extract(xls_file)
except zipfile.BadZipFile:
logger.warning(f"Skipping {region} {year} ({data_type}): Not a valid ZIP file")
os.remove(temp_zip)
return
except IndexError:
logger.warning(f"Skipping {region} {year} ({data_type}): No XLS file found in ZIP")
os.remove(temp_zip)
return

# The .xls file is actually tab-separated text, not Excel format
# Read as CSV with tab delimiter and proper encoding for French characters
df = pd.read_csv(xls_file, sep="\t", encoding="latin-1", low_memory=False)

# Save as CSV in downloads subdirectory
os.makedirs(output_dir, exist_ok=True)
csv_filename = os.path.join(output_dir, f"eCO2mix_RTE_{region}_Annuel_{year}.csv")
df.to_csv(csv_filename, index=False)

# Clean up temporary files
os.remove(temp_zip)
os.remove(xls_file)

logger.info(f"Saved {csv_filename}")
sleep(1)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download France RTE éCO2mix generation data for specified years"
)
parser.add_argument(
"--start_yr", type=int, help="Start year for definitive data download (default: 2019)"
)
parser.add_argument(
"--end_yr", type=int, help="End year for definitive data download (default: 2023)"
)
parser.add_argument(
"--consolidate_yr",
type=int,
help="Year for consolidated (in-progress) data download (default: 2024)",
)

args = parser.parse_args()

year_list = [year for year in range(args.start_yr, args.end_yr + 1)]

# Run for consolidated data
for region in admin_region_list:
get_region_generation_csv(region, args.consolidate_yr, consolidated=True)

# Run for all regions and definitive years
for region in admin_region_list:
for year in year_list:
get_region_generation_csv(region, year)
Loading