openclimatefix · Her0n24 · Feb 11, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/docs/france_readme.md b/docs/france_readme.md
@@ -0,0 +1,98 @@
+## France Solar Data Pipeline for PVNet
+This edit/ contribution adds support for France RTE solar generation data to the project.
+
+## Changes
+- Added France data processing script
+- Created admin region metadata CSV
+- Updated data pipeline to use integer location_ids
+- Added inspection script for validation
+
+## Data API
+The Definitive datasets follow the format:
+https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_{Region}_Annuel-Definitif_{Year}.zip
+
+The consolidate datasets follow the format:
+https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_{Region}_En-cours-Consolide.zip
+
+Note that TCH (le Taux de CHarge), which refers to the actual production compared to installed solar capacity is only available from 2020. Hence, initially we use 2020 to 2024 (5 years) of data. 
+
+## Summer Time Behavior
+When transitioning to summer time (e.g. 26 Mar 2023 2:00 to 03:00), entries between 2:00 and 3:00 are duplicated.
+When transitioning back to winter time (e.g. 29 Oct 2023 3:00 to 2:00), data entries are ambiguous and 2 timesteps will be missing.
+
+### ZARR File 
+The converted zarr file is available on huggingface, link:
+https://huggingface.co/datasets/hhhn2/France_PV_data
+
+## Testing
+- Ran process_france_data.py successfully
+- Validated output with inspect_france_training_pipeline.py
+
+## Data Processing Results
+Data Quality
+
+Generation (MW):
+  Shape: (12, 87696)
+  Range: [0.00, 4002.00] MW
+  Mean: 174.98 MW
+  NaN count: 120 (0.01%)
+
+Capacity (MWp):
+  Shape: (12, 87696)
+  Range: [122.70, 6000.00] MWp
+  Mean: 1170.15 MWp
+  NaN count: 0 (0.00%)
+
+Per-Region Statistics
+
+0:
+Generation: [0.0, 2194.0] MW, Mean: 238.2 MW, NaN: 0.0%
+Capacity: 1655.3 MWp, NaN: 0.0%
+
+1:
+  Generation: [0.0, 883.0] MW, Mean: 78.7 MW, NaN: 0.0%
+  Capacity: 537.8 MWp, NaN: 0.0%
+
+2:
+  Generation: [0.0, 568.0] MW, Mean: 49.3 MW, NaN: 0.0%
+  Capacity: 364.3 MWp, NaN: 0.0%
+
+3:
+  Generation: [0.0, 975.0] MW, Mean: 97.5 MW, NaN: 0.0%
+  Capacity: 665.7 MWp, NaN: 0.0%
+
+4:
+  Generation: [0.0, 1337.0] MW, Mean: 134.2 MW, NaN: 0.0%
+  Capacity: 998.7 MWp, NaN: 0.0%
+
+5:
+  Generation: [0.0, 629.0] MW, Mean: 49.0 MW, NaN: 0.0%
+  Capacity: 361.4 MWp, NaN: 0.0%
+
+6:
+  Generation: [0.0, 306.0] MW, Mean: 26.8 MW, NaN: 0.0%
+  Capacity: 218.7 MWp, NaN: 0.0%
+
+7:
+  Generation: [0.0, 464.0] MW, Mean: 32.1 MW, NaN: 0.0%
+  Capacity: 247.4 MWp, NaN: 0.0%
+
+8:
+  Generation: [0.0, 4002.0] MW, Mean: 534.9 MW, NaN: 0.0%
+  Capacity: 3524.9 MWp, NaN: 0.0%
+
+9:
+  Generation: [0.0, 3287.0] MW, Mean: 438.6 MW, NaN: 0.0%
+  Capacity: 2799.3 MWp, NaN: 0.0%
+
+10:
+  Generation: [0.0, 1213.0] MW, Mean: 118.8 MW, NaN: 0.0%
+  Capacity: 860.4 MWp, NaN: 0.0%
+
+11:
+  Generation: [0.0, 1942.0] MW, Mean: 301.8 MW, NaN: 0.0%
+  Capacity: 1807.9 MWp, NaN: 0.0%
+
+## Next Steps
+- Make a NOAA GFS pipeline for France
+- Compare with baseline model
diff --git a/src/open_data_pvnet/configs/admin_region_lat_lon.csv b/src/open_data_pvnet/configs/admin_region_lat_lon.csv
@@ -0,0 +1,13 @@
+location_id,region,principal_municipality,latitude,longitude
+0,"Auvergne-Rhône-Alpes","Lyon",45.7640,4.8357
+1,"Bourgogne-Franche-Comté","Dijon",47.3220,5.0415
+2,"Bretagne","Rennes",48.1173,-1.6778
+3,"Centre-Val-de-Loire","Orléans",47.9030,1.9093
+4,"Grand-Est","Strasbourg",48.5734,7.7521
+5,"Hauts-de-France","Lille",50.6292,3.0573
+6,"Ile-de-France","Paris",48.8566,2.3522
+7,"Normandie","Rouen",49.4432,1.0993
+8,"Nouvelle-Aquitaine","Bordeaux",44.8378,-0.5792
+9,"Occitanie","Toulouse",43.6047,1.4442
+10,"Pays-de-la-Loire","Nantes",47.2184,-1.5536
+11,"PACA","Marseille",43.2965,5.3698
diff --git a/src/open_data_pvnet/scripts/fetch_pvlive_data.py b/src/open_data_pvnet/scripts/fetch_pvlive_data.py
@@ -1,7 +1,6 @@
 from pvlive_api import PVLive
 import logging
 
-
 logger = logging.getLogger(__name__)
 
 

diff --git a/src/open_data_pvnet/scripts/generate_combined_gsp.py b/src/open_data_pvnet/scripts/generate_combined_gsp.py
@@ -33,12 +33,13 @@
 
 from src.open_data_pvnet.scripts.fetch_pvlive_data import PVLiveData
 
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+
 
 def main(
     start_year: int = typer.Option(2020, help="Start year for data collection"),
     end_year: int = typer.Option(2025, help="End year for data collection"),
-    output_folder: str = typer.Option("data", help="Output folder for the zarr dataset")
+    output_folder: str = typer.Option("data", help="Output folder for the zarr dataset"),
 ):
     """
     Generate combined GSP data for all GSPs and save as a zarr dataset.
@@ -51,15 +52,15 @@ def main(
     all_dataframes = []
 
     # Changed range to start from 0 to include gsp_id=0
-    for gsp_id in range(0, 319):  
+    for gsp_id in range(0, 319):
         logging.info(f"Processing GSP ID {gsp_id}")
         df = data_source.get_data_between(
             start=range_start,
             end=range_end,
             entity_id=gsp_id,
-            extra_fields="capacity_mwp,installedcapacity_mwp"
+            extra_fields="capacity_mwp,installedcapacity_mwp",
         )
-        
+
         if df is not None and not df.empty:
             # Add gsp_id column to the dataframe
             df["gsp_id"] = gsp_id
@@ -87,7 +88,9 @@ def main(
     xr_pv.to_zarr(output_path, mode="w", consolidated=True)
 
     logging.info(f"Successfully saved combined GSP dataset to {output_path}")
-    logging.info(f"Dataset contains GSPs 0-318 for period {range_start.date()} to {range_end.date()}")
+    logging.info(
+        f"Dataset contains GSPs 0-318 for period {range_start.date()} to {range_end.date()}"
+    )
 
 
 if __name__ == "__main__":

diff --git a/src/open_data_pvnet/scripts/get_generation_csv.py b/src/open_data_pvnet/scripts/get_generation_csv.py
@@ -0,0 +1,147 @@
+"""
+France PVNet Data Download Script
+
+This script downloads and process mainland France solar generation data from RTE's éCO2mix platform for PVNet training.
+
+Data source:
+RTE éCO2mix Dataset: https://www.rte-france.com/en/data-publications/eco2mix/download-indicators
+    - Half Hourly data for the 12 administrative regions of France, from Jan 2020 to Dec 2023 (definitive data)
+    - Consolidated data for Jan to Dec 2024 (in-progress data)
+    - Capacity (TCH) data available from Jan 2020
+
+Usage:
+    python get_generation_csv.py --start_yr 2019 --end_yr 2023 --consolidate_yr 2024
+    # where users need to determine the consolidate year for assignment of a year in the file name
+    # based on the latest available data on RTE. This way filenames will be consistent with the year of data they contain.
+"""
+
+import requests
+import pandas as pd
+import os
+from time import sleep
+import zipfile
+import argparse
+import logging
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+
+base_dir = os.getcwd()
+parent_3_levels_up = os.path.dirname(os.path.dirname(os.path.dirname(base_dir)))
+output_dir = os.path.join(parent_3_levels_up, "tmp")
+
+admin_region_list = [
+    "Auvergne-Rhône-Alpes",
+    "Bourgogne-Franche-Comté",
+    "Bretagne",
+    "Centre-Val-de-Loire",
+    "Grand-Est",
+    "Hauts-de-France",
+    "Ile-de-France",
+    "Normandie",
+    "Nouvelle-Aquitaine",
+    "Occitanie",
+    "Pays-de-la-Loire",
+    "PACA",
+]
+
+
+def get_region_generation_csv(region, year, consolidated=False) -> None:
+    """Download and extract the annual generation CSV for a given region and year.
+
+    Args:
+        region (str): The name of the region.
+        year (int): The year for which to download the data.
+        consolidated (bool): If True, download consolidated (in-progress) data,
+                           otherwise download definitive data.
+    """
+    if consolidated:
+        url = f"https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_{region}_En-cours-Consolide.zip"
+        data_type = "Consolidated"
+    else:
+        url = f"https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_{region}_Annuel-Definitif_{year}.zip"
+        data_type = "Definitive"
+
+    # Download the ZIP file
+    response = requests.get(url)
+
+    # Check if request was successful
+    if response.status_code != 200:
+        logger.error(
+            f"Failed to download {region} {year} ({data_type}): HTTP {response.status_code}"
+        )
+        return
+
+    # Save to temporary ZIP file
+    temp_zip = f"temp_{region}_{year}.zip"
+    with open(temp_zip, "wb") as f:
+        f.write(response.content)
+
+    # Try to extract the ZIP file
+    try:
+        with zipfile.ZipFile(temp_zip, "r") as zip_ref:
+            # Get list of files in the zip
+            file_list = zip_ref.namelist()
+            # Find the XLS file (assuming there's one XLS file in the zip)
+            xls_file = [f for f in file_list if f.endswith(".xls") or f.endswith(".xlsx")][0]
+            # Extract just that file
+            zip_ref.extract(xls_file)
+    except zipfile.BadZipFile:
+        logger.warning(f"Skipping {region} {year} ({data_type}): Not a valid ZIP file")
+        os.remove(temp_zip)
+        return
+    except IndexError:
+        logger.warning(f"Skipping {region} {year} ({data_type}): No XLS file found in ZIP")
+        os.remove(temp_zip)
+        return
+
+    # The .xls file is actually tab-separated text, not Excel format
+    # Read as CSV with tab delimiter and proper encoding for French characters
+    df = pd.read_csv(xls_file, sep="\t", encoding="latin-1", low_memory=False)
+
+    # Save as CSV in downloads subdirectory
+    os.makedirs(output_dir, exist_ok=True)
+    csv_filename = os.path.join(output_dir, f"eCO2mix_RTE_{region}_Annuel_{year}.csv")
+    df.to_csv(csv_filename, index=False)
+
+    # Clean up temporary files
+    os.remove(temp_zip)
+    os.remove(xls_file)
+
+    logger.info(f"Saved {csv_filename}")
+    sleep(1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Download France RTE éCO2mix generation data for specified years"
+    )
+    parser.add_argument(
+        "--start_yr", type=int, help="Start year for definitive data download (default: 2019)"
+    )
+    parser.add_argument(
+        "--end_yr", type=int, help="End year for definitive data download (default: 2023)"
+    )
+    parser.add_argument(
+        "--consolidate_yr",
+        type=int,
+        help="Year for consolidated (in-progress) data download (default: 2024)",
+    )
+
+    args = parser.parse_args()
+
+    year_list = [year for year in range(args.start_yr, args.end_yr + 1)]
+
+    # Run for consolidated data
+    for region in admin_region_list:
+        get_region_generation_csv(region, args.consolidate_yr, consolidated=True)
+
+    # Run for all regions and definitive years
+    for region in admin_region_list:
+        for year in year_list:
+            get_region_generation_csv(region, year)