improve handling new ERA5 format and review default surface variable

ArcticSnow · ArcticSnow · commit d0b06f704047 · 2025-09-02T15:23:41.000+02:00
diff --git a/TopoPyScale/fetch_era5.py b/TopoPyScale/fetch_era5.py
@@ -41,11 +41,7 @@
                 'surface_solar_radiation_downwards':'ssrd',
                 'surface_pressure':'sp',
                 'total_precipitation':'tp',
-                '2m_temperature':'t2m', 
-                'toa_incident_solar_radiation':'tisr',
-                'friction_velocity':'zust', 
-                'instantaneous_moisture_flux':'ie', 
-                'instantaneous_surface_sensible_heat_flux':'ishf'}
+                '2m_temperature':'t2m'}
 
 var_plev_name_google = {'geopotential':'z', 
                 'temperature':'t', 
@@ -282,7 +278,7 @@ def fetch_era5_google_from_zarr(eraDir, startDate, endDate, lonW, latS, lonE, la
 
 def retrieve_era5(product, startDate, endDate, eraDir, latN, latS, lonE, lonW, step, 
                     num_threads=10, surf_plev='surf', plevels=None, realtime=False, 
-                    output_format='netcdf', download_format="unarchived", new_CDS_API=True, rm_daily=False, store_as_zarr='ERA5.zarr'):
+                    output_format='netcdf', download_format="unarchived", new_CDS_API=True, rm_daily=False):
     """ Sets up era5 surface retrieval.
     * Creates list of year/month pairs to iterate through.
     * MARS retrievals are most efficient when subset by time.
@@ -305,7 +301,6 @@ def retrieve_era5(product, startDate, endDate, eraDir, latN, latS, lonE, lonW, s
         download_format (str): default "unarchived". Can be "zip"
         new_CDS_API: flag to handle new formating of SURF files with the new CDS API (2024).
         rm_daily: remove folder containing all daily ERA5 file. Option to clear space of data converted to yearly files.
-        store_as_zarr (str): name of the Zarr store. None if you want to use the old system with Netcdf
 
     Returns:
         Monthly era surface files stored in disk.
@@ -456,10 +451,10 @@ def retrieve_era5(product, startDate, endDate, eraDir, latN, latS, lonE, lonW, s
         except:
             raise IOError('deletion of daily files issue')
 
-    print("===> ERA5 files ready")
+    print("===> ERA5 netcdf files ready")
 
 
-def era5_request_surf(dataset, year, month, day, bbox, target, product, time, output_format= "netcdf", download_format="unarchived"):
+def era5_request_surf(dataset, year, month, day, bbox, target, product, time, varoi=None, output_format= "netcdf", download_format="unarchived"):
     """CDS surface api call
 
     Args:
@@ -470,23 +465,26 @@ def era5_request_surf(dataset, year, month, day, bbox, target, product, time, ou
         target (str): filename
         product (str): type of model run. defaul: reanalysis
         time (str or list): hours for which to download data
+        varoi (list): list of variable of interest to download. Default to None which fallsback to minimum needed for TPS
         output_format (str): default is "netcdf", can be "grib".
         download_format (str): default "unarchived". Can be "zip"
 
     Returns:
         Store to disk dataset as indicated
 
     """
-
-    varnames = ['geopotential', '2m_dewpoint_temperature', 'surface_thermal_radiation_downwards',
-                      'surface_solar_radiation_downwards','surface_pressure',
-                      'total_precipitation', '2m_temperature', 'toa_incident_solar_radiation',
-                      'friction_velocity', 'instantaneous_moisture_flux', 'instantaneous_surface_sensible_heat_flux']
-
+    if varoi is None:
+        varoi = ['geopotential', 
+                    '2m_dewpoint_temperature',
+                    'surface_thermal_radiation_downwards',
+                    'surface_solar_radiation_downwards','surface_pressure',
+                    'total_precipitation',
+                    '2m_temperature']
+    
     c = cdsapi.Client()
     c.retrieve(
         dataset,
-        {'variable': varnames,
+        {'variable': varoi,
          'product_type': [product],
          "area": bbox,
          'year': year,
@@ -502,7 +500,7 @@ def era5_request_surf(dataset, year, month, day, bbox, target, product, time, ou
     unzip_file(str(target))
 
 
-def era5_request_plev(dataset, year, month, day, bbox, target, product, time, plevels, output_format= "netcdf", download_format="unarchived"):
+def era5_request_plev(dataset, year, month, day, bbox, target, product, time, plevels, varoi=None, output_format= "netcdf", download_format="unarchived"):
     """CDS plevel api call
 
     Args:
@@ -514,23 +512,30 @@ def era5_request_plev(dataset, year, month, day, bbox, target, product, time, pl
         product (str): type of model run. defaul: reanalysis
         time (str or list): hours to query
         plevels (str or list): pressure levels to query
+        varoi (list): list of variable of interest to download. Default to None which fallsback to minimum needed for TPS
         output_format (str): default is "netcdf", can be "grib".
         download_format (str): default "unarchived". Can be "zip"
 
     Returns:
         Store to disk dataset as indicated
 
     """
+    if varoi is None:
+        varoi = ['geopotential',
+                 'temperature',
+                 'u_component_of_wind',
+                 'v_component_of_wind',
+                 'specific_humidity',
+                 'relative_humidity']
+
+
     c = cdsapi.Client()
     c.retrieve(
         dataset,
         {
             'product_type': [product],
             "area": bbox,
-            'variable': [
-                'geopotential', 'temperature', 'u_component_of_wind',
-                'v_component_of_wind', 'specific_humidity', 'relative_humidity'
-            ],
+            'variable': varoi,
             'pressure_level': plevels,
             'year': year,
             'month': '%02d'%(month),
@@ -684,7 +689,7 @@ def unzip_file(file_path):
             merged_file_path = os.path.join(os.path.dirname(zip_file_path), os.path.basename(zip_file_path).replace('.zip', '.nc'))
             try:
                 # Combine all `.nc` files
-                datasets = [xr.open_dataset(nc_file) for nc_file in nc_files]
+                datasets = [xr.open_dataset(nc_file, engine='netcdf4') for nc_file in nc_files]
                 merged_ds = xr.merge(datasets)  # Adjust dimension as needed
                 merged_ds.to_netcdf(merged_file_path)
                 print(f"Merged .nc files into {merged_file_path}.")
@@ -721,12 +726,12 @@ def remap_CDSbeta(file_pattern, file_type='SURF'):
                 try:
                     ds = ds.drop_vars('number')
                 except:
-                    print("variables not found")
+                    print("Coordinate 'number' not found")
 
                 try:
                     ds = ds.drop_vars('expver')
                 except:
-                    print("variables not found")
+                    print("Coordinate 'expver' not found")
 
                 ds.to_netcdf(nc_file+ "_remap", mode='w')
                 # move remap back to orig name
@@ -740,25 +745,24 @@ def remap_CDSbeta(file_pattern, file_type='SURF'):
                 ds = xr.open_dataset(nc_file)
                 ds = ds.rename({ 'valid_time' : 'time'})
 
-                try:
-                    #cdo delname,number,ishf,ie,zust,tisr SURF_20240925.nc SURF_clean.nc
-                    #ds2 = ds.swap_dims({'valid_time': 'time'})
-                    ds = ds.drop_vars('ishf')
-                    ds = ds.drop_vars('ie')
-                    ds = ds.drop_vars('zust')
-                    ds = ds.drop_vars('tisr')
-                except:
-                    print("variables not found")
+                #try:
+                #    #cdo delname,number,ishf,ie,zust,tisr SURF_20240925.nc SURF_clean.nc
+                #    ds = ds.drop_vars('ishf')
+                #    ds = ds.drop_vars('ie')
+                #    ds = ds.drop_vars('zust')
+                #    ds = ds.drop_vars('tisr')
+                #except:
+                #    print("variables not found")
 
                 try:
                     ds = ds.drop_vars('number')
                 except:
-                    print("variables not found")
+                    print("Coordinate 'number' not found")
 
                 try:
                     ds = ds.drop_vars('expver')
                 except:
-                    print("variables not found")
+                    print("Coordinate 'expver' not found")
 
                 ds.to_netcdf(nc_file + "_remap", mode='w')
                 # move remap back to orig name
diff --git a/TopoPyScale/topoclass.py b/TopoPyScale/topoclass.py
@@ -315,8 +315,13 @@ def extract_topo_cluster_param(self):
         else:
             if not os.path.isabs(mask_file):
                 mask_file = Path(self.config.project.directory, mask_file)
+                if not mask_file.exists():
+                    raise ValueError(f'File {mask_file} does not exist.')
+
             # read mask TIFF
-            ds_mask = rio.open_rasterio(mask_file).to_dataset('band').rename({1: 'mask'})
+            ds_mask = xr.open_dataset(mask_file, engine='rasterio').band_data.isel(band=0).drop_vars('band').to_dataset()
+            ds_mask = ds_mask.rename({'band_data': 'mask'})
+            self.toposub.ds_param['mask'] = ds_mask.mask.drop_vars('spatial_ref')
 
             # check if bounds and resolution match
             if not ds_mask.rio.bounds() == self.toposub.ds_param.rio.bounds() or not ds_mask.rio.resolution() == self.toposub.ds_param.rio.resolution():
@@ -340,9 +345,12 @@ def extract_topo_cluster_param(self):
             split_clustering = True
             if not os.path.isabs(groups_file):
                 groups_file = Path(self.config.project.directory, groups_file)
+                if not groups_file.exists():
+                    raise ValueError(f'File {groups_file} does not exist.')
 
             # read cluster TIFF
-            ds_group = rio.open_rasterio(groups_file).to_dataset('band').rename({1: 'group'})
+            ds_group = xr.open_dataset(groups_file, engine='rasterio').band_data.isel(band=0).drop_vars('band').to_dataset()
+            ds_group = ds_group.rename({'band_data': 'group'})
 
             # check if bounds and resolution match
             if not ds_group.rio.bounds() == self.toposub.ds_param.rio.bounds() or not ds_group.rio.resolution() == self.toposub.ds_param.rio.resolution():
@@ -784,8 +792,7 @@ def get_era5(self):
                 output_format=output_format,
                 download_format=download_format,
                 new_CDS_API=True,
-                rm_daily=self.config.climate[self.config.project.climate].rm_daily,
-                store_as_zarr=self.config.climate.era5.zarr_store
+                rm_daily=self.config.climate[self.config.project.climate].rm_daily
             )
             # retrieve era5 plevels
             fe.retrieve_era5(
@@ -802,8 +809,7 @@ def get_era5(self):
                 output_format=output_format,
                 download_format=download_format,
                 new_CDS_API=True,
-                rm_daily=self.config.climate[self.config.project.climate].rm_daily,
-                store_as_zarr=self.config.climate.era5.zarr_store
+                rm_daily=self.config.climate[self.config.project.climate].rm_daily
             )
 
         elif data_repository == 'google_cloud_storage':
diff --git a/doc/docs/03_configurationFile.md b/doc/docs/03_configurationFile.md
@@ -7,10 +7,15 @@ my_project/
     ├── inputs/
         ├── dem/ 
             ├── my_dem.tif
-            └── pts_list.csv  (OPTIONAL: to downscale to specific points)
+            ├── my_dem_mask.tif     (OPTIONAL: to mask part of the DEM)
+            ├── my_dem_groups.tif   (OPTIONAL: to delineate groups of clusters)
+            └── pts_list.csv        (OPTIONAL: to downscale to specific points)
         └── climate/
-            ├── PLEV*.nc
-            └── SURF*.nc
+            ├── daily/
+            ├── ERA5.zarr           (OPTIONAL: to downscale with zarr optimization)
+            └── yearly/
+                ├── PLEV*.nc
+                └── SURF*.nc
     ├── outputs/
             ├── tmp/
             └── downscaled/
@@ -116,8 +121,8 @@ sampling:
     n_clusters: 50                      # number of cluster to segment the DEM
     random_seed: 2                      # random seed for the K-mean clustering 
     clustering_features: { 'x': 1, 'y': 1, 'elevation': 1, 'slope': 1, 'aspect_cos': 1, 'aspect_sin': 1, 'svf': 1 }  # dictionnary of the features of choice to use in clustering with their relative importance. Relative importance is a multiplier after scaling
-    clustering_mask: clustering/catchment_mask.tif # optional path to tif containing a mask (0/1)
-    clustering_groups: clustering/groups.tif # optional path to a tif containing cluster groups (int values), e.g. land cover
+    clustering_mask: inputs/dem/catchment_mask.tif  # optional path to tif containing a mask (0/1)
+    clustering_groups: inputs/dem/groups.tif        # optional path to a tif containing cluster groups (int values), e.g. land cover
 
 #.....................................................................................................
 toposcale: