Merge pull request pik-piam#66 from irr-github/split_im

irr-github · web-flow · commit daed4a4ab8e4 · 2025-08-19T16:29:31.000+02:00
Flexibilise Node definitions
diff --git a/config/default_config.yaml b/config/default_config.yaml
@@ -316,5 +316,31 @@ operational_reserve:
   epsilon_vres: 0.02
   contingency: 300000 # MW
 
+# TODO integrate in workflow
+nodes: 
+  split_provinces: False # split (some) provinces (admin l1) using admin l2  (counties/prefectures)
+  # exclude currently only applies if split_provinces is True, will be fixed with integrartion to workflow
+  exclude_provinces: ["Macau", "HongKong"] # list of provinces to exclude from splitting
+  splits:
+    # adminl2 groups per adminl1. {adminl1: {custom_name: [adminl2_names],..}}
+    InnerMongolia:
+      West: 
+        - "Hulunbuir"
+        - "Xing'an"
+        - "Tongliao"
+        - "Chifeng"
+        - "Xilin Gol"
+      East: 
+        - "Alxa"
+        - "Baotou"
+        - "Baynnur"
+        - "Hohhot"
+        - "Ordos"
+        - "Ulaan Chab"
+        - "Wuhai"
+
 fetch_regions:
-  simplify_tol: 0.5
+  simplify_tol: 
+    eez: 0.5
+    land: 0.05
+
diff --git a/workflow/rules/fetch_data.smk b/workflow/rules/fetch_data.smk
@@ -1,5 +1,8 @@
 # snakemake rules for data fetch operations
 
+# TODO rework this, save shapes with all admin levels
+# build nodes with another script and save that to DERIVED_DATA
+# nodes could be read by snakefile and passed as a param to the relevant rules
 rule fetch_region_shapes:
     output: 
         country_shape=DERIVED_COMMON + "/regions/country.geojson",
diff --git a/workflow/scripts/fetch_shapes.py b/workflow/scripts/fetch_shapes.py
@@ -9,6 +9,8 @@
 import numpy as np
 from os import PathLike
 from pandas import DataFrame
+import zipfile
+import io
 
 from constants import (
     CRS,
@@ -21,6 +23,8 @@
 from _helpers import mock_snakemake, configure_logging
 
 NATURAL_EARTH_RESOLUTION = "10m"
+GDAM_LV1 = "NAME_1"
+GDAM_LV2 = "NAME_2"
 
 logger = logging.getLogger(__name__)
 
@@ -37,7 +41,8 @@ def fetch_natural_earth_shape(
 
     Example:
         china country: build_natural_earth_shape("admin_0_countries", "ADMIN", "China")
-        china provinces: build_natural_earth_shape("admin_1_states_provinces", "iso_a2", "CN", region_key = "name_en")
+        china provinces: build_natural_earth_shape("admin_1_states_provinces", 
+            "iso_a2", "CN", region_key="name_en")
 
     Returns:
         gpd.GeoDataFrame: the filtered records
@@ -121,7 +126,7 @@ def fetch_maritime_eez(zone_name: str) -> gpd.GeoDataFrame:
 
     def find_record_id(zone_name: str) -> int:
         # get Maritime Gazette record ID for the country
-        # the eez ID is 70: see https://www.marineregions.org/gazetteer.php?p=webservices&type=rest#/
+        # eez ID is 70: see https://www.marineregions.org/gazetteer.php?p=webservices&type=rest#/
         url = f"https://www.marineregions.org/rest/getGazetteerRecordsByName.json/{zone_name}/?like=true&fuzzy=false&typeID=70&offset=0&count=100"
         response = requests.get(url)
         if response.status_code != 200:
@@ -142,13 +147,15 @@ def find_record_id(zone_name: str) -> int:
     #  URL of the WFS service
     url = "https://geo.vliz.be/geoserver/wfs"
     # WFS request parameters + record ID filter
+    base_filter_ = "<Filter><PropertyIsEqualTo><PropertyName>mrgid_eez</PropertyName><Literal>"
+    filter_ = base_filter_ + f"{mgrid}</Literal></PropertyIsEqualTo></Filter>"
     params = dict(
         service="WFS",
         version="1.1.0",
         request="GetFeature",
         typeName="MarineRegions:eez",
         outputFormat="json",
-        filter=f"<Filter><PropertyIsEqualTo><PropertyName>mrgid_eez</PropertyName><Literal>{mgrid}</Literal></PropertyIsEqualTo></Filter>",
+        filter=filter_,
     )
 
     # Fetch data from WFS using requests
@@ -169,6 +176,143 @@ def find_record_id(zone_name: str) -> int:
     return eez.set_crs(epsg=crs)
 
 
+def fetch_gadm(country_code="CHN", level=2):
+    """
+    fetch GADM shapefile for a given country and administrative level.
+    https://gadm.org/download_country.html
+
+    Parameters:
+        country_code (str): ISO3 country code (e.g., 'CHN', 'USA').
+        level (int): Administrative level (0=country, 1=region, etc.).
+
+    Returns:
+        geopandas.GeoDataFrame: Loaded shapefile as GeoDataFrame.
+    """
+    # Construct the URL
+    url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/shp/gadm41_{country_code}_shp.zip"
+
+    # Download the zip file
+    response = requests.get(url)
+    if response.status_code != 200:
+        raise ValueError(
+            f"Failed to download data for {country_code} - Status code: {response.status_code}"
+        )
+
+    # Extract the zip file in memory
+    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
+        # Filter to the desired level shapefile
+        level_filename = f"gadm41_{country_code}_{level}.shp"
+        if level_filename not in z.namelist():
+            raise ValueError(f"Level {level} shapefile not found for {country_code}.")
+
+        shp_dir = "resources/data/province_shapes"
+        z.extractall(shp_dir)
+        gdf = gpd.read_file(f"{shp_dir}/{level_filename}")
+        return gdf
+
+
+def fetch_prefecture_shapes(
+    fixes={
+        GDAM_LV1: {
+            "Nei Mongol": "InnerMongolia",
+            "Xinjiang Uygur": "Xinjiang",
+            "Hong Kong": "HongKong",
+            "Ningxia Hui": "Ningxia",
+        }
+    }
+):
+    """
+    Fetch county-level shapefiles for China.
+
+    Args:
+        fixes (dict, Optional): Dictionary mapping old names to new names for specific columns.
+    """
+    gdf = fetch_gadm(country_code="CHN", level=2)
+    for col, fix_dict in fixes.items():
+        for old_name, new_name in fix_dict.items():
+            mask = gdf.query(f"{col} == '{old_name}'").index
+            gdf.loc[mask, col] = new_name
+    return gdf[["COUNTRY", "NAME_1", "NAME_2", "geometry"]]
+
+
+def build_nodes(
+    prefectures: gpd.GeoDataFrame,
+    nodes_cfg: dict,
+) -> gpd.GeoSeries:
+    """ Build the nodes, either directly at provincial (admin1) level or from adminlvk2 subregions
+      
+    Args:
+      prefectures:  """
+    gdf = prefectures.copy()
+    if nodes_cfg.get("split_provinces", False):
+        validate_split_cfg(nodes_cfg["splits"], gdf)
+        return split_provinces(gdf, nodes_cfg)
+    else:
+        provs = provs = gdf.dissolve(GDAM_LV1)
+        provs = provs.drop([nodes_cfg["exclude_provinces"]])
+        return provs.rename_axis("node")["geometry"]
+
+
+def validate_split_cfg(split_cfg: dict, gdf: gpd.GeoDataFrame):
+    """validate the province split configuration. 
+    The province (admin level 1) is split by admin level 2 {subregion: [prefecture names],..}.
+    The prefecture names must be unique and cover all admin2 in the admin1 level.
+
+    Args:
+        split_cfg (dict): the configuration for the prefecture split
+        gdf (gpd.GeoDataFrame): the geodataframe with prefecture shapes
+    Raises:
+        ValueError: if the prefectures are not unique or do not cover all admin2 in the admin1 level
+    """
+    # validate_settings
+    for admin1 in split_cfg:
+        if admin1 not in gdf[GDAM_LV1].unique():
+            err_ = f"Invalid admin1 entry {admin1} not found in provinces {gdf[GDAM_LV1].unique()}"
+            raise ValueError(err_)
+        
+        # flatten values
+        admin2 = []
+        for names, v in split_cfg[admin1].items():
+            admin2 += v
+        
+        # check completeness
+        all_admin2 = gdf.query(f'{GDAM_LV1} == "{admin1}"')[GDAM_LV2].unique().tolist()
+        if not sorted(admin2) == sorted(all_admin2):
+            raise ValueError(
+                f"{admin1} prefectures do not match expected:\ngot {admin2}\nvs\n {all_admin2}"
+            )
+
+        # check uniqueness (pop -> must be after completeness check)
+        duplicated = any([admin2.pop() in admin2 for i in range(len(admin2))])
+        if duplicated:
+            raise ValueError(f"Duplicated prefecture names in {admin1}: {admin2}")
+
+
+# TODO consider returning country and province
+def split_provinces(
+    prefectures: gpd.GeoDataFrame,
+    node_config: dict
+) -> gpd.GeoSeries:
+    """
+    Split Inner Mongolia into East and West regions based on prefectures.
+
+    Args:
+        prefectures (gpd.GeoDataFrame): Gall chinese prefectures.
+        node_config (dict): the configuration for node build
+    Returns:
+        gpd.GeoDataFrame: Updated GeoDataFrame with Inner Mongolia split EAST/WEST.
+    """
+    gdf = prefectures.copy()
+    for admin1, splits in node_config["splits"].items():
+        mask = gdf.query(f"{GDAM_LV1} == '{admin1}'").index
+        splits_inv = {vv: admin1 + "_" + k for k, v in splits.items() for vv in v}
+        gdf.loc[mask, GDAM_LV1] = gdf.loc[mask, "NAME_2"].map(splits_inv)
+    
+    # merge geometries by node
+    gdf.rename(columns = {GDAM_LV1: "node"}, inplace=True)
+    return gdf[["node", "geometry"]].dissolve(by="node", aggfunc="sum")
+
+
 def cut_smaller_from_larger(
     row: gpd.GeoSeries, gdf: gpd.GeoDataFrame, overlaps: DataFrame
 ) -> gpd.GeoSeries:
@@ -192,7 +336,7 @@ def cut_smaller_from_larger(
     for idx in ovrlap_idx:
         geom = gdf.iloc[idx].geometry
         if row.geometry.area > geom.area:
-            row.geometry = row.geometry.difference(geom)
+            row["geometry"] = row["geometry"].difference(geom)
         elif row.geometry.area == geom.area:
             raise ValueError(f"Equal area overlap between {row.name} and {idx} - unhandled")
     return row
@@ -286,22 +430,35 @@ def eez_by_region(
         snakemake = mock_snakemake("fetch_region_shapes")
     configure_logging(snakemake, logger=logger)
 
+    nodes_config = snakemake.config.get("nodes", {"split_provinces": False})
+    tol = snakemake.config["fetch_regions"]["simplify_tol"]
+
     logger.info(f"Fetching country shape {COUNTRY_NAME} from cartopy")
     fetch_country_shape(snakemake.output.country_shape)
     logger.info(f"Country shape saved to {snakemake.output.country_shape}")
 
     logger.info(f"Fetching province shapes for {COUNTRY_ISO} from cartopy")
     # TODO it would be better to filter by set regions after making the voronoi polygons
-    regions = fetch_province_shapes()
+    if not nodes_config.get("split_provinces", False):
+        regions = fetch_province_shapes()
+    else:
+        logger.info("Splitting provinces into user defined nodes")
+        prefectures = fetch_prefecture_shapes()
+        nodes = build_nodes(prefectures, nodes_config)
+        nodes.simplify(tol["land"]).to_file(snakemake.output.province_shapes.replace(".geojson", "_nodestest.geojson"), driver="GeoJSON")
+
+        raise NotImplementedError(
+            "Province splitting is not implemented accross the whole workflow yet."
+        )
+
     regions.to_file(snakemake.output.province_shapes, driver="GeoJSON")
     regions.to_file(snakemake.output.prov_shpfile)
     logger.info(f"Province shapes saved to {snakemake.output.province_shapes}")
 
     logger.info(f"Fetching maritime zones for EEZ prefix {EEZ_PREFIX}")
     eez_country = fetch_maritime_eez(EEZ_PREFIX)
     logger.info("Breaking by reion")
-    tol = snakemake.config["fetch_regions"]["simplify_tol"]
-    eez_by_region(eez_country, regions, prov_key="province", simplify_tol=tol).to_file(
+    eez_by_region(eez_country, regions, prov_key="province", simplify_tol=tol["eez"]).to_file(
         snakemake.output.offshore_shapes, driver="GeoJSON"
     )