99import numpy as np
1010from os import PathLike
1111from pandas import DataFrame
12+ import zipfile
13+ import io
1214
1315from constants import (
1416 CRS ,
2123from _helpers import mock_snakemake , configure_logging
2224
2325NATURAL_EARTH_RESOLUTION = "10m"
26+ GDAM_LV1 = "NAME_1"
27+ GDAM_LV2 = "NAME_2"
2428
2529logger = logging .getLogger (__name__ )
2630
@@ -37,7 +41,8 @@ def fetch_natural_earth_shape(
3741
3842 Example:
3943 china country: build_natural_earth_shape("admin_0_countries", "ADMIN", "China")
40- china provinces: build_natural_earth_shape("admin_1_states_provinces", "iso_a2", "CN", region_key = "name_en")
44+ china provinces: build_natural_earth_shape("admin_1_states_provinces",
45+ "iso_a2", "CN", region_key="name_en")
4146
4247 Returns:
4348 gpd.GeoDataFrame: the filtered records
@@ -121,7 +126,7 @@ def fetch_maritime_eez(zone_name: str) -> gpd.GeoDataFrame:
121126
122127 def find_record_id (zone_name : str ) -> int :
123128 # get Maritime Gazette record ID for the country
124- # the eez ID is 70: see https://www.marineregions.org/gazetteer.php?p=webservices&type=rest#/
129+ # eez ID is 70: see https://www.marineregions.org/gazetteer.php?p=webservices&type=rest#/
125130 url = f"https://www.marineregions.org/rest/getGazetteerRecordsByName.json/{ zone_name } /?like=true&fuzzy=false&typeID=70&offset=0&count=100"
126131 response = requests .get (url )
127132 if response .status_code != 200 :
@@ -142,13 +147,15 @@ def find_record_id(zone_name: str) -> int:
142147 # URL of the WFS service
143148 url = "https://geo.vliz.be/geoserver/wfs"
144149 # WFS request parameters + record ID filter
150+ base_filter_ = "<Filter><PropertyIsEqualTo><PropertyName>mrgid_eez</PropertyName><Literal>"
151+ filter_ = base_filter_ + f"{ mgrid } </Literal></PropertyIsEqualTo></Filter>"
145152 params = dict (
146153 service = "WFS" ,
147154 version = "1.1.0" ,
148155 request = "GetFeature" ,
149156 typeName = "MarineRegions:eez" ,
150157 outputFormat = "json" ,
151- filter = f"<Filter><PropertyIsEqualTo><PropertyName>mrgid_eez</PropertyName><Literal> { mgrid } </Literal></PropertyIsEqualTo></Filter>" ,
158+ filter = filter_ ,
152159 )
153160
154161 # Fetch data from WFS using requests
@@ -169,6 +176,143 @@ def find_record_id(zone_name: str) -> int:
169176 return eez .set_crs (epsg = crs )
170177
171178
179+ def fetch_gadm (country_code = "CHN" , level = 2 ):
180+ """
181+ fetch GADM shapefile for a given country and administrative level.
182+ https://gadm.org/download_country.html
183+
184+ Parameters:
185+ country_code (str): ISO3 country code (e.g., 'CHN', 'USA').
186+ level (int): Administrative level (0=country, 1=region, etc.).
187+
188+ Returns:
189+ geopandas.GeoDataFrame: Loaded shapefile as GeoDataFrame.
190+ """
191+ # Construct the URL
192+ url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/shp/gadm41_{ country_code } _shp.zip"
193+
194+ # Download the zip file
195+ response = requests .get (url )
196+ if response .status_code != 200 :
197+ raise ValueError (
198+ f"Failed to download data for { country_code } - Status code: { response .status_code } "
199+ )
200+
201+ # Extract the zip file in memory
202+ with zipfile .ZipFile (io .BytesIO (response .content )) as z :
203+ # Filter to the desired level shapefile
204+ level_filename = f"gadm41_{ country_code } _{ level } .shp"
205+ if level_filename not in z .namelist ():
206+ raise ValueError (f"Level { level } shapefile not found for { country_code } ." )
207+
208+ shp_dir = "resources/data/province_shapes"
209+ z .extractall (shp_dir )
210+ gdf = gpd .read_file (f"{ shp_dir } /{ level_filename } " )
211+ return gdf
212+
213+
214+ def fetch_prefecture_shapes (
215+ fixes = {
216+ GDAM_LV1 : {
217+ "Nei Mongol" : "InnerMongolia" ,
218+ "Xinjiang Uygur" : "Xinjiang" ,
219+ "Hong Kong" : "HongKong" ,
220+ "Ningxia Hui" : "Ningxia" ,
221+ }
222+ }
223+ ):
224+ """
225+ Fetch county-level shapefiles for China.
226+
227+ Args:
228+ fixes (dict, Optional): Dictionary mapping old names to new names for specific columns.
229+ """
230+ gdf = fetch_gadm (country_code = "CHN" , level = 2 )
231+ for col , fix_dict in fixes .items ():
232+ for old_name , new_name in fix_dict .items ():
233+ mask = gdf .query (f"{ col } == '{ old_name } '" ).index
234+ gdf .loc [mask , col ] = new_name
235+ return gdf [["COUNTRY" , "NAME_1" , "NAME_2" , "geometry" ]]
236+
237+
238+ def build_nodes (
239+ prefectures : gpd .GeoDataFrame ,
240+ nodes_cfg : dict ,
241+ ) -> gpd .GeoSeries :
242+ """ Build the nodes, either directly at provincial (admin1) level or from adminlvk2 subregions
243+
244+ Args:
245+ prefectures: """
246+ gdf = prefectures .copy ()
247+ if nodes_cfg .get ("split_provinces" , False ):
248+ validate_split_cfg (nodes_cfg ["splits" ], gdf )
249+ return split_provinces (gdf , nodes_cfg )
250+ else :
251+ provs = provs = gdf .dissolve (GDAM_LV1 )
252+ provs = provs .drop ([nodes_cfg ["exclude_provinces" ]])
253+ return provs .rename_axis ("node" )["geometry" ]
254+
255+
256+ def validate_split_cfg (split_cfg : dict , gdf : gpd .GeoDataFrame ):
257+ """validate the province split configuration.
258+ The province (admin level 1) is split by admin level 2 {subregion: [prefecture names],..}.
259+ The prefecture names must be unique and cover all admin2 in the admin1 level.
260+
261+ Args:
262+ split_cfg (dict): the configuration for the prefecture split
263+ gdf (gpd.GeoDataFrame): the geodataframe with prefecture shapes
264+ Raises:
265+ ValueError: if the prefectures are not unique or do not cover all admin2 in the admin1 level
266+ """
267+ # validate_settings
268+ for admin1 in split_cfg :
269+ if admin1 not in gdf [GDAM_LV1 ].unique ():
270+ err_ = f"Invalid admin1 entry { admin1 } not found in provinces { gdf [GDAM_LV1 ].unique ()} "
271+ raise ValueError (err_ )
272+
273+ # flatten values
274+ admin2 = []
275+ for names , v in split_cfg [admin1 ].items ():
276+ admin2 += v
277+
278+ # check completeness
279+ all_admin2 = gdf .query (f'{ GDAM_LV1 } == "{ admin1 } "' )[GDAM_LV2 ].unique ().tolist ()
280+ if not sorted (admin2 ) == sorted (all_admin2 ):
281+ raise ValueError (
282+ f"{ admin1 } prefectures do not match expected:\n got { admin2 } \n vs\n { all_admin2 } "
283+ )
284+
285+ # check uniqueness (pop -> must be after completeness check)
286+ duplicated = any ([admin2 .pop () in admin2 for i in range (len (admin2 ))])
287+ if duplicated :
288+ raise ValueError (f"Duplicated prefecture names in { admin1 } : { admin2 } " )
289+
290+
291+ # TODO consider returning country and province
292+ def split_provinces (
293+ prefectures : gpd .GeoDataFrame ,
294+ node_config : dict
295+ ) -> gpd .GeoSeries :
296+ """
297+ Split Inner Mongolia into East and West regions based on prefectures.
298+
299+ Args:
300+ prefectures (gpd.GeoDataFrame): Gall chinese prefectures.
301+ node_config (dict): the configuration for node build
302+ Returns:
303+ gpd.GeoDataFrame: Updated GeoDataFrame with Inner Mongolia split EAST/WEST.
304+ """
305+ gdf = prefectures .copy ()
306+ for admin1 , splits in node_config ["splits" ].items ():
307+ mask = gdf .query (f"{ GDAM_LV1 } == '{ admin1 } '" ).index
308+ splits_inv = {vv : admin1 + "_" + k for k , v in splits .items () for vv in v }
309+ gdf .loc [mask , GDAM_LV1 ] = gdf .loc [mask , "NAME_2" ].map (splits_inv )
310+
311+ # merge geometries by node
312+ gdf .rename (columns = {GDAM_LV1 : "node" }, inplace = True )
313+ return gdf [["node" , "geometry" ]].dissolve (by = "node" , aggfunc = "sum" )
314+
315+
172316def cut_smaller_from_larger (
173317 row : gpd .GeoSeries , gdf : gpd .GeoDataFrame , overlaps : DataFrame
174318) -> gpd .GeoSeries :
@@ -192,7 +336,7 @@ def cut_smaller_from_larger(
192336 for idx in ovrlap_idx :
193337 geom = gdf .iloc [idx ].geometry
194338 if row .geometry .area > geom .area :
195- row . geometry = row . geometry .difference (geom )
339+ row [ " geometry" ] = row [ " geometry" ] .difference (geom )
196340 elif row .geometry .area == geom .area :
197341 raise ValueError (f"Equal area overlap between { row .name } and { idx } - unhandled" )
198342 return row
@@ -286,22 +430,35 @@ def eez_by_region(
286430 snakemake = mock_snakemake ("fetch_region_shapes" )
287431 configure_logging (snakemake , logger = logger )
288432
433+ nodes_config = snakemake .config .get ("nodes" , {"split_provinces" : False })
434+ tol = snakemake .config ["fetch_regions" ]["simplify_tol" ]
435+
289436 logger .info (f"Fetching country shape { COUNTRY_NAME } from cartopy" )
290437 fetch_country_shape (snakemake .output .country_shape )
291438 logger .info (f"Country shape saved to { snakemake .output .country_shape } " )
292439
293440 logger .info (f"Fetching province shapes for { COUNTRY_ISO } from cartopy" )
294441 # TODO it would be better to filter by set regions after making the voronoi polygons
295- regions = fetch_province_shapes ()
442+ if not nodes_config .get ("split_provinces" , False ):
443+ regions = fetch_province_shapes ()
444+ else :
445+ logger .info ("Splitting provinces into user defined nodes" )
446+ prefectures = fetch_prefecture_shapes ()
447+ nodes = build_nodes (prefectures , nodes_config )
448+ nodes .simplify (tol ["land" ]).to_file (snakemake .output .province_shapes .replace (".geojson" , "_nodestest.geojson" ), driver = "GeoJSON" )
449+
450+ raise NotImplementedError (
451+ "Province splitting is not implemented accross the whole workflow yet."
452+ )
453+
296454 regions .to_file (snakemake .output .province_shapes , driver = "GeoJSON" )
297455 regions .to_file (snakemake .output .prov_shpfile )
298456 logger .info (f"Province shapes saved to { snakemake .output .province_shapes } " )
299457
300458 logger .info (f"Fetching maritime zones for EEZ prefix { EEZ_PREFIX } " )
301459 eez_country = fetch_maritime_eez (EEZ_PREFIX )
302460 logger .info ("Breaking by reion" )
303- tol = snakemake .config ["fetch_regions" ]["simplify_tol" ]
304- eez_by_region (eez_country , regions , prov_key = "province" , simplify_tol = tol ).to_file (
461+ eez_by_region (eez_country , regions , prov_key = "province" , simplify_tol = tol ["eez" ]).to_file (
305462 snakemake .output .offshore_shapes , driver = "GeoJSON"
306463 )
307464
0 commit comments