Merge pull request #18 from quantifyearth/mwd-add-occurrence-validation

mdales · web-flow · commit ad4fdf8f19c1 · 2026-01-22T11:09:59.000Z
Add occurrence validation to run script
diff --git a/prepare_layers/convert_crosswalk.py b/prepare_layers/convert_crosswalk.py
@@ -50,6 +50,10 @@ def convert_crosswalk(
             for code in codes:
                 res.append([hab, code])
 
+    # This addition is for the islands correction layer, which will
+    # be added as an effective lcc_0.tif
+    res.append(["islands", "0"])
+
     df = pd.DataFrame(res, columns=["code", "value"])
     df.to_csv(output_path, index=False)
 
diff --git a/prepare_layers/remove_nans_from_mask.py b/prepare_layers/remove_nans_from_mask.py
@@ -0,0 +1,40 @@
+import argparse
+import os
+from pathlib import Path
+
+import yirgacheffe as yg
+
+def remove_nans_from_mask(
+    input_path: Path,
+    output_path: Path,
+) -> None:
+    os.makedirs(output_path.parent, exist_ok=True)
+    with yg.read_raster(input_path) as layer:
+        converted = layer.nan_to_num()
+        converted.to_geotiff(output_path)
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Convert NaNs to zeros in mask layers")
+    parser.add_argument(
+        '--original',
+        type=Path,
+        help="Original raster",
+        required=True,
+        dest="original_path",
+    )
+    parser.add_argument(
+        '--output',
+        type=Path,
+        help='Destination raster',
+        required=True,
+        dest='output_path',
+    )
+    args = parser.parse_args()
+
+    remove_nans_from_mask(
+        args.original_path,
+        args.output_path
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/prepare_species/common.py b/prepare_species/common.py
@@ -28,7 +28,8 @@
     "threats",
     "category",
     "category_weight",
-    "geometry"
+    "geometry",
+    "in_star",
 ]
 
 # From Muir et al: For each species, a global STAR threat abatement (START) score
@@ -84,6 +85,8 @@ class SpeciesReport:
         "keeps_habitats",
         "has_geometries",
         "keeps_geometries",
+        "has_category",
+        "in_star",
         "filename",
     ]
 
@@ -224,6 +227,16 @@ def tidy_reproject_save(
     os.makedirs(output_directory_path, exist_ok=True)
     output_path = output_directory_path / f"{grow.id_no}.geojson"
     res = gpd.GeoDataFrame(grow.to_frame().transpose(), crs=src_crs, geometry="geometry")
+
+    # Ensure proper dtypes for JSON serialization
+    int_columns = ['id_no', 'assessment_id', 'assessment_year', 'elevation_lower', 'elevation_upper', 'category_weight']
+    for col in int_columns:
+        if col in res.columns:
+            res[col] = res[col].astype('Int64')  # Nullable integer type
+
+    if 'in_star' in res.columns:
+        res['in_star'] = res['in_star'].astype(bool)
+
     res_projected = res.to_crs(target_crs)
     res_projected.to_file(output_path, driver="GeoJSON")
     report.filename = output_path
diff --git a/prepare_species/extract_species_data_psql.py b/prepare_species/extract_species_data_psql.py
@@ -30,6 +30,10 @@
 logging.basicConfig()
 logger.setLevel(logging.DEBUG)
 
+# Note that this query returns more species that would be accepted into STAR based on
+# threat category, but for model checking we want all AOHs for a given taxa, so the
+# pipeline must process them all, and then just include the correct set when processing
+# STAR
 MAIN_STATEMENT = """
 SELECT
     assessments.sis_taxon_id as id_no,
@@ -54,7 +58,7 @@
     AND taxons.class_name = %s
     AND taxons.infra_type is NULL -- no subspecies
     AND taxons.metadata->>'taxon_level' = 'Species'
-    AND red_list_category_lookup.code IN ('NT', 'VU', 'EN', 'CR')
+    AND red_list_category_lookup.code IN ('DD', 'LC', 'NT', 'VU', 'EN', 'CR')
 """
 
 SYSTEMS_STATEMENT = """
@@ -138,20 +142,10 @@ def process_row(
         presence += (4,)
         report.possibly_extinct = True # pylint: disable=W0201
 
-    cursor.execute(SYSTEMS_STATEMENT, (assessment_id,))
-    systems_data = cursor.fetchall()
-    try:
-        systems = process_systems(systems_data, report)
-    except ValueError as exc:
-        logger.debug("Dropping %s: %s", id_no, str(exc))
-        return report
-
-    cursor.execute(THREATS_STATEMENT, (assessment_id,))
-    raw_threats = cursor.fetchall()
-    threats = process_threats(raw_threats, report)
-    if len(threats) == 0:
-        return report
+    include_in_star = category in ('NT', 'VU', 'EN', 'CR')
+    report.has_category = include_in_star
 
+    # First checks are the ones that rule out being able to make an AOH at all
     cursor.execute(HABITATS_STATEMENT, (assessment_id,))
     raw_habitats = cursor.fetchall()
     try:
@@ -170,6 +164,37 @@ def process_row(
     except ValueError as _exc:
         return report
 
+    # Second checks are whether it is good for STAR, so from hereon we should
+    # output a GeoJSON regardless as we should make an AOH for validation with this
+    cursor.execute(SYSTEMS_STATEMENT, (assessment_id,))
+    systems_data = cursor.fetchall()
+    try:
+        systems = process_systems(systems_data, report)
+    except ValueError as exc:
+        logger.debug("Dropping %s: %s", id_no, str(exc))
+        include_in_star = False
+        try:
+            systems = systems_data[0][0]
+        except IndexError:
+            systems = []
+
+    cursor.execute(THREATS_STATEMENT, (assessment_id,))
+    raw_threats = cursor.fetchall()
+    threats = process_threats(raw_threats, report)
+    if len(threats) == 0:
+        include_in_star = False
+
+    report.in_star = include_in_star
+
+    try:
+        category_weight = CATEGORY_WEIGHTS[category]
+    except KeyError:
+        assert include_in_star is False
+        category_weight = 0
+
+    # This is a fix as per the method to include the missing islands layer:
+    habitats_list = list(habitats) + ["islands"]
+
     gdf = gpd.GeoDataFrame(
         [[
             id_no,
@@ -179,14 +204,15 @@ def process_row(
             systems,
             int(elevation_lower) if elevation_lower is not None else None,
             int(elevation_upper) if elevation_upper is not None else None,
-            '|'.join(list(habitats)),
+            '|'.join(habitats_list),
             scientific_name,
             family_name,
             class_name,
             json.dumps(threats),
             category,
-            CATEGORY_WEIGHTS[category],
-            geometry
+            category_weight,
+            geometry,
+            include_in_star,
           ]],
         columns=COLUMNS,
         crs=target_projection or 'epsg:4326'
diff --git a/prepare_species/extract_species_data_redlist.py b/prepare_species/extract_species_data_redlist.py
@@ -243,6 +243,8 @@ def process_species(
         logger.debug("Dropping %s: %s", id_no, exc)
         return report
 
+    habitats_list = list(habitats) + ["islands"]
+
     # Create GeoDataFrame with all data
     gdf = gpd.GeoDataFrame(
         [[
@@ -253,7 +255,7 @@ def process_species(
             systems,
             elevation_lower,
             elevation_upper,
-            '|'.join(list(habitats)),
+            '|'.join(habitats_list),
             scientific_name,
             family_name,
             class_name,
diff --git a/requirements.txt b/requirements.txt
@@ -7,8 +7,8 @@ pymer4
 pyproj
 scikit-image
 redlistapi
-yirgacheffe>=1.9
-aoh[validation]>=1.0
+yirgacheffe>=1.12,<2.0
+aoh[validation]>=2.0.1,<3.0
 
 # GDAL should be installed manually to match the version of the library installed on your machine
 gdal[numpy]
diff --git a/scripts/run.sh b/scripts/run.sh
@@ -63,32 +63,20 @@ if [ ! -d "${DATADIR}"/habitat_layers ]; then
 
     echo "Processing habitat map..."
     aoh-habitat-process --habitat "${DATADIR}"/habitat/raw.tif \
-                        --scale 1000.0 \
+                        --scale 992.292720200000133 \
                         --projection "ESRI:54009" \
                         --output "${DATADIR}"/tmp_habitat_layers/current
     mv "${DATADIR}"/tmp_habitat_layers "${DATADIR}"/habitat_layers
 fi
 
-if [ ! -d "${DATADIR}"/masks ]; then
-    echo "Processing masks..."
-    python3 ./prepare_layers/make_masks.py --habitat_layers "${DATADIR}"/habitat_layers/current \
-                                        --output_directory "${DATADIR}"/masks
+if [ ! -f "${DATADIR}"/habitat_layers/current/lcc_0.tif ]; then
+    cp "${DATADIR}/Zenodo/MissingLandcover_1km_cover.tif" "${DATADIR}"/habitat_layers/current/lcc_0.tif
 fi
 
-# Fetch and prepare the elevation layers
-if [[ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif || ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]]; then
-    if [ ! -f "${DATADIR}"/elevation/elevation.tif ]; then
-        echo "Fetching elevation map..."
-        reclaimer zenodo --zenodo_id 5719984  --filename dem-100m-esri54017.tif --output "${DATADIR}"/elevation/elevation.tif
-    fi
-    if [ ! -f "${DATADIR}"/elevation/elevation-max-1k.tif ]; then
-        echo "Generating elevation max layer..."
-        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r max -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-max-1k.tif
-    fi
-    if [ ! -f "${DATADIR}"/elevation/elevation-min-1k.tif ]; then
-        echo "Generating elevation min layer..."
-        gdalwarp -t_srs ESRI:54009 -tr 1000 -1000 -r min -co COMPRESS=LZW -wo NUM_THREADS=40 "${DATADIR}"/elevation/elevation.tif "${DATADIR}"/elevation/elevation-min-1k.tif
-    fi
+if [ ! -f "${DATADIR}"/masks/CGLS100Inland_withGADMIslands.tif ]; then
+    echo "Processing masks..."
+    python3 ./prepare_layers/remove_nans_from_mask.py --original "${DATADIR}"/Zenodo/CGLS100Inland_withGADMIslands.tif \
+                                                      --output "${DATADIR}"/masks/CGLS100Inland_withGADMIslands.tif
 fi
 
 # Generate the crosswalk table
@@ -102,7 +90,10 @@ for TAXA in "${TAXALIST[@]}"
 do
     if [ ! -d "${DATADIR}"/species-info/"${TAXA}"/ ]; then
         echo "Extracting species data for ${TAXA}..."
-        python3 ./prepare_species/extract_species_data_psql.py --class "${TAXA}" --output "${DATADIR}"/species-info/"${TAXA}"/ --projection "ESRI:54009" --excludes "${DATADIR}"/SpeciesList_generalisedRangePolygons.csv
+        python3 ./prepare_species/extract_species_data_psql.py --class "${TAXA}" \
+                                                               --output "${DATADIR}"/species-info/"${TAXA}"/ \
+                                                               --projection "ESRI:54009" \
+                                                               --excludes "${DATADIR}"/SpeciesList_generalisedRangePolygons.csv
     fi
 done
 
@@ -133,6 +124,18 @@ aoh-collate-data --aoh_results "${DATADIR}"/aohs/current/ \
 echo "Calculating model validation..."
 aoh-validate-prevalence --collated_aoh_data "${DATADIR}"/validation/aohs.csv \
                         --output "${DATADIR}"/validation/model_validation.csv
+for TAXA in "${TAXALIST[@]}"
+do
+    echo "Fetching GBIF data for ${TAXA}..."
+    aoh-fetch-gbif-data --collated_aoh_data "${DATADIR}"/validation/aohs.csv \
+                        --taxa "${TAXA}" \
+                        --output_dir "${DATADIR}"/validation/occurrences/
+    echo "Validating occurrences for ${TAXA}..."
+    aoh-validate-occurrences --gbif_data_path "${DATADIR}"/validation/occurrences/"${TAXA}" \
+                             --species_data "${DATADIR}"/species-info/"${TAXA}"/current/ \
+                             --aoh_results  "${DATADIR}"/aohs/current/"${TAXA}"/ \
+                             --output "${DATADIR}"/validation/occurrences/"${TAXA}".csv
+done
 
 # Threats
 echo "Generating threat task list..."
diff --git a/utils/aoh_generator.py b/utils/aoh_generator.py
@@ -22,21 +22,21 @@ def aoh_generator(
             for species_path in species_paths:
                 res.append([
                     data_dir / "habitat_layers" / scenario,
-                    data_dir / "elevation" / "elevation-max-1k.tif",
-                    data_dir / "elevation" / "elevation-min-1k.tif",
+                    data_dir / "Zenodo" / "FABDEM_1km_max_patched.tif",
+                    data_dir / "Zenodo" / "FABDEM_1km_min_patched.tif",
                     data_dir / "crosswalk.csv",
                     species_path,
-                    data_dir / "masks" / "terrestrial_mask.tif",
+                    data_dir / "masks" / "CGLS100Inland_withGADMIslands.tif",
                     data_dir / "aohs" / scenario / taxa_dir_path.name,
                 ])
 
     df = pd.DataFrame(res, columns=[
-        '--habitats',
+        '--fractional_habitats',
         '--elevation-max',
         '--elevation-min',
         '--crosswalk',
         '--speciesdata',
-        '--area',
+        '--weights',
         '--output'
     ])
     output_dir, _ = os.path.split(output_csv_path)
diff --git a/utils/threats_generator.py b/utils/threats_generator.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import argparse
+import json
 import os
 from pathlib import Path
 
@@ -23,11 +24,17 @@ def threats_generator(
                 taxon_id = species_path.stem
                 aoh_path = data_dir / "aohs" / source / taxa_dir_path.name / f"{taxon_id}_all.tif"
                 if aoh_path.exists():
-                    res.append([
-                        species_path,
-                        aoh_path,
-                        data_dir / "threat_rasters" / taxa_dir_path.name,
-                    ])
+                    # Check if species should be included in STAR. We don't use geopandas here
+                    # because it will parse the geometry, which can be quite slow, and we just
+                    # want a single bool field
+                    with open(species_path, 'r', encoding="UTF-8") as f:
+                        data = json.load(f)
+                    if data['features'][0]['properties']['in_star']:
+                        res.append([
+                            species_path,
+                            aoh_path,
+                            data_dir / "threat_rasters" / taxa_dir_path.name,
+                        ])
 
     df = pd.DataFrame(res, columns=[
         '--speciesdata',