Make ref table function way simpler and more efficient (#209)

ehinman · jzemmels · web-flow · commit 6328e09973a5 · 2026-01-22T11:41:53.000-06:00
* make ref table function way simpler and more efficient

* add more documentation, an example

* add in the deduplication line

* add required packages for gpd.explore in docs and move a requirement to docs section of pyproject.toml

* Update dataretrieval/waterdata/api.py

Co-authored-by: Joe Zemmels (he/him) &lt;jzemmels@gmail.com&gt;

* update notebook to pip install all required packages and fix small change

---------

Co-authored-by: Joe Zemmels (he/him) &lt;jzemmels@gmail.com&gt;
diff --git a/.github/workflows/sphinx-docs.yml b/.github/workflows/sphinx-docs.yml
@@ -18,7 +18,6 @@ jobs:
         shell: bash -l {0}
         run: |
           python -m pip install --upgrade pip
-          pip install "docutils<0.22"
           pip install .[doc,nldi]
           ipython kernel install --name "python3" --user
           sudo apt update -y && sudo apt install -y latexmk texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended dvipng pandoc
diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py
@@ -1417,6 +1417,7 @@ def get_field_measurements(
 
     return get_ogc_data(args, output_id, service)
 
+
 def get_reference_table(
         collection: str,
         limit: Optional[int] = None,
@@ -1441,6 +1442,27 @@ def get_reference_table(
         allowable limit is 50000. It may be beneficial to set this number lower
         if your internet connection is spotty. The default (None) will set the
         limit to the maximum allowable limit for the service.
+    
+    Returns
+    -------
+    df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame``
+        Formatted data returned from the API query. The primary metadata
+        of each reference table will show up in the first column, where
+        the name of the column is the singular form of the collection name,
+        separated by underscores (e.g. the "medium-codes" reference table
+        has a column called "medium_code", which contains all possible
+        medium code values).
+    md: :obj:`dataretrieval.utils.Metadata`
+        A custom metadata object including the URL request and query time.
+    
+    Examples
+    --------
+    .. code::
+
+        >>> # Get table of USGS parameter codes
+        >>> ref, md = dataretrieval.waterdata.get_reference_table(
+        ...     collection="parameter-codes"
+        ... )
     """
     valid_code_services = get_args(METADATA_COLLECTIONS)
     if collection not in valid_code_services:
@@ -1449,29 +1471,19 @@ def get_reference_table(
             f"Valid options are: {valid_code_services}."
         )
     
-    req = _construct_api_requests(
-        service=collection,
-        limit=limit,
-        skip_geometry=True,
-    )
-    # Run API request and iterate through pages if needed
-    return_list, response = _walk_pages(
-        geopd=False, req=req
-    )
-
-    # Give ID column a more meaningful name
-    if collection.endswith("s"):
-        return_list = return_list.rename(
-            columns={"id": f"{collection[:-1].replace('-', '_')}_id"}
-            )
+    # Give ID column the collection name with underscores
+    if collection.endswith("s") and collection != "counties":
+        output_id = f"{collection[:-1].replace('-', '_')}"
+    elif collection == "counties":
+        output_id = "county"
     else:
-        return_list = return_list.rename(
-            columns={"id": f"{collection.replace('-', '_')}_id"}
-            )
-
-    # Create metadata object from response
-    metadata = BaseMetadata(response)
-    return return_list, metadata
+        output_id = f"{collection.replace('-', '_')}"
+    
+    return get_ogc_data(
+        args={},
+        output_id=output_id,
+        service=collection
+        )
 
 
 def get_codes(code_service: CODE_SERVICES) -> pd.DataFrame:
diff --git a/demos/WaterData_demo.ipynb b/demos/WaterData_demo.ipynb
@@ -87,20 +87,17 @@
    "metadata": {},
    "source": [
     "## Examples\n",
-    "Let's get into some examples using the functions listed above. First, we need to load the `waterdata` module and a few other packages and functions to go through the examples. To run the entirety of this notebook, you will need to install `dataretrieval`, `matplotlib`, and `geopandas` packages. `matplotlib` is needed to create the plots, and `geopandas` is needed to create the interactive maps."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cd626a14",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Install necessary packages to run notebook\n",
+    "Let's get into some examples using the functions listed above. First, we need to load the `waterdata` module and a few other packages and functions to go through the examples. To run the entirety of this notebook, you will need to install `dataretrieval`, `matplotlib`, and `geopandas` packages (plus dependencies). `matplotlib` is needed to create the plots, and `geopandas` is needed to create the interactive maps.\n",
+    "\n",
+    "Note that if you use conda rather than pip, you do not need to install folium and mapclassify separately, as they are included in the conda-forge geopandas install.\n",
+    "\n",
+    "```python\n",
     "!pip install dataretrieval\n",
     "!pip install matplotlib\n",
-    "!pip install geopandas"
+    "!pip install geopandas\n",
+    "!pip install folium\n",
+    "!pip install mapclassify\n",
+    "``` "
    ]
   },
   {
@@ -156,7 +153,7 @@
    "outputs": [],
    "source": [
     "streamflow_pcodes = pcodes[pcodes['parameter_name'].str.contains('streamflow|discharge', case=False, na=False)]\n",
-    "display(streamflow_pcodes[['parameter_code_id', 'parameter_name']])"
+    "display(streamflow_pcodes[['parameter_code', 'parameter_name']])"
    ]
   },
   {
@@ -599,7 +596,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "waterdata-demo",
+   "display_name": "waterdata-demo-pip",
    "language": "python",
    "name": "python3"
   },
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,13 +39,16 @@ test = [
   "flake8",
 ]
 doc = [
+  "docutils<0.22",
   "sphinx",
   "sphinx-rtd-theme",
   "nbsphinx",
   "nbsphinx_link",
   "ipython",
   "ipykernel",
   "matplotlib",
+  "folium>=0.12",
+  "mapclassify"
 ]
 nldi = [
   'geopandas>=0.10'
diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py
@@ -248,7 +248,7 @@ def test_get_time_series_metadata():
 
 def test_get_reference_table():
     df, md = get_reference_table("agency-codes")
-    assert "agency_code_id" in df.columns
+    assert "agency_code" in df.columns
     assert df.shape[0] > 0
     assert hasattr(md, 'url')
     assert hasattr(md, 'query_time')