Spaces:

boettiger-lab
/

preview-ca-30x30-cbn

Running

App Files Files Community

cassiebuhler commited on Aug 8

Commit

b0bb178

1 Parent(s): 70d09b9

zonal stats + splitting geometry scripts

Browse files

Files changed (10) hide show

.gitignore +5 -0
preprocess/split_data/combine.py +66 -0
preprocess/split_data/combine_habitat_climate.ipynb +84 -0
preprocess/split_data/split.py +175 -0
preprocess/split_data/split_geoms.ipynb +74 -0
preprocess/zonal_stats/join_zonals.ipynb +168 -0
preprocess/zonal_stats/raster_stats.ipynb +91 -0
preprocess/zonal_stats/raster_utils.py +75 -0
preprocess/zonal_stats/vector_stats.ipynb +94 -0
preprocess/zonal_stats/vector_utils.py +192 -0

.gitignore CHANGED Viewed

@@ -196,3 +196,8 @@ k8s/secret-deployment.yaml
 duck.db
 query_log.csv
 **/*.zip

 duck.db
 query_log.csv
 **/*.zip
+**/*.shp..xml
+**/*.TablesByName..atx
+**/*.shp..xml
+**/*.gpkg
+**/*.lyrx

preprocess/split_data/combine.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import ibis
+from ibis import _
+import ibis.expr.datatypes as dt
+@ibis.udf.scalar.builtin
+def ST_IsEmpty(geom: dt.geometry) -> dt.boolean:
+    return f"ST_IsEmpty({geom})"
+def combine_habitat_and_climate(data1_url, data2_url, con):
+    SQM_PER_ACRE = 4046.8564224
+    t1 = con.read_parquet(data1_url).select(_.habitat_type, _.geom)
+    t2 = con.read_parquet(data2_url).select(_.climate_zone, _.geom)
+    # intersection areas: where habitat and climate overlap
+    intersected = (
+        t1.inner_join(t2, t1.geom.intersects(t2.geom))
+        .select(
+            habitat_type=t1.habitat_type,
+            climate_zone=t2.climate_zone,
+            geom=t1.geom.intersection(t2.geom).name("geom")
+        )
+        .filter(_.geom.is_valid())
+        .mutate(acres=( _.geom.area() / SQM_PER_ACRE ).round(4))
+    )
+    # habitat only: subtract all overlapping climate from each habitat polygon
+    overlapping_climate = (
+        t1.cross_join(t2)
+        .filter(t1.geom.intersects(t2.geom))
+        .select(t2.geom)
+        .aggregate(union_geom=_.geom.unary_union())
+    )
+    habitat_with_union = t1.cross_join(overlapping_climate)
+    habitat_only = (
+        habitat_with_union.select(
+            habitat_type=_.habitat_type,
+            climate_zone=ibis.literal("None").name("climate_zone"),
+            geom=_.geom.difference(_.union_geom).name("geom")
+        )
+        .filter(_.geom.is_valid())
+        .mutate(acres=( _.geom.area() / SQM_PER_ACRE ).round(4))
+    )
+    # climate only: subtract all overlapping habitat from each climate polygon
+    overlapping_habitat = (
+        t2.cross_join(t1)
+        .filter(t2.geom.intersects(t1.geom))
+        .select(t1.geom)
+        .aggregate(union_geom=_.geom.unary_union())
+    )
+    climate_with_union = t2.cross_join(overlapping_habitat)
+    climate_only = (
+        climate_with_union.select(
+            habitat_type=ibis.literal("None").name("habitat_type"),
+            climate_zone=_.climate_zone,
+            geom=_.geom.difference(_.union_geom).name("geom")
+        )
+        .filter(_.geom.is_valid())
+        .mutate(acres=( _.geom.area() / SQM_PER_ACRE ).round(4))
+    )
+    # combine
+    result = intersected.union(habitat_only).union(climate_only)
+    # result = result.filter(~ST_IsEmpty(_.geom))
+    return result

preprocess/split_data/combine_habitat_climate.ipynb ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8b314df6-c4b9-4be5-9a34-2ab5236f072b",
+   "metadata": {},
+   "source": [
+    "# Combine habitat types and climate zones\n",
+    "To split the protected areas into habitat types and climate zones, it's best to dissolve the habitat type/climate zone geometries into a single dataset, and then intersect on protected areas. \n",
+    "\n",
+    "To do so, we combine the data to get distinct geometries for each habitat type + climate zone combonation. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "baa63640-33a7-4b99-a4aa-46125beb5976",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from combine import * \n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+    "if base_dir not in sys.path:\n",
+    "    sys.path.insert(0, base_dir)\n",
+    "    \n",
+    "from minio_utils import * \n",
+    "con, _ = connect_minio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5000eb46-1e51-49e3-bd2d-3230e829628f",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%time \n",
+    "#Wall time: 3h 48min 41s\n",
+    "# prior to running, I simplified each vector by 10 meters. \n",
+    "data1_url = 's3://public-ca30x30/CBN/Habitat/dissolved_geoms/CWHR13_dissolved_geoms_simplify10m.parquet'\n",
+    "data2_url = 's3://public-ca30x30/CBN/Climate_zones/dissolved_geoms/Climate_zones_dissolved_geoms_simplify10m.parquet'\n",
+    "result = combine_habitat_and_climate(data1_url, data2_url, con)\n",
+    "\n",
+    "new_url = 's3://public-ca30x30/CBN/Habitat/dissolved_geoms/CWHR13_climate_dissolved_geoms_simplify10m.parquet'\n",
+    "result.to_parquet(new_url)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

preprocess/split_data/split.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import ibis
+from ibis import _
+def get_ecoregion(index):
+    ecoregions = ['Central_California_Coast',
+     'Central_Valley_Coast_Ranges',
+     'Colorado_Desert',
+     'Great_Valley_North',
+     'Great_Valley_South',
+     'Klamath_Mountains',
+     'Modoc_Plateau',
+     'Mojave_Desert',
+     'Mono',
+     'Northern_California_Coast',
+     'Northern_California_Coast_Ranges',
+     'Northern_California_Interior_Coast_Ranges',
+     'Northwestern_Basin_and_Range',
+     'Sierra_Nevada',
+     'Sierra_Nevada_Foothills',
+     'Sonoran_Desert',
+     'Southeastern_Great_Basin',
+     'Southern_California_Coast',
+     'Southern_California_Mountains_and_Valleys',
+     'Southern_Cascades']
+    eco = ecoregions[index]
+    return eco
+def split_layer(data3_url, con):
+    overlap_url = 's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/Habitat_and_Climate_zones/CWHR13_climate_dissolved_geoms_simplify10m_includesNA.parquet'
+    overlap_table = con.read_parquet(overlap_url)
+    SQM_PER_ACRE = 4046.8564224
+    t3 = con.read_parquet(data3_url).select("id", "name", "manager", "manager_type", "county", "gap_code",
+                                            "status", "land_tenure", "ecoregion", "access_type", "geom")
+    # append each id with the habitat + climate zone combo as its "sub_id"
+    habitat_letter_map = {
+        "Agriculture": "a",
+        "Barren/Other": "b",
+        "Conifer Forest": "c",
+        "Conifer Woodland": "d",
+        "Desert Shrub": "e",
+        "Desert Woodland": "f",
+        "Hardwood Forest": "g",
+        "Hardwood Woodland": "h",
+        "Herbaceous": "i",
+        "Shrub": "j",
+        "Urban": "k",
+        "Water": "l",
+        "Wetland": "m",
+        "None": "n"
+    }
+    climate_letter_map = {
+        "Zone 1": "a",
+        "Zone 2": "b",
+        "Zone 3": "c",
+        "Zone 4": "d",
+        "Zone 5": "e",
+        "Zone 6": "f",
+        "Zone 7": "g",
+        "Zone 8": "h",
+        "Zone 9": "i",
+        "Zone 10": "j",
+        "None": "k",
+    }
+    habitat_letter_table = ibis.memtable([{"habitat_type": k, "habitat_letter": v} for k, v in habitat_letter_map.items()])
+    climate_letter_table = ibis.memtable([{"climate_zone": k, "climate_letter": v} for k, v in climate_letter_map.items()])
+    # join mappings to overlap table
+    overlap_labeled = (
+        overlap_table
+        .inner_join(habitat_letter_table, "habitat_type")
+        .inner_join(climate_letter_table, "climate_zone")
+    )
+    # cross join and spatial intersection
+    joined = t3.cross_join(overlap_labeled)
+    joined = joined.mutate(intersects=t3.geom.intersects(overlap_labeled.geom))
+    # filter for intersection or include if no match (preserve unmatched)
+    matched = joined.filter(joined.intersects)
+    # calculate sub_id and geometry for matches
+    matched = matched.select(
+        id=t3.id,
+        sub_id=t3.id.cast("string")
+                .concat("_")
+                .concat(overlap_labeled.habitat_letter)
+                .concat(overlap_labeled.climate_letter)
+                .name("sub_id"),
+        habitat_type=overlap_labeled.habitat_type,
+        climate_zone=overlap_labeled.climate_zone,
+        name=t3.name,
+        manager=t3.manager,
+        manager_type=t3.manager_type,
+        county=t3.county,
+        gap_code=t3.gap_code,
+        status=t3.status,
+        land_tenure=t3.land_tenure,
+        ecoregion=t3.ecoregion,
+        access_type=t3.access_type,
+        geom=t3.geom.intersection(overlap_labeled.geom),
+        acres=(t3.geom.intersection(overlap_labeled.geom).area() / SQM_PER_ACRE).round(4)
+    )
+    # find unmatched records (no overlap)
+    matched_ids = matched.select("id").distinct()
+    # left join to find unmatched rows
+    unmatched = t3.left_join(matched_ids, "id").filter(matched_ids.id.isnull())
+    unmatched = unmatched.select(
+        id=t3.id,
+        sub_id=t3.id.cast("string").concat("_n").concat("k").name("sub_id"),
+        habitat_type=ibis.literal("None"),
+        climate_zone=ibis.literal("None"),
+        name=t3.name,
+        manager=t3.manager,
+        manager_type=t3.manager_type,
+        county=t3.county,
+        gap_code=t3.gap_code,
+        status=t3.status,
+        land_tenure=t3.land_tenure,
+        ecoregion=t3.ecoregion,
+        access_type=t3.access_type,
+        geom=t3.geom,
+        acres=(t3.geom.area() / SQM_PER_ACRE).round(4)
+    )
+    # compute unmatched residuals by subtracting all intersected parts from each t3.geom
+    matched_geoms = matched.group_by("id").aggregate(
+        matched_union=matched.geom.unary_union()
+)
+    # join to get original geom
+    residuals = t3.inner_join(matched_geoms, "id").mutate(
+        residual_geom=t3.geom.difference(matched_geoms.matched_union)
+    )
+    # only keep meaningful residual geoms, not empty geoms.
+    residuals = (residuals
+                 .filter((_.residual_geom.is_valid())
+                         & ((_.residual_geom.area()) > 0 )
+                        ))
+    # unmatched parts
+    residual_rows = residuals.select(
+        id=t3.id,
+        sub_id=t3.id.cast("string").concat("_n").concat("k").name("sub_id"),
+        habitat_type=ibis.literal("None"),
+        climate_zone=ibis.literal("None"),
+        name=t3.name,
+        manager=t3.manager,
+        manager_type=t3.manager_type,
+        county=t3.county,
+        gap_code=t3.gap_code,
+        status=t3.status,
+        land_tenure=t3.land_tenure,
+        ecoregion=t3.ecoregion,
+        access_type=t3.access_type,
+        geom=_.residual_geom,
+        acres=(_.residual_geom.area() / SQM_PER_ACRE).round(4)
+    )
+    result = matched.union(unmatched).union(residual_rows)
+    return result
+def check_results(con, url,save_url):
+    original_id = con.read_parquet(url).select('id').distinct().execute()['id']
+    new_id = con.read_parquet(save_url).select('id').distinct().execute()['id']
+    missing_ids = list(set(original_id)- set(new_id))
+    print(f'# of missing IDs: {len(missing_ids)}')
+    original_acres = con.read_parquet(url).select('acres').execute()['acres'].sum()
+    new_acres = con.read_parquet(save_url).select('acres').execute()['acres'].sum()
+    acres_loss = original_acres-new_acres
+    print(f'Acres loss: {acres_loss}\n')
+    print(f'Ratio: {new_acres/original_acres}\n')
+    return missing_ids

preprocess/split_data/split_geoms.ipynb ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "96bbe4c4-a600-437b-b096-dfb4ba2cc8fe",
+   "metadata": {},
+   "source": [
+    "# Split geometries into habitat types and climate zones \n",
+    "To assign each feature a habitat type and climate zone, we split up protected areas that span multiple"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7150257-5f70-4419-a8fb-63bb12dd0963",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from split import * \n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+    "if base_dir not in sys.path:\n",
+    "    sys.path.insert(0, base_dir)\n",
+    "    \n",
+    "from minio_utils import * \n",
+    "con, _ = connect_minio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b3c5bb2-86d1-419d-8d3d-0a99fe18f442",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "# run for all ecoregions + gap codes. \n",
+    "# If you don't process the data in subsets (gap codes and ecoregions), it'll take a few days and you'll need 64GB+ of memory \n",
+    "eco = get_ecoregion(10)\n",
+    "label = 'gap2'\n",
+    "print(label)\n",
+    "print(eco)\n",
+    "\n",
+    "url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/base/{label}/{eco}_epsg3310.parquet'\n",
+    "result = split_layer(url, con)\n",
+    "save_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/split_habitat_climate/{label}/{label}_{eco}_habitat_climate.parquet'\n",
+    "result.to_parquet(save_url)\n",
+    "check_results(con, url,save_url)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

preprocess/zonal_stats/join_zonals.ipynb ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ef62b0fd-67b0-4ad3-bad0-bdaec434acaf",
+   "metadata": {},
+   "source": [
+    "# Joining zonal data\n",
+    "We split up into gap codes to compute zonal stats, so we need to join them into a single table again"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e792c164-38bc-4821-823e-31274db6abec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ibis \n",
+    "from ibis import _\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+    "if base_dir not in sys.path:\n",
+    "    sys.path.insert(0, base_dir)\n",
+    "    \n",
+    "from minio_utils import * \n",
+    "con, _ = connect_minio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c34a00b-ccea-4be8-add4-db242a6aba74",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "labels = ['gap1','gap2','gap3','gap4','nonconserved']\n",
+    "\n",
+    "for label in labels:\n",
+    "    names = ['id',\n",
+    "             'pct_top_amphibian_richness','mean_amphibian_richness',\n",
+    "             'pct_top_reptile_richness','mean_reptile_richness',\n",
+    "             'pct_top_bird_richness','mean_bird_richness',\n",
+    "             'pct_top_mammal_richness','mean_mammal_richness',\n",
+    "             'pct_top_freshwater_richness','mean_top_freshwater_richness',\n",
+    "             'pct_wetlands','pct_fire','pct_farmland','pct_grazing',\n",
+    "             'pct_disadvantaged_community','pct_low_income_community',\n",
+    "             'mean_plant_richness','pct_top_plant_richness'\n",
+    "            ]\n",
+    "    \n",
+    "    agg_dict = {\n",
+    "        name: _[name].first() for name in names\n",
+    "    }\n",
+    "    stats_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}/**.parquet'\n",
+    "    a = (con.read_parquet(stats_url, union_by_name = True)\n",
+    "        .drop('geom')\n",
+    "        .group_by('sub_id')\n",
+    "        .aggregate(**agg_dict)\n",
+    "        )\n",
+    "    \n",
+    "    url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/split_habitat_climate/{label}_habitat_climate.parquet'\n",
+    "    base = con.read_parquet(url)\n",
+    "    joined = base.inner_join(a,['sub_id','id'])\n",
+    "    save_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}_habitat_climate_stats.parquet'\n",
+    "    joined.to_parquet(save_url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6c2451f-aa6a-4d4e-b233-8b8b590fbfe0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cols = ['id',\n",
+    "    'sub_id',\n",
+    "    'name',\n",
+    "    'manager',\n",
+    "    'manager_type',\n",
+    "    'gap_code',\n",
+    "    'status',\n",
+    "    'land_tenure',\n",
+    "    'access_type',\n",
+    "    'county',\n",
+    "    'ecoregion',\n",
+    "    'habitat_type',\n",
+    "    'climate_zone',\n",
+    "    'mean_amphibian_richness',\n",
+    "    'mean_reptile_richness',\n",
+    "    'mean_bird_richness',\n",
+    "    'mean_mammal_richness',\n",
+    "    'mean_plant_richness',\n",
+    "    'mean_freshwater_richness',\n",
+    "    'pct_top_amphibian_richness',\n",
+    "    'pct_top_reptile_richness',\n",
+    "    'pct_top_bird_richness',\n",
+    "    'pct_top_mammal_richness',\n",
+    "    'pct_top_plant_richness',\n",
+    "    'pct_top_freshwater_richness',\n",
+    "    'pct_wetlands',\n",
+    "    'pct_fire',\n",
+    "    'pct_farmland',\n",
+    "    'pct_grazing_lands',\n",
+    "    'pct_disadvantaged_community',\n",
+    "    'pct_low_income_community',\n",
+    "    'acres',\n",
+    "    'geom']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c986c99-a403-4266-b031-3a882d93c1fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stats_joined_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/*_habitat_climate_stats.parquet'\n",
+    "joined_stats = (con\n",
+    "                .read_parquet(stats_joined_url)\n",
+    "                .mutate(geom = _.geom.convert('epsg:3310','epsg:4326'))\n",
+    "                .rename(pct_grazing_lands = \"pct_grazing\")\n",
+    "                .mutate(gap_code = _.gap_code.substitute({'Non-Conservation Area':'None'}))\n",
+    "                .mutate(name = _.name.fill_null('None'))\n",
+    "                .mutate(manager = _.manager.fill_null('None'))\n",
+    "                .mutate(manager_type = _.manager_type.fill_null('None'))\n",
+    "                .mutate(gap_code = _.gap_code.fill_null('None'))\n",
+    "                .mutate(status = _.status.fill_null('None'))\n",
+    "                .mutate(land_tenure = _.land_tenure.fill_null('None'))\n",
+    "                .mutate(access_type = _.access_type.fill_null('None'))\n",
+    "                .mutate(county = _.county.fill_null('None'))\n",
+    "                .mutate(ecoregion = _.ecoregion.fill_null('None'))\n",
+    "                .mutate(habitat_type = _.habitat_type.fill_null('None'))\n",
+    "                .mutate(climate_zone = _.climate_zone.fill_null('None'))\n",
+    ")\n",
+    "\n",
+    "url1 = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/ca30x30_habitat_climate_stats.parquet'\n",
+    "url2= f's3://public-ca30x30/ca30x30_cbn_v3.parquet'\n",
+    "data = joined_stats.select(cols).order_by('gap_code','county','name','id','sub_id')\n",
+    "data.to_parquet(url1)\n",
+    "data.to_parquet(url2)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

preprocess/zonal_stats/raster_stats.ipynb ADDED Viewed

	@@ -0,0 +1,91 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "68cd13a8-3eb1-461c-b5e1-4f8a356abfeb",
+   "metadata": {},
+   "source": [
+    "# Compute zonal stats for protected areas with raster data "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1852e008-906d-40e5-826f-a8fa92d9fceb",
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from raster_utils import * \n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+    "if base_dir not in sys.path:\n",
+    "    sys.path.insert(0, base_dir)\n",
+    "    \n",
+    "from minio_utils import * \n",
+    "con, _ = connect_minio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d91efb56-e6e9-4cb6-8ce6-54d318e62534",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time \n",
+    "# run for all metrics, files, and gap codes \n",
+    "metric = 'overlap'\n",
+    "label = 'gap1'\n",
+    "index = 0\n",
+    "\n",
+    "get_raster_stats(con, label, name, raster, metric, index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1da6f99d-ba7f-44ed-8100-cc3371966693",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.lib.display import Audio\n",
+    "import numpy as np\n",
+    "\n",
+    "framerate = 4410\n",
+    "play_time_seconds = 1\n",
+    "\n",
+    "t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)\n",
+    "audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)\n",
+    "Audio(audio_data, rate=framerate, autoplay=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

preprocess/zonal_stats/raster_utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import pandas as pd
+from exactextract import exact_extract
+import ibis
+from ibis import _
+def get_raster_file(metric, index):
+    mean_end_plant = '/vsicurl/https://minio.carlboettiger.info/public-ca30x30/CBN/Biodiversity_unique/Rarityweighted_endemic_plant_richness/endemicspecies_E_epsg3310.tif'
+    mean_plant = '/vsicurl/https://minio.carlboettiger.info/public-ca30x30/CBN/Biodiversity_unique/Plant_richness/species_D.tif'
+    pct_plant = '/vsicurl/https://minio.carlboettiger.info/public-ca30x30/CBN/Biodiversity_unique/Plant_richness/species_D_80percent_epsg3310.tif'
+    pct_end_plant = '/vsicurl/https://minio.carlboettiger.info/public-ca30x30/CBN/Biodiversity_unique/Rarityweighted_endemic_plant_richness/endemicspecies_E_80percent_epsg3310.tif'
+    if metric == 'mean':
+        names = ['mean_plant_richness','mean_rarityweighted_endemic_plant_richness']
+        rasters = [mean_plant, mean_end_plant]
+    elif metric == 'overlap':
+        names = ['pct_top_plant_richness','pct_rarityweighted_endemic_plant_richness']
+        rasters = [pct_plant, pct_end_plant]
+    return names[index], rasters[index]
+def get_raster_stats(con, label, name, raster, metric, index):
+    name, raster  = get_raster_file(metric, index)
+    compute_raster_stats(con, label, name, raster, metric)
+    return
+def compute_percentage_overlap(df, name):
+    def extract_overlap(row):
+        if 1 in row['unique']:
+            idx = row['unique'].index(1)
+            return round(row['frac'][idx], 6)
+        return 0
+    df[name] = df.apply(extract_overlap, axis=1)
+    return df[['id', 'sub_id', name, 'acres']]
+def compute_raster_stats(con, label, name, raster, metric = 'mean'):
+    print(label)
+    print(metric)
+    print(name)
+    url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/split_habitat_climate/{label}_habitat_climate.parquet'
+    if label in ['gap2','gap4']: #don't compute with tiny polygons (exactextract gets mad if you keep them)
+        polys = con.read_parquet(url).rename(new_id = 'id').execute()
+        small = polys[round(polys['acres'],10) ==0]
+        a = ibis.memtable(small, name = 'tmp')
+        exclude_ids = a.select('sub_id').distinct().execute()['sub_id'].to_list()
+        polys = polys[~polys['sub_id'].isin(exclude_ids)]
+    else:
+        polys = con.read_parquet(url).rename(new_id = 'id').execute()
+    polys = polys.set_crs('epsg:3310')
+    if metric == 'mean':
+        out = exact_extract(raster, polys, [metric], include_cols=["new_id","sub_id","acres"])
+        rows = [{'id': f['properties']['new_id'], 'sub_id': f['properties']['sub_id'], name: round(f['properties'][metric], 6),
+                'acres': f['properties']['acres']} for f in out]
+        stats = pd.DataFrame(rows)
+    # computing the overlap of each unique parcel then computing the overlap of only 1's
+    elif metric == 'overlap':
+        metrics = ['frac','unique']
+        out = exact_extract(raster, polys, metrics, include_cols=["new_id","sub_id","acres"])
+        rows = [{'id': f['properties']['new_id'], 'sub_id': f['properties']['sub_id'], 'frac': list(f['properties']['frac']),
+                 'unique': list(f['properties']['unique']),
+                 'acres': f['properties']['acres']} for f in out]
+        stats = compute_percentage_overlap(pd.DataFrame(rows), name)
+    out = con.create_table('tmp', stats, overwrite = True)
+    ##zeroing out tiny polygons
+    if label in ['gap2','gap4']:
+        excluded = con.read_parquet(url).filter(_.sub_id.isin(exclude_ids)).mutate(**{name: ibis.literal(0)})
+        excluded = excluded.cast({name:"float64"}).select('sub_id','id',name,'acres')
+        out = out.union(excluded)
+    save_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}/{name}.parquet'
+    out.to_parquet(save_url)
+    return

preprocess/zonal_stats/vector_stats.ipynb ADDED Viewed

	@@ -0,0 +1,94 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d0e4ad58-9eeb-43c4-811a-9c701321afe1",
+   "metadata": {},
+   "source": [
+    "# Compute zonal stats for protected areas with vector data "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b756f206-651f-459e-8977-1fb99afeab27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vector_utils import *\n",
+    "import os\n",
+    "import sys\n",
+    "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+    "if base_dir not in sys.path:\n",
+    "    sys.path.insert(0, base_dir)\n",
+    "    \n",
+    "from minio_utils import * \n",
+    "con, _ = connect_minio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1d692e2-a5bb-4c1d-b679-48af858d5853",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time \n",
+    "# run for all metrics, files, and gap codes \n",
+    "metric = 'overlap'\n",
+    "label = 'gap1'\n",
+    "index = 0\n",
+    "\n",
+    "vector_stats, save_url = get_vector_stats(con, index, label, metric)\n",
+    "vector_stats.to_parquet(save_url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc1dedcd-6823-41fd-a8da-2d118a33ca70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.lib.display import Audio\n",
+    "import numpy as np\n",
+    "\n",
+    "framerate = 4410\n",
+    "play_time_seconds = 1\n",
+    "\n",
+    "t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)\n",
+    "audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)\n",
+    "Audio(audio_data, rate=framerate, autoplay=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "212e9d19-caf1-429a-a42c-a5377b9e7b05",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

preprocess/zonal_stats/vector_utils.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import ibis
+from ibis import _
+import ibis.selectors as s
+import ibis.expr.datatypes as dt
+import os
+def get_url(folder, file, bucket = 'public-ca30x30', base_folder = 'CBN', method = 'write'):
+    if method == 'write':
+        minio = 's3://'
+    else:
+        minio = 'https://minio.carlboettiger.info/'
+    if base_folder is None:
+        path = os.path.join(bucket,folder,file)
+    else:
+        path = os.path.join(bucket,base_folder,folder,file)
+    url = minio+path
+    return url
+def get_vector_file(metric):
+    # CBN vector data
+    amph_richness = get_url('ACE_biodiversity/ACE_amphibian_richness','ACE_amphibian_richness_epsg3310.parquet')
+    pct_amph_richness = get_url('ACE_biodiversity/ACE_amphibian_richness','ACE_amphibian_richness_80percent_epsg3310.parquet')
+    reptile_richness = get_url('ACE_biodiversity/ACE_reptile_richness','ACE_reptile_richness_epsg3310.parquet')
+    pct_reptile_richness = get_url('ACE_biodiversity/ACE_reptile_richness','ACE_reptile_richness_80percent_epsg3310.parquet')
+    bird_richness = get_url('ACE_biodiversity/ACE_bird_richness','ACE_bird_richness_epsg3310.parquet')
+    pct_bird_richness = get_url('ACE_biodiversity/ACE_bird_richness','ACE_bird_richness_80percent_epsg3310.parquet')
+    mammal_richness = get_url('ACE_biodiversity/ACE_mammal_richness','ACE_mammal_richness_epsg3310.parquet')
+    pct_mammal_richness = get_url('ACE_biodiversity/ACE_mammal_richness','ACE_mammal_richness_80percent_epsg3310.parquet')
+    rare_amphibian_richness = get_url('ACE_biodiversity/ACE_rare_amphibian_richness','ACE_rare_amphibian_richness_epsg3310.parquet')
+    pct_rare_amphibian_richness = get_url('ACE_biodiversity/ACE_rare_amphibian_richness','ACE_rare_amphibian_richness_95percent_epsg3310.parquet')
+    rare_reptile_richness = get_url('ACE_biodiversity/ACE_rare_reptile_richness','ACE_rare_reptile_richness_epsg3310.parquet')
+    pct_rare_reptile_richness = get_url('ACE_biodiversity/ACE_rare_reptile_richness','ACE_rare_reptile_richness_95percent_epsg3310.parquet')
+    rare_bird_richness = get_url('ACE_biodiversity/ACE_rare_bird_richness','ACE_rare_bird_richness_epsg3310.parquet')
+    pct_rare_bird_richness = get_url('ACE_biodiversity/ACE_rare_bird_richness','ACE_rare_bird_richness_95percent_epsg3310.parquet')
+    rare_mammal_richness = get_url('ACE_biodiversity/ACE_rare_mammal_richness','ACE_rare_mammal_richness_epsg3310.parquet')
+    pct_rare_mammal_richness = get_url('ACE_biodiversity/ACE_rare_mammal_richness','ACE_rare_mammal_richness_95percent_epsg3310.parquet')
+    endemic_amphibian_richness = get_url('ACE_biodiversity/ACE_endemic_amphibian_richness','ACE_endemic_amphibian_richness_epsg3310.parquet')
+    pct_endemic_amphibian_richness = get_url('ACE_biodiversity/ACE_endemic_amphibian_richness','ACE_endemic_amphibian_richness_95percent_epsg3310.parquet')
+    endemic_reptile_richness = get_url('ACE_biodiversity/ACE_endemic_reptile_richness','ACE_endemic_reptile_richness_epsg3310.parquet')
+    pct_endemic_reptile_richness = get_url('ACE_biodiversity/ACE_endemic_reptile_richness','ACE_endemic_reptile_richness_95percent_epsg3310.parquet')
+    endemic_bird_richness = get_url('ACE_biodiversity/ACE_endemic_bird_richness','ACE_endemic_bird_richness_epsg3310.parquet')
+    pct_endemic_bird_richness = get_url('ACE_biodiversity/ACE_endemic_bird_richness','ACE_endemic_bird_richness_95percent_epsg3310.parquet')
+    endemic_mammal_richness = get_url('ACE_biodiversity/ACE_endemic_mammal_richness','ACE_endemic_mammal_richness_epsg3310.parquet')
+    pct_endemic_mammal_richness = get_url('ACE_biodiversity/ACE_endemic_mammal_richness','ACE_endemic_mammal_richness_95percent_epsg3310.parquet')
+    freshwater_richness = get_url('Freshwater_resources/Freshwater_species_richness','freshwater_species_richness_ds1197_epsg3310.parquet')
+    pct_freshwater_richness = get_url('Freshwater_resources/Freshwater_species_richness','freshwater_species_richness_ds1197_80percent_epsg3310.parquet')
+    wetlands = get_url('Freshwater_resources/Wetlands','CA_wetlands_epsg3310.parquet')
+    fire = get_url('Climate_risks/Historical_fire_perimeters','calfire_2023_epsg3310.parquet')
+    farmland = get_url('NBS_agriculture/Farmland_all/Farmland','Farmland_2018_epsg3310.parquet')
+    grazing = get_url('NBS_agriculture/Farmland_all/Lands_suitable_grazing','Grazing_land_2018_epsg3310.parquet')
+    DAC = get_url('Progress_data_new_protection/DAC','DAC_2022_epsg3310.parquet')
+    low_income = get_url('Progress_data_new_protection/Low_income_communities','low_income_CalEnviroScreen4_epsg3310.parquet')
+    pct_newly_protected = get_url('Progress_data_new_protection/Newly_counted_lands/dissolved','newly_protected_2024_union_epsg3310.parquet')
+    pct_data_improvement = get_url('Progress_data_new_protection/Newly_counted_lands/dissolved','data_improvement_2024_union_epsg3310.parquet')
+    pct_increased_management = get_url('Progress_data_new_protection/Newly_counted_lands/dissolved','increased_management_2024_union_epsg3310.parquet')
+    if metric == 'mean':
+        names = ['mean_amphibian_richness','mean_reptile_richness',
+                 'mean_bird_richness','mean_mammal_richness',
+                 'mean_rare_amphibian_richness','mean_rare_reptile_richness',
+                 'mean_rare_bird_richness','mean_rare_mammal_richness',
+                 'mean_endemic_amphibian_richness','mean_endemic_reptile_richness',
+                 'mean_endemic_bird_richness','mean_endemic_mammal_richness',
+                 'mean_freshwater_richness']
+        vectors = [amph_richness, reptile_richness,
+                   bird_richness, mammal_richness,
+                   rare_amphibian_richness, rare_reptile_richness,
+                   rare_bird_richness, rare_mammal_richness,
+                   endemic_amphibian_richness,
+                   endemic_reptile_richness,
+                   endemic_bird_richness,
+                   endemic_mammal_richness,
+                   freshwater_richness]
+        cols = ['NtvAmph','NtvRept','NtvBird','NtvMamm',
+                'RarAmph','RarRept','RarBird','RarMamm',
+                'AmphEndem','ReptEndem','BirdEndem','MammEndem',
+                'Freshwater_Species_Count']
+    elif metric == 'overlap':
+        names = ['pct_top_amphibian_richness','pct_top_reptile_richness',
+                 'pct_top_bird_richness','pct_top_mammal_richness',
+                 'pct_rare_amphibian_richness','pct_rare_reptile_richness',
+                 'pct_rare_bird_richness','pct_rare_mammal_richness',
+                 'pct_endemic_amphibian_richness','pct_endemic_reptile_richness',
+                 'pct_endemic_bird_richness','pct_endemic_mammal_richness',
+                 'pct_top_freshwater_richness',
+                 'pct_wetlands','pct_fire','pct_farmland','pct_grazing',
+                 'pct_disadvantaged_community','pct_low_income_community',
+                'pct_newly_protected','pct_data_improvement','pct_increased_management']
+        vectors = [pct_amph_richness, pct_reptile_richness,
+                    pct_bird_richness, pct_mammal_richness,
+                    pct_rare_amphibian_richness, pct_rare_reptile_richness,
+                    pct_rare_bird_richness, pct_rare_mammal_richness,
+                    pct_endemic_amphibian_richness,
+                    pct_endemic_reptile_richness,
+                    pct_endemic_bird_richness,
+                    pct_endemic_mammal_richness,
+                    pct_freshwater_richness,
+                    wetlands,
+                    fire, farmland, grazing,
+                    DAC, low_income,
+                    pct_newly_protected,
+                    pct_data_improvement,
+                    pct_increased_management
+                  ]
+        cols = [None] * len(vectors)
+    return names, vectors, cols
+def get_vector_stats(con, index, label, metric):
+    names, vectors, cols = get_vector_file(metric)
+    name = names[index]
+    vector = vectors[index]
+    col = cols[index]
+    print(label)
+    print(name)
+    url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/split_habitat_climate/{label}_habitat_climate.parquet'
+    stats_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}/{name}.parquet'
+    vector_stats = vector_vector_stats(con, url, vector, metric, col)
+    vector_stats = vector_stats.rename(**{name: metric})
+    return vector_stats, stats_url
+# usage: t.mutate(geom_valid = ST_MakeValid(t.geom))
+@ibis.udf.scalar.builtin
+def ST_MakeValid(geom) -> dt.geometry:
+ ...
+def vector_vector_stats(con, base, data_layer, metric, col):
+    print(f'metric: {metric}')
+    print(f'column name: {col}')
+    t1 = con.read_parquet(base).select(_.id, _.sub_id, _.geom)
+    if metric == 'mean':
+        t2 = con.read_parquet(data_layer).rename(value = col).select(_.geom, _.value)
+    else:
+        t2 = con.read_parquet(data_layer).select(_.geom)
+    t1 = t1.mutate(geom = ST_MakeValid(_.geom))
+    t2 = t2.mutate(geom = ST_MakeValid(_.geom))
+    stats = (t1
+        .left_join(t2, t1.geom.intersects(t2.geom))
+        .group_by(t1.id, t1.sub_id, t1.geom)
+            )
+    if metric == 'overlap':
+        stats = (stats.agg(overlap = (
+                t1.geom.intersection(t2.geom).area() / t1.geom.area())
+                .sum().coalesce(0).round(3) )) # overlap
+    elif metric == 'mean':
+        stats = (stats.agg(mean=(
+                (t1.geom.intersection(t2.geom).area() / t1.geom.area() * t2.value)
+                .sum().coalesce(0).round(3))))
+    else:
+        print('Select a metric.')
+        return
+    #####  for some ACE data, nonconserved areas don't get captured, so we assign it a 0
+    ##### only used this for ACE data + nonconserved.
+    # left join to keep all sub_ids
+    # non_overlapping = t1.anti_join(stats, 'sub_id')
+    # zero rows for non-overlapping sub_ids
+    # if metric == 'overlap':
+    # zeros = (non_overlapping
+    #          .mutate(overlap = 0)
+    #          .cast({'overlap':'float64'})
+    #         )
+    # elif metric == 'mean':
+    #     zeros = (non_overlapping
+    #              .mutate(mean = 0)
+    #              .cast({'mean':'float64'})
+    #             )
+    # stats = stats.union(zeros)
+    return stats