cassiebuhler commited on
Commit
b0bb178
·
1 Parent(s): 70d09b9

zonal stats + splitting geometry scripts

Browse files
.gitignore CHANGED
@@ -196,3 +196,8 @@ k8s/secret-deployment.yaml
196
  duck.db
197
  query_log.csv
198
  **/*.zip
 
 
 
 
 
 
196
  duck.db
197
  query_log.csv
198
  **/*.zip
199
+ **/*.shp..xml
200
+ **/*.TablesByName..atx
201
+ **/*.shp..xml
202
+ **/*.gpkg
203
+ **/*.lyrx
preprocess/split_data/combine.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ibis
2
+ from ibis import _
3
+ import ibis.expr.datatypes as dt
4
+
5
+ @ibis.udf.scalar.builtin
6
+ def ST_IsEmpty(geom: dt.geometry) -> dt.boolean:
7
+ return f"ST_IsEmpty({geom})"
8
+
9
+ def combine_habitat_and_climate(data1_url, data2_url, con):
10
+
11
+ SQM_PER_ACRE = 4046.8564224
12
+ t1 = con.read_parquet(data1_url).select(_.habitat_type, _.geom)
13
+ t2 = con.read_parquet(data2_url).select(_.climate_zone, _.geom)
14
+
15
+ # intersection areas: where habitat and climate overlap
16
+ intersected = (
17
+ t1.inner_join(t2, t1.geom.intersects(t2.geom))
18
+ .select(
19
+ habitat_type=t1.habitat_type,
20
+ climate_zone=t2.climate_zone,
21
+ geom=t1.geom.intersection(t2.geom).name("geom")
22
+ )
23
+ .filter(_.geom.is_valid())
24
+ .mutate(acres=( _.geom.area() / SQM_PER_ACRE ).round(4))
25
+ )
26
+
27
+ # habitat only: subtract all overlapping climate from each habitat polygon
28
+ overlapping_climate = (
29
+ t1.cross_join(t2)
30
+ .filter(t1.geom.intersects(t2.geom))
31
+ .select(t2.geom)
32
+ .aggregate(union_geom=_.geom.unary_union())
33
+ )
34
+ habitat_with_union = t1.cross_join(overlapping_climate)
35
+ habitat_only = (
36
+ habitat_with_union.select(
37
+ habitat_type=_.habitat_type,
38
+ climate_zone=ibis.literal("None").name("climate_zone"),
39
+ geom=_.geom.difference(_.union_geom).name("geom")
40
+ )
41
+ .filter(_.geom.is_valid())
42
+ .mutate(acres=( _.geom.area() / SQM_PER_ACRE ).round(4))
43
+ )
44
+
45
+ # climate only: subtract all overlapping habitat from each climate polygon
46
+ overlapping_habitat = (
47
+ t2.cross_join(t1)
48
+ .filter(t2.geom.intersects(t1.geom))
49
+ .select(t1.geom)
50
+ .aggregate(union_geom=_.geom.unary_union())
51
+ )
52
+ climate_with_union = t2.cross_join(overlapping_habitat)
53
+ climate_only = (
54
+ climate_with_union.select(
55
+ habitat_type=ibis.literal("None").name("habitat_type"),
56
+ climate_zone=_.climate_zone,
57
+ geom=_.geom.difference(_.union_geom).name("geom")
58
+ )
59
+ .filter(_.geom.is_valid())
60
+ .mutate(acres=( _.geom.area() / SQM_PER_ACRE ).round(4))
61
+ )
62
+
63
+ # combine
64
+ result = intersected.union(habitat_only).union(climate_only)
65
+ # result = result.filter(~ST_IsEmpty(_.geom))
66
+ return result
preprocess/split_data/combine_habitat_climate.ipynb ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "8b314df6-c4b9-4be5-9a34-2ab5236f072b",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Combine habitat types and climate zones\n",
9
+ "To split the protected areas into habitat types and climate zones, it's best to dissolve the habitat type/climate zone geometries into a single dataset, and then intersect on protected areas. \n",
10
+ "\n",
11
+ "To do so, we combine the data to get distinct geometries for each habitat type + climate zone combonation. "
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": null,
17
+ "id": "baa63640-33a7-4b99-a4aa-46125beb5976",
18
+ "metadata": {
19
+ "editable": true,
20
+ "slideshow": {
21
+ "slide_type": ""
22
+ },
23
+ "tags": []
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "from combine import * \n",
28
+ "import os\n",
29
+ "import sys\n",
30
+ "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
31
+ "if base_dir not in sys.path:\n",
32
+ " sys.path.insert(0, base_dir)\n",
33
+ " \n",
34
+ "from minio_utils import * \n",
35
+ "con, _ = connect_minio()"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "id": "5000eb46-1e51-49e3-bd2d-3230e829628f",
42
+ "metadata": {
43
+ "editable": true,
44
+ "slideshow": {
45
+ "slide_type": ""
46
+ },
47
+ "tags": []
48
+ },
49
+ "outputs": [],
50
+ "source": [
51
+ "%%time \n",
52
+ "#Wall time: 3h 48min 41s\n",
53
+ "# prior to running, I simplified each vector by 10 meters. \n",
54
+ "data1_url = 's3://public-ca30x30/CBN/Habitat/dissolved_geoms/CWHR13_dissolved_geoms_simplify10m.parquet'\n",
55
+ "data2_url = 's3://public-ca30x30/CBN/Climate_zones/dissolved_geoms/Climate_zones_dissolved_geoms_simplify10m.parquet'\n",
56
+ "result = combine_habitat_and_climate(data1_url, data2_url, con)\n",
57
+ "\n",
58
+ "new_url = 's3://public-ca30x30/CBN/Habitat/dissolved_geoms/CWHR13_climate_dissolved_geoms_simplify10m.parquet'\n",
59
+ "result.to_parquet(new_url)"
60
+ ]
61
+ }
62
+ ],
63
+ "metadata": {
64
+ "kernelspec": {
65
+ "display_name": "Python 3 (ipykernel)",
66
+ "language": "python",
67
+ "name": "python3"
68
+ },
69
+ "language_info": {
70
+ "codemirror_mode": {
71
+ "name": "ipython",
72
+ "version": 3
73
+ },
74
+ "file_extension": ".py",
75
+ "mimetype": "text/x-python",
76
+ "name": "python",
77
+ "nbconvert_exporter": "python",
78
+ "pygments_lexer": "ipython3",
79
+ "version": "3.12.10"
80
+ }
81
+ },
82
+ "nbformat": 4,
83
+ "nbformat_minor": 5
84
+ }
preprocess/split_data/split.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ibis
2
+ from ibis import _
3
+
4
+ def get_ecoregion(index):
5
+ ecoregions = ['Central_California_Coast',
6
+ 'Central_Valley_Coast_Ranges',
7
+ 'Colorado_Desert',
8
+ 'Great_Valley_North',
9
+ 'Great_Valley_South',
10
+ 'Klamath_Mountains',
11
+ 'Modoc_Plateau',
12
+ 'Mojave_Desert',
13
+ 'Mono',
14
+ 'Northern_California_Coast',
15
+ 'Northern_California_Coast_Ranges',
16
+ 'Northern_California_Interior_Coast_Ranges',
17
+ 'Northwestern_Basin_and_Range',
18
+ 'Sierra_Nevada',
19
+ 'Sierra_Nevada_Foothills',
20
+ 'Sonoran_Desert',
21
+ 'Southeastern_Great_Basin',
22
+ 'Southern_California_Coast',
23
+ 'Southern_California_Mountains_and_Valleys',
24
+ 'Southern_Cascades']
25
+ eco = ecoregions[index]
26
+ return eco
27
+
28
+ def split_layer(data3_url, con):
29
+ overlap_url = 's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/Habitat_and_Climate_zones/CWHR13_climate_dissolved_geoms_simplify10m_includesNA.parquet'
30
+ overlap_table = con.read_parquet(overlap_url)
31
+ SQM_PER_ACRE = 4046.8564224
32
+ t3 = con.read_parquet(data3_url).select("id", "name", "manager", "manager_type", "county", "gap_code",
33
+ "status", "land_tenure", "ecoregion", "access_type", "geom")
34
+
35
+ # append each id with the habitat + climate zone combo as its "sub_id"
36
+ habitat_letter_map = {
37
+ "Agriculture": "a",
38
+ "Barren/Other": "b",
39
+ "Conifer Forest": "c",
40
+ "Conifer Woodland": "d",
41
+ "Desert Shrub": "e",
42
+ "Desert Woodland": "f",
43
+ "Hardwood Forest": "g",
44
+ "Hardwood Woodland": "h",
45
+ "Herbaceous": "i",
46
+ "Shrub": "j",
47
+ "Urban": "k",
48
+ "Water": "l",
49
+ "Wetland": "m",
50
+ "None": "n"
51
+ }
52
+
53
+ climate_letter_map = {
54
+ "Zone 1": "a",
55
+ "Zone 2": "b",
56
+ "Zone 3": "c",
57
+ "Zone 4": "d",
58
+ "Zone 5": "e",
59
+ "Zone 6": "f",
60
+ "Zone 7": "g",
61
+ "Zone 8": "h",
62
+ "Zone 9": "i",
63
+ "Zone 10": "j",
64
+ "None": "k",
65
+ }
66
+
67
+ habitat_letter_table = ibis.memtable([{"habitat_type": k, "habitat_letter": v} for k, v in habitat_letter_map.items()])
68
+ climate_letter_table = ibis.memtable([{"climate_zone": k, "climate_letter": v} for k, v in climate_letter_map.items()])
69
+
70
+ # join mappings to overlap table
71
+ overlap_labeled = (
72
+ overlap_table
73
+ .inner_join(habitat_letter_table, "habitat_type")
74
+ .inner_join(climate_letter_table, "climate_zone")
75
+ )
76
+
77
+ # cross join and spatial intersection
78
+ joined = t3.cross_join(overlap_labeled)
79
+ joined = joined.mutate(intersects=t3.geom.intersects(overlap_labeled.geom))
80
+
81
+ # filter for intersection or include if no match (preserve unmatched)
82
+ matched = joined.filter(joined.intersects)
83
+
84
+ # calculate sub_id and geometry for matches
85
+ matched = matched.select(
86
+ id=t3.id,
87
+ sub_id=t3.id.cast("string")
88
+ .concat("_")
89
+ .concat(overlap_labeled.habitat_letter)
90
+ .concat(overlap_labeled.climate_letter)
91
+ .name("sub_id"),
92
+ habitat_type=overlap_labeled.habitat_type,
93
+ climate_zone=overlap_labeled.climate_zone,
94
+ name=t3.name,
95
+ manager=t3.manager,
96
+ manager_type=t3.manager_type,
97
+ county=t3.county,
98
+ gap_code=t3.gap_code,
99
+ status=t3.status,
100
+ land_tenure=t3.land_tenure,
101
+ ecoregion=t3.ecoregion,
102
+ access_type=t3.access_type,
103
+ geom=t3.geom.intersection(overlap_labeled.geom),
104
+ acres=(t3.geom.intersection(overlap_labeled.geom).area() / SQM_PER_ACRE).round(4)
105
+ )
106
+
107
+ # find unmatched records (no overlap)
108
+ matched_ids = matched.select("id").distinct()
109
+
110
+ # left join to find unmatched rows
111
+ unmatched = t3.left_join(matched_ids, "id").filter(matched_ids.id.isnull())
112
+ unmatched = unmatched.select(
113
+ id=t3.id,
114
+ sub_id=t3.id.cast("string").concat("_n").concat("k").name("sub_id"),
115
+ habitat_type=ibis.literal("None"),
116
+ climate_zone=ibis.literal("None"),
117
+ name=t3.name,
118
+ manager=t3.manager,
119
+ manager_type=t3.manager_type,
120
+ county=t3.county,
121
+ gap_code=t3.gap_code,
122
+ status=t3.status,
123
+ land_tenure=t3.land_tenure,
124
+ ecoregion=t3.ecoregion,
125
+ access_type=t3.access_type,
126
+ geom=t3.geom,
127
+ acres=(t3.geom.area() / SQM_PER_ACRE).round(4)
128
+ )
129
+
130
+ # compute unmatched residuals by subtracting all intersected parts from each t3.geom
131
+ matched_geoms = matched.group_by("id").aggregate(
132
+ matched_union=matched.geom.unary_union()
133
+ )
134
+ # join to get original geom
135
+ residuals = t3.inner_join(matched_geoms, "id").mutate(
136
+ residual_geom=t3.geom.difference(matched_geoms.matched_union)
137
+ )
138
+ # only keep meaningful residual geoms, not empty geoms.
139
+ residuals = (residuals
140
+ .filter((_.residual_geom.is_valid())
141
+ & ((_.residual_geom.area()) > 0 )
142
+ ))
143
+
144
+ # unmatched parts
145
+ residual_rows = residuals.select(
146
+ id=t3.id,
147
+ sub_id=t3.id.cast("string").concat("_n").concat("k").name("sub_id"),
148
+ habitat_type=ibis.literal("None"),
149
+ climate_zone=ibis.literal("None"),
150
+ name=t3.name,
151
+ manager=t3.manager,
152
+ manager_type=t3.manager_type,
153
+ county=t3.county,
154
+ gap_code=t3.gap_code,
155
+ status=t3.status,
156
+ land_tenure=t3.land_tenure,
157
+ ecoregion=t3.ecoregion,
158
+ access_type=t3.access_type,
159
+ geom=_.residual_geom,
160
+ acres=(_.residual_geom.area() / SQM_PER_ACRE).round(4)
161
+ )
162
+ result = matched.union(unmatched).union(residual_rows)
163
+ return result
164
+
165
+ def check_results(con, url,save_url):
166
+ original_id = con.read_parquet(url).select('id').distinct().execute()['id']
167
+ new_id = con.read_parquet(save_url).select('id').distinct().execute()['id']
168
+ missing_ids = list(set(original_id)- set(new_id))
169
+ print(f'# of missing IDs: {len(missing_ids)}')
170
+ original_acres = con.read_parquet(url).select('acres').execute()['acres'].sum()
171
+ new_acres = con.read_parquet(save_url).select('acres').execute()['acres'].sum()
172
+ acres_loss = original_acres-new_acres
173
+ print(f'Acres loss: {acres_loss}\n')
174
+ print(f'Ratio: {new_acres/original_acres}\n')
175
+ return missing_ids
preprocess/split_data/split_geoms.ipynb ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "96bbe4c4-a600-437b-b096-dfb4ba2cc8fe",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Split geometries into habitat types and climate zones \n",
9
+ "To assign each feature a habitat type and climate zone, we split up protected areas that span multiple"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "id": "d7150257-5f70-4419-a8fb-63bb12dd0963",
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "from split import * \n",
20
+ "import os\n",
21
+ "import sys\n",
22
+ "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
23
+ "if base_dir not in sys.path:\n",
24
+ " sys.path.insert(0, base_dir)\n",
25
+ " \n",
26
+ "from minio_utils import * \n",
27
+ "con, _ = connect_minio()"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": null,
33
+ "id": "3b3c5bb2-86d1-419d-8d3d-0a99fe18f442",
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "%%time\n",
38
+ "# run for all ecoregions + gap codes. \n",
39
+ "# If you don't process the data in subsets (gap codes and ecoregions), it'll take a few days and you'll need 64GB+ of memory \n",
40
+ "eco = get_ecoregion(10)\n",
41
+ "label = 'gap2'\n",
42
+ "print(label)\n",
43
+ "print(eco)\n",
44
+ "\n",
45
+ "url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/base/{label}/{eco}_epsg3310.parquet'\n",
46
+ "result = split_layer(url, con)\n",
47
+ "save_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/split_habitat_climate/{label}/{label}_{eco}_habitat_climate.parquet'\n",
48
+ "result.to_parquet(save_url)\n",
49
+ "check_results(con, url,save_url)"
50
+ ]
51
+ }
52
+ ],
53
+ "metadata": {
54
+ "kernelspec": {
55
+ "display_name": "Python 3 (ipykernel)",
56
+ "language": "python",
57
+ "name": "python3"
58
+ },
59
+ "language_info": {
60
+ "codemirror_mode": {
61
+ "name": "ipython",
62
+ "version": 3
63
+ },
64
+ "file_extension": ".py",
65
+ "mimetype": "text/x-python",
66
+ "name": "python",
67
+ "nbconvert_exporter": "python",
68
+ "pygments_lexer": "ipython3",
69
+ "version": "3.12.10"
70
+ }
71
+ },
72
+ "nbformat": 4,
73
+ "nbformat_minor": 5
74
+ }
preprocess/zonal_stats/join_zonals.ipynb ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "ef62b0fd-67b0-4ad3-bad0-bdaec434acaf",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Joining zonal data\n",
9
+ "We split up into gap codes to compute zonal stats, so we need to join them into a single table again"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "id": "e792c164-38bc-4821-823e-31274db6abec",
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "import ibis \n",
20
+ "from ibis import _\n",
21
+ "import os\n",
22
+ "import sys\n",
23
+ "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
24
+ "if base_dir not in sys.path:\n",
25
+ " sys.path.insert(0, base_dir)\n",
26
+ " \n",
27
+ "from minio_utils import * \n",
28
+ "con, _ = connect_minio()"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "id": "5c34a00b-ccea-4be8-add4-db242a6aba74",
35
+ "metadata": {
36
+ "scrolled": true
37
+ },
38
+ "outputs": [],
39
+ "source": [
40
+ "labels = ['gap1','gap2','gap3','gap4','nonconserved']\n",
41
+ "\n",
42
+ "for label in labels:\n",
43
+ " names = ['id',\n",
44
+ " 'pct_top_amphibian_richness','mean_amphibian_richness',\n",
45
+ " 'pct_top_reptile_richness','mean_reptile_richness',\n",
46
+ " 'pct_top_bird_richness','mean_bird_richness',\n",
47
+ " 'pct_top_mammal_richness','mean_mammal_richness',\n",
48
+ " 'pct_top_freshwater_richness','mean_top_freshwater_richness',\n",
49
+ " 'pct_wetlands','pct_fire','pct_farmland','pct_grazing',\n",
50
+ " 'pct_disadvantaged_community','pct_low_income_community',\n",
51
+ " 'mean_plant_richness','pct_top_plant_richness'\n",
52
+ " ]\n",
53
+ " \n",
54
+ " agg_dict = {\n",
55
+ " name: _[name].first() for name in names\n",
56
+ " }\n",
57
+ " stats_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}/**.parquet'\n",
58
+ " a = (con.read_parquet(stats_url, union_by_name = True)\n",
59
+ " .drop('geom')\n",
60
+ " .group_by('sub_id')\n",
61
+ " .aggregate(**agg_dict)\n",
62
+ " )\n",
63
+ " \n",
64
+ " url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/split_habitat_climate/{label}_habitat_climate.parquet'\n",
65
+ " base = con.read_parquet(url)\n",
66
+ " joined = base.inner_join(a,['sub_id','id'])\n",
67
+ " save_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}_habitat_climate_stats.parquet'\n",
68
+ " joined.to_parquet(save_url)"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "id": "b6c2451f-aa6a-4d4e-b233-8b8b590fbfe0",
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "cols = ['id',\n",
79
+ " 'sub_id',\n",
80
+ " 'name',\n",
81
+ " 'manager',\n",
82
+ " 'manager_type',\n",
83
+ " 'gap_code',\n",
84
+ " 'status',\n",
85
+ " 'land_tenure',\n",
86
+ " 'access_type',\n",
87
+ " 'county',\n",
88
+ " 'ecoregion',\n",
89
+ " 'habitat_type',\n",
90
+ " 'climate_zone',\n",
91
+ " 'mean_amphibian_richness',\n",
92
+ " 'mean_reptile_richness',\n",
93
+ " 'mean_bird_richness',\n",
94
+ " 'mean_mammal_richness',\n",
95
+ " 'mean_plant_richness',\n",
96
+ " 'mean_freshwater_richness',\n",
97
+ " 'pct_top_amphibian_richness',\n",
98
+ " 'pct_top_reptile_richness',\n",
99
+ " 'pct_top_bird_richness',\n",
100
+ " 'pct_top_mammal_richness',\n",
101
+ " 'pct_top_plant_richness',\n",
102
+ " 'pct_top_freshwater_richness',\n",
103
+ " 'pct_wetlands',\n",
104
+ " 'pct_fire',\n",
105
+ " 'pct_farmland',\n",
106
+ " 'pct_grazing_lands',\n",
107
+ " 'pct_disadvantaged_community',\n",
108
+ " 'pct_low_income_community',\n",
109
+ " 'acres',\n",
110
+ " 'geom']"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": null,
116
+ "id": "0c986c99-a403-4266-b031-3a882d93c1fe",
117
+ "metadata": {},
118
+ "outputs": [],
119
+ "source": [
120
+ "stats_joined_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/*_habitat_climate_stats.parquet'\n",
121
+ "joined_stats = (con\n",
122
+ " .read_parquet(stats_joined_url)\n",
123
+ " .mutate(geom = _.geom.convert('epsg:3310','epsg:4326'))\n",
124
+ " .rename(pct_grazing_lands = \"pct_grazing\")\n",
125
+ " .mutate(gap_code = _.gap_code.substitute({'Non-Conservation Area':'None'}))\n",
126
+ " .mutate(name = _.name.fill_null('None'))\n",
127
+ " .mutate(manager = _.manager.fill_null('None'))\n",
128
+ " .mutate(manager_type = _.manager_type.fill_null('None'))\n",
129
+ " .mutate(gap_code = _.gap_code.fill_null('None'))\n",
130
+ " .mutate(status = _.status.fill_null('None'))\n",
131
+ " .mutate(land_tenure = _.land_tenure.fill_null('None'))\n",
132
+ " .mutate(access_type = _.access_type.fill_null('None'))\n",
133
+ " .mutate(county = _.county.fill_null('None'))\n",
134
+ " .mutate(ecoregion = _.ecoregion.fill_null('None'))\n",
135
+ " .mutate(habitat_type = _.habitat_type.fill_null('None'))\n",
136
+ " .mutate(climate_zone = _.climate_zone.fill_null('None'))\n",
137
+ ")\n",
138
+ "\n",
139
+ "url1 = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/ca30x30_habitat_climate_stats.parquet'\n",
140
+ "url2= f's3://public-ca30x30/ca30x30_cbn_v3.parquet'\n",
141
+ "data = joined_stats.select(cols).order_by('gap_code','county','name','id','sub_id')\n",
142
+ "data.to_parquet(url1)\n",
143
+ "data.to_parquet(url2)\n"
144
+ ]
145
+ }
146
+ ],
147
+ "metadata": {
148
+ "kernelspec": {
149
+ "display_name": "Python 3 (ipykernel)",
150
+ "language": "python",
151
+ "name": "python3"
152
+ },
153
+ "language_info": {
154
+ "codemirror_mode": {
155
+ "name": "ipython",
156
+ "version": 3
157
+ },
158
+ "file_extension": ".py",
159
+ "mimetype": "text/x-python",
160
+ "name": "python",
161
+ "nbconvert_exporter": "python",
162
+ "pygments_lexer": "ipython3",
163
+ "version": "3.12.10"
164
+ }
165
+ },
166
+ "nbformat": 4,
167
+ "nbformat_minor": 5
168
+ }
preprocess/zonal_stats/raster_stats.ipynb ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "68cd13a8-3eb1-461c-b5e1-4f8a356abfeb",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Compute zonal stats for protected areas with raster data "
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "1852e008-906d-40e5-826f-a8fa92d9fceb",
15
+ "metadata": {
16
+ "editable": true,
17
+ "slideshow": {
18
+ "slide_type": ""
19
+ },
20
+ "tags": []
21
+ },
22
+ "outputs": [],
23
+ "source": [
24
+ "from raster_utils import * \n",
25
+ "import os\n",
26
+ "import sys\n",
27
+ "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
28
+ "if base_dir not in sys.path:\n",
29
+ " sys.path.insert(0, base_dir)\n",
30
+ " \n",
31
+ "from minio_utils import * \n",
32
+ "con, _ = connect_minio()"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "id": "d91efb56-e6e9-4cb6-8ce6-54d318e62534",
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "%%time \n",
43
+ "# run for all metrics, files, and gap codes \n",
44
+ "metric = 'overlap'\n",
45
+ "label = 'gap1'\n",
46
+ "index = 0\n",
47
+ "\n",
48
+ "get_raster_stats(con, label, name, raster, metric, index)"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "id": "1da6f99d-ba7f-44ed-8100-cc3371966693",
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "from IPython.lib.display import Audio\n",
59
+ "import numpy as np\n",
60
+ "\n",
61
+ "framerate = 4410\n",
62
+ "play_time_seconds = 1\n",
63
+ "\n",
64
+ "t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)\n",
65
+ "audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)\n",
66
+ "Audio(audio_data, rate=framerate, autoplay=True)"
67
+ ]
68
+ }
69
+ ],
70
+ "metadata": {
71
+ "kernelspec": {
72
+ "display_name": "Python 3 (ipykernel)",
73
+ "language": "python",
74
+ "name": "python3"
75
+ },
76
+ "language_info": {
77
+ "codemirror_mode": {
78
+ "name": "ipython",
79
+ "version": 3
80
+ },
81
+ "file_extension": ".py",
82
+ "mimetype": "text/x-python",
83
+ "name": "python",
84
+ "nbconvert_exporter": "python",
85
+ "pygments_lexer": "ipython3",
86
+ "version": "3.12.10"
87
+ }
88
+ },
89
+ "nbformat": 4,
90
+ "nbformat_minor": 5
91
+ }
preprocess/zonal_stats/raster_utils.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from exactextract import exact_extract
3
+ import ibis
4
+ from ibis import _
5
+
6
+ def get_raster_file(metric, index):
7
+ mean_end_plant = '/vsicurl/https://minio.carlboettiger.info/public-ca30x30/CBN/Biodiversity_unique/Rarityweighted_endemic_plant_richness/endemicspecies_E_epsg3310.tif'
8
+ mean_plant = '/vsicurl/https://minio.carlboettiger.info/public-ca30x30/CBN/Biodiversity_unique/Plant_richness/species_D.tif'
9
+ pct_plant = '/vsicurl/https://minio.carlboettiger.info/public-ca30x30/CBN/Biodiversity_unique/Plant_richness/species_D_80percent_epsg3310.tif'
10
+ pct_end_plant = '/vsicurl/https://minio.carlboettiger.info/public-ca30x30/CBN/Biodiversity_unique/Rarityweighted_endemic_plant_richness/endemicspecies_E_80percent_epsg3310.tif'
11
+ if metric == 'mean':
12
+ names = ['mean_plant_richness','mean_rarityweighted_endemic_plant_richness']
13
+ rasters = [mean_plant, mean_end_plant]
14
+ elif metric == 'overlap':
15
+ names = ['pct_top_plant_richness','pct_rarityweighted_endemic_plant_richness']
16
+ rasters = [pct_plant, pct_end_plant]
17
+ return names[index], rasters[index]
18
+
19
+ def get_raster_stats(con, label, name, raster, metric, index):
20
+ name, raster = get_raster_file(metric, index)
21
+ compute_raster_stats(con, label, name, raster, metric)
22
+ return
23
+
24
+ def compute_percentage_overlap(df, name):
25
+ def extract_overlap(row):
26
+ if 1 in row['unique']:
27
+ idx = row['unique'].index(1)
28
+ return round(row['frac'][idx], 6)
29
+ return 0
30
+ df[name] = df.apply(extract_overlap, axis=1)
31
+ return df[['id', 'sub_id', name, 'acres']]
32
+
33
+ def compute_raster_stats(con, label, name, raster, metric = 'mean'):
34
+ print(label)
35
+ print(metric)
36
+ print(name)
37
+ url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/split_habitat_climate/{label}_habitat_climate.parquet'
38
+ if label in ['gap2','gap4']: #don't compute with tiny polygons (exactextract gets mad if you keep them)
39
+ polys = con.read_parquet(url).rename(new_id = 'id').execute()
40
+ small = polys[round(polys['acres'],10) ==0]
41
+ a = ibis.memtable(small, name = 'tmp')
42
+ exclude_ids = a.select('sub_id').distinct().execute()['sub_id'].to_list()
43
+ polys = polys[~polys['sub_id'].isin(exclude_ids)]
44
+
45
+ else:
46
+ polys = con.read_parquet(url).rename(new_id = 'id').execute()
47
+ polys = polys.set_crs('epsg:3310')
48
+
49
+ if metric == 'mean':
50
+ out = exact_extract(raster, polys, [metric], include_cols=["new_id","sub_id","acres"])
51
+ rows = [{'id': f['properties']['new_id'], 'sub_id': f['properties']['sub_id'], name: round(f['properties'][metric], 6),
52
+ 'acres': f['properties']['acres']} for f in out]
53
+ stats = pd.DataFrame(rows)
54
+
55
+ # computing the overlap of each unique parcel then computing the overlap of only 1's
56
+ elif metric == 'overlap':
57
+ metrics = ['frac','unique']
58
+ out = exact_extract(raster, polys, metrics, include_cols=["new_id","sub_id","acres"])
59
+ rows = [{'id': f['properties']['new_id'], 'sub_id': f['properties']['sub_id'], 'frac': list(f['properties']['frac']),
60
+ 'unique': list(f['properties']['unique']),
61
+ 'acres': f['properties']['acres']} for f in out]
62
+ stats = compute_percentage_overlap(pd.DataFrame(rows), name)
63
+ out = con.create_table('tmp', stats, overwrite = True)
64
+
65
+ ##zeroing out tiny polygons
66
+ if label in ['gap2','gap4']:
67
+ excluded = con.read_parquet(url).filter(_.sub_id.isin(exclude_ids)).mutate(**{name: ibis.literal(0)})
68
+ excluded = excluded.cast({name:"float64"}).select('sub_id','id',name,'acres')
69
+ out = out.union(excluded)
70
+ save_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}/{name}.parquet'
71
+ out.to_parquet(save_url)
72
+ return
73
+
74
+
75
+
preprocess/zonal_stats/vector_stats.ipynb ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "d0e4ad58-9eeb-43c4-811a-9c701321afe1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Compute zonal stats for protected areas with vector data "
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "b756f206-651f-459e-8977-1fb99afeab27",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "from vector_utils import *\n",
19
+ "import os\n",
20
+ "import sys\n",
21
+ "base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
22
+ "if base_dir not in sys.path:\n",
23
+ " sys.path.insert(0, base_dir)\n",
24
+ " \n",
25
+ "from minio_utils import * \n",
26
+ "con, _ = connect_minio()"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "id": "d1d692e2-a5bb-4c1d-b679-48af858d5853",
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "%%time \n",
37
+ "# run for all metrics, files, and gap codes \n",
38
+ "metric = 'overlap'\n",
39
+ "label = 'gap1'\n",
40
+ "index = 0\n",
41
+ "\n",
42
+ "vector_stats, save_url = get_vector_stats(con, index, label, metric)\n",
43
+ "vector_stats.to_parquet(save_url)"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "id": "fc1dedcd-6823-41fd-a8da-2d118a33ca70",
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "from IPython.lib.display import Audio\n",
54
+ "import numpy as np\n",
55
+ "\n",
56
+ "framerate = 4410\n",
57
+ "play_time_seconds = 1\n",
58
+ "\n",
59
+ "t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)\n",
60
+ "audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)\n",
61
+ "Audio(audio_data, rate=framerate, autoplay=True)"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "id": "212e9d19-caf1-429a-a42c-a5377b9e7b05",
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": []
71
+ }
72
+ ],
73
+ "metadata": {
74
+ "kernelspec": {
75
+ "display_name": "Python 3 (ipykernel)",
76
+ "language": "python",
77
+ "name": "python3"
78
+ },
79
+ "language_info": {
80
+ "codemirror_mode": {
81
+ "name": "ipython",
82
+ "version": 3
83
+ },
84
+ "file_extension": ".py",
85
+ "mimetype": "text/x-python",
86
+ "name": "python",
87
+ "nbconvert_exporter": "python",
88
+ "pygments_lexer": "ipython3",
89
+ "version": "3.12.10"
90
+ }
91
+ },
92
+ "nbformat": 4,
93
+ "nbformat_minor": 5
94
+ }
preprocess/zonal_stats/vector_utils.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ibis
2
+ from ibis import _
3
+ import ibis.selectors as s
4
+ import ibis.expr.datatypes as dt
5
+ import os
6
+
7
+ def get_url(folder, file, bucket = 'public-ca30x30', base_folder = 'CBN', method = 'write'):
8
+ if method == 'write':
9
+ minio = 's3://'
10
+ else:
11
+ minio = 'https://minio.carlboettiger.info/'
12
+ if base_folder is None:
13
+ path = os.path.join(bucket,folder,file)
14
+ else:
15
+ path = os.path.join(bucket,base_folder,folder,file)
16
+ url = minio+path
17
+ return url
18
+
19
+ def get_vector_file(metric):
20
+ # CBN vector data
21
+ amph_richness = get_url('ACE_biodiversity/ACE_amphibian_richness','ACE_amphibian_richness_epsg3310.parquet')
22
+ pct_amph_richness = get_url('ACE_biodiversity/ACE_amphibian_richness','ACE_amphibian_richness_80percent_epsg3310.parquet')
23
+
24
+ reptile_richness = get_url('ACE_biodiversity/ACE_reptile_richness','ACE_reptile_richness_epsg3310.parquet')
25
+ pct_reptile_richness = get_url('ACE_biodiversity/ACE_reptile_richness','ACE_reptile_richness_80percent_epsg3310.parquet')
26
+
27
+ bird_richness = get_url('ACE_biodiversity/ACE_bird_richness','ACE_bird_richness_epsg3310.parquet')
28
+ pct_bird_richness = get_url('ACE_biodiversity/ACE_bird_richness','ACE_bird_richness_80percent_epsg3310.parquet')
29
+
30
+ mammal_richness = get_url('ACE_biodiversity/ACE_mammal_richness','ACE_mammal_richness_epsg3310.parquet')
31
+ pct_mammal_richness = get_url('ACE_biodiversity/ACE_mammal_richness','ACE_mammal_richness_80percent_epsg3310.parquet')
32
+
33
+ rare_amphibian_richness = get_url('ACE_biodiversity/ACE_rare_amphibian_richness','ACE_rare_amphibian_richness_epsg3310.parquet')
34
+ pct_rare_amphibian_richness = get_url('ACE_biodiversity/ACE_rare_amphibian_richness','ACE_rare_amphibian_richness_95percent_epsg3310.parquet')
35
+
36
+ rare_reptile_richness = get_url('ACE_biodiversity/ACE_rare_reptile_richness','ACE_rare_reptile_richness_epsg3310.parquet')
37
+ pct_rare_reptile_richness = get_url('ACE_biodiversity/ACE_rare_reptile_richness','ACE_rare_reptile_richness_95percent_epsg3310.parquet')
38
+
39
+ rare_bird_richness = get_url('ACE_biodiversity/ACE_rare_bird_richness','ACE_rare_bird_richness_epsg3310.parquet')
40
+ pct_rare_bird_richness = get_url('ACE_biodiversity/ACE_rare_bird_richness','ACE_rare_bird_richness_95percent_epsg3310.parquet')
41
+
42
+ rare_mammal_richness = get_url('ACE_biodiversity/ACE_rare_mammal_richness','ACE_rare_mammal_richness_epsg3310.parquet')
43
+ pct_rare_mammal_richness = get_url('ACE_biodiversity/ACE_rare_mammal_richness','ACE_rare_mammal_richness_95percent_epsg3310.parquet')
44
+
45
+ endemic_amphibian_richness = get_url('ACE_biodiversity/ACE_endemic_amphibian_richness','ACE_endemic_amphibian_richness_epsg3310.parquet')
46
+ pct_endemic_amphibian_richness = get_url('ACE_biodiversity/ACE_endemic_amphibian_richness','ACE_endemic_amphibian_richness_95percent_epsg3310.parquet')
47
+
48
+ endemic_reptile_richness = get_url('ACE_biodiversity/ACE_endemic_reptile_richness','ACE_endemic_reptile_richness_epsg3310.parquet')
49
+ pct_endemic_reptile_richness = get_url('ACE_biodiversity/ACE_endemic_reptile_richness','ACE_endemic_reptile_richness_95percent_epsg3310.parquet')
50
+
51
+ endemic_bird_richness = get_url('ACE_biodiversity/ACE_endemic_bird_richness','ACE_endemic_bird_richness_epsg3310.parquet')
52
+ pct_endemic_bird_richness = get_url('ACE_biodiversity/ACE_endemic_bird_richness','ACE_endemic_bird_richness_95percent_epsg3310.parquet')
53
+
54
+ endemic_mammal_richness = get_url('ACE_biodiversity/ACE_endemic_mammal_richness','ACE_endemic_mammal_richness_epsg3310.parquet')
55
+ pct_endemic_mammal_richness = get_url('ACE_biodiversity/ACE_endemic_mammal_richness','ACE_endemic_mammal_richness_95percent_epsg3310.parquet')
56
+
57
+ freshwater_richness = get_url('Freshwater_resources/Freshwater_species_richness','freshwater_species_richness_ds1197_epsg3310.parquet')
58
+ pct_freshwater_richness = get_url('Freshwater_resources/Freshwater_species_richness','freshwater_species_richness_ds1197_80percent_epsg3310.parquet')
59
+
60
+ wetlands = get_url('Freshwater_resources/Wetlands','CA_wetlands_epsg3310.parquet')
61
+ fire = get_url('Climate_risks/Historical_fire_perimeters','calfire_2023_epsg3310.parquet')
62
+ farmland = get_url('NBS_agriculture/Farmland_all/Farmland','Farmland_2018_epsg3310.parquet')
63
+ grazing = get_url('NBS_agriculture/Farmland_all/Lands_suitable_grazing','Grazing_land_2018_epsg3310.parquet')
64
+ DAC = get_url('Progress_data_new_protection/DAC','DAC_2022_epsg3310.parquet')
65
+ low_income = get_url('Progress_data_new_protection/Low_income_communities','low_income_CalEnviroScreen4_epsg3310.parquet')
66
+
67
+ pct_newly_protected = get_url('Progress_data_new_protection/Newly_counted_lands/dissolved','newly_protected_2024_union_epsg3310.parquet')
68
+ pct_data_improvement = get_url('Progress_data_new_protection/Newly_counted_lands/dissolved','data_improvement_2024_union_epsg3310.parquet')
69
+ pct_increased_management = get_url('Progress_data_new_protection/Newly_counted_lands/dissolved','increased_management_2024_union_epsg3310.parquet')
70
+
71
+ if metric == 'mean':
72
+ names = ['mean_amphibian_richness','mean_reptile_richness',
73
+ 'mean_bird_richness','mean_mammal_richness',
74
+ 'mean_rare_amphibian_richness','mean_rare_reptile_richness',
75
+ 'mean_rare_bird_richness','mean_rare_mammal_richness',
76
+ 'mean_endemic_amphibian_richness','mean_endemic_reptile_richness',
77
+ 'mean_endemic_bird_richness','mean_endemic_mammal_richness',
78
+ 'mean_freshwater_richness']
79
+
80
+ vectors = [amph_richness, reptile_richness,
81
+ bird_richness, mammal_richness,
82
+ rare_amphibian_richness, rare_reptile_richness,
83
+ rare_bird_richness, rare_mammal_richness,
84
+ endemic_amphibian_richness,
85
+ endemic_reptile_richness,
86
+ endemic_bird_richness,
87
+ endemic_mammal_richness,
88
+ freshwater_richness]
89
+
90
+ cols = ['NtvAmph','NtvRept','NtvBird','NtvMamm',
91
+ 'RarAmph','RarRept','RarBird','RarMamm',
92
+ 'AmphEndem','ReptEndem','BirdEndem','MammEndem',
93
+ 'Freshwater_Species_Count']
94
+
95
+ elif metric == 'overlap':
96
+ names = ['pct_top_amphibian_richness','pct_top_reptile_richness',
97
+ 'pct_top_bird_richness','pct_top_mammal_richness',
98
+ 'pct_rare_amphibian_richness','pct_rare_reptile_richness',
99
+ 'pct_rare_bird_richness','pct_rare_mammal_richness',
100
+ 'pct_endemic_amphibian_richness','pct_endemic_reptile_richness',
101
+ 'pct_endemic_bird_richness','pct_endemic_mammal_richness',
102
+ 'pct_top_freshwater_richness',
103
+ 'pct_wetlands','pct_fire','pct_farmland','pct_grazing',
104
+ 'pct_disadvantaged_community','pct_low_income_community',
105
+ 'pct_newly_protected','pct_data_improvement','pct_increased_management']
106
+
107
+ vectors = [pct_amph_richness, pct_reptile_richness,
108
+ pct_bird_richness, pct_mammal_richness,
109
+ pct_rare_amphibian_richness, pct_rare_reptile_richness,
110
+ pct_rare_bird_richness, pct_rare_mammal_richness,
111
+ pct_endemic_amphibian_richness,
112
+ pct_endemic_reptile_richness,
113
+ pct_endemic_bird_richness,
114
+ pct_endemic_mammal_richness,
115
+ pct_freshwater_richness,
116
+ wetlands,
117
+ fire, farmland, grazing,
118
+ DAC, low_income,
119
+ pct_newly_protected,
120
+ pct_data_improvement,
121
+ pct_increased_management
122
+ ]
123
+ cols = [None] * len(vectors)
124
+ return names, vectors, cols
125
+
126
+
127
+ def get_vector_stats(con, index, label, metric):
128
+ names, vectors, cols = get_vector_file(metric)
129
+ name = names[index]
130
+ vector = vectors[index]
131
+ col = cols[index]
132
+ print(label)
133
+ print(name)
134
+ url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/subsets/split_habitat_climate/{label}_habitat_climate.parquet'
135
+ stats_url = f's3://public-ca30x30/CA_Nature/2024/Preprocessing/v3/stats/{label}/{name}.parquet'
136
+ vector_stats = vector_vector_stats(con, url, vector, metric, col)
137
+ vector_stats = vector_stats.rename(**{name: metric})
138
+ return vector_stats, stats_url
139
+
140
+
141
+ # usage: t.mutate(geom_valid = ST_MakeValid(t.geom))
142
+ @ibis.udf.scalar.builtin
143
+ def ST_MakeValid(geom) -> dt.geometry:
144
+ ...
145
+
146
+
147
+ def vector_vector_stats(con, base, data_layer, metric, col):
148
+ print(f'metric: {metric}')
149
+ print(f'column name: {col}')
150
+ t1 = con.read_parquet(base).select(_.id, _.sub_id, _.geom)
151
+ if metric == 'mean':
152
+ t2 = con.read_parquet(data_layer).rename(value = col).select(_.geom, _.value)
153
+ else:
154
+ t2 = con.read_parquet(data_layer).select(_.geom)
155
+
156
+ t1 = t1.mutate(geom = ST_MakeValid(_.geom))
157
+ t2 = t2.mutate(geom = ST_MakeValid(_.geom))
158
+
159
+ stats = (t1
160
+ .left_join(t2, t1.geom.intersects(t2.geom))
161
+ .group_by(t1.id, t1.sub_id, t1.geom)
162
+ )
163
+ if metric == 'overlap':
164
+ stats = (stats.agg(overlap = (
165
+ t1.geom.intersection(t2.geom).area() / t1.geom.area())
166
+ .sum().coalesce(0).round(3) )) # overlap
167
+ elif metric == 'mean':
168
+ stats = (stats.agg(mean=(
169
+ (t1.geom.intersection(t2.geom).area() / t1.geom.area() * t2.value)
170
+ .sum().coalesce(0).round(3))))
171
+ else:
172
+ print('Select a metric.')
173
+ return
174
+ ##### for some ACE data, nonconserved areas don't get captured, so we assign it a 0
175
+ ##### only used this for ACE data + nonconserved.
176
+ # left join to keep all sub_ids
177
+ # non_overlapping = t1.anti_join(stats, 'sub_id')
178
+ # zero rows for non-overlapping sub_ids
179
+ # if metric == 'overlap':
180
+ # zeros = (non_overlapping
181
+ # .mutate(overlap = 0)
182
+ # .cast({'overlap':'float64'})
183
+ # )
184
+ # elif metric == 'mean':
185
+ # zeros = (non_overlapping
186
+ # .mutate(mean = 0)
187
+ # .cast({'mean':'float64'})
188
+ # )
189
+ # stats = stats.union(zeros)
190
+ return stats
191
+
192
+