-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathgedi_l4a_hyrax.py
executable file
·254 lines (216 loc) · 8.63 KB
/
gedi_l4a_hyrax.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python3
import argparse
import http.cookiejar
import pathlib
import requests
import sys
import datetime as dt
import geopandas as gpd
import netCDF4 as nc
import pandas as pd
from os import path
from shapely.ops import orient
from urllib.parse import urlsplit
from requests.adapters import HTTPAdapter, Retry
import warnings
warnings.filterwarnings('ignore')
DT_FORMAT = "%Y-%m-%d"
CMR_URL="https://cmr.earthdata.nasa.gov/search/"
AUTH_HOST = "https://urs.earthdata.nasa.gov"
HEADERS = ['lat_lowestmode', 'lon_lowestmode', 'elev_lowestmode', 'shot_number']
def parse_args(args):
"""Parses command line agruments."""
parser = argparse.ArgumentParser(
description="Access GEDI L4A using NASA OPeNDAP in the Cloud",
usage="gedi_l4a_hyrax.py --doi <DOI> --date1 <start_date> --date2 <end_date> --poly <path_to_geojson_file> --beams <gedi_beams> --variables <gedi_variables> --outfile <output_csv_file> [--json]\n"
)
parser.add_argument(
"--doi",
required=True,
type=check_doi,
help="DOI e.g., 10.3334/ORNLDAAC/2056 for GEDI L4A V2.1"
)
parser.add_argument(
"--date1",
required=True,
type=check_datefmt,
help="start date in YYYY-MM-DD format"
)
parser.add_argument(
"--date2",
required=True,
type=check_datefmt,
help="end date in YYYY-MM-DD format"
)
parser.add_argument(
"--poly",
required=True,
type=argparse.FileType('r', encoding='UTF-8'),
help="path to a GeoJSON file defining area of interest"
)
parser.add_argument(
"--beams",
required=True,
type=lambda arg: arg.split(','),
help="GEDI beam names in a comma-separated format"
)
parser.add_argument(
"--variables",
required=True,
type=lambda arg: arg.split(','),
help="GEDI variable names in a comma-separated format"
)
parser.add_argument(
"--outfile",
required=True,
type=pathlib.Path,
help="output CSV file name"
)
parser.add_argument(
"--json",
default=False,
action='store_true',
help="setting this creates additional output GeoJSON subset file"
)
return parser.parse_args(args)
def check_datefmt(d: str):
"""Checks if date parameters are in correct format.
Args:
d (str): date string in YYYY-MM-DD
Returns:
datetime object
"""
try:
return dt.datetime.strptime(d, DT_FORMAT)
except ValueError:
msg = "not a valid date in YYYY-MM-DD format"
raise argparse.ArgumentTypeError(msg)
def check_doi(d: str):
"""Checks if DOI passed is valid and the dataset exists at NASA CMR
Args:
d (str): DOI
Returns:
string: DOI stripped off https://doi.org, if any
"""
try:
dpath = urlsplit(d).path.strip("/")
requests.get(CMR_URL + 'collections.json?doi=' + dpath).json()['feed']['entry'][0]['id']
return dpath
except (ValueError, IndexError):
msg = "not a valid DOI"
raise argparse.ArgumentTypeError(msg)
def get_granules_hyrax(doi: str, poly_epsg4326, temporal_str: str):
"""Get hyrax url for the granules that overlaps the temporal and
spatial bounds.
Args:
doi (str): dataset DOI
poly_epsg4326: GeoDataFrame object containing the polygon object
temporal_str (str): temporal ranges with start and end datetimes
in NASA CMR-required format
Returns:
array: array of dictionaries with the granule hyrax urls
"""
print("Searching for granules ..")
# orienting coordinates clockwise
poly_epsg4326.geometry = poly_epsg4326.geometry.apply(orient, args=(1,))
# reducing number of vertices in the polygon
# CMR has 1000000 bytes limit
grsm_epsg4326 = poly_epsg4326.simplify(0.0005)
doisearch = requests.get(CMR_URL + 'collections.json?doi=' + doi).json()['feed']['entry'][0]
concept_id = doisearch['id']
geojson = {"shapefile": ("poly.json", poly_epsg4326.geometry.to_json(), "application/geo+json")}
page_num = 1
page_size = 2000 # CMR page size limit
granule_arr = []
while True:
# defining parameters
cmr_param = {
"collection_concept_id": concept_id,
"page_size": page_size,
"page_num": page_num,
"temporal": temporal_str,
"simplify-shapefile": 'true' # this is needed to bypass 5000 coordinates limit of CMR
}
granulesearch = CMR_URL + 'granules.json'
response = requests.post(granulesearch, data=cmr_param, files=geojson)
granules = response.json()['feed']['entry']
if granules:
for g in granules:
# Get Hyrax URL
href=''
for links in g['links']:
if 'title' in links and links['title'].startswith('OPeNDAP'):
granule_arr.append({'url':links['href']})
page_num += 1
else:
break
print(f"Total granules found: {len(granule_arr)}")
return granule_arr
def main():
"""Access GEDI L4A variables from Hyrax for polygon (GeoJSON file) and start/end dates, and
saves the output as a csv file"""
parser = parse_args(sys.argv[1:])
doi = parser.doi
outfile = parser.outfile
start_date = parser.date1
end_date = parser.date2
beams = parser.beams
variables = parser.variables
fmt_json = parser.json
poly = gpd.read_file(parser.poly)
poly.crs = 'EPSG:4326'
dt_cmr = '%Y-%m-%dT%H:%M:%SZ'
temporal = start_date.strftime(dt_cmr) + ',' + end_date.strftime(dt_cmr)
# setting up maximum retries to get around Hyrax 500 error
s = requests.Session()
retries = Retry(total=3, backoff_factor=0.1, status_forcelist=[ 500, 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
# appending science variables to lat, lon, elev, shot_number
for v in variables:
if v not in HEADERS:
HEADERS.append(v)
# writing header row to the output file
if not path.isfile(outfile):
with open(outfile, "w") as f:
f.write(','.join(HEADERS)+'\n')
for g in get_granules_hyrax(doi, poly, temporal):
for beam in beams:
print(f"Downloading {g['url'].rsplit('/', 1)[-1]} / {beam}")
# retrieving lat, lon coordinates for the file
hyrax_url = f"{g['url']}.dap.nc4?dap4.ce=/{beam}/lon_lowestmode;/{beam}/lat_lowestmode"
r = s.get(hyrax_url)
if (r.status_code != 400):
ds = nc.Dataset('hyrax', memory=r.content)
lat = ds[beam]['lat_lowestmode'][:]
lon = ds[beam]['lon_lowestmode'][:]
ds.close()
df = pd.DataFrame({'lat_lowestmode': lat, 'lon_lowestmode': lon}) # creating pandas dataframe
# subsetting by bounds of the area of interest
# converting to geopandas dataframe
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon_lowestmode, df.lat_lowestmode))
gdf_sub = gdf[gdf['geometry'].within(poly.geometry[0])]
if not gdf_sub.empty:
# retrieving variables of interest, agbd, agbd_t in this case.
# We are only retriving the shots within subset area.
for _, df_gr in gdf_sub.groupby((gdf_sub.index.to_series().diff() > 1).cumsum()):
i = df_gr.index.min()
j = df_gr.index.max()
for v in HEADERS[2:]:
var_s = f"/{beam}/{v}[{i}:{j}]"
hyrax_url = f"{g['url']}.dap.nc4?dap4.ce={var_s}"
r = s.get(hyrax_url)
if (r.status_code != 400):
ds = nc.Dataset('hyrax', memory=r.content)
gdf_sub.loc[i:j, (v)] = ds[beam][v][:]
ds.close()
# saving the output file
gdf_sub['shot_number'] = gdf_sub['shot_number'].astype(str)
gdf_sub.to_csv(outfile, mode='a', index=False, header=False, columns=HEADERS)
if fmt_json:
jsonf = f"{path.splitext(outfile)[0]}.json"
print (f"writing GeoJSON file {jsonf}")
df = pd.read_csv(outfile)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.lon_lowestmode, df.lat_lowestmode))
gdf.to_file(jsonf, driver='GeoJSON', drop_id=True)
if __name__ == "__main__":
main()