How to automate GEDI download-subsetting workflow?
-
- Subject Matter Expert
- Posts: 59
- Joined: Tue Nov 12, 2019 3:29 pm America/New_York
How to automate GEDI download-subsetting workflow?
User Question: Is there a way to automate the downloading of the links from the web search service and run each link through the subsetter python script saving only the output and deleting the orignial .h5 file, doing this file by file. Example to help with understanding question, I have used subsetter to filter 2 tb of data to 30 gb of data, would like the automation to download the first granule in link, subsett it to my shapefile, export json, delete original granule, do it for next link etc .
Filters:
-
- Subject Matter Expert
- Posts: 59
- Joined: Tue Nov 12, 2019 3:29 pm America/New_York
Re: How to automate GEDI download-subsetting workflow?
Below is an example of Python code that could be used to accomplish the workflow described in your question. The code below will go through and call GEDI Finder to retrieve the granules for a given product/location and use the links provided to download each file. From there, you can specify a shapefile/geojson region of interest and clip the downloaded file and export as GeoJSON. The source file will then be deleted, and the script will begin to download the next file.
Code: Select all
# Example Python Script to Call GEDI Finder, Download files, run through GEDI Subsetter, and delete source granule
# Import necessary packages
import requests as r
from netrc import netrc
import os
import geopandas as gp
import sys
from shapely.geometry import Polygon
import h5py
import pandas as pd
import numpy as np
# Define variables
url = 'https://lpdaacsvc.cr.usgs.gov/services/gedifinder?'
product = 'GEDI02_B'
version = '001'
bbox = '[49.0,-114.46,47.65,-113.2]' # Glacier NP example
# Call GEDI Finder API to return list of intersecting GEDI Files
gediFiles = r.get(f"{url}product={product}&version={version}&bbox={bbox}").json()['data']
'''If you haven't set up a .netrc file containing your Earthdata Login Username and password, execute the script at:
https://git.earthdata.nasa.gov/projects/LPDUR/repos/daac_data_download_python/browse/EarthdataLoginSetup.py to get that set up first. This will allow you to download GEDI data directly without needing to enter a username and password every time. '''
netrcDir = os.path.expanduser("~/.netrc")
urs = 'urs.earthdata.nasa.gov' # Address to call for authentication
# Set up GEDI Subsetter variables before looping through each file:
ROI = 'POLYGON.shp' # Update with your shapefile of interest, located in the same directory
# Convert to Shapely polygon for geojson, .shp or bbox
if ROI.endswith('json') or ROI.endswith('.shp'):
try:
ROI = gp.GeoDataFrame.from_file(ROI)
ROI.crs = 'EPSG:4326'
if len(ROI) > 1:
print('Multi-feature polygon detected. Only the first feature will be used to subset the GEDI data.')
ROI = ROI.geometry[0]
except:
print('error: unable to read input geojson file or the file was not found')
sys.exit(2)
else:
ROI = ROI.replace("'", "")
ROI = ROI.split(',')
ROI = [float(r) for r in ROI]
try:
ROI = Polygon([(ROI[1], ROI[0]), (ROI[3], ROI[0]), (ROI[3], ROI[2]), (ROI[1], ROI[2])])
ROI.crs = 'EPSG:4326'
except:
print('error: unable to read input bounding box coordinates, the required format is: ul_lat,ul_lon,lr_lat,lr_lon')
sys.exit(2)
# Keep the exact input geometry for the final clip to ROI
finalClip = gp.GeoDataFrame([1], geometry=[ROI], crs='EPSG:4326')
# Define beam subset if provided or default to all beams (defaulting to all beams here)
beamSubset = ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011']
layerSubset = None # Define additional layers to subset (defaulting to no extra layers added here)
# -------------------------------------SET UP WORKSPACE------------------------------------------ #
inDir = os.getcwd() + os.sep # Set input directory as current working directory
# Create and set output directory
outDir = os.path.normpath((os.path.split(inDir)[0] + os.sep + 'output')) + os.sep
if not os.path.exists(outDir):
os.makedirs(outDir)
# --------------------DEFINE PRESET BAND/LAYER SUBSETS ------------------------------------------ #
# Default layers to be subset and exported, see README for information on how to add additional layers
l1bSubset = [ '/geolocation/latitude_bin0', '/geolocation/longitude_bin0', '/channel', '/shot_number', '/rxwaveform','/rx_sample_count', '/stale_return_flag', '/tx_sample_count', '/txwaveform', '/geolocation/degrade', '/geolocation/delta_time', '/geolocation/digital_elevation_model', '/geolocation/solar_elevation', '/geolocation/local_beam_elevation', '/noise_mean_corrected', '/geolocation/elevation_bin0', '/geolocation/elevation_lastbin', '/geolocation/surface_type']
l2aSubset = ['/lat_lowestmode', '/lon_lowestmode', '/channel', '/shot_number', '/degrade_flag', '/delta_time', '/digital_elevation_model', '/elev_lowestmode', '/quality_flag', '/rh', '/sensitivity', '/elevation_bias_flag', '/surface_flag', '/num_detectedmodes', '/selected_algorithm', '/solar_elevation']
l2bSubset = ['/geolocation/lat_lowestmode', '/geolocation/lon_lowestmode', '/channel', '/geolocation/shot_number', '/cover', '/cover_z', '/fhd_normal', '/pai', '/pai_z', '/rhov', '/rhog', '/pavd_z', '/l2a_quality_flag', '/l2b_quality_flag', '/rh100', '/sensitivity', '/stale_return_flag', '/surface_flag', '/geolocation/degrade_flag', '/geolocation/solar_elevation', '/geolocation/delta_time', '/geolocation/digital_elevation_model', '/geolocation/elev_lowestmode']
l = 0
# Create a for loop to go through, download each GEDI file, apply GEDI Subsetter to export results, and remove source file
for g in gediFiles:
saveName = g.rsplit('/', 1)[-1]
# Create and submit request and download file
with r.get(g.strip(), verify=False, stream=True, auth=(netrc(netrcDir).authenticators(urs)[0], netrc(netrcDir).authenticators(urs)[2])) as response:
if response.status_code != 200:
print(f"{saveName} not downloaded. Verify that your username and password are correct in {netrcDir}")
else:
response.raw.decode_content = True
content = response.raw
# NOTE: Will save to the directory that the script is being executed in
with open(saveName, 'wb') as d:
while True:
chunk = content.read(16 * 1024)
if not chunk:
break
d.write(chunk)
print('Downloaded file: {}'.format(saveName))
# Be patient--these very large files may take a long time to download
# After downloading the file, now move on to opening and subsetting it using the code from GEDI_Subsetter.py
# -------------------IMPORT GEDI FILES AS GEODATAFRAMES AND CLIP TO ROI-------------------------- #
# Loop through each GEDI file and export as a point geojson
l += 1
print(f"Processing file: {saveName} ({l}/{len(gediFiles)})")
gedi = h5py.File(saveName, 'r') # Open file
gedi_objs = []
gedi.visit(gedi_objs.append) # Retrieve list of datasets
# Search for relevant SDS inside data file
gediSDS = [str(o) for o in gedi_objs if isinstance(gedi[o], h5py.Dataset)]
# Define subset of layers based on product
if 'GEDI01_B' in g:
sdsSubset = l1bSubset
elif 'GEDI02_A' in g:
sdsSubset = l2aSubset
else:
sdsSubset = l2bSubset
# Append additional datasets if provided
if layerSubset is not None:
[sdsSubset.append(y) for y in layerSubset]
# Subset to the selected datasets
gediSDS = [c for c in gediSDS if any(c.endswith(d) for d in sdsSubset)]
# Get unique list of beams and subset to user-defined subset or default (all beams)
beams = []
for h in gediSDS:
beam = h.split('/', 1)[0]
if beam not in beams and beam in beamSubset:
beams.append(beam)
gediDF = pd.DataFrame() # Create empty dataframe to store GEDI datasets
del beam, gedi_objs, h
# Loop through each beam and create a geodataframe with lat/lon for each shot, then clip to ROI
for b in beams:
beamSDS = [s for s in gediSDS if b in s]
# Search for latitude, longitude, and shot number SDS
lat = [l for l in beamSDS if sdsSubset[0] in l][0]
lon = [l for l in beamSDS if sdsSubset[1] in l][0]
shot = f'{b}/shot_number'
# Open latitude, longitude, and shot number SDS
shots = gedi[shot][()]
lats = gedi[lat][()]
lons = gedi[lon][()]
# Append BEAM, shot number, latitude, longitude and an index to the GEDI dataframe
geoDF = pd.DataFrame({'BEAM': len(shots) * [b], shot.split('/', 1)[-1].replace('/', '_'): shots,
'Latitude':lats, 'Longitude':lons, 'index': np.arange(0, len(shots), 1)})
# Convert lat/lon coordinates to shapely points and append to geodataframe
geoDF = gp.GeoDataFrame(geoDF, geometry=gp.points_from_xy(geoDF.Longitude, geoDF.Latitude))
# Clip to only include points within the user-defined bounding box
geoDF = geoDF[geoDF['geometry'].within(ROI.envelope)]
gediDF = gediDF.append(geoDF)
del geoDF
# Convert to geodataframe and add crs
gediDF = gp.GeoDataFrame(gediDF)
gediDF.crs = 'EPSG:4326'
if gediDF.shape[0] == 0:
print(f"No intersecting shots were found between {g} and the region of interest submitted.")
continue
del lats, lons, shots
# --------------------------------OPEN SDS AND APPEND TO GEODATAFRAME---------------------------- #
beamsDF = pd.DataFrame() # Create dataframe to store SDS
j = 0
# Loop through each beam and extract subset of defined SDS
for b in beams:
beamDF = pd.DataFrame()
beamSDS = [s for s in gediSDS if b in s and not any(s.endswith(d) for d in sdsSubset[0:3])]
shot = f'{b}/shot_number'
try:
# set up indexes in order to retrieve SDS data only within the clipped subset from above
mindex = min(gediDF[gediDF['BEAM'] == b]['index'])
maxdex = max(gediDF[gediDF['BEAM'] == b]['index']) + 1
shots = gedi[shot][mindex:maxdex]
except ValueError:
print(f"No intersecting shots found for {b}")
continue
# Loop through and extract each SDS subset and add to DF
for s in beamSDS:
j += 1
sName = s.split('/', 1)[-1].replace('/', '_')
# Datasets with consistent structure as shots
if gedi[s].shape == gedi[shot].shape:
beamDF[sName] = gedi[s][mindex:maxdex] # Subset by index
# Datasets with a length of one
elif len(gedi[s][()]) == 1:
beamDF[sName] = [gedi[s][()][0]] * len(shots) # create array of same single value
# Multidimensional datasets
elif len(gedi[s].shape) == 2 and 'surface_type' not in s:
allData = gedi[s][()][mindex:maxdex]
# For each additional dimension, create a new output column to store those data
for i in range(gedi[s].shape[1]):
step = []
for a in allData:
step.append(a[i])
beamDF[f"{sName}_{i}"] = step
# Waveforms
elif s.endswith('waveform') or s.endswith('pgap_theta_z'):
waveform = []
if s.endswith('waveform'):
# Use sample_count and sample_start_index to identify the location of each waveform
start = gedi[f'{b}/{s.split("/")[-1][:2]}_sample_start_index'][mindex:maxdex]
count = gedi[f'{b}/{s.split("/")[-1][:2]}_sample_count'][mindex:maxdex]
# for pgap_theta_z, use rx sample start index and count to subset
else:
# Use sample_count and sample_start_index to identify the location of each waveform
start = gedi[f'{b}/rx_sample_start_index'][mindex:maxdex]
count = gedi[f'{b}/rx_sample_count'][mindex:maxdex]
wave = gedi[s][()]
# in the dataframe, each waveform will be stored as a list of values
for k in range(len(start)):
singleWF = wave[int(start[k] - 1): int(start[k] - 1 + count[k])]
waveform.append(','.join([str(q) for q in singleWF]))
beamDF[sName] = waveform
# Surface type
elif s.endswith('surface_type'):
surfaces = ['land', 'ocean', 'sea_ice', 'land_ice', 'inland_water']
allData = gedi[s][()]
for i in range(gedi[s].shape[0]):
beamDF[f'{surfaces[i]}'] = allData[i][mindex:maxdex]
del allData
else:
print(f"SDS: {s} not found")
print(f"Processing {j} of {len(beamSDS) * len(beams)}: {s}")
beamsDF = beamsDF.append(beamDF)
del beamDF, beamSDS, beams, gedi, gediSDS, shots, sdsSubset
# Combine geolocation dataframe with SDS layer dataframe
outDF = pd.merge(gediDF, beamsDF, left_on='shot_number', right_on=[sn for sn in beamsDF.columns if sn.endswith('shot_number')][0])
outDF.index = outDF['index']
del gediDF, beamsDF
# Subset the output DF to the actual boundary of the input ROI
outDF = gp.overlay(outDF, finalClip)
del outDF[0]
# --------------------------------EXPORT AS GEOJSON---------------------------------------------- #
# Check for empty output dataframe
try:
# Export final geodataframe as Geojson
outDF.to_file(f"{outDir}{saveName.replace('.h5', '.json')}", driver='GeoJSON')
print(f"{saveName.replace('.h5', '.json')} saved at: {outDir}")
except ValueError:
print(f"{g} intersects the bounding box of the input ROI, but no shots intersect final clipped ROI.")
# Finally, remove the original GEDI granule
os.remove(saveName)