Page 1 of 1

How to automate GEDI download-subsetting workflow?

Posted: Tue Aug 18, 2020 4:04 pm America/New_York
by LP DAAC - ckrehbiel
User Question: Is there a way to automate the downloading of the links from the web search service and run each link through the subsetter python script saving only the output and deleting the orignial .h5 file, doing this file by file. Example to help with understanding question, I have used subsetter to filter 2 tb of data to 30 gb of data, would like the automation to download the first granule in link, subsett it to my shapefile, export json, delete original granule, do it for next link etc .

Re: How to automate GEDI download-subsetting workflow?

Posted: Tue Aug 18, 2020 4:14 pm America/New_York
by LP DAAC - ckrehbiel
Below is an example of Python code that could be used to accomplish the workflow described in your question. The code below will go through and call GEDI Finder to retrieve the granules for a given product/location and use the links provided to download each file. From there, you can specify a shapefile/geojson region of interest and clip the downloaded file and export as GeoJSON. The source file will then be deleted, and the script will begin to download the next file.

Code: Select all

# Example Python Script to Call GEDI Finder, Download files, run through GEDI Subsetter, and delete source granule 
# Import necessary packages 
import requests as r 
from netrc import netrc 
import os 
import geopandas as gp 
import sys 
from shapely.geometry import Polygon 
import h5py 
import pandas as pd 
import numpy as np 

# Define variables 
url = '' 
product = 'GEDI02_B' 
version = '001' 
bbox = '[49.0,-114.46,47.65,-113.2]'  # Glacier NP example 

# Call GEDI Finder API to return list of intersecting GEDI Files 
gediFiles = r.get(f"{url}product={product}&version={version}&bbox={bbox}").json()['data'] 

'''If you haven't set up a .netrc file containing your Earthdata Login Username and password, execute the script at: to get that set up first. This will allow you to download GEDI data directly without needing to enter a username and password every time. ''' 
netrcDir = os.path.expanduser("~/.netrc") 
urs = ''    # Address to call for authentication 

# Set up GEDI Subsetter variables before looping through each file: 
ROI = 'POLYGON.shp'  # Update with your shapefile of interest, located in the same directory   

# Convert to Shapely polygon for geojson, .shp or bbox 
if ROI.endswith('json') or ROI.endswith('.shp'): 
        ROI = gp.GeoDataFrame.from_file(ROI) = 'EPSG:4326' 
        if len(ROI) > 1: 
            print('Multi-feature polygon detected. Only the first feature will be used to subset the GEDI data.') 
        ROI = ROI.geometry[0] 
        print('error: unable to read input geojson file or the file was not found') 
    ROI = ROI.replace("'", "") 
    ROI = ROI.split(',') 
    ROI = [float(r) for r in ROI] 
        ROI = Polygon([(ROI[1], ROI[0]), (ROI[3], ROI[0]), (ROI[3], ROI[2]), (ROI[1], ROI[2])])  = 'EPSG:4326' 
        print('error: unable to read input bounding box coordinates, the required format is: ul_lat,ul_lon,lr_lat,lr_lon') 

# Keep the exact input geometry for the final clip to ROI 
finalClip = gp.GeoDataFrame([1], geometry=[ROI], crs='EPSG:4326')     

# Define beam subset if provided or default to all beams (defaulting to all beams here)         
beamSubset = ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011'] 
layerSubset = None # Define additional layers to subset (defaulting to no extra layers added here) 

# -------------------------------------SET UP WORKSPACE------------------------------------------ # 
inDir = os.getcwd() + os.sep # Set input directory as current working directory 

# Create and set output directory 
outDir = os.path.normpath((os.path.split(inDir)[0] + os.sep + 'output')) + os.sep 
if not os.path.exists(outDir): 
# --------------------DEFINE PRESET BAND/LAYER SUBSETS ------------------------------------------ # 
# Default layers to be subset and exported, see README for information on how to add additional layers 
l1bSubset = [ '/geolocation/latitude_bin0', '/geolocation/longitude_bin0', '/channel', '/shot_number', '/rxwaveform','/rx_sample_count', '/stale_return_flag', '/tx_sample_count', '/txwaveform', '/geolocation/degrade', '/geolocation/delta_time', '/geolocation/digital_elevation_model', '/geolocation/solar_elevation',  '/geolocation/local_beam_elevation',  '/noise_mean_corrected',  '/geolocation/elevation_bin0', '/geolocation/elevation_lastbin', '/geolocation/surface_type'] 
l2aSubset = ['/lat_lowestmode', '/lon_lowestmode', '/channel', '/shot_number', '/degrade_flag', '/delta_time',  '/digital_elevation_model', '/elev_lowestmode', '/quality_flag', '/rh', '/sensitivity', '/elevation_bias_flag', '/surface_flag',  '/num_detectedmodes',  '/selected_algorithm',  '/solar_elevation'] 
l2bSubset = ['/geolocation/lat_lowestmode', '/geolocation/lon_lowestmode', '/channel', '/geolocation/shot_number', '/cover', '/cover_z', '/fhd_normal', '/pai', '/pai_z',  '/rhov',  '/rhog',  '/pavd_z', '/l2a_quality_flag', '/l2b_quality_flag', '/rh100', '/sensitivity', '/stale_return_flag', '/surface_flag', '/geolocation/degrade_flag',  '/geolocation/solar_elevation', '/geolocation/delta_time', '/geolocation/digital_elevation_model', '/geolocation/elev_lowestmode'] 
l = 0  

# Create a for loop to go through, download each GEDI file, apply GEDI Subsetter to export results, and remove source file 
for g in gediFiles: 
    saveName = g.rsplit('/', 1)[-1] 
    # Create and submit request and download file 
    with r.get(g.strip(), verify=False, stream=True, auth=(netrc(netrcDir).authenticators(urs)[0], netrc(netrcDir).authenticators(urs)[2])) as response: 
        if response.status_code != 200: 
            print(f"{saveName} not downloaded. Verify that your username and password are correct in {netrcDir}") 
            response.raw.decode_content = True 
            content = response.raw 
           # NOTE: Will save to the directory that the script is being executed in 
            with open(saveName, 'wb') as d: 
                while True: 
                    chunk = * 1024) 
                    if not chunk: 
            print('Downloaded file: {}'.format(saveName)) 
            # Be patient--these very large files may take a long time to download     
    # After downloading the file, now move on to opening and subsetting it using the code from 

# -------------------IMPORT GEDI FILES AS GEODATAFRAMES AND CLIP TO ROI-------------------------- #    
# Loop through each GEDI file and export as a point geojson 
    l += 1 
    print(f"Processing file: {saveName} ({l}/{len(gediFiles)})") 
    gedi = h5py.File(saveName, 'r')      # Open file 
    gedi_objs = []             
    gedi.visit(gedi_objs.append)  # Retrieve list of datasets   

    # Search for relevant SDS inside data file 
    gediSDS = [str(o) for o in gedi_objs if isinstance(gedi[o], h5py.Dataset)]     

    # Define subset of layers based on product 
    if 'GEDI01_B' in g: 
        sdsSubset = l1bSubset 
    elif 'GEDI02_A' in g: 
        sdsSubset = l2aSubset  
        sdsSubset = l2bSubset     

    # Append additional datasets if provided 
    if layerSubset is not None: 
        [sdsSubset.append(y) for y in layerSubset] 

    # Subset to the selected datasets 
    gediSDS = [c for c in gediSDS if any(c.endswith(d) for d in sdsSubset)] 

    # Get unique list of beams and subset to user-defined subset or default (all beams) 
    beams = [] 
    for h in gediSDS: 
        beam = h.split('/', 1)[0] 
        if beam not in beams and beam in beamSubset: 

    gediDF = pd.DataFrame()  # Create empty dataframe to store GEDI datasets     
    del beam, gedi_objs, h 
    # Loop through each beam and create a geodataframe with lat/lon for each shot, then clip to ROI 
    for b in beams: 
        beamSDS = [s for s in gediSDS if b in s]         

        # Search for latitude, longitude, and shot number SDS 
        lat = [l for l in beamSDS if sdsSubset[0] in l][0]   
        lon = [l for l in beamSDS if sdsSubset[1] in l][0] 
        shot = f'{b}/shot_number'                   

        # Open latitude, longitude, and shot number SDS 
        shots = gedi[shot][()] 
        lats = gedi[lat][()] 
        lons = gedi[lon][()]          

        # Append BEAM, shot number, latitude, longitude and an index to the GEDI dataframe 
        geoDF = pd.DataFrame({'BEAM': len(shots) * [b], shot.split('/', 1)[-1].replace('/', '_'): shots, 
                              'Latitude':lats, 'Longitude':lons, 'index': np.arange(0, len(shots), 1)}) 

        # Convert lat/lon coordinates to shapely points and append to geodataframe 
        geoDF = gp.GeoDataFrame(geoDF, geometry=gp.points_from_xy(geoDF.Longitude, geoDF.Latitude))         

        # Clip to only include points within the user-defined bounding box 
        geoDF = geoDF[geoDF['geometry'].within(ROI.envelope)]     
        gediDF = gediDF.append(geoDF) 
        del geoDF     

    # Convert to geodataframe and add crs 
    gediDF = gp.GeoDataFrame(gediDF) = 'EPSG:4326'     

    if gediDF.shape[0] == 0: 
        print(f"No intersecting shots were found between {g} and the region of interest submitted.") 
    del lats, lons, shots      

# --------------------------------OPEN SDS AND APPEND TO GEODATAFRAME---------------------------- # 
    beamsDF = pd.DataFrame()  # Create dataframe to store SDS 
    j = 0     

    # Loop through each beam and extract subset of defined SDS 
    for b in beams: 
        beamDF = pd.DataFrame() 
        beamSDS = [s for s in gediSDS if b in s and not any(s.endswith(d) for d in sdsSubset[0:3])] 
        shot = f'{b}/shot_number'        

            # set up indexes in order to retrieve SDS data only within the clipped subset from above 
            mindex = min(gediDF[gediDF['BEAM'] == b]['index']) 
            maxdex = max(gediDF[gediDF['BEAM'] == b]['index']) + 1 
            shots = gedi[shot][mindex:maxdex] 
        except ValueError: 
            print(f"No intersecting shots found for {b}") 
        # Loop through and extract each SDS subset and add to DF 
        for s in beamSDS: 
            j += 1 
            sName = s.split('/', 1)[-1].replace('/', '_')  

            # Datasets with consistent structure as shots 
            if gedi[s].shape == gedi[shot].shape: 
                beamDF[sName] = gedi[s][mindex:maxdex]  # Subset by index 
            # Datasets with a length of one  
            elif len(gedi[s][()]) == 1: 
                beamDF[sName] = [gedi[s][()][0]] * len(shots) # create array of same single value             

            # Multidimensional datasets 
            elif len(gedi[s].shape) == 2 and 'surface_type' not in s:  
                allData = gedi[s][()][mindex:maxdex] 
                # For each additional dimension, create a new output column to store those data 
                for i in range(gedi[s].shape[1]): 
                    step = [] 
                    for a in allData: 
                    beamDF[f"{sName}_{i}"] = step             

            # Waveforms 
            elif s.endswith('waveform') or s.endswith('pgap_theta_z'): 
                waveform = []                  

                if s.endswith('waveform'): 
                    # Use sample_count and sample_start_index to identify the location of each waveform 
                    start = gedi[f'{b}/{s.split("/")[-1][:2]}_sample_start_index'][mindex:maxdex] 
                    count = gedi[f'{b}/{s.split("/")[-1][:2]}_sample_count'][mindex:maxdex]                 

                # for pgap_theta_z, use rx sample start index and count to subset 
                    # Use sample_count and sample_start_index to identify the location of each waveform 
                    start = gedi[f'{b}/rx_sample_start_index'][mindex:maxdex] 
                    count = gedi[f'{b}/rx_sample_count'][mindex:maxdex] 
                wave = gedi[s][()] 
                # in the dataframe, each waveform will be stored as a list of values 

                for k in range(len(start)): 
                    singleWF = wave[int(start[k] - 1): int(start[k] - 1 + count[k])] 
                    waveform.append(','.join([str(q) for q in singleWF])) 
                beamDF[sName] = waveform              

            # Surface type  
            elif s.endswith('surface_type'): 
                surfaces = ['land', 'ocean', 'sea_ice', 'land_ice', 'inland_water'] 
                allData = gedi[s][()] 
                for i in range(gedi[s].shape[0]): 
                    beamDF[f'{surfaces[i]}'] = allData[i][mindex:maxdex] 
                del allData 
                print(f"SDS: {s} not found") 
            print(f"Processing {j} of {len(beamSDS) * len(beams)}: {s}")        

        beamsDF = beamsDF.append(beamDF) 
    del beamDF, beamSDS, beams, gedi, gediSDS, shots, sdsSubset 
    # Combine geolocation dataframe with SDS layer dataframe 
    outDF = pd.merge(gediDF, beamsDF, left_on='shot_number', right_on=[sn for sn in beamsDF.columns if sn.endswith('shot_number')][0]) 
    outDF.index = outDF['index'] 
    del gediDF, beamsDF         

    # Subset the output DF to the actual boundary of the input ROI 
    outDF = gp.overlay(outDF, finalClip) 
    del outDF[0] 
# --------------------------------EXPORT AS GEOJSON---------------------------------------------- # 
    # Check for empty output dataframe 
        # Export final geodataframe as Geojson 
        outDF.to_file(f"{outDir}{saveName.replace('.h5', '.json')}", driver='GeoJSON') 
        print(f"{saveName.replace('.h5', '.json')} saved at: {outDir}") 
    except ValueError: 
        print(f"{g} intersects the bounding box of the input ROI, but no shots intersect final clipped ROI.")     

    # Finally, remove the original GEDI granule