How to automate GEDI download-subsetting workflow?

Please enter here to ask a question about any NASA Science related topics!
Post Reply
LP DAAC - ckrehbiel
Subject Matter Expert
Subject Matter Expert
Posts: 59
Joined: Tue Nov 12, 2019 3:29 pm America/New_York

How to automate GEDI download-subsetting workflow?

by LP DAAC - ckrehbiel » Tue Aug 18, 2020 4:04 pm America/New_York

User Question: Is there a way to automate the downloading of the links from the web search service and run each link through the subsetter python script saving only the output and deleting the orignial .h5 file, doing this file by file. Example to help with understanding question, I have used subsetter to filter 2 tb of data to 30 gb of data, would like the automation to download the first granule in link, subsett it to my shapefile, export json, delete original granule, do it for next link etc .

Tags:

LP DAAC - ckrehbiel
Subject Matter Expert
Subject Matter Expert
Posts: 59
Joined: Tue Nov 12, 2019 3:29 pm America/New_York

Re: How to automate GEDI download-subsetting workflow?

by LP DAAC - ckrehbiel » Tue Aug 18, 2020 4:14 pm America/New_York

Below is an example of Python code that could be used to accomplish the workflow described in your question. The code below will go through and call GEDI Finder to retrieve the granules for a given product/location and use the links provided to download each file. From there, you can specify a shapefile/geojson region of interest and clip the downloaded file and export as GeoJSON. The source file will then be deleted, and the script will begin to download the next file.

Code: Select all

# Example Python Script to Call GEDI Finder, Download files, run through GEDI Subsetter, and delete source granule 
# Import necessary packages 
import requests as r 
from netrc import netrc 
import os 
import geopandas as gp 
import sys 
from shapely.geometry import Polygon 
import h5py 
import pandas as pd 
import numpy as np 

# Define variables 
url = 'https://lpdaacsvc.cr.usgs.gov/services/gedifinder?' 
product = 'GEDI02_B' 
version = '001' 
bbox = '[49.0,-114.46,47.65,-113.2]'  # Glacier NP example 

# Call GEDI Finder API to return list of intersecting GEDI Files 
gediFiles = r.get(f"{url}product={product}&version={version}&bbox={bbox}").json()['data'] 

'''If you haven't set up a .netrc file containing your Earthdata Login Username and password, execute the script at: 

    https://git.earthdata.nasa.gov/projects/LPDUR/repos/daac_data_download_python/browse/EarthdataLoginSetup.py to get that set up first. This will allow you to download GEDI data directly without needing to enter a username and password every time. ''' 
netrcDir = os.path.expanduser("~/.netrc") 
urs = 'urs.earthdata.nasa.gov'    # Address to call for authentication 

# Set up GEDI Subsetter variables before looping through each file: 
ROI = 'POLYGON.shp'  # Update with your shapefile of interest, located in the same directory   

# Convert to Shapely polygon for geojson, .shp or bbox 
if ROI.endswith('json') or ROI.endswith('.shp'): 
    try: 
        ROI = gp.GeoDataFrame.from_file(ROI) 
        ROI.crs = 'EPSG:4326' 
        if len(ROI) > 1: 
            print('Multi-feature polygon detected. Only the first feature will be used to subset the GEDI data.') 
        ROI = ROI.geometry[0] 
    except: 
        print('error: unable to read input geojson file or the file was not found') 
        sys.exit(2) 
else: 
    ROI = ROI.replace("'", "") 
    ROI = ROI.split(',') 
    ROI = [float(r) for r in ROI] 
    try: 
        ROI = Polygon([(ROI[1], ROI[0]), (ROI[3], ROI[0]), (ROI[3], ROI[2]), (ROI[1], ROI[2])])  
        ROI.crs = 'EPSG:4326' 
    except: 
        print('error: unable to read input bounding box coordinates, the required format is: ul_lat,ul_lon,lr_lat,lr_lon') 
        sys.exit(2) 

# Keep the exact input geometry for the final clip to ROI 
finalClip = gp.GeoDataFrame([1], geometry=[ROI], crs='EPSG:4326')     

# Define beam subset if provided or default to all beams (defaulting to all beams here)         
beamSubset = ['BEAM0000', 'BEAM0001', 'BEAM0010', 'BEAM0011', 'BEAM0101', 'BEAM0110', 'BEAM1000', 'BEAM1011'] 
layerSubset = None # Define additional layers to subset (defaulting to no extra layers added here) 

# -------------------------------------SET UP WORKSPACE------------------------------------------ # 
inDir = os.getcwd() + os.sep # Set input directory as current working directory 

# Create and set output directory 
outDir = os.path.normpath((os.path.split(inDir)[0] + os.sep + 'output')) + os.sep 
if not os.path.exists(outDir): 
    os.makedirs(outDir) 
  
# --------------------DEFINE PRESET BAND/LAYER SUBSETS ------------------------------------------ # 
# Default layers to be subset and exported, see README for information on how to add additional layers 
l1bSubset = [ '/geolocation/latitude_bin0', '/geolocation/longitude_bin0', '/channel', '/shot_number', '/rxwaveform','/rx_sample_count', '/stale_return_flag', '/tx_sample_count', '/txwaveform', '/geolocation/degrade', '/geolocation/delta_time', '/geolocation/digital_elevation_model', '/geolocation/solar_elevation',  '/geolocation/local_beam_elevation',  '/noise_mean_corrected',  '/geolocation/elevation_bin0', '/geolocation/elevation_lastbin', '/geolocation/surface_type'] 
l2aSubset = ['/lat_lowestmode', '/lon_lowestmode', '/channel', '/shot_number', '/degrade_flag', '/delta_time',  '/digital_elevation_model', '/elev_lowestmode', '/quality_flag', '/rh', '/sensitivity', '/elevation_bias_flag', '/surface_flag',  '/num_detectedmodes',  '/selected_algorithm',  '/solar_elevation'] 
l2bSubset = ['/geolocation/lat_lowestmode', '/geolocation/lon_lowestmode', '/channel', '/geolocation/shot_number', '/cover', '/cover_z', '/fhd_normal', '/pai', '/pai_z',  '/rhov',  '/rhog',  '/pavd_z', '/l2a_quality_flag', '/l2b_quality_flag', '/rh100', '/sensitivity', '/stale_return_flag', '/surface_flag', '/geolocation/degrade_flag',  '/geolocation/solar_elevation', '/geolocation/delta_time', '/geolocation/digital_elevation_model', '/geolocation/elev_lowestmode'] 
l = 0  

# Create a for loop to go through, download each GEDI file, apply GEDI Subsetter to export results, and remove source file 
for g in gediFiles: 
    saveName = g.rsplit('/', 1)[-1] 
    # Create and submit request and download file 
    with r.get(g.strip(), verify=False, stream=True, auth=(netrc(netrcDir).authenticators(urs)[0], netrc(netrcDir).authenticators(urs)[2])) as response: 
        if response.status_code != 200: 
            print(f"{saveName} not downloaded. Verify that your username and password are correct in {netrcDir}") 
        else: 
            response.raw.decode_content = True 
            content = response.raw 
           # NOTE: Will save to the directory that the script is being executed in 
            with open(saveName, 'wb') as d: 
                while True: 
                    chunk = content.read(16 * 1024) 
                    if not chunk: 
                        break 
                    d.write(chunk) 
            print('Downloaded file: {}'.format(saveName)) 
            # Be patient--these very large files may take a long time to download     
    # After downloading the file, now move on to opening and subsetting it using the code from GEDI_Subsetter.py 

# -------------------IMPORT GEDI FILES AS GEODATAFRAMES AND CLIP TO ROI-------------------------- #    
# Loop through each GEDI file and export as a point geojson 
    l += 1 
    print(f"Processing file: {saveName} ({l}/{len(gediFiles)})") 
    gedi = h5py.File(saveName, 'r')      # Open file 
    gedi_objs = []             
    gedi.visit(gedi_objs.append)  # Retrieve list of datasets   

    # Search for relevant SDS inside data file 
    gediSDS = [str(o) for o in gedi_objs if isinstance(gedi[o], h5py.Dataset)]     

    # Define subset of layers based on product 
    if 'GEDI01_B' in g: 
        sdsSubset = l1bSubset 
    elif 'GEDI02_A' in g: 
        sdsSubset = l2aSubset  
    else: 
        sdsSubset = l2bSubset     

    # Append additional datasets if provided 
    if layerSubset is not None: 
        [sdsSubset.append(y) for y in layerSubset] 

    # Subset to the selected datasets 
    gediSDS = [c for c in gediSDS if any(c.endswith(d) for d in sdsSubset)] 

    # Get unique list of beams and subset to user-defined subset or default (all beams) 
    beams = [] 
    for h in gediSDS: 
        beam = h.split('/', 1)[0] 
        if beam not in beams and beam in beamSubset: 
            beams.append(beam)   

    gediDF = pd.DataFrame()  # Create empty dataframe to store GEDI datasets     
    del beam, gedi_objs, h 
     
    # Loop through each beam and create a geodataframe with lat/lon for each shot, then clip to ROI 
    for b in beams: 
        beamSDS = [s for s in gediSDS if b in s]         

        # Search for latitude, longitude, and shot number SDS 
        lat = [l for l in beamSDS if sdsSubset[0] in l][0]   
        lon = [l for l in beamSDS if sdsSubset[1] in l][0] 
        shot = f'{b}/shot_number'                   

        # Open latitude, longitude, and shot number SDS 
        shots = gedi[shot][()] 
        lats = gedi[lat][()] 
        lons = gedi[lon][()]          

        # Append BEAM, shot number, latitude, longitude and an index to the GEDI dataframe 
        geoDF = pd.DataFrame({'BEAM': len(shots) * [b], shot.split('/', 1)[-1].replace('/', '_'): shots, 
                              'Latitude':lats, 'Longitude':lons, 'index': np.arange(0, len(shots), 1)}) 

        # Convert lat/lon coordinates to shapely points and append to geodataframe 
        geoDF = gp.GeoDataFrame(geoDF, geometry=gp.points_from_xy(geoDF.Longitude, geoDF.Latitude))         

        # Clip to only include points within the user-defined bounding box 
        geoDF = geoDF[geoDF['geometry'].within(ROI.envelope)]     
        gediDF = gediDF.append(geoDF) 
        del geoDF     

    # Convert to geodataframe and add crs 
    gediDF = gp.GeoDataFrame(gediDF) 
    gediDF.crs = 'EPSG:4326'     

    if gediDF.shape[0] == 0: 
        print(f"No intersecting shots were found between {g} and the region of interest submitted.") 
        continue 
    del lats, lons, shots      

# --------------------------------OPEN SDS AND APPEND TO GEODATAFRAME---------------------------- # 
    beamsDF = pd.DataFrame()  # Create dataframe to store SDS 
    j = 0     

    # Loop through each beam and extract subset of defined SDS 
    for b in beams: 
        beamDF = pd.DataFrame() 
        beamSDS = [s for s in gediSDS if b in s and not any(s.endswith(d) for d in sdsSubset[0:3])] 
        shot = f'{b}/shot_number'        

        try: 
            # set up indexes in order to retrieve SDS data only within the clipped subset from above 
            mindex = min(gediDF[gediDF['BEAM'] == b]['index']) 
            maxdex = max(gediDF[gediDF['BEAM'] == b]['index']) + 1 
            shots = gedi[shot][mindex:maxdex] 
        except ValueError: 
            print(f"No intersecting shots found for {b}") 
            continue 
        # Loop through and extract each SDS subset and add to DF 
        for s in beamSDS: 
            j += 1 
            sName = s.split('/', 1)[-1].replace('/', '_')  

            # Datasets with consistent structure as shots 
            if gedi[s].shape == gedi[shot].shape: 
                beamDF[sName] = gedi[s][mindex:maxdex]  # Subset by index 
                
            # Datasets with a length of one  
            elif len(gedi[s][()]) == 1: 
                beamDF[sName] = [gedi[s][()][0]] * len(shots) # create array of same single value             

            # Multidimensional datasets 
            elif len(gedi[s].shape) == 2 and 'surface_type' not in s:  
                allData = gedi[s][()][mindex:maxdex] 
                 
                # For each additional dimension, create a new output column to store those data 
                for i in range(gedi[s].shape[1]): 
                    step = [] 
                    for a in allData: 
                        step.append(a[i]) 
                    beamDF[f"{sName}_{i}"] = step             

            # Waveforms 
            elif s.endswith('waveform') or s.endswith('pgap_theta_z'): 
                waveform = []                  

                if s.endswith('waveform'): 
                    # Use sample_count and sample_start_index to identify the location of each waveform 
                    start = gedi[f'{b}/{s.split("/")[-1][:2]}_sample_start_index'][mindex:maxdex] 
                    count = gedi[f'{b}/{s.split("/")[-1][:2]}_sample_count'][mindex:maxdex]                 

                # for pgap_theta_z, use rx sample start index and count to subset 
                else: 
                    # Use sample_count and sample_start_index to identify the location of each waveform 
                    start = gedi[f'{b}/rx_sample_start_index'][mindex:maxdex] 
                    count = gedi[f'{b}/rx_sample_count'][mindex:maxdex] 
                wave = gedi[s][()] 
                 
                # in the dataframe, each waveform will be stored as a list of values 

                for k in range(len(start)): 
                    singleWF = wave[int(start[k] - 1): int(start[k] - 1 + count[k])] 
                    waveform.append(','.join([str(q) for q in singleWF])) 
                beamDF[sName] = waveform              

            # Surface type  
            elif s.endswith('surface_type'): 
                surfaces = ['land', 'ocean', 'sea_ice', 'land_ice', 'inland_water'] 
                allData = gedi[s][()] 
                for i in range(gedi[s].shape[0]): 
                    beamDF[f'{surfaces[i]}'] = allData[i][mindex:maxdex] 
                del allData 
            else: 
                print(f"SDS: {s} not found") 
            print(f"Processing {j} of {len(beamSDS) * len(beams)}: {s}")        

        beamsDF = beamsDF.append(beamDF) 
    del beamDF, beamSDS, beams, gedi, gediSDS, shots, sdsSubset 
     
    # Combine geolocation dataframe with SDS layer dataframe 
    outDF = pd.merge(gediDF, beamsDF, left_on='shot_number', right_on=[sn for sn in beamsDF.columns if sn.endswith('shot_number')][0]) 
    outDF.index = outDF['index'] 
    del gediDF, beamsDF         

    # Subset the output DF to the actual boundary of the input ROI 
    outDF = gp.overlay(outDF, finalClip) 
    del outDF[0] 
# --------------------------------EXPORT AS GEOJSON---------------------------------------------- # 
    # Check for empty output dataframe 
    try:     
        # Export final geodataframe as Geojson 
        outDF.to_file(f"{outDir}{saveName.replace('.h5', '.json')}", driver='GeoJSON') 
        print(f"{saveName.replace('.h5', '.json')} saved at: {outDir}") 
    except ValueError: 
        print(f"{g} intersects the bounding box of the input ROI, but no shots intersect final clipped ROI.")     

    # Finally, remove the original GEDI granule 
    os.remove(saveName) 

Post Reply