#!pip install xstac
import pystac
import requests
import s3fs
import xstac
import fsspec
import xarray as xr
Publishing a CMIP6 Kerchunk Reference to STAC
Run this notebook
You can launch this notebook in VEDA JupyterHub by clicking the link below.
Launch in VEDA JupyterHub (requires access)
Learn more
Inside the Hub
This notebook was written on a VEDA JupyterHub instance
See (VEDA Analytics JupyterHub Access)[https://nasa-impact.github.io/veda-docs/veda-jh-access.html] for information about how to gain access.
Outside the Hub
You are welcome to run this anywhere you like (Note: alternatively you can run this on https://daskhub.veda.smce.nasa.gov/, MAAP, locally, …), just make sure that the data is accessible, or get in contact with the VEDA team to enable access.
Approach
This notebook creates STAC collection metadata for a CMIP6 Kerchunk Reference File which has already been generated and stored in S3.
This notebook serves as documentation for the publication of the CMIP6 kerchunk reference. It is not expected to generalize for arbitrary Zarr datasets but may be a helpful example. It was run on the VEDA JupyterHub and since veda-data-store-staging
is a protected bucket it is not expected to work in an environment without access to that bucket.
Step 1: Install and import necessary libraries
Step 2: Open the dataset with xarray
= 's3://veda-data-store-staging/cmip6-GISS-E2-1-G-tas-kerchunk/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json'
dataset_url
= {
xr_open_args "engine": "zarr",
"decode_coords": "all",
"consolidated": False
}
= fsspec.filesystem(
fs "reference",
=dataset_url,
fo={"anon": True},
remote_options
)= fs.get_mapper("")
src_path
= xr.open_dataset(src_path, **xr_open_args) ds
/tmp/ipykernel_5419/732403854.py:16: UserWarning: Variable(s) referenced in cell_measures not in variables: ['areacella']
ds = xr.open_dataset(src_path, **xr_open_args)
Step 3: Generate STAC metadata
The spatial extent is taken from the xarray metadata. The temporal extent will be added by the xstac
library.
= [ds.lon[0].values, ds.lat[0].values, ds.lon[-1].values, ds.lat[-1].values]
spatial_extent_values = list(map(int, spatial_extent_values))
spatial_extent = 'combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk_TEST'
_id = pystac.Asset(
zarr_asset ='zarr',
title=dataset_url,
href='application/vnd+zarr',
media_type=['data'],
roles
)= pystac.Extent(
extent =pystac.SpatialExtent(bboxes=[spatial_extent]),
spatial=pystac.TemporalExtent([[None, None]])
temporal )
Add the VEDA provider.
= [
providers
pystac.Provider(="VEDA",
name=[pystac.ProviderRole.PRODUCER, pystac.ProviderRole.PROCESSOR, pystac.ProviderRole.HOST],
roles="https://github.com/nasa-impact/veda-data-pipelines",
url
) ]
Put it all together to intialize a pystac.Collection
instance.
= pystac.Collection(
collection id=_id,
=extent,
extent= {'zarr': zarr_asset},
assets ='for zarr testing',
description=providers,
providers=['https://stac-extensions.github.io/datacube/v2.0.0/schema.json'],
stac_extensions="CC0-1.0"
license )
That collection instance is used by xstac
to generate additional metadata, such as the temporal extent and the datacube extension
information.
= collection.to_dict()
collection_template = xstac.xarray_to_stac(
collection
ds,
collection_template,="time",
temporal_dimension="lon",
x_dimension="lat",
y_dimension# TODO: get this from attributes if possible
="4326",
reference_system=False
validate
)# It should validate, yay!
collection.validate()
['https://schemas.stacspec.org/v1.0.0/collection-spec/json-schema/collection.json',
'https://stac-extensions.github.io/datacube/v2.0.0/schema.json']
Final Step - Publish the collection
Finally, we will publish the client using the VEDA STAC Ingestor API. If you are trying to publish to the VEDA STAC API but don’t have credentials for the STAC ingestor, this is a good time to ask for help and take a break. If you are not trying to publish to the VEDA STAC API but you are using pgSTAC, you should be able to write the collection to a json file and upload to the location of your static catalog publish using pypgstac.
# The VEDA STAC ingestor requires a few more fields
= collection.to_dict()
dataset 'data_type'] = 'zarr'
dataset['collection'] = _id
dataset['title'] = 'CMIP6 Daily GISS-E2-1-G TAS Kerchunk (DEMO)'
dataset['dashboard:is_periodic'] = True
dataset['dashboard:time_density'] = 'day' dataset[
# You may need to install cognito client
from cognito_client import CognitoClient
= "https://6r8ht9b123.execute-api.us-west-2.amazonaws.com/dev/"
STAC_INGESTOR_API = CognitoClient(
client ="CHANGE ME",
client_id="CHANGE ME",
user_pool_id="CHANGE ME",
identity_pool_id
)= client.login()
_
= client.access_token TOKEN
= f"Bearer {TOKEN}"
auth_header = {
headers "Authorization": auth_header,
"content-type": "application/json",
"accept": "application/json",
}= requests.post((STAC_INGESTOR_API + "api/ingest/collections"), json=dataset, headers=headers)
response
print(response.text)