{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# From xarray to pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import python packages" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "import xarray as xr\n", "xr.set_options(display_style='html')\n", "import intake\n", "import cftime\n", "import matplotlib.pyplot as plt\n", "import cartopy.crs as ccrs\n", "import pandas as pd\n", "import dask\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Open CMIP6 online catalog" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "

pangeo-cmip6 catalog with 7632 dataset(s) from 517667 asset(s):

\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
unique
activity_id18
institution_id36
source_id88
experiment_id170
member_id657
table_id37
variable_id709
grid_label10
zstore517667
dcpp_init_year60
version715
\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "cat_url = \"https://storage.googleapis.com/cmip6/pangeo-cmip6.json\"\n", "col = intake.open_esm_datastore(cat_url)\n", "col" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Search corresponding data " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
activity_idinstitution_idsource_idexperiment_idmember_idtable_idvariable_idgrid_labelzstoredcpp_init_yearversion
0CMIPNCARCESM2-WACCMhistoricalr1i1p1f1AERmonso2gngs://cmip6/CMIP6/CMIP/NCAR/CESM2-WACCM/histori...NaN20190227
\n", "
" ], "text/plain": [ " activity_id institution_id source_id experiment_id member_id table_id \\\n", "0 CMIP NCAR CESM2-WACCM historical r1i1p1f1 AERmon \n", "\n", " variable_id grid_label zstore \\\n", "0 so2 gn gs://cmip6/CMIP6/CMIP/NCAR/CESM2-WACCM/histori... \n", "\n", " dcpp_init_year version \n", "0 NaN 20190227 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat = col.search(source_id=['CESM2-WACCM'], experiment_id=['historical'], table_id=['AERmon'], variable_id=['so2'], member_id=['r1i1p1f1'])\n", "cat.df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create dictionary from the list of datasets we found\n", "- This step may take several minutes so be patient!" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "--> The keys in the returned dictionary of datasets are constructed as follows:\n", "\t'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " 100.00% [1/1 00:00<00:00]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dset_dict = cat.to_dataset_dict(zarr_kwargs={'use_cftime':True})" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['CMIP.NCAR.CESM2-WACCM.historical.AERmon.gn']\n" ] } ], "source": [ "lconf = list(dset_dict.keys())\n", "print(lconf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Open dataset\n", "\n", "- Use `xarray` python package to analyze netCDF dataset\n", "- `open_dataset` allows to get all the metadata without loading data into memory. \n", "- with `xarray`, we only load into memory what is needed." ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "dset = dset_dict[lconf[0]]\n", "dset = dset.squeeze()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get metadata corresponding to the whole dataset" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.Dataset>\n",
       "Dimensions:    (lat: 192, lev: 70, lon: 288, nbnd: 2, time: 1980)\n",
       "Coordinates:\n",
       "  * lat        (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0\n",
       "    lat_bnds   (lat, nbnd) float32 dask.array<chunksize=(192, 2), meta=np.ndarray>\n",
       "  * lev        (lev) float64 -5.96e-06 -9.827e-06 -1.62e-05 ... -976.3 -992.6\n",
       "    lev_bnds   (lev, nbnd) float32 dask.array<chunksize=(70, 2), meta=np.ndarray>\n",
       "  * lon        (lon) float64 0.0 1.25 2.5 3.75 5.0 ... 355.0 356.2 357.5 358.8\n",
       "    lon_bnds   (lon, nbnd) float32 dask.array<chunksize=(288, 2), meta=np.ndarray>\n",
       "  * time       (time) object 1850-01-15 12:00:00 ... 2014-12-15 12:00:00\n",
       "    time_bnds  (time, nbnd) object dask.array<chunksize=(1980, 2), meta=np.ndarray>\n",
       "    member_id  <U8 'r1i1p1f1'\n",
       "Dimensions without coordinates: nbnd\n",
       "Data variables:\n",
       "    so2        (time, lev, lat, lon) float32 dask.array<chunksize=(5, 70, 192, 288), meta=np.ndarray>\n",
       "Attributes: (12/48)\n",
       "    Conventions:             CF-1.7 CMIP-6.2\n",
       "    activity_id:             CMIP\n",
       "    branch_method:           standard\n",
       "    branch_time_in_child:    674885.0\n",
       "    branch_time_in_parent:   20075.0\n",
       "    case_id:                 4\n",
       "    ...                      ...\n",
       "    variable_id:             so2\n",
       "    variant_info:            CMIP6 CESM2 hindcast (1850-2014) with high-top a...\n",
       "    variant_label:           r1i1p1f1\n",
       "    status:                  2019-11-05;created;by nhn2@columbia.edu\n",
       "    intake_esm_varname:      ['so2']\n",
       "    intake_esm_dataset_key:  CMIP.NCAR.CESM2-WACCM.historical.AERmon.gn
" ], "text/plain": [ "\n", "Dimensions: (lat: 192, lev: 70, lon: 288, nbnd: 2, time: 1980)\n", "Coordinates:\n", " * lat (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0\n", " lat_bnds (lat, nbnd) float32 dask.array\n", " * lev (lev) float64 -5.96e-06 -9.827e-06 -1.62e-05 ... -976.3 -992.6\n", " lev_bnds (lev, nbnd) float32 dask.array\n", " * lon (lon) float64 0.0 1.25 2.5 3.75 5.0 ... 355.0 356.2 357.5 358.8\n", " lon_bnds (lon, nbnd) float32 dask.array\n", " * time (time) object 1850-01-15 12:00:00 ... 2014-12-15 12:00:00\n", " time_bnds (time, nbnd) object dask.array\n", " member_id \n", "Attributes: (12/48)\n", " Conventions: CF-1.7 CMIP-6.2\n", " activity_id: CMIP\n", " branch_method: standard\n", " branch_time_in_child: 674885.0\n", " branch_time_in_parent: 20075.0\n", " case_id: 4\n", " ... ...\n", " variable_id: so2\n", " variant_info: CMIP6 CESM2 hindcast (1850-2014) with high-top a...\n", " variant_label: r1i1p1f1\n", " status: 2019-11-05;created;by nhn2@columbia.edu\n", " intake_esm_varname: ['so2']\n", " intake_esm_dataset_key: CMIP.NCAR.CESM2-WACCM.historical.AERmon.gn" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get metadata corresponding to SO2" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "dask.array\n", "Coordinates:\n", " * lat (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0\n", " * lev (lev) float64 -5.96e-06 -9.827e-06 -1.62e-05 ... -976.3 -992.6\n", " * lon (lon) float64 0.0 1.25 2.5 3.75 5.0 ... 355.0 356.2 357.5 358.8\n", " * time (time) object 1850-01-15 12:00:00 ... 2014-12-15 12:00:00\n", " member_id \n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.DataArray 'so2' (time: 1980, lat: 192)>\n",
       "array([[4.4148924e-11, 4.3374932e-11, 4.1744469e-11, ..., 2.0261100e-12,\n",
       "        1.8938171e-12, 1.8695366e-12],\n",
       "       [3.1989539e-11, 3.1808257e-11, 3.1800659e-11, ..., 1.2741638e-12,\n",
       "        1.2158472e-12, 1.1790553e-12],\n",
       "       [2.9416582e-12, 2.6269848e-12, 2.5257598e-12, ..., 6.3785300e-13,\n",
       "        6.2758875e-13, 6.3872253e-13],\n",
       "       ...,\n",
       "       [8.3957434e-13, 8.9930792e-13, 8.8156679e-13, ..., 4.1710988e-12,\n",
       "        4.0890802e-12, 5.4941021e-12],\n",
       "       [7.7786545e-12, 7.8954681e-12, 8.0431321e-12, ..., 2.4857640e-11,\n",
       "        2.7402766e-11, 3.3165942e-11],\n",
       "       [1.2275407e-11, 1.2175271e-11, 1.2504974e-11, ..., 4.6298299e-10,\n",
       "        4.6481125e-10, 4.7047843e-10]], dtype=float32)\n",
       "Coordinates:\n",
       "  * lat        (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0\n",
       "    lev        float64 -992.6\n",
       "  * time       (time) object 1850-01-15 12:00:00 ... 2014-12-15 12:00:00\n",
       "    member_id  <U8 'r1i1p1f1'
" ], "text/plain": [ "\n", "array([[4.4148924e-11, 4.3374932e-11, 4.1744469e-11, ..., 2.0261100e-12,\n", " 1.8938171e-12, 1.8695366e-12],\n", " [3.1989539e-11, 3.1808257e-11, 3.1800659e-11, ..., 1.2741638e-12,\n", " 1.2158472e-12, 1.1790553e-12],\n", " [2.9416582e-12, 2.6269848e-12, 2.5257598e-12, ..., 6.3785300e-13,\n", " 6.2758875e-13, 6.3872253e-13],\n", " ...,\n", " [8.3957434e-13, 8.9930792e-13, 8.8156679e-13, ..., 4.1710988e-12,\n", " 4.0890802e-12, 5.4941021e-12],\n", " [7.7786545e-12, 7.8954681e-12, 8.0431321e-12, ..., 2.4857640e-11,\n", " 2.7402766e-11, 3.3165942e-11],\n", " [1.2275407e-11, 1.2175271e-11, 1.2504974e-11, ..., 4.6298299e-10,\n", " 4.6481125e-10, 4.7047843e-10]], dtype=float32)\n", "Coordinates:\n", " * lat (lat) float64 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0\n", " lev float64 -992.6\n", " * time (time) object 1850-01-15 12:00:00 ... 2014-12-15 12:00:00\n", " member_id ]" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dset_selection.sel(time=cftime.DatetimeNoLeap(2003, 10, 15), method=\"nearest\").plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Convert to pandas dataframe" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 56.4 ms, sys: 2.27 ms, total: 58.7 ms\n", "Wall time: 85.7 ms\n" ] } ], "source": [ "%%time\n", "pdf = dset_selection.to_dataframe()" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
levmember_idso2
timelat
1850-01-15 12:00:00-90.000000-992.556095r1i1p1f14.414892e-11
-89.057592-992.556095r1i1p1f14.337493e-11
-88.115183-992.556095r1i1p1f14.174447e-11
-87.172775-992.556095r1i1p1f14.043559e-11
-86.230366-992.556095r1i1p1f14.044334e-11
\n", "
" ], "text/plain": [ " lev member_id so2\n", "time lat \n", "1850-01-15 12:00:00 -90.000000 -992.556095 r1i1p1f1 4.414892e-11\n", " -89.057592 -992.556095 r1i1p1f1 4.337493e-11\n", " -88.115183 -992.556095 r1i1p1f1 4.174447e-11\n", " -87.172775 -992.556095 r1i1p1f1 4.043559e-11\n", " -86.230366 -992.556095 r1i1p1f1 4.044334e-11" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Drop a column" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "pdf.drop('member_id', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
levso2
timelat
1850-01-15 12:00:00-90.000000-992.5560954.414892e-11
-89.057592-992.5560954.337493e-11
-88.115183-992.5560954.174447e-11
-87.172775-992.5560954.043559e-11
-86.230366-992.5560954.044334e-11
\n", "
" ], "text/plain": [ " lev so2\n", "time lat \n", "1850-01-15 12:00:00 -90.000000 -992.556095 4.414892e-11\n", " -89.057592 -992.556095 4.337493e-11\n", " -88.115183 -992.556095 4.174447e-11\n", " -87.172775 -992.556095 4.043559e-11\n", " -86.230366 -992.556095 4.044334e-11" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdf.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save to local file" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "pdf.to_csv(\"CMIP_NCAR_CESM2-WACCM_historical_AERmon_zonal_mean.csv\", sep='\\t')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save your results to Remote private object storage\n", "- your credentials are in `$HOME/.aws/credentials` \n", "- check with your instructor to get the secret access key (replace XXX by the right key)\n", "\n", "```\n", "[default]\n", "aws_access_key_id=forces2021-work\n", "aws_secret_access_key=XXXXXXXXXXXX\n", "aws_endpoint_url=https://forces2021.uiogeo-apps.sigma2.no/\n", "```\n", "
\n", " It is important to save your results in a place that can last longer than a few days/weeks!\n", "
" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "import s3fs" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "fsg = s3fs.S3FileSystem(anon=False,\n", " client_kwargs={\n", " 'endpoint_url': 'https://forces2021.uiogeo-apps.sigma2.no/'\n", " })" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Upload local file to remote storage" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "s3://work/annefou/CMIP_NCAR_CESM2-WACCM_historical_AERmon_zonal_mean.csv\n" ] } ], "source": [ "s3_path = \"s3://work/annefou/CMIP_NCAR_CESM2-WACCM_historical_AERmon_zonal_mean.csv\"\n", "print(s3_path)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "fsg.put('CMIP_NCAR_CESM2-WACCM_historical_AERmon_zonal_mean.csv', s3_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 4 }