diff --git a/.gitignore b/.gitignore index bd2cef2..5f6fdb5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,12 @@ __pycache__ .coverage htmlcov dist +build .direnv .mypy_cache .pytest_cache .ruff_cache *.egg-info/ .venv +*.tar +*.zarr diff --git a/tests/createZarrAndTar_Stores.ipynb b/tests/createZarrAndTar_Stores.ipynb new file mode 100644 index 0000000..081bf1c --- /dev/null +++ b/tests/createZarrAndTar_Stores.ipynb @@ -0,0 +1,252 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 12, + "id": "d3a17686-0c0f-42a9-81f5-e09a115f0ed0", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import tarfile\n", + "\n", + "import numpy as np\n", + "import xarray as xr\n", + "import zarr" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "86fbe34a-1870-4f0a-83df-baab853048a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.1.5.dev2734+g596fd7f91\n", + "2025.12.0\n", + "2.4.0\n" + ] + } + ], + "source": [ + "print(zarr.__version__)\n", + "print(xr.__version__)\n", + "print(np.__version__)" + ] + }, + { + "cell_type": "markdown", + "id": "de4e8067-95a7-47d5-98e1-17ffa9fe642f", + "metadata": {}, + "source": [ + "# Testing zarr stores with sample data" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "41b5467f-5c9c-4a91-90ff-b5d80334553b", + "metadata": {}, + "outputs": [], + "source": [ + "times = np.arange('2023-01-01', '2023-01-03', dtype='datetime64[D]')\n", + "lats = [34, 35, 36]\n", + "lons = [-118, -117, -116]\n", + "variableDict= {\n", + " 'temperature' : {\n", + " 'values' : np.random.rand(2, 3, 3) * 30 + 273.15, # (time, lat, lon)\n", + " 'units' : \"K\"\n", + " },\n", + " 'precipitation' : {\n", + " 'values' : np.random.rand(2, 3, 3), # (time, lat, lon)\n", + " 'units' : \"mm/day\"\n", + " }\n", + "}\n", + "zarrStorePrefix='testStore'\n", + "zarrStoreList = []" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ed4f5dbd-5bb1-4b91-a450-ef5999fe464b", + "metadata": {}, + "outputs": [], + "source": [ + "def createZarrStore( varName, index ):\n", + " # Create the Dataset\n", + " ds = xr.Dataset(\n", + " data_vars={\n", + " f\"{varName}\": (\n", + " (\"time\", \"lat\", \"lon\"),\n", + " variableDict[varName]['values'],\n", + " {\"units\": variableDict[varName]['units']}),\n", + " },\n", + " coords={\n", + " \"time\": times,\n", + " \"lat\": lats,\n", + " \"lon\": lons\n", + " },\n", + " attrs={\"description\": \"Sample weather data\"}\n", + " )\n", + " #Convert and save to zarr store with 'index' in the name.\n", + " zarrStoreName = f'{zarrStorePrefix}{index}Dev.zarr'\n", + " ds.to_zarr( f'{zarrStoreName}', mode='w', zarr_format=3, consolidated = True )\n", + " zarrStoreList.append(zarrStoreName)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3eac65ce-8880-4e40-a777-dca8bd3bdfb3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/work/bk1414/k204247/staczarrDev/tools/myRepo/zarr-python/src/zarr/api/asynchronous.py:247: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.\n", + " warnings.warn(\n", + "/work/bk1414/k204247/staczarrDev/tools/myRepo/zarr-python/src/zarr/api/asynchronous.py:247: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "count = 1\n", + "for variable in variableDict.keys():\n", + " createZarrStore( variable, count )\n", + " count += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8885aaa4-ed8a-4863-a9fc-3671ceef2426", + "metadata": {}, + "outputs": [], + "source": [ + "def scan_dir( dirName ):\n", + " with os.scandir( dirName ) as it:\n", + " for entry in it:\n", + " if entry.is_file():\n", + " list_of_vars.append( entry.path )\n", + " elif entry.is_dir():\n", + " scan_dir( entry.path )" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "13992e7b-c04d-4665-979f-536c383700b8", + "metadata": {}, + "outputs": [], + "source": [ + "def createTarStoreFromZarrStore(zarrStoreName):\n", + " zarrBaseName=zarrStoreName.split('.')[0]\n", + " tarFileName=f'{zarrBaseName}.tar'\n", + "\n", + " scan_dir( zarrStoreName )\n", + " print(list_of_vars)\n", + "\n", + " try:\n", + " with tarfile.open(tarFileName, \"w\") as tar:\n", + " for name in list_of_vars:\n", + " print(f\"Adding file {name} to {tarFileName}\\n\")\n", + " tar.add( name, arcname=name.replace( zarrStoreName + os.path.sep, '' ) )\n", + " tar.close()\n", + " except Exception as e:\n", + " print(f\"Exception occured: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "cbbb89a6-3e32-4139-bc6b-72efcc1851e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['testStore1Dev.zarr/time/zarr.json', 'testStore1Dev.zarr/time/c/0', 'testStore1Dev.zarr/temperature/zarr.json', 'testStore1Dev.zarr/temperature/c/0/0/0', 'testStore1Dev.zarr/zarr.json', 'testStore1Dev.zarr/lon/zarr.json', 'testStore1Dev.zarr/lon/c/0', 'testStore1Dev.zarr/lat/zarr.json', 'testStore1Dev.zarr/lat/c/0']\n", + "Adding file testStore1Dev.zarr/time/zarr.json to testStore1Dev.tar\n", + "\n", + "Adding file testStore1Dev.zarr/time/c/0 to testStore1Dev.tar\n", + "\n", + "Adding file testStore1Dev.zarr/temperature/zarr.json to testStore1Dev.tar\n", + "\n", + "Adding file testStore1Dev.zarr/temperature/c/0/0/0 to testStore1Dev.tar\n", + "\n", + "Adding file testStore1Dev.zarr/zarr.json to testStore1Dev.tar\n", + "\n", + "Adding file testStore1Dev.zarr/lon/zarr.json to testStore1Dev.tar\n", + "\n", + "Adding file testStore1Dev.zarr/lon/c/0 to testStore1Dev.tar\n", + "\n", + "Adding file testStore1Dev.zarr/lat/zarr.json to testStore1Dev.tar\n", + "\n", + "Adding file testStore1Dev.zarr/lat/c/0 to testStore1Dev.tar\n", + "\n", + "['testStore2Dev.zarr/precipitation/zarr.json', 'testStore2Dev.zarr/precipitation/c/0/0/0', 'testStore2Dev.zarr/time/zarr.json', 'testStore2Dev.zarr/time/c/0', 'testStore2Dev.zarr/zarr.json', 'testStore2Dev.zarr/lon/zarr.json', 'testStore2Dev.zarr/lon/c/0', 'testStore2Dev.zarr/lat/zarr.json', 'testStore2Dev.zarr/lat/c/0']\n", + "Adding file testStore2Dev.zarr/precipitation/zarr.json to testStore2Dev.tar\n", + "\n", + "Adding file testStore2Dev.zarr/precipitation/c/0/0/0 to testStore2Dev.tar\n", + "\n", + "Adding file testStore2Dev.zarr/time/zarr.json to testStore2Dev.tar\n", + "\n", + "Adding file testStore2Dev.zarr/time/c/0 to testStore2Dev.tar\n", + "\n", + "Adding file testStore2Dev.zarr/zarr.json to testStore2Dev.tar\n", + "\n", + "Adding file testStore2Dev.zarr/lon/zarr.json to testStore2Dev.tar\n", + "\n", + "Adding file testStore2Dev.zarr/lon/c/0 to testStore2Dev.tar\n", + "\n", + "Adding file testStore2Dev.zarr/lat/zarr.json to testStore2Dev.tar\n", + "\n", + "Adding file testStore2Dev.zarr/lat/c/0 to testStore2Dev.tar\n", + "\n" + ] + } + ], + "source": [ + "for zarrStore in zarrStoreList:\n", + " list_of_vars=[]\n", + " createTarStoreFromZarrStore( zarrStore )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e65d3a09-65f1-4cb5-a068-ceb3f8a2730c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pystacdevenv", + "language": "python", + "name": "pystacdevenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/multiTarStoreRead_From_xpystac.ipynb b/tests/multiTarStoreRead_From_xpystac.ipynb new file mode 100644 index 0000000..801ee2b --- /dev/null +++ b/tests/multiTarStoreRead_From_xpystac.ipynb @@ -0,0 +1,3190 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "09bce265-1424-453b-8e93-171399c02873", + "metadata": {}, + "outputs": [], + "source": [ + "import pystac\n", + "import xarray as xr\n", + "import zarr\n", + "\n", + "import xpystac" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5bfd1c29-a702-444c-a368-eb792306653c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.1.5.dev2734+g596fd7f91\n", + "2025.12.0\n", + "0.1.dev46+gcba5d10d8.d20251224\n", + "1.14.2\n" + ] + } + ], + "source": [ + "print(zarr.__version__)\n", + "print(xr.__version__)\n", + "print(xpystac.__version__)\n", + "print(pystac.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a33b1c55-9cc2-450a-98d9-2f3df239075b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['scipy', 'stac', 'store', 'zarr'])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xr.backends.list_engines().keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "df4cf650-0652-40ae-bc60-2e06914141e4", + "metadata": {}, + "outputs": [], + "source": [ + "tarStorePathList = [ 'testStore1Dev.tar', 'testStore2Dev.tar' ]" + ] + }, + { + "cell_type": "markdown", + "id": "eb2ba29b-af50-4d19-8c33-1ca6dcb8f539", + "metadata": {}, + "source": [ + "# Single Tar store read with zarr 'TarStore'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c5bfc475-4a0c-48b5-bdaf-70122597f94f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 208B\n",
+       "Dimensions:      (time: 2, lat: 3, lon: 3)\n",
+       "Coordinates:\n",
+       "  * time         (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n",
+       "  * lat          (lat) int64 24B 34 35 36\n",
+       "  * lon          (lon) int64 24B -118 -117 -116\n",
+       "Data variables:\n",
+       "    temperature  (time, lat, lon) float64 144B 280.7 294.3 281.7 ... 278.1 279.1\n",
+       "Attributes:\n",
+       "    description:  Sample weather data
" + ], + "text/plain": [ + " Size: 208B\n", + "Dimensions: (time: 2, lat: 3, lon: 3)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n", + " * lat (lat) int64 24B 34 35 36\n", + " * lon (lon) int64 24B -118 -117 -116\n", + "Data variables:\n", + " temperature (time, lat, lon) float64 144B 280.7 294.3 281.7 ... 278.1 279.1\n", + "Attributes:\n", + " description: Sample weather data" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with zarr.storage.TarStore( tarStorePathList[0], mode='r' ) as store:\n", + " ds = xr.open_zarr(store).compute()\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "c8d39d6e-395b-4e64-81d5-8fb37f7eb185", + "metadata": {}, + "source": [ + "# Multiple Tar stores with zarr 'TarStore'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bd3bbfaf-3507-4faf-a53b-86452e240364", + "metadata": {}, + "outputs": [], + "source": [ + "tarStoreList = [ zarr.storage.TarStore( storePath, mode='r')\n", + " for storePath in tarStorePathList ]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "728d635f-2489-45a1-8520-01cea3db9096", + "metadata": {}, + "outputs": [], + "source": [ + "dsFull = xr.open_mfdataset( tarStoreList, engine = 'zarr' )" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "73169b0f-f6bd-4e57-894a-fad468f3606a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 352B\n",
+       "Dimensions:        (time: 2, lat: 3, lon: 3)\n",
+       "Coordinates:\n",
+       "  * time           (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n",
+       "  * lat            (lat) int64 24B 34 35 36\n",
+       "  * lon            (lon) int64 24B -118 -117 -116\n",
+       "Data variables:\n",
+       "    temperature    (time, lat, lon) float64 144B dask.array<chunksize=(2, 3, 3), meta=np.ndarray>\n",
+       "    precipitation  (time, lat, lon) float64 144B dask.array<chunksize=(2, 3, 3), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    description:  Sample weather data
" + ], + "text/plain": [ + " Size: 352B\n", + "Dimensions: (time: 2, lat: 3, lon: 3)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n", + " * lat (lat) int64 24B 34 35 36\n", + " * lon (lon) int64 24B -118 -117 -116\n", + "Data variables:\n", + " temperature (time, lat, lon) float64 144B dask.array\n", + " precipitation (time, lat, lon) float64 144B dask.array\n", + "Attributes:\n", + " description: Sample weather data" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dsFull" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "62f97379-09d6-47e9-ad48-42ced6b45d3d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 352B\n",
+       "Dimensions:        (time: 2, lat: 3, lon: 3)\n",
+       "Coordinates:\n",
+       "  * time           (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n",
+       "  * lat            (lat) int64 24B 34 35 36\n",
+       "  * lon            (lon) int64 24B -118 -117 -116\n",
+       "Data variables:\n",
+       "    temperature    (time, lat, lon) float64 144B 280.7 294.3 ... 278.1 279.1\n",
+       "    precipitation  (time, lat, lon) float64 144B 0.7805 0.3212 ... 0.4721\n",
+       "Attributes:\n",
+       "    description:  Sample weather data
" + ], + "text/plain": [ + " Size: 352B\n", + "Dimensions: (time: 2, lat: 3, lon: 3)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n", + " * lat (lat) int64 24B 34 35 36\n", + " * lon (lon) int64 24B -118 -117 -116\n", + "Data variables:\n", + " temperature (time, lat, lon) float64 144B 280.7 294.3 ... 278.1 279.1\n", + " precipitation (time, lat, lon) float64 144B 0.7805 0.3212 ... 0.4721\n", + "Attributes:\n", + " description: Sample weather data" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dsFull.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "9eb41307-5e0f-45fb-ad24-51afd812d923", + "metadata": {}, + "source": [ + "# Accessing assets from STAC catalog and creating 'archiveextension' based assets" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2ff4b56c-e1ee-4926-802a-ddfdbb57c6e6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10\n" + ] + } + ], + "source": [ + "ngc4008_collection=pystac.Collection.from_file(\"https://wwestac.cloud.dkrz.de/stac-fastapi-es/collections/ngc4008\")\n", + "ngc4008_collection.to_dict()\n", + "items=pystac.ItemCollection.from_file(\"https://wwestac.cloud.dkrz.de/stac-fastapi-es/collections/ngc4008/items\")\n", + "print(len(items))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "8ecd5a31-fae1-447e-be4d-282a6c96db28", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'disk': }" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item = items[1]\n", + "item.id\n", + "item.to_dict()\n", + "item.assets" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d7ed2036-2dca-4b63-b1b6-aaaba13084ca", + "metadata": {}, + "outputs": [], + "source": [ + "item.assets['tape1'] = {'href': tarStorePathList[0],\n", + " 'type': 'application/x-tar',\n", + " 'archive:format': 'application/x-tar',\n", + " 'archive:href': tarStorePathList[0],\n", + " 'archive:type': 'application/vnd+zarr'}" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4e25b9aa-7316-4642-a255-e52b3e0c358f", + "metadata": {}, + "outputs": [], + "source": [ + "item.assets['tape2'] = {'href': tarStorePathList[1],\n", + " 'type': 'application/x-tar',\n", + " 'archive:format': 'application/x-tar',\n", + " 'archive:href': tarStorePathList[1],\n", + " 'archive:type': 'application/vnd+zarr'}" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9e71a502-268b-42af-81a3-094d284bd508", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'disk': ,\n", + " 'tape1': {'href': 'testStore1Dev.tar',\n", + " 'type': 'application/x-tar',\n", + " 'archive:format': 'application/x-tar',\n", + " 'archive:href': 'testStore1Dev.tar',\n", + " 'archive:type': 'application/vnd+zarr'},\n", + " 'tape2': {'href': 'testStore2Dev.tar',\n", + " 'type': 'application/x-tar',\n", + " 'archive:format': 'application/x-tar',\n", + " 'archive:href': 'testStore2Dev.tar',\n", + " 'archive:type': 'application/vnd+zarr'}}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item.assets" + ] + }, + { + "cell_type": "markdown", + "id": "85643d58-3068-45a3-b249-33ac9a3a8530", + "metadata": {}, + "source": [ + "# Accessing single STAC asset with 'archiveextension' using xpystac and reading with zarr 'TarStore'" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "46abcfb8-2eee-4fe7-9879-5c178938bf14", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'href': 'testStore1Dev.tar',\n", + " 'type': 'application/x-tar',\n", + " 'archive:format': 'application/x-tar',\n", + " 'archive:href': 'testStore1Dev.tar',\n", + " 'archive:type': 'application/vnd+zarr'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tape_asset=item.assets['tape1']\n", + "tape_asset" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a47a94cc-a279-4600-b317-913cee48b592", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Asset as input!\n", + "Opening tarstore : testStore1Dev.tar\n" + ] + } + ], + "source": [ + "ds_tar=xr.open_dataset(pystac.asset.Asset.from_dict(tape_asset),engine=\"stac\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "2ff022dd-c66d-4021-92f1-c4414abb58a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 208B\n",
+       "Dimensions:      (time: 2, lat: 3, lon: 3)\n",
+       "Coordinates:\n",
+       "  * time         (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n",
+       "  * lat          (lat) int64 24B 34 35 36\n",
+       "  * lon          (lon) int64 24B -118 -117 -116\n",
+       "Data variables:\n",
+       "    temperature  (time, lat, lon) float64 144B ...\n",
+       "Attributes:\n",
+       "    description:  Sample weather data
" + ], + "text/plain": [ + " Size: 208B\n", + "Dimensions: (time: 2, lat: 3, lon: 3)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n", + " * lat (lat) int64 24B 34 35 36\n", + " * lon (lon) int64 24B -118 -117 -116\n", + "Data variables:\n", + " temperature (time, lat, lon) float64 144B ...\n", + "Attributes:\n", + " description: Sample weather data" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_tar" + ] + }, + { + "cell_type": "markdown", + "id": "b81f51a7-c4ac-48ce-bac7-ddbe62db943f", + "metadata": {}, + "source": [ + "# Accessing multiple STAC assets with 'archiveextension' using xpystac and reading with zarr 'TarStore'" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "62b03e74-729c-43c9-9376-b1e0e18b6f6f", + "metadata": {}, + "outputs": [], + "source": [ + "tape_assets=[ item.assets['tape1'], item.assets['tape2'] ]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "bfad1ff9-42a0-4306-9159-35cf7b910ca6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'href': 'testStore1Dev.tar',\n", + " 'type': 'application/x-tar',\n", + " 'archive:format': 'application/x-tar',\n", + " 'archive:href': 'testStore1Dev.tar',\n", + " 'archive:type': 'application/vnd+zarr'},\n", + " {'href': 'testStore2Dev.tar',\n", + " 'type': 'application/x-tar',\n", + " 'archive:format': 'application/x-tar',\n", + " 'archive:href': 'testStore2Dev.tar',\n", + " 'archive:type': 'application/vnd+zarr'}]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tape_assets" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "25cbea66-54b7-4963-96e5-0a1ba75ac539", + "metadata": {}, + "outputs": [], + "source": [ + "tape_assetList = [pystac.asset.Asset.from_dict(tape_asset)\n", + " for tape_asset in tape_assets]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "4641f38a-09c0-4563-a461-ec8734f30a13", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "List of Assets as input!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 2/2 [00:00<00:00, 20763.88it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Opening tarstore : testStore1Dev.tar\n", + "Opening tarstore : testStore2Dev.tar\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "ds_tarList=xr.open_dataset( tape_assetList, engine=\"stac\" )" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "b31b5ffd-f10a-4f4a-b9ed-398bd0117c5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 352B\n",
+       "Dimensions:        (time: 2, lat: 3, lon: 3)\n",
+       "Coordinates:\n",
+       "  * time           (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n",
+       "  * lat            (lat) int64 24B 34 35 36\n",
+       "  * lon            (lon) int64 24B -118 -117 -116\n",
+       "Data variables:\n",
+       "    temperature    (time, lat, lon) float64 144B ...\n",
+       "    precipitation  (time, lat, lon) float64 144B ...\n",
+       "Attributes:\n",
+       "    description:  Sample weather data
" + ], + "text/plain": [ + " Size: 352B\n", + "Dimensions: (time: 2, lat: 3, lon: 3)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 16B 2023-01-01 2023-01-02\n", + " * lat (lat) int64 24B 34 35 36\n", + " * lon (lon) int64 24B -118 -117 -116\n", + "Data variables:\n", + " temperature (time, lat, lon) float64 144B ...\n", + " precipitation (time, lat, lon) float64 144B ...\n", + "Attributes:\n", + " description: Sample weather data" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_tarList" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8065412e-9483-483f-b07a-b30a39dd03d1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pystacdevenv", + "language": "python", + "name": "pystacdevenv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/test_archiveExtension.py b/tests/test_archiveExtension.py new file mode 100644 index 0000000..145b3c8 --- /dev/null +++ b/tests/test_archiveExtension.py @@ -0,0 +1,163 @@ +# Standard modules +import os +import sys + +#Non-standard modules + +try: + import xarray as xr +except Exception: + print( sys.exc_info() ) + print( f"Module 'xarray' import error in {__file__}" ) + +try: + import zarr +except Exception: + print( sys.exc_info() ) + print( f"Module 'zarr' import error in {__file__}" ) + +try: + pass +except Exception: + print( sys.exc_info() ) + print( f"Module 'dask' import error in {__file__}" ) + + +try: + import xpystac +except Exception: + print( sys.exc_info() ) + print( f"Module 'xpystac' import error in {__file__}" ) + +try: + import pystac +except Exception: + print( sys.exc_info() ) + print( f"Module 'pystac' import error in {__file__}" ) + + +try: + from pystac.extensions.archive import ArchiveExtension +except Exception: + print( sys.exc_info() ) + print( f"Module 'pystac.extensions.archive.ArchiveExtension' \ + import error in {__file__}" ) + + +try: + pass +except Exception: + print( sys.exc_info() ) + print( f"Module 'utils_archive_extension' import error in {__file__}" ) + +try: + from utils_archive_extension import ( + createTarStoreFromZarrStore, + createZarrStoreWithSingleVariable, + variableDict, + zarrStoreList, + ) + +except Exception: + print( sys.exc_info() ) + print( f"Module 'utils_archive_extension.(variableDict, zarrStoreList, \ + createZarrStoreWithSingleVariable, createTarStoreFromZarrStore' \ + import error in {__file__}" ) + +print(zarr.__version__) +print(xr.__version__) +print(pystac.__version__) +print(xpystac.__version__) + +engineList = xr.backends.list_engines().keys() +assert 'stac' in engineList + +# Create Zarr store +count = 1 +for variable in variableDict.keys(): + createZarrStoreWithSingleVariable( variable, count ) + count += 1 + +# Convert zarr store to Tarstore +for zarrStore in zarrStoreList: + createTarStoreFromZarrStore( zarrStore ) + + +tarStorePathList = [ './data/testStore1Dev.tar', './data/testStore2Dev.tar' ] + +for testStore in tarStorePathList: + assert os.path.exists(testStore) + +# # Single Tar store read with zarr 'TarStore' + + +with zarr.storage.TarStore( tarStorePathList[0], mode='r' ) as store: + ds = xr.open_zarr(store).compute() +assert ds + + +# # Multiple Tar stores with zarr 'TarStore' + +tarStoreList = [ zarr.storage.TarStore( storePath, mode='r') + for storePath in tarStorePathList ] +dsFull = xr.open_mfdataset( tarStoreList, engine = 'zarr' ) +assert dsFull +#dsFull.compute() + + +# # Accessing assets from STAC catalog and creating 'archiveextension' based assets + +ngc4008_collection=pystac.Collection.from_file("https://wwestac.cloud.dkrz.de/stac-fastapi-es/collections/ngc4008") +ngc4008_collection.to_dict() +items=pystac.ItemCollection.from_file("https://wwestac.cloud.dkrz.de/stac-fastapi-es/collections/ngc4008/items") +item_count = len(items) + +assert item_count > 0 + +item = items[1] +item.id +item.to_dict() +assert item.assets is not None + +item.assets['tape1'] = {'href': tarStorePathList[0], + 'type': 'application/x-tar', + 'archive:format': 'application/x-tar', + 'archive:href': tarStorePathList[0], + 'archive:type': 'application/vnd+zarr'} + + +item.assets['tape2'] = {'href': tarStorePathList[1], + 'type': 'application/x-tar', + 'archive:format': 'application/x-tar', + 'archive:href': tarStorePathList[1], + 'archive:type': 'application/vnd+zarr'} + +item.assets + +ArchiveExtension.add_to(item) + +assert ArchiveExtension.get_schema_uri() in item.stac_extensions + + +# # Accessing single STAC asset with 'archiveextension' using xpystac and +# reading with zarr 'TarStore' +tape_asset=item.assets['tape1'] +tape_asset +ds_tar=xr.open_dataset(pystac.asset.Asset.from_dict(tape_asset),engine="stac") +assert ds_tar + +# # Accessing multiple STAC assets with 'archiveextension' using xpystac and +# reading with zarr 'TarStore' + +tape_assets=[ item.assets['tape1'], item.assets['tape2'] ] +tape_assets + +tape_assetList = [pystac.asset.Asset.from_dict(tape_asset) + for tape_asset in tape_assets] +ds_tarList=xr.open_dataset( tape_assetList, engine="stac" ) +assert ds_tarList + +assert len ( ds_tarList.variables.keys() ) > 0 + +for var in variableDict.keys(): + assert var in ds_tarList.variables.keys() diff --git a/tests/utils_archive_extension.py b/tests/utils_archive_extension.py new file mode 100644 index 0000000..6362e38 --- /dev/null +++ b/tests/utils_archive_extension.py @@ -0,0 +1,88 @@ +import os +import tarfile + +import numpy as np +import xarray as xr +import zarr + +print(zarr.__version__) +print(xr.__version__) +print(np.__version__) + +# # Testing zarr stores with sample data + +times = np.arange('2023-01-01', '2023-01-03', dtype='datetime64[D]') +lats = [34, 35, 36] +lons = [-118, -117, -116] +variableDict= { + 'temperature' : { + 'values' : np.random.rand(2, 3, 3) * 30 + 273.15, # (time, lat, lon) + 'units' : "K" + }, + 'precipitation' : { + 'values' : np.random.rand(2, 3, 3), # (time, lat, lon) + 'units' : "mm/day" + } +} +zarrStorePrefix='testStore' +zarrStoreList = [] +tarStoreList = [] +list_of_vars=[] + +def createZarrStoreWithSingleVariable( varName, index ): + # Create the Dataset + ds = xr.Dataset( + data_vars={ + f"{varName}": ( + ("time", "lat", "lon"), + variableDict[varName]['values'], + {"units": variableDict[varName]['units']}), + }, + coords={ + "time": times, + "lat": lats, + "lon": lons + }, + attrs={"description": "Sample weather data"} + ) + #Convert and save to zarr store with 'index' in the name. + zarrStoreName = f'./data/{zarrStorePrefix}{index}Dev.zarr' + ds.to_zarr( f'{zarrStoreName}', mode='w', zarr_format=3 ) + zarrStoreList.append(zarrStoreName) + + +def scan_dir( dirName ): + with os.scandir( dirName ) as it: + for entry in it: + if entry.is_file(): + list_of_vars.append( entry.path ) + elif entry.is_dir(): + scan_dir( entry.path ) + + +def createTarStoreFromZarrStore(zarrStoreName): + zarrBaseName=os.path.basename(zarrStoreName).split('.')[0] + tarFileName=f'./data/{zarrBaseName}.tar' + list_of_vars.clear() + + scan_dir( zarrStoreName ) + + try: + with tarfile.open(tarFileName, "w") as tar: + for name in list_of_vars: + #print(f"Adding file {name} to {tarFileName}\n") + tar.add( name, arcname=name.replace( zarrStoreName + os.path.sep, '' ) ) + tar.close() + tarStoreList.append(tarFileName) + except Exception as e: + print(f"Exception occured: {e}") + +# Create Zarr store +count = 1 +for variable in variableDict.keys(): + createZarrStoreWithSingleVariable( variable, count ) + count += 1 + +# Convert zarr store to Tarstore +for zarrStore in zarrStoreList: + createTarStoreFromZarrStore( zarrStore ) diff --git a/xpystac/core.py b/xpystac/core.py index 6dc2f49..59c05c0 100644 --- a/xpystac/core.py +++ b/xpystac/core.py @@ -78,7 +78,7 @@ def _( return xarray.open_dataset(mapper, **{**default_kwargs, **kwargs}) -@to_xarray.register +@to_xarray.register( pystac.Asset ) def _( obj: pystac.Asset, patch_url: None | Callable[[str], str] = None, @@ -87,47 +87,136 @@ def _( ) -> xarray.Dataset: open_kwargs = obj.extra_fields.get("xarray:open_kwargs", {}) - storage_options = obj.extra_fields.get("xarray:storage_options", None) - if storage_options: - open_kwargs["storage_options"] = storage_options + # MKM 18 Oct 2024 + #TODO : Check if the obj is list instance or pystac.Asset instance and + # accordingly if just one asset pass it through xarray, + # else, collect the assets and pass to xarray as open_mfdataset. + # In case of tar balls, extract each of them and the store these paths + # send the list of these zarr stores to xarray.open_mfdataset() + # It should work. + + default_kwargs = {} + + # Check the type of the 'obj' + if isinstance(obj, pystac.Asset): + print("Asset as input!",flush=True) + #open_kwargs = obj.extra_fields.get("xarray:open_kwargs", {}) + + storage_options = obj.extra_fields.get("xarray:storage_options", None) + if storage_options: + open_kwargs["storage_options"] = storage_options + + if ( + allow_kerchunk + and obj.media_type == pystac.MediaType.JSON + and {"index", "references"}.intersection(set(obj.roles) + if obj.roles else set()) + ): + requests = _import_optional_dependency("requests") + r = requests.get(obj.href) + r.raise_for_status() + + refs = r.json() + if patch_url is not None: + refs = patch_url(refs) + + default_kwargs = { + "engine": "kerchunk", + } + return xarray.open_dataset(refs, **{**default_kwargs, + **open_kwargs, **kwargs}) + + + if obj.media_type == pystac.MediaType.COG: + _import_optional_dependency("rioxarray") + default_kwargs = {**default_kwargs, "engine": "rasterio"} + elif obj.media_type in ["application/vnd+zarr", "application/vnd.zarr"]: + _import_optional_dependency("zarr") + zarr_kwargs = {} + if "zarr:consolidated" in obj.extra_fields: + zarr_kwargs["consolidated"] = obj.extra_fields["zarr:consolidated"] + if "zarr:zarr_format" in obj.extra_fields: + zarr_kwargs["zarr_format"] = obj.extra_fields["zarr:zarr_format"] + default_kwargs = {**zarr_kwargs, "engine": "zarr"} + elif obj.media_type == "application/vnd.zarr+icechunk": + from xpystac._icechunk import read_icechunk + + return read_icechunk(obj) + + # Handling the 'archive' extension, + # as of now only plain '*.tar' files are handled. + elif obj.media_type == "application/x-tar": + zarr = _import_optional_dependency("zarr") + if 'TarStore' not in zarr.storage.__all__: + raise ImportError("zarr.storage.TarStore not found! " \ + "Please update 'zarr' to the latest version.") + else: + print(f"Opening tarstore : {obj.href}") + # MKM With new tarstore implementation in zarr-python + with zarr.storage.TarStore(obj.href, mode = 'r') as tar_store: + return xarray.open_zarr(tar_store, **kwargs) - if ( - allow_kerchunk - and obj.media_type == pystac.MediaType.JSON - and {"index", "references"}.intersection(set(obj.roles) if obj.roles else set()) - ): - requests = _import_optional_dependency("requests") - r = requests.get(obj.href) - r.raise_for_status() - refs = r.json() + href = obj.href if patch_url is not None: - refs = patch_url(refs) - - default_kwargs = { - "engine": "kerchunk", - } - return xarray.open_dataset(refs, **{**default_kwargs, **open_kwargs, **kwargs}) - - if obj.media_type == pystac.MediaType.COG: - _import_optional_dependency("rioxarray") - default_kwargs = {"engine": "rasterio"} - elif obj.media_type in ["application/vnd+zarr", "application/vnd.zarr"]: - _import_optional_dependency("zarr") - zarr_kwargs = {} - if "zarr:consolidated" in obj.extra_fields: - zarr_kwargs["consolidated"] = obj.extra_fields["zarr:consolidated"] - if "zarr:zarr_format" in obj.extra_fields: - zarr_kwargs["zarr_format"] = obj.extra_fields["zarr:zarr_format"] - default_kwargs = {**zarr_kwargs, "engine": "zarr"} - elif obj.media_type == "application/vnd.zarr+icechunk": - from xpystac._icechunk import read_icechunk - - return read_icechunk(obj) - - href = obj.href - if patch_url is not None: - href = patch_url(href) - - ds = xarray.open_dataset(href, **{**default_kwargs, **open_kwargs, **kwargs}) - return ds + href = patch_url(href) + ds = xarray.open_dataset(href, **{**default_kwargs, + **open_kwargs, **kwargs}) + return ds + + +@to_xarray.register( list ) +def _( + obj: list[pystac.Asset], + patch_url: None | Callable[[str], str] = None, + allow_kerchunk: bool = True, + **kwargs, +) -> xarray.Dataset: + + if not isinstance( obj, list ): + raise TypeError('Input is not a list of assets!') + + if not isinstance( obj[0], pystac.Asset ): + raise TypeError('Input is not a list of assets!') + + open_kwargs = obj[0].extra_fields.get("xarray:open_kwargs", {}) + + + + print("List of Assets as input!",flush=True) + # Creates a list of assets from the list of items. + # Concates all the zarr stores from each tar ball and + # creates the xarray Dataset,with engine as 'zarr'. + # Returns the xarray Dataset created above, + # ( for this particular use case ) + + open_kwargs = obj[0].extra_fields.get("xarray:open_kwargs", {}) + + storage_options = obj[0].extra_fields.get("xarray:storage_options", None) + if storage_options: + open_kwargs["storage_options"] = storage_options + + ref_media_type = obj[0].media_type + zarr_store_list = [] + tqdm = _import_optional_dependency("tqdm") + for i in tqdm.tqdm(obj): + # Check the type of the assets -- for homogenity ( all are tar balls ) + if i.media_type != ref_media_type: + print(f"Encountered {i.to_dict()} which differs with {ref_media_type}!") + # Empty Dataset + return xarray.Dataset(data_vars=None, coords=None, attrs=None) + + if ref_media_type == "application/x-tar": + print(f"Opening tarstore : {i.href}") + # To be opened with new tarstore implementation in zarr-python + zarr_store_list.append(i.href) + + zarr = _import_optional_dependency("zarr") + if 'TarStore' not in zarr.storage.__all__: + raise ImportError("zarr.storage.TarStore not found! " \ + "Please update 'zarr' to the latest version.") + else: + # TODO: To fix the concat_dims etc. for hierarchical datasets. + tarStoreList = [ zarr.storage.TarStore( storePath, mode='r') + for storePath in zarr_store_list ] + return xarray.open_mfdataset( tarStoreList, engine = 'zarr' )