Skip to content

Commit 749520b

Browse files
philippemironPhilippe Mironselipotkevinsantana11
authored
Switch to row instead of trajectory (#376)
* Utilize `rows` for naming legacy `traj` dimension which is mostly relevant in oceanographic datasets while rows is more generalized * remove coord dim map and map coords to dim aliases. * Map dim alias to library required dims --------- Co-authored-by: Philippe Miron <philippe.miron@dtn.com> Co-authored-by: Shane Elipot <selipot@miami.edu> Co-authored-by: Kevin Santana <kevinsantana11@gmail.com>
1 parent a370d68 commit 749520b

File tree

18 files changed

+348
-215
lines changed

18 files changed

+348
-215
lines changed

clouddrift/adapters/andro.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
2-
This module defines functions used to adapt the ANDRO: An Argo-based
3-
deep displacement dataset as a ragged-arrays dataset.
2+
This module defines functions used to adapt the ANDRO: An Argo-based
3+
deep displacement dataset as a ragged-arrays dataset.
44
55
The dataset is hosted at https://www.seanoe.org/data/00360/47077/ and the user manual
66
is available at https://archimer.ifremer.fr/doc/00360/47126/.
@@ -12,8 +12,8 @@
1212
1313
Reference
1414
---------
15-
Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles,
16-
Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset.
15+
Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles,
16+
Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset.
1717
SEANOE. https://doi.org/10.17882/47077
1818
"""
1919

clouddrift/adapters/gdp.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@
1414
from clouddrift.adapters.utils import download_with_progress
1515
from clouddrift.raggedarray import DimNames
1616

17-
GDP_COORDS: list[tuple[str, DimNames]] = [
18-
("id", "traj"),
19-
("time", "obs"),
17+
GDP_DIMS: dict[str, DimNames] = {"traj": "rows", "obs": "obs"}
18+
19+
GDP_COORDS = [
20+
"id",
21+
"time",
2022
]
2123

2224
GDP_METADATA = [

clouddrift/adapters/gdp1h.py

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@
2323
GDP_VERSION = "2.01"
2424

2525

26-
GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/hourly_product/v2.01/"
26+
GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/hourly_product/v2.01"
2727
GDP_DATA_URL_EXPERIMENTAL = (
28-
"https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/experimental/"
28+
"https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/experimental"
2929
)
3030

3131

@@ -113,7 +113,7 @@ def download(
113113
gdp_metadata = gdp.get_gdp_metadata()
114114

115115
return gdp.order_by_date(
116-
gdp_metadata, [int(f.split("_")[-1][:-3]) for f in filelist]
116+
gdp_metadata, [int(f.split("_")[-1].removesuffix(".nc")) for f in filelist]
117117
)
118118

119119

@@ -215,35 +215,47 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
215215
[False if ds.get("location_type") == "Argos" else True],
216216
) # 0 for Argos, 1 for GPS
217217
ds["DeployingShip"] = (("traj"), gdp.cut_str(ds.DeployingShip, 20))
218-
ds["DeploymentStatus"] = (("traj"), gdp.cut_str(ds.DeploymentStatus, 20))
219-
ds["BuoyTypeManufacturer"] = (("traj"), gdp.cut_str(ds.BuoyTypeManufacturer, 20))
220-
ds["BuoyTypeSensorArray"] = (("traj"), gdp.cut_str(ds.BuoyTypeSensorArray, 20))
218+
ds["DeploymentStatus"] = (
219+
("traj"),
220+
gdp.cut_str(ds.DeploymentStatus, 20),
221+
)
222+
ds["BuoyTypeManufacturer"] = (
223+
("traj"),
224+
gdp.cut_str(ds.BuoyTypeManufacturer, 20),
225+
)
226+
ds["BuoyTypeSensorArray"] = (
227+
("traj"),
228+
gdp.cut_str(ds.BuoyTypeSensorArray, 20),
229+
)
221230
ds["CurrentProgram"] = (
222231
("traj"),
223232
np.array([gdp.str_to_float(ds.CurrentProgram, -1)], dtype=np.int32),
224233
)
225-
ds["PurchaserFunding"] = (("traj"), gdp.cut_str(ds.PurchaserFunding, 20))
234+
ds["PurchaserFunding"] = (
235+
("traj"),
236+
gdp.cut_str(ds.PurchaserFunding, 20),
237+
)
226238
ds["SensorUpgrade"] = (("traj"), gdp.cut_str(ds.SensorUpgrade, 20))
227239
ds["Transmissions"] = (("traj"), gdp.cut_str(ds.Transmissions, 20))
228-
ds["DeployingCountry"] = (("traj"), gdp.cut_str(ds.DeployingCountry, 20))
229-
ds["DeploymentComments"] = (
240+
ds["DeployingCountry"] = (
230241
("traj"),
231-
gdp.cut_str(
232-
ds.DeploymentComments.encode("ascii", "ignore").decode("ascii"), 20
233-
),
234-
) # remove non ascii char
235-
ds["ManufactureYear"] = (
242+
gdp.cut_str(ds.DeployingCountry, 20),
243+
)
244+
ds["DeploymentComments"] = (
236245
("traj"),
237246
np.array([gdp.str_to_float(ds.ManufactureYear, -1)], dtype=np.int16),
238247
)
239248
ds["ManufactureMonth"] = (
240249
("traj"),
241250
np.array([gdp.str_to_float(ds.ManufactureMonth, -1)], dtype=np.int16),
242251
)
243-
ds["ManufactureSensorType"] = (("traj"), gdp.cut_str(ds.ManufactureSensorType, 20))
252+
ds["ManufactureSensorType"] = (
253+
("traj"),
254+
gdp.cut_str(ds.ManufactureSensorType, 20),
255+
)
244256
ds["ManufactureVoltage"] = (
245257
("traj"),
246-
np.array([gdp.str_to_float(ds.ManufactureVoltage[:-6], -1)], dtype=np.int16),
258+
np.array([gdp.str_to_float(ds.ManufactureVoltage[:-2], -1)], dtype=np.int16),
247259
) # e.g. 56 V
248260
ds["FloatDiameter"] = (
249261
("traj"),
@@ -270,12 +282,18 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
270282
("traj"),
271283
[gdp.str_to_float(ds.DragAreaOfDrogue[:-4])],
272284
) # e.g. 416.6 m^2
273-
ds["DragAreaRatio"] = (("traj"), [gdp.str_to_float(ds.DragAreaRatio)]) # e.g. 39.08
285+
ds["DragAreaRatio"] = (
286+
("traj"),
287+
[gdp.str_to_float(ds.DragAreaRatio)],
288+
) # e.g. 39.08
274289
ds["DrogueCenterDepth"] = (
275290
("traj"),
276291
[gdp.str_to_float(ds.DrogueCenterDepth[:-2])],
277292
) # e.g. 20.0 m
278-
ds["DrogueDetectSensor"] = (("traj"), gdp.cut_str(ds.DrogueDetectSensor, 20))
293+
ds["DrogueDetectSensor"] = (
294+
("traj"),
295+
gdp.cut_str(ds.DrogueDetectSensor, 20),
296+
)
279297

280298
# vars attributes
281299
vars_attrs = {
@@ -581,21 +599,22 @@ def to_raggedarray(
581599
ra = RaggedArray.from_files(
582600
indices=ids,
583601
preprocess_func=preprocess,
584-
coord_dim_map=gdp.GDP_COORDS,
602+
name_coords=gdp.GDP_COORDS,
585603
name_meta=gdp.GDP_METADATA,
586604
name_data=GDP_DATA,
605+
name_dims=gdp.GDP_DIMS,
587606
rowsize_func=gdp.rowsize,
588607
filename_pattern=filename_pattern,
589608
tmp_path=tmp_path,
590609
)
591610

592611
# set dynamic global attributes
593612
if ra.attrs_global:
594-
ra.attrs_global[
595-
"time_coverage_start"
596-
] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
597-
ra.attrs_global[
598-
"time_coverage_end"
599-
] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
613+
ra.attrs_global["time_coverage_start"] = (
614+
f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
615+
)
616+
ra.attrs_global["time_coverage_end"] = (
617+
f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
618+
)
600619

601620
return ra

clouddrift/adapters/gdp6h.py

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
GDP_VERSION = "September 2023"
2323

24-
GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/6h/"
24+
GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/6h"
2525
GDP_TMP_PATH = os.path.join(tempfile.gettempdir(), "clouddrift", "gdp6h")
2626
GDP_DATA = [
2727
"lon",
@@ -82,7 +82,7 @@ def download(
8282
string = urlpath.read().decode("utf-8")
8383
filelist = list(set(re.compile(pattern).findall(string)))
8484
for f in filelist:
85-
did = int(f[:-3].split("_")[2])
85+
did = int(f.split("_")[2].removesuffix(".nc"))
8686
if (drifter_ids is None or did in drifter_ids) and did not in added:
8787
drifter_urls.append(f"{url}/{dir}/{f}")
8888
added.add(did)
@@ -187,7 +187,10 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
187187
warnings.warn(f"Variable {var} not found in upstream data; skipping.")
188188

189189
# new variables
190-
ds["ids"] = (["traj", "obs"], [np.repeat(ds.ID.values, ds.sizes["obs"])])
190+
ds["ids"] = (
191+
["traj", "obs"],
192+
[np.repeat(ds.ID.values, ds.sizes["obs"])],
193+
)
191194
ds["drogue_status"] = (
192195
["traj", "obs"],
193196
[gdp.drogue_presence(ds.drogue_lost_date.data, ds.time.data[0])],
@@ -199,17 +202,32 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
199202
[False if ds.get("location_type") == "Argos" else True],
200203
) # 0 for Argos, 1 for GPS
201204
ds["DeployingShip"] = (("traj"), gdp.cut_str(ds.DeployingShip, 20))
202-
ds["DeploymentStatus"] = (("traj"), gdp.cut_str(ds.DeploymentStatus, 20))
203-
ds["BuoyTypeManufacturer"] = (("traj"), gdp.cut_str(ds.BuoyTypeManufacturer, 20))
204-
ds["BuoyTypeSensorArray"] = (("traj"), gdp.cut_str(ds.BuoyTypeSensorArray, 20))
205+
ds["DeploymentStatus"] = (
206+
("traj"),
207+
gdp.cut_str(ds.DeploymentStatus, 20),
208+
)
209+
ds["BuoyTypeManufacturer"] = (
210+
("traj"),
211+
gdp.cut_str(ds.BuoyTypeManufacturer, 20),
212+
)
213+
ds["BuoyTypeSensorArray"] = (
214+
("traj"),
215+
gdp.cut_str(ds.BuoyTypeSensorArray, 20),
216+
)
205217
ds["CurrentProgram"] = (
206218
("traj"),
207219
[np.int32(gdp.str_to_float(ds.CurrentProgram, -1))],
208220
)
209-
ds["PurchaserFunding"] = (("traj"), gdp.cut_str(ds.PurchaserFunding, 20))
221+
ds["PurchaserFunding"] = (
222+
("traj"),
223+
gdp.cut_str(ds.PurchaserFunding, 20),
224+
)
210225
ds["SensorUpgrade"] = (("traj"), gdp.cut_str(ds.SensorUpgrade, 20))
211226
ds["Transmissions"] = (("traj"), gdp.cut_str(ds.Transmissions, 20))
212-
ds["DeployingCountry"] = (("traj"), gdp.cut_str(ds.DeployingCountry, 20))
227+
ds["DeployingCountry"] = (
228+
("traj"),
229+
gdp.cut_str(ds.DeployingCountry, 20),
230+
)
213231
ds["DeploymentComments"] = (
214232
("traj"),
215233
gdp.cut_str(
@@ -224,10 +242,13 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
224242
("traj"),
225243
[np.int16(gdp.str_to_float(ds.ManufactureMonth, -1))],
226244
)
227-
ds["ManufactureSensorType"] = (("traj"), gdp.cut_str(ds.ManufactureSensorType, 20))
245+
ds["ManufactureSensorType"] = (
246+
("traj"),
247+
gdp.cut_str(ds.ManufactureSensorType, 20),
248+
)
228249
ds["ManufactureVoltage"] = (
229250
("traj"),
230-
[np.int16(gdp.str_to_float(ds.ManufactureVoltage[:-6], -1))],
251+
[np.int16(gdp.str_to_float(ds.ManufactureVoltage[:-2], -1))],
231252
) # e.g. 56 V
232253
ds["FloatDiameter"] = (
233254
("traj"),
@@ -254,12 +275,18 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
254275
("traj"),
255276
[gdp.str_to_float(ds.DragAreaOfDrogue[:-4])],
256277
) # e.g. 416.6 m^2
257-
ds["DragAreaRatio"] = (("traj"), [gdp.str_to_float(ds.DragAreaRatio)]) # e.g. 39.08
278+
ds["DragAreaRatio"] = (
279+
("traj"),
280+
[gdp.str_to_float(ds.DragAreaRatio)],
281+
) # e.g. 39.08
258282
ds["DrogueCenterDepth"] = (
259283
("traj"),
260284
[gdp.str_to_float(ds.DrogueCenterDepth[:-2])],
261285
) # e.g. 20.0 m
262-
ds["DrogueDetectSensor"] = (("traj"), gdp.cut_str(ds.DrogueDetectSensor, 20))
286+
ds["DrogueDetectSensor"] = (
287+
("traj"),
288+
gdp.cut_str(ds.DrogueDetectSensor, 20),
289+
)
263290

264291
# vars attributes
265292
vars_attrs = {
@@ -481,20 +508,21 @@ def to_raggedarray(
481508
ra = RaggedArray.from_files(
482509
indices=ids,
483510
preprocess_func=preprocess,
484-
coord_dim_map=gdp.GDP_COORDS,
511+
name_coords=gdp.GDP_COORDS,
485512
name_meta=gdp.GDP_METADATA,
486513
name_data=GDP_DATA,
514+
name_dims=gdp.GDP_DIMS,
487515
rowsize_func=gdp.rowsize,
488516
filename_pattern="drifter_6h_{id}.nc",
489517
tmp_path=tmp_path,
490518
)
491519

492520
# update dynamic global attributes
493-
ra.attrs_global[
494-
"time_coverage_start"
495-
] = f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
496-
ra.attrs_global[
497-
"time_coverage_end"
498-
] = f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
521+
ra.attrs_global["time_coverage_start"] = (
522+
f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
523+
)
524+
ra.attrs_global["time_coverage_end"] = (
525+
f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
526+
)
499527

500528
return ra

clouddrift/adapters/glad.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
---------
1414
Özgökmen, Tamay. 2013. GLAD experiment CODE-style drifter trajectories (low-pass filtered, 15 minute interval records), northern Gulf of Mexico near DeSoto Canyon, July-October 2012. Distributed by: Gulf of Mexico Research Initiative Information and Data Cooperative (GRIIDC), Harte Research Institute, Texas A&M University–Corpus Christi. doi:10.7266/N7VD6WC8
1515
"""
16+
1617
from io import BytesIO
1718

1819
import numpy as np

clouddrift/adapters/mosaic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
>>> from clouddrift.adapters import mosaic
1919
>>> ds = mosaic.to_xarray()
2020
"""
21+
2122
import xml.etree.ElementTree as ET
2223
from datetime import datetime
2324
from io import BytesIO

clouddrift/adapters/subsurface_floats.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
2-
This module defines functions to adapt as a ragged-array dataset a collection of data
3-
from 2193 trajectories of SOFAR, APEX, and RAFOS subsurface floats from 52 experiments
2+
This module defines functions to adapt as a ragged-array dataset a collection of data
3+
from 2193 trajectories of SOFAR, APEX, and RAFOS subsurface floats from 52 experiments
44
across the world between 1989 and 2015.
55
66
The dataset is hosted at https://www.aoml.noaa.gov/phod/float_traj/index.php

clouddrift/adapters/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ def download_with_progress(
4545
retry_protocol = custom_retry_protocol # type: ignore
4646

4747
executor = concurrent.futures.ThreadPoolExecutor()
48-
futures: dict[
49-
concurrent.futures.Future, Tuple[str, Union[BufferedIOBase, str]]
50-
] = dict()
48+
futures: dict[concurrent.futures.Future, Tuple[str, Union[BufferedIOBase, str]]] = (
49+
dict()
50+
)
5151
bar = None
5252

5353
for src, dst, exp_size in download_map:

clouddrift/adapters/yomaha.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
2-
This module defines functions used to adapt the YoMaHa'07: Velocity data assessed
3-
from trajectories of Argo floats at parking level and at the sea surface as
4-
a ragged-arrays dataset.
2+
This module defines functions used to adapt the YoMaHa'07: Velocity data assessed
3+
from trajectories of Argo floats at parking level and at the sea surface as
4+
a ragged-arrays dataset.
55
66
The dataset is hosted at http://apdrc.soest.hawaii.edu/projects/yomaha/ and the user manual
77
is available at http://apdrc.soest.hawaii.edu/projects/yomaha/yomaha07/YoMaHa070612.pdf.
@@ -52,7 +52,7 @@ def download(tmp_path: str):
5252
download_with_progress(download_requests)
5353

5454
filename_gz = f"{tmp_path}/{YOMAHA_URLS[-1].split('/')[-1]}"
55-
filename = filename_gz[:-3]
55+
filename = filename_gz.removesuffix(".gz")
5656

5757
buffer = BytesIO()
5858
download_with_progress([(YOMAHA_URLS[-1], buffer, None)])
@@ -153,7 +153,8 @@ def to_xarray(tmp_path: Union[str, None] = None):
153153
)
154154

155155
# open with pandas
156-
filename = f"{tmp_path}/{YOMAHA_URLS[-1].split('/')[-1][:-3]}"
156+
filename_gz = f"{tmp_path}/{YOMAHA_URLS[-1].split('/')[-1]}"
157+
filename = filename_gz.removesuffix(".gz")
157158
df = pd.read_csv(
158159
filename, names=col_names, sep=r"\s+", header=None, na_values=na_col
159160
)

clouddrift/datasets.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
"""
2-
This module provides functions to easily access ragged array datasets. If the datasets are
2+
This module provides functions to easily access ragged array datasets. If the datasets are
33
not accessed via cloud storage platforms or are not found on the local filesystem,
4-
they will be downloaded from their upstream repositories and stored for later access
4+
they will be downloaded from their upstream repositories and stored for later access
55
(~/.clouddrift for UNIX-based systems).
66
"""
7+
78
import os
89
import platform
910
from io import BytesIO

0 commit comments

Comments
 (0)