Skip to content

Commit 3898938

Browse files
authored
Rename rowsize to count (#212)
* Rename rowsize -> count * Bump minor version * Warn if rowsize not found upstream
1 parent dc4acc4 commit 3898938

File tree

8 files changed

+171
-157
lines changed

8 files changed

+171
-157
lines changed

clouddrift/adapters/gdp.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
GDP_METADATA = [
2222
"ID",
23-
"rowsize",
23+
"count",
2424
"WMO",
2525
"expno",
2626
"deploy_date",
@@ -270,7 +270,7 @@ def drogue_presence(lost_time, time) -> bool:
270270
return time < lost_time
271271

272272

273-
def rowsize(index: int, **kwargs) -> int:
273+
def count(index: int, **kwargs) -> int:
274274
try:
275275
return xr.open_dataset(
276276
os.path.join(

clouddrift/adapters/gdp1h.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,13 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
486486
# rename variables
487487
ds = ds.rename_vars({"longitude": "lon", "latitude": "lat"})
488488

489+
if "rowsize" in ds.variables:
490+
ds = ds.rename_vars({"rowsize": "count"})
491+
else:
492+
warnings.warn(
493+
"Variable rowsize not found in upstream GDP data; has it been renamed?"
494+
)
495+
489496
return ds
490497

491498

@@ -566,7 +573,7 @@ def to_raggedarray(
566573
name_coords=gdp.GDP_COORDS,
567574
name_meta=gdp.GDP_METADATA,
568575
name_data=GDP_DATA,
569-
rowsize_func=gdp.rowsize,
576+
count_func=gdp.count,
570577
filename_pattern=filename_pattern,
571578
tmp_path=GDP_TMP_PATH,
572579
)

clouddrift/adapters/gdp6h.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
273273
"long_name": "Global Drifter Program Buoy ID repeated along observations",
274274
"units": "-",
275275
},
276-
"rowsize": {
276+
"count": {
277277
"long_name": "Number of observations per trajectory",
278278
"sample_dimension": "obs",
279279
"units": "-",
@@ -418,6 +418,13 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
418418
# rename variables
419419
ds = ds.rename_vars({"longitude": "lon", "latitude": "lat"})
420420

421+
if "rowsize" in ds.variables:
422+
ds = ds.rename_vars({"rowsize": "count"})
423+
else:
424+
warnings.warn(
425+
"Variable rowsize not found in upstream GDP data; has it been renamed?"
426+
)
427+
421428
return ds
422429

423430

@@ -481,7 +488,7 @@ def to_raggedarray(
481488
name_coords=gdp.GDP_COORDS,
482489
name_meta=gdp.GDP_METADATA,
483490
name_data=GDP_DATA,
484-
rowsize_func=gdp.rowsize,
491+
count_func=gdp.count,
485492
filename_pattern="drifter_{id}.nc",
486493
tmp_path=GDP_TMP_PATH,
487494
)

clouddrift/analysis.py

Lines changed: 56 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@
1616
def apply_ragged(
1717
func: callable,
1818
arrays: list[np.ndarray],
19-
rowsize: list[int],
19+
count: list[int],
2020
*args: tuple,
2121
executor: futures.Executor = futures.ThreadPoolExecutor(max_workers=None),
2222
**kwargs: dict,
2323
) -> Union[tuple[np.ndarray], np.ndarray]:
2424
"""Apply a function to a ragged array.
2525
2626
The function ``func`` will be applied to each contiguous row of ``arrays`` as
27-
indicated by row sizes ``rowsize``. The output of ``func`` will be
27+
indicated by row sizes ``count``. The output of ``func`` will be
2828
concatenated into a single ragged array.
2929
3030
By default this function uses ``concurrent.futures.ThreadPoolExecutor`` to
@@ -41,7 +41,7 @@ def apply_ragged(
4141
Function to apply to each row of each ragged array in ``arrays``.
4242
arrays : list[np.ndarray] or np.ndarray
4343
An array or a list of arrays to apply ``func`` to.
44-
rowsize : list
44+
count : list
4545
List of integers specifying the number of data points in each row.
4646
*args : tuple
4747
Additional arguments to pass to ``func``.
@@ -64,31 +64,31 @@ def apply_ragged(
6464
multiple particles, the coordinates of which are found in the ragged arrays x, y, and t
6565
that share row sizes 2, 3, and 4:
6666
67-
>>> rowsize = [2, 3, 4]
67+
>>> count = [2, 3, 4]
6868
>>> x = np.array([1, 2, 10, 12, 14, 30, 33, 36, 39])
6969
>>> y = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
7070
>>> t = np.array([1, 2, 1, 2, 3, 1, 2, 3, 4])
71-
>>> u1, v1 = apply_ragged(velocity_from_position, [x, y, t], rowsize, coord_system="cartesian")
71+
>>> u1, v1 = apply_ragged(velocity_from_position, [x, y, t], count, coord_system="cartesian")
7272
array([1., 1., 2., 2., 2., 3., 3., 3., 3.]),
7373
array([1., 1., 1., 1., 1., 1., 1., 1., 1.]))
7474
7575
Raises
7676
------
7777
ValueError
78-
If the sum of ``rowsize`` does not equal the length of ``arrays``.
78+
If the sum of ``count`` does not equal the length of ``arrays``.
7979
IndexError
8080
If empty ``arrays``.
8181
"""
8282
# make sure the arrays is iterable
8383
if type(arrays) not in [list, tuple]:
8484
arrays = [arrays]
85-
# validate rowsize
85+
# validate count
8686
for arr in arrays:
87-
if not sum(rowsize) == len(arr):
88-
raise ValueError("The sum of rowsize must equal the length of arr.")
87+
if not sum(count) == len(arr):
88+
raise ValueError("The sum of count must equal the length of arr.")
8989

9090
# split the array(s) into trajectories
91-
arrays = [unpack_ragged(arr, rowsize) for arr in arrays]
91+
arrays = [unpack_ragged(arr, count) for arr in arrays]
9292
iter = [[arrays[i][j] for i in range(len(arrays))] for j in range(len(arrays[0]))]
9393

9494
# parallel execution
@@ -180,8 +180,8 @@ def chunk(
180180
notice that you must pass the array to chunk as an array-like, not a list:
181181
182182
>>> x = np.array([1, 2, 3, 4, 5])
183-
>>> rowsize = [2, 1, 2]
184-
>>> apply_ragged(chunk, x, rowsize, 2)
183+
>>> count = [2, 1, 2]
184+
>>> apply_ragged(chunk, x, count, 2)
185185
array([[1, 2],
186186
[4, 5]])
187187
@@ -217,18 +217,18 @@ def chunk(
217217

218218
def prune(
219219
ragged: Union[list, np.ndarray, pd.Series, xr.DataArray],
220-
rowsize: Union[list, np.ndarray, pd.Series, xr.DataArray],
221-
min_rowsize: float,
220+
count: Union[list, np.ndarray, pd.Series, xr.DataArray],
221+
min_count: float,
222222
) -> Tuple[np.ndarray, np.ndarray]:
223223
"""Within a ragged array, removes arrays less than a specified row size.
224224
225225
Parameters
226226
----------
227227
ragged : np.ndarray or pd.Series or xr.DataArray
228228
A ragged array.
229-
rowsize : list or np.ndarray[int] or pd.Series or xr.DataArray[int]
229+
count : list or np.ndarray[int] or pd.Series or xr.DataArray[int]
230230
The size of each row in the input ragged array.
231-
min_rowsize :
231+
min_count :
232232
The minimum row size that will be kept.
233233
234234
Returns
@@ -244,7 +244,7 @@ def prune(
244244
Raises
245245
------
246246
ValueError
247-
If the sum of ``rowsize`` does not equal the length of ``arrays``.
247+
If the sum of ``count`` does not equal the length of ``arrays``.
248248
IndexError
249249
If empty ``ragged``.
250250
@@ -256,17 +256,17 @@ def prune(
256256
ragged = apply_ragged(
257257
lambda x, min_len: x if len(x) >= min_len else np.empty(0, dtype=x.dtype),
258258
np.array(ragged),
259-
rowsize,
260-
min_len=min_rowsize,
259+
count,
260+
min_len=min_count,
261261
)
262-
rowsize = apply_ragged(
262+
count = apply_ragged(
263263
lambda x, min_len: x if x >= min_len else np.empty(0, dtype=x.dtype),
264-
np.array(rowsize),
265-
np.ones_like(rowsize),
266-
min_len=min_rowsize,
264+
np.array(count),
265+
np.ones_like(count),
266+
min_len=min_count,
267267
)
268268

269-
return ragged, rowsize
269+
return ragged, count
270270

271271

272272
def regular_to_ragged(
@@ -313,14 +313,14 @@ def regular_to_ragged(
313313

314314
def ragged_to_regular(
315315
ragged: Union[np.ndarray, pd.Series, xr.DataArray],
316-
rowsize: Union[list, np.ndarray, pd.Series, xr.DataArray],
316+
count: Union[list, np.ndarray, pd.Series, xr.DataArray],
317317
fill_value: float = np.nan,
318318
) -> np.ndarray:
319319
"""Convert a ragged array to a two-dimensional array such that each contiguous segment
320320
of a ragged array is a row in the two-dimensional array. Each row of the two-dimensional
321321
array is padded with NaNs as needed. The length of the first dimension of the output
322-
array is the length of ``rowsize``. The length of the second dimension is the maximum
323-
element of ``rowsize``.
322+
array is the length of ``count``. The length of the second dimension is the maximum
323+
element of ``count``.
324324
325325
Note: Although this function accepts parameters of type ``xarray.DataArray``,
326326
passing NumPy arrays is recommended for performance reasons.
@@ -329,7 +329,7 @@ def ragged_to_regular(
329329
----------
330330
ragged : np.ndarray or pd.Series or xr.DataArray
331331
A ragged array.
332-
rowsize : list or np.ndarray[int] or pd.Series or xr.DataArray[int]
332+
count : list or np.ndarray[int] or pd.Series or xr.DataArray[int]
333333
The size of each row in the ragged array.
334334
fill_value : float, optional
335335
Fill value to use for the trailing elements of each row of the resulting
@@ -359,17 +359,17 @@ def ragged_to_regular(
359359
--------
360360
:func:`regular_to_ragged`
361361
"""
362-
res = fill_value * np.ones((len(rowsize), int(max(rowsize))), dtype=ragged.dtype)
363-
unpacked = unpack_ragged(ragged, rowsize)
364-
for n in range(len(rowsize)):
365-
res[n, : int(rowsize[n])] = unpacked[n]
362+
res = fill_value * np.ones((len(count), int(max(count))), dtype=ragged.dtype)
363+
unpacked = unpack_ragged(ragged, count)
364+
for n in range(len(count)):
365+
res[n, : int(count[n])] = unpacked[n]
366366
return res
367367

368368

369369
def segment(
370370
x: np.ndarray,
371371
tolerance: Union[float, np.timedelta64, timedelta, pd.Timedelta],
372-
rowsize: np.ndarray[int] = None,
372+
count: np.ndarray[int] = None,
373373
) -> np.ndarray[int]:
374374
"""Divide an array into segments based on a tolerance value.
375375
@@ -380,7 +380,7 @@ def segment(
380380
tolerance : float, np.timedelta64, timedelta, pd.Timedelta
381381
The maximum signed difference between consecutive points in a segment.
382382
The array x will be segmented wherever differences exceed the tolerance.
383-
rowsize : np.ndarray[int], optional
383+
count : np.ndarray[int], optional
384384
The size of rows if x is originally a ragged array. If present, x will be
385385
divided both by gaps that exceed the tolerance, and by the original rows
386386
of the ragged array.
@@ -401,12 +401,12 @@ def segment(
401401
array([1, 3, 2, 4, 1])
402402
403403
If the array is already previously segmented (e.g. multiple rows in
404-
a ragged array), then the ``rowsize`` argument can be used to preserve
404+
a ragged array), then the ``count`` argument can be used to preserve
405405
the original segments:
406406
407407
>>> x = [0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4]
408-
>>> rowsize = [3, 2, 6]
409-
>>> segment(x, 0.5, rowsize)
408+
>>> count = [3, 2, 6]
409+
>>> segment(x, 0.5, count)
410410
array([1, 2, 1, 1, 1, 4, 1])
411411
412412
The tolerance can also be negative. In this case, the input array is
@@ -419,11 +419,11 @@ def segment(
419419
420420
To segment an array for both positive and negative gaps, invoke the function
421421
twice, once for a positive tolerance and once for a negative tolerance.
422-
The result of the first invocation can be passed as the ``rowsize`` argument
422+
The result of the first invocation can be passed as the ``count`` argument
423423
to the first ``segment`` invocation:
424424
425425
>>> x = [1, 1, 2, 2, 1, 1, 2, 2]
426-
>>> segment(x, 0.5, rowsize=segment(x, -0.5))
426+
>>> segment(x, 0.5, count=segment(x, -0.5))
427427
array([2, 2, 2, 2])
428428
429429
If the input array contains time objects, the tolerance must be a time interval:
@@ -444,7 +444,7 @@ def segment(
444444
else:
445445
positive_tol = tolerance >= 0
446446

447-
if rowsize is None:
447+
if count is None:
448448
if positive_tol:
449449
exceeds_tolerance = np.diff(x) > tolerance
450450
else:
@@ -453,11 +453,11 @@ def segment(
453453
segment_sizes = np.append(segment_sizes, len(x) - np.sum(segment_sizes))
454454
return segment_sizes
455455
else:
456-
if not sum(rowsize) == len(x):
457-
raise ValueError("The sum of rowsize must equal the length of x.")
456+
if not sum(count) == len(x):
457+
raise ValueError("The sum of count must equal the length of x.")
458458
segment_sizes = []
459459
start = 0
460-
for r in rowsize:
460+
for r in count:
461461
end = start + int(r)
462462
segment_sizes.append(segment(x[start:end], tolerance))
463463
start = end
@@ -990,7 +990,7 @@ def subset(ds: xr.Dataset, criteria: dict) -> xr.Dataset:
990990
raise ValueError(f"Unknown variable '{key}'.")
991991

992992
# remove data when trajectories are filtered
993-
traj_idx = np.insert(np.cumsum(ds["rowsize"].values), 0, 0)
993+
traj_idx = np.insert(np.cumsum(ds["count"].values), 0, 0)
994994
for i in np.where(~mask_traj)[0]:
995995
mask_obs[slice(traj_idx[i], traj_idx[i + 1])] = False
996996

@@ -1005,16 +1005,14 @@ def subset(ds: xr.Dataset, criteria: dict) -> xr.Dataset:
10051005
else:
10061006
# apply the filtering for both dimensions
10071007
ds_sub = ds.isel({"traj": mask_traj, "obs": mask_obs})
1008-
# update the rowsize
1009-
ds_sub["rowsize"].values = segment(
1010-
ds_sub.ids, 0.5, rowsize=segment(ds_sub.ids, -0.5)
1008+
# update the count
1009+
ds_sub["count"].values = segment(
1010+
ds_sub.ids, 0.5, count=segment(ds_sub.ids, -0.5)
10111011
)
10121012
return ds_sub
10131013

10141014

1015-
def unpack_ragged(
1016-
ragged_array: np.ndarray, rowsize: np.ndarray[int]
1017-
) -> list[np.ndarray]:
1015+
def unpack_ragged(ragged_array: np.ndarray, count: np.ndarray[int]) -> list[np.ndarray]:
10181016
"""Unpack a ragged array into a list of regular arrays.
10191017
10201018
Unpacking a ``np.ndarray`` ragged array is about 2 orders of magnitude
@@ -1025,15 +1023,15 @@ def unpack_ragged(
10251023
----------
10261024
ragged_array : array-like
10271025
A ragged_array to unpack
1028-
rowsize : array-like
1026+
count : array-like
10291027
An array of integers whose values is the size of each row in the ragged
10301028
array
10311029
10321030
Returns
10331031
-------
10341032
list
10351033
A list of array-likes with sizes that correspond to the values in
1036-
rowsize, and types that correspond to the type of ragged_array
1034+
count, and types that correspond to the type of ragged_array
10371035
10381036
Examples
10391037
--------
@@ -1042,20 +1040,20 @@ def unpack_ragged(
10421040
10431041
.. code-block:: python
10441042
1045-
lon = unpack_ragged(ds.lon, ds.rowsize) # return a list[xr.DataArray] (slower)
1046-
lon = unpack_ragged(ds.lon.values, ds.rowsize) # return a list[np.ndarray] (faster)
1043+
lon = unpack_ragged(ds.lon, ds["count"]) # return a list[xr.DataArray] (slower)
1044+
lon = unpack_ragged(ds.lon.values, ds["count"]) # return a list[np.ndarray] (faster)
10471045
10481046
Looping over trajectories in a ragged Xarray Dataset to compute velocities
10491047
for each:
10501048
10511049
.. code-block:: python
10521050
10531051
for lon, lat, time in list(zip(
1054-
unpack_ragged(ds.lon.values, ds.rowsize),
1055-
unpack_ragged(ds.lat.values, ds.rowsize),
1056-
unpack_ragged(ds.time.values, ds.rowsize)
1052+
unpack_ragged(ds.lon.values, ds["count"]),
1053+
unpack_ragged(ds.lat.values, ds["count"]),
1054+
unpack_ragged(ds.time.values, ds["count"])
10571055
)):
10581056
u, v = velocity_from_position(lon, lat, time)
10591057
"""
1060-
indices = np.insert(np.cumsum(np.array(rowsize)), 0, 0)
1058+
indices = np.insert(np.cumsum(np.array(count)), 0, 0)
10611059
return [ragged_array[indices[n] : indices[n + 1]] for n in range(indices.size - 1)]

0 commit comments

Comments
 (0)