apply ragged (#128)

philippemiron · Philippe Miron · milancurcic · web-flow · commit 3c902e8f2a14 · 2023-03-01T12:30:08.000-05:00
* initial commit

* initial commit

* lint

* lint

* apply changes

* fix merge in the comment

* missing spaces

* lint

* added max_workers

* forgot max_workers in the func..

* Docstring and type hints

* Expand tests

* Reorder arguments

* Test with additional args

* Test for args and kwargs each

* Fix and test for passing arrs as a scalar DataArray

* Rename arrs -&gt; arrays (too similar to args)

---------

Co-authored-by: Philippe Miron &lt;philippe.miron@dtn.com&gt;
Co-authored-by: milancurcic &lt;caomaco@gmail.com&gt;
diff --git a/clouddrift/analysis.py b/clouddrift/analysis.py
@@ -1,7 +1,93 @@
 import numpy as np
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 import xarray as xr
+from concurrent import futures
 from clouddrift.haversine import distance, bearing
+from clouddrift.dataformat import unpack_ragged
+
+
+def apply_ragged(
+    func: callable,
+    arrays: list[np.ndarray],
+    rowsize: list[int],
+    *args: tuple,
+    max_workers: int = None,
+    **kwargs: dict,
+) -> Union[tuple[np.ndarray], np.ndarray]:
+    """Apply a function to a ragged array.
+
+    The function ``func`` will be applied to each contiguous row of ``arrays`` as
+    indicated by row sizes ``rowsize``. The output of ``func`` will be
+    concatenated into a single ragged array.
+
+    This function uses ``concurrent.futures.ThreadPoolExecutor`` to run ``func``
+    in multiple threads. The number of threads can be controlled by the
+    ``max_workers`` argument, which is passed down to ``ThreadPoolExecutor``.
+
+    Parameters
+    ----------
+    func : callable
+        Function to apply to each row of each ragged array in ``arrays``.
+    arrays : list[np.ndarray] or np.ndarray
+        An array or a list of arrays to apply ``func`` to.
+    rowsize : list
+        List of integers specifying the number of data points in each row.
+    *args : tuple
+        Additional arguments to pass to ``func``.
+    max_workers : int, optional
+        Number of threads to use. If None, the number of threads will be equal
+        to the ``max_workers`` default value of ``concurrent.futures.ThreadPoolExecutor``.
+    **kwargs : dict
+        Additional keyword arguments to pass to ``func``.
+
+    Returns
+    -------
+    out : tuple[np.ndarray] or np.ndarray
+        Output array(s) from ``func``.
+
+    Examples
+    --------
+    >>> def func(x, y):
+    ...     return x + y
+    >>> x = np.arange(10)
+    >>> y = np.arange(10, 20)
+    >>> apply_ragged(func, [x, y], [5, 5])
+    array([10, 12, 14, 16, 18, 20, 22, 24, 26, 28])
+
+    Raises
+    ------
+    ValueError
+        If the sum of ``rowsize`` does not equal the length of ``arrays``.
+    """
+    # make sure the arrays is iterable
+    if type(arrays) not in [list, tuple]:
+        arrays = [arrays]
+    # validate rowsize
+    for arr in arrays:
+        if not sum(rowsize) == len(arr):
+            raise ValueError("The sum of rowsize must equal the length of arr.")
+
+    # split the array(s) into trajectories
+    arrays = [unpack_ragged(arr, rowsize) for arr in arrays]
+    iter = [[arrays[i][j] for i in range(len(arrays))] for j in range(len(arrays[0]))]
+
+    # combine other arguments
+    for arg in iter:
+        if args:
+            arg.append(*args)
+
+    # parallel execution
+    with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        res = executor.map(lambda x: func(*x, **kwargs), iter)
+    # concatenate the outputs
+    res = list(res)
+    if isinstance(res[0], tuple):  # more than 1 parameter
+        outputs = []
+        for i in range(len(res[0])):
+            outputs.append(np.concatenate([r[i] for r in res]))
+        return tuple(outputs)
+    else:
+        return np.concatenate(res)
 
 
 def segment(
@@ -60,6 +146,7 @@ def segment(
     >>> segment(x, 0.5, rowsize=segment(x, -0.5))
     array([2, 2, 2, 2])
     """
+
     if rowsize is None:
         if tolerance >= 0:
             exceeds_tolerance = np.diff(x) > tolerance
diff --git a/tests/analysis_tests.py b/tests/analysis_tests.py
@@ -1,4 +1,4 @@
-from clouddrift.analysis import segment, velocity_from_position
+from clouddrift.analysis import segment, velocity_from_position, apply_ragged
 from clouddrift.haversine import EARTH_RADIUS_METERS
 import unittest
 import numpy as np
@@ -132,3 +132,67 @@ def test_time_axis(self):
         self.assertTrue(np.all(vf == expected_vf))
         self.assertTrue(np.all(uf.shape == expected_uf.shape))
         self.assertTrue(np.all(vf.shape == expected_vf.shape))
+
+
+class apply_ragged_tests(unittest.TestCase):
+    def setUp(self):
+        self.rowsize = [2, 3, 4]
+        self.x = np.array([1, 2, 10, 12, 14, 30, 33, 36, 39])
+        self.y = np.arange(0, len(self.x))
+        self.t = np.array([1, 2, 1, 2, 3, 1, 2, 3, 4])
+
+    def test_simple(self):
+        y = apply_ragged(lambda x: x**2, np.array([1, 2, 3, 4]), [2, 2])
+        self.assertTrue(np.all(y == np.array([1, 4, 9, 16])))
+
+    def test_simple_dataarray(self):
+        y = apply_ragged(
+            lambda x: x**2,
+            xr.DataArray(data=[1, 2, 3, 4], coords={"obs": [1, 2, 3, 4]}),
+            [2, 2],
+        )
+        self.assertTrue(np.all(y == np.array([1, 4, 9, 16])))
+
+    def test_simple_with_args(self):
+        y = apply_ragged(lambda x, p: x**p, np.array([1, 2, 3, 4]), [2, 2], 2)
+        self.assertTrue(np.all(y == np.array([1, 4, 9, 16])))
+
+    def test_simple_with_kwargs(self):
+        y = apply_ragged(lambda x, p: x**p, np.array([1, 2, 3, 4]), [2, 2], p=2)
+        self.assertTrue(np.all(y == np.array([1, 4, 9, 16])))
+
+    def test_velocity_ndarray(self):
+        u, v = apply_ragged(
+            velocity_from_position,
+            [self.x, self.y, self.t],
+            self.rowsize,
+            coord_system="cartesian",
+        )
+        self.assertIsNone(
+            np.testing.assert_allclose(u, [1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0])
+        )
+        self.assertIsNone(
+            np.testing.assert_allclose(v, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+        )
+
+    def test_velocity_dataarray(self):
+        u, v = apply_ragged(
+            velocity_from_position,
+            [
+                xr.DataArray(data=self.x),
+                xr.DataArray(data=self.y),
+                xr.DataArray(data=self.t),
+            ],
+            xr.DataArray(data=self.rowsize),
+            coord_system="cartesian",
+        )
+        self.assertIsNone(
+            np.testing.assert_allclose(u, [1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0])
+        )
+        self.assertIsNone(
+            np.testing.assert_allclose(v, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+        )
+
+    def test_bad_rowsize_raises(self):
+        with self.assertRaises(ValueError):
+            y = apply_ragged(lambda x: x**2, np.array([1, 2, 3, 4]), [2])