subset moidfied original dataset (#203)

philippemiron · Philippe Miron · web-flow · commit 9d3fbb5f837d · 2023-06-26T09:51:41.000-04:00
* subset moidfied original dataset

* remove list comprehension

* added subset examples

---------

Co-authored-by: Philippe Miron &lt;philippe.miron@dtn.com&gt;
diff --git a/clouddrift/analysis.py b/clouddrift/analysis.py
@@ -949,18 +949,28 @@ def subset(ds: xr.Dataset, criteria: dict) -> xr.Dataset:
 
     Examples
     --------
-    Criteria are combined on any data or metadata variables part of the Dataset.
+    Criteria are combined on any data or metadata variables part of the Dataset. The following examples are based on the GDP dataset.
 
-    To subset between a range of values:
-    >>> subset(ds, {"lon": (min_lon, max_lon), "lat": (min_lat, max_lat)})
-    >>> subset(ds, {"time": (min_time, max_time)})
+    Retrieve a region, like the Gulf of Mexico, using ranges of latitude and longitude:
+    >>> subset(ds, {"lat": (21, 31), "lon": (-98, -78)})
 
-    To select multiples values:
-    >>> subset(ds, {"ID": [1, 2, 3]})
-
-    To select a specific value:
+    Retrieve drogued trajectory segments:
     >>> subset(ds, {"drogue_status": True})
 
+    Retrieve trajectory segments with temperature higher than 25°C (303.15K):
+    >>> subset(ds, {"sst": (303.15, np.inf)})
+
+    Retrieve specific drifters from their IDs:
+    >>> subset(ds, {"ID": [2578, 2582, 2583]})
+
+    Retrieve a specific time period:
+    >>> subset(ds, {"time": (np.datetime64("2000-01-01"), np.datetime64("2020-01-31"))})
+
+    Note: To subset time variable, the range has to be defined as a function type of the variable. By default, `xarray` uses `np.datetime64` to represent datetime data. If the datetime data is a `datetime.datetime`, or `pd.Timestamp`, the range would have to be define accordingly.
+
+    Those criteria can also be combined:
+    >>> subset(ds, {"lat": (21, 31), "lon": (-98, -78), "drogue_status": True, "sst": (303.15, np.inf), "time": (np.datetime64("2000-01-01"), np.datetime64("2020-01-31"))})
+
     Raises
     ------
     ValueError
@@ -992,11 +1002,12 @@ def subset(ds: xr.Dataset, criteria: dict) -> xr.Dataset:
         warnings.warn("No data matches the criteria; returning an empty dataset.")
         return xr.Dataset()
     else:
-        # update rowsize
-        id_count = np.bincount(ds.ids[mask_obs])
-        ds["rowsize"].values[mask_traj] = [id_count[i] for i in ds.ID[mask_traj]]
         # apply the filtering for both dimensions
-        return ds.isel({"traj": mask_traj, "obs": mask_obs})
+        ds_sub = ds.isel({"traj": mask_traj, "obs": mask_obs})
+        # update the rowsize
+        id_count = np.bincount(ds_sub.ids)
+        ds_sub["rowsize"].values = np.take(id_count, ds_sub.ID)
+        return ds_sub
 
 
 def unpack_ragged(
diff --git a/tests/analysis_tests.py b/tests/analysis_tests.py
@@ -712,6 +712,11 @@ class subset_tests(unittest.TestCase):
     def setUp(self):
         self.ds = sample_ragged_array().to_xarray()
 
+    def test_ds_unmodified(self):
+        ds_original = self.ds.copy(deep=True)
+        ds_sub = subset(self.ds, {"test": True})
+        xr.testing.assert_equal(ds_original, self.ds)
+
     def test_equal(self):
         ds_sub = subset(self.ds, {"test": True})
         self.assertEqual(len(ds_sub.ID), 2)