inikishev
diff --git a/‎docs/source/FAQ.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/FAQ.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchzero/modules/clipping/clipping.py‎
Lines changed: 19 additions & 0 deletions b/‎torchzero/modules/clipping/clipping.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎torchzero/modules/experimental/absoap.py‎
Lines changed: 4 additions & 1 deletion b/‎torchzero/modules/experimental/absoap.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎torchzero/modules/experimental/adadam.py‎
Lines changed: 5 additions & 1 deletion b/‎torchzero/modules/experimental/adadam.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎torchzero/modules/experimental/adamY.py‎
Lines changed: 5 additions & 1 deletion b/‎torchzero/modules/experimental/adamY.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎torchzero/modules/experimental/adasoap.py‎
Lines changed: 4 additions & 1 deletion b/‎torchzero/modules/experimental/adasoap.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎torchzero/modules/experimental/eigendescent.py‎
Lines changed: 4 additions & 1 deletion b/‎torchzero/modules/experimental/eigendescent.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎torchzero/modules/experimental/etf.py‎
Lines changed: 16 additions & 3 deletions b/‎torchzero/modules/experimental/etf.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎torchzero/modules/experimental/higher_order_adagrad.py‎
Lines changed: 4 additions & 4 deletions b/‎torchzero/modules/experimental/higher_order_adagrad.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎torchzero/modules/experimental/reduce_outward_lr.py‎
Lines changed: 3 additions & 4 deletions b/‎torchzero/modules/experimental/reduce_outward_lr.py‎
Lines changed: 3 additions & 4 deletions
@@ -204,7 +204,7 @@ How to save/serialize a modular optimizer?
 ============================================
 Please refer to pytorch docs https://pytorch.org/tutorials/beginner/saving_loading_models.html.
 
-Like pytorch optimizers, torchzero modular optimizers and modules support :code:`opt.state_dict()` and :code:`opt.load_state_dict()`, which saves and loads state dicts of all modules, including nested ones.
+Like pytorch optimizers, torchzero modular optimizers support :code:`opt.state_dict()` and :code:`opt.load_state_dict()`, which saves and loads state dicts of all modules, including nested ones.
 
 So you can use the standard code for saving and loading:
 
 
@@ -152,8 +152,11 @@ class ClipValue(Transform):
         target (str): refer to :ref:`target argument` in documentation.
 
     Examples:
+
         Gradient clipping:
+
         .. code-block:: python
+
             opt = tz.Modular(
                 model.parameters(),
                 tz.m.ClipValue(1),
@@ -162,13 +165,16 @@ class ClipValue(Transform):
             )
 
         Update clipping:
+
         .. code-block:: python
+
             opt = tz.Modular(
                 model.parameters(),
                 tz.m.Adam(),
                 tz.m.ClipValue(1),
                 tz.m.LR(1e-2),
             )
+
     """
     def __init__(self, value: float, target: Target = 'update'):
         defaults = dict(value=value)
@@ -198,8 +204,11 @@ class ClipNorm(Transform):
             what this affects.
 
     Examples:
+
         Gradient norm clipping:
+
         .. code-block:: python
+
             opt = tz.Modular(
                 model.parameters(),
                 tz.m.ClipNorm(1),
@@ -208,7 +217,9 @@ class ClipNorm(Transform):
             )
 
         Update norm clipping:
+
         .. code-block:: python
+
             opt = tz.Modular(
                 model.parameters(),
                 tz.m.Adam(),
@@ -263,8 +274,11 @@ class Normalize(Transform):
             what this affects.
 
     Examples:
+
         Gradient normalization:
+
         .. code-block:: python
+
             opt = tz.Modular(
                 model.parameters(),
                 tz.m.Normalize(1),
@@ -273,7 +287,9 @@ class Normalize(Transform):
             )
 
         Update normalization:
+
         .. code-block:: python
+
             opt = tz.Modular(
                 model.parameters(),
                 tz.m.Adam(),
@@ -363,8 +379,11 @@ class Centralize(Transform):
             minimal size of a dimension to normalize along it. Defaults to 1.
 
     Examples:
+
         Standard gradient centralization:
+
         .. code-block:: python
+
             opt = tz.Modular(
                 model.parameters(),
                 tz.m.Centralize(dim=0),
 
@@ -24,7 +24,10 @@ def update_absoap_covariances_(
 
 Source=Literal['p','g','s','y', 'gy', 'sy', 'sn', 'yn', 'gys', 'sys']
 class ABSOAP(Transform):
-    """SOAP but with some extra options for testing. Please note that this is experimental and isn't guaranteed to work.
+    """SOAP but with some extra options for testing.
+
+    .. warning::
+        This module is just for testing my stupid ideas.
 
     Args:
         scale_by_s - whether to scale y by s
 
@@ -50,7 +50,11 @@ def adadam_(
     return None
 
 class Adadam(Module):
-    """Adam with a diagonally preconditioned preconditioner. Please note that this is experimental and isn't guaranteed to work."""
+    """Adam with a diagonally preconditioned preconditioner.
+
+    .. warning::
+        Experimental.
+    """
     def __init__(
         self,
         beta1: float = 0.9,
 
@@ -62,7 +62,11 @@ def adamy_(
     return None
 
 class AdamY(Module):
-    """Adam but uses scaled gradient differences for second momentum. Please note that this is experimental and isn't guaranteed to work."""
+    """Adam but uses scaled gradient differences for second momentum.
+
+    .. warning::
+        Experimental.
+    """
     def __init__(
         self,
         beta1: float = 0.9,
 
@@ -33,7 +33,10 @@ def update_adasoap_covariances_(
 
 
 class AdaSOAP(Transform):
-    """SOAP with diagonally preconditioned GG^Ts. Please note that this is experimental and isn't guaranteed to work.
+    """SOAP with diagonally preconditioned GG^Ts.
+
+    .. warning::
+        Experimental.
 
     precond_beta - beta for GG^T squares
     """
 
@@ -23,7 +23,10 @@ def _cosine_similarity(x, y):
 
 class EigenDescent(Module):
     """
-    Uses eigenvectors corresponding to certain eigenvalues. Please note that this is experimental and isn't guaranteed to work.
+    Uses eigenvectors corresponding to certain eigenvalues.
+
+    .. warning::
+        Experimental.
 
     Args:
         mode (str, optional):
 
@@ -8,7 +8,11 @@
 
 
 class ExponentialTrajectoryFit(Module):
-    """A method. Please note that this is experimental and isn't guaranteed to work."""
+    """A method.
+
+    .. warning::
+        Experimental.
+    """
     def __init__(self, step_size=1e-3):
         defaults = dict(step_size = step_size)
         super().__init__(defaults)
@@ -67,7 +71,12 @@ def step(self, var):
 
 
 class ExponentialTrajectoryFitV2(Module):
-    """Should be better than one above, except it isn't. Please note that this is experimental and isn't guaranteed to work."""
+    """Should be better than one above, except it isn't.
+
+    .. warning::
+        Experimental.
+
+    """
     def __init__(self, step_size=1e-3, num_steps: int= 4):
         defaults = dict(step_size = step_size, num_steps=num_steps)
         super().__init__(defaults)
@@ -132,7 +141,11 @@ def _fit_exponential(y0, y1, y2):
     return A, B, r
 
 class PointwiseExponential(Module):
-    """A stupid method (for my youtube channel). Please note that this is experimental and isn't guaranteed to work."""
+    """A stupid method (for my youtube channel).
+
+    .. warning::
+        Experimental.
+    """
     def __init__(self, step_size: float = 1e-3, reg: float = 1e-2, steps = 10000):
         defaults = dict(reg=reg, steps=steps, step_size=step_size)
         super().__init__(defaults)
 
@@ -20,13 +20,13 @@
 
 class HigherOrderAdagrad(Module):
     """
-    .. note::
-        Conceptual.
+    .. warning::
+        Experimental.
 
-    .. note::
+    .. warning::
         Extremely expensive.
 
-    .. note::
+    .. warning::
         Doesn't work.
     """
     def __init__(
 
@@ -4,13 +4,12 @@
 from ...utils import TensorList, unpack_states, unpack_dicts
 
 class ReduceOutwardLR(Transform):
-    """
-    When update sign matches weight sign, the learning rate for that weight is multiplied by `mul`.
+    """When update sign matches weight sign, the learning rate for that weight is multiplied by `mul`.
 
     This means updates that move weights towards zero have higher learning rates.
 
-    .. note::
-        this sounded good, but it sucks.
+    .. warning::
+        This sounded good but after testing turns out it sucks.
     """
     def __init__(self, mul = 0.5, use_grad=False, invert=False, target: Target = 'update'):
         defaults = dict(mul=mul, use_grad=use_grad, invert=invert)