From 3a36de65ee1b6a6bd8bd80e90bfe3e9232776078 Mon Sep 17 00:00:00 2001
From: chensy <chensy@v3.cluster.local>
Date: Tue, 1 Apr 2025 19:43:48 -0400
Subject: [PATCH 1/8] matryoshka

---
 aq_engine.py |  51 ++++++++++++++++++++++----
 quantize.sh  |  22 +++++++++++
 run.sh       |  26 +++++++++++++
 src/aq.py    |  16 ++++++--
 src/utils.py | 101 +++++++++++++++++++++++++++++++++++++++++----------
 5 files changed, 187 insertions(+), 29 deletions(-)
 create mode 100644 quantize.sh
 create mode 100644 run.sh

diff --git a/aq_engine.py b/aq_engine.py
index ea14eec..5d770f8 100644
--- a/aq_engine.py
+++ b/aq_engine.py
@@ -105,6 +105,9 @@ def quantize(self, *, args: Namespace, verbose: bool = True) -> QuantizedWeight:
             )
         return self.quantized_weight
 
+
+
+
     def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor:
         """
         Compute the activation MSE error = ||X @ quantized_weight - X @ reference_weight||^2
@@ -114,21 +117,55 @@ def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor:
             The indices / slices must correspond to output channels (if out_group_size==1) or groups (if > 1).
             Formally, the indices must be in range [ 0 , self.out_features // self.out_group_size )
         """
-        assert self.quantized_weight is not None, "must be called inside / after AQUtil.quantize"
-        quantized_weight = self.quantized_weight(selection)
-
+        # assert self.quantized_weight is not None, "must be called inside / after AQUtil.quantize"
+        # quantized_weight = self.quantized_weight(selection)
+
+        # if isinstance(selection, ellipsis):
+        #     reference_weight = self.layer.weight.detach().to(quantized_weight.dtype)
+        # else:
+        #     assert isinstance(selection, slice)
+        #     out_channel_selection = slice(
+        #         selection.start * self.quantized_weight.out_group_size,
+        #         selection.stop * self.quantized_weight.out_group_size,
+        #     )
+
+        #     reference_weight = self.layer.weight.detach()[out_channel_selection].to(quantized_weight.dtype)
+        # delta_weight = (quantized_weight - reference_weight).to(self.XTX.dtype)
+        # return (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features
+
+        assert self.quantized_weight is not None, "必须在 AQUtil.quantize 内部/之后调用"
+    
+    # 获取参考权重
         if isinstance(selection, ellipsis):
-            reference_weight = self.layer.weight.detach().to(quantized_weight.dtype)
+            reference_weight = self.layer.weight.detach().to(self.quantized_weight.codebooks.dtype)
         else:
             assert isinstance(selection, slice)
             out_channel_selection = slice(
                 selection.start * self.quantized_weight.out_group_size,
                 selection.stop * self.quantized_weight.out_group_size,
             )
+            reference_weight = self.layer.weight.detach()[out_channel_selection].to(self.quantized_weight.codebooks.dtype)
+        
+        # 计算总的 codebook 数量
+        total_codebooks = self.quantized_weight.num_codebooks
+        total_loss = torch.tensor(0.0, device=self.device, dtype=self.XTX.dtype)
+        
+        # 对每个渐进式阶段计算 MSE
+        for i in range(1, total_codebooks + 1):
+            # 获取使用前 i 个 codebook 的量化权重
+            quantized_weight_i = self.quantized_weight(selection, num_codebooks=i)
+            
+            # 计算当前阶段的 MSE
+            delta_weight = (quantized_weight_i - reference_weight).to(self.XTX.dtype)
+            mse_i = (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features
+            
+            # 添加到总损失
+            total_loss = total_loss + mse_i
+        
+        return total_loss
+
+
 
-            reference_weight = self.layer.weight.detach()[out_channel_selection].to(quantized_weight.dtype)
-        delta_weight = (quantized_weight - reference_weight).to(self.XTX.dtype)
-        return (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features
 
     def _replace_and_compute_mse(self, params_to_replace: nn.ParameterDict, selection: slice) -> torch.Tensor:
         """Utility for parallelism: replace the specified parameters of self.quantized_weight, then compute MSE"""
diff --git a/quantize.sh b/quantize.sh
new file mode 100644
index 0000000..0b1f204
--- /dev/null
+++ b/quantize.sh
@@ -0,0 +1,22 @@
+export CUDA_VISIBLE_DEVICES=0,1   # or e.g. 0,1,2,3
+export MODEL_PATH=/model-weights/gemma-2b     # /scratch/ssd004/scratch/chensy/hf_home/models--meta-llama--Llama-2-7b-hf/blobs/2ef41cbc275000b29afe157ba487f0530b8c26dc
+export DATASET_PATH=pajama
+export SAVE_PATH=/scratch/ssd004/scratch/chensy/AQLM-x/
+# export WANDB_PROJECT=MY_AQ_EXPS
+# export WANDB_NAME=COOL_EXP_NAME
+
+python main.py $MODEL_PATH $DATASET_PATH \
+ --nsamples=1024 \
+ --val_size=32 \
+ --num_codebooks=2 \
+ --nbits_per_codebook=8 \
+ --in_group_size=8 \
+ --relative_mse_tolerance=0.005 \
+ --finetune_batch_size=32 \
+ --finetune_max_epochs=10 \
+ --finetune_early_stop=3 \
+ --finetune_keep_best \
+ --local_batch_size=1 \
+ --offload_activations \
+ --resume \
+ --save $SAVE_PATH
\ No newline at end of file
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..b3e1237
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+#SBATCH --job-name=eval
+#SBATCH --gres=gpu:rtx6000:1
+#SBATCH --qos=normal
+#SBATCH --time=10:00:00
+#SBATCH -c 30
+#SBATCH --mem=60G
+#SBATCH --output=slurm-%j.out
+#SBATCH --error=slurm-%j.err
+
+
+export CUDA_HOME=/pkgs/cuda-12.4
+export PATH=$CUDA_HOME/bin:$PATH
+export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+# module load cuda-12.4
+
+# export CUDA_HOME=/pkgs/cuda-12.4
+# export PATH=$CUDA_HOME/bin:$PATH
+# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+. /scratch/ssd004/scratch/chensy/envs/dora_llama
+# export CUDA_HOME=/pkgs/cuda-12.1
+# export PATH=$CUDA_HOME/bin:$PATH
+# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+
+lsmod | grep -i nvidia
+sh quantize.sh
\ No newline at end of file
diff --git a/src/aq.py b/src/aq.py
index df2f681..a1755ae 100644
--- a/src/aq.py
+++ b/src/aq.py
@@ -197,16 +197,26 @@ def get_scales(self) -> torch.Tensor:
     def shape(self) -> Tuple[int, int]:
         return self.out_features, self.in_features
 
-    def forward(self, selection: Union[slice, ellipsis, torch.Tensor] = ...):
+    def forward(self, selection: Union[slice, ellipsis, torch.Tensor] = ..., num_codebooks: Optional[int] = None):
         """
         Differentably reconstruct the weight (or parts thereof) from compressed components
         :param selection: By default, reconstruct the entire weight. If selection is specified, this method will instead
             reconstruct a portion of weight for the corresponding output dimensions (used for parallelism).
             The indices / slices must correspond to output channels (if out_group_size==1) or groups (if > 1).
             Formally, the indices must be in range [ 0 , self.out_features // self.out_group_size )
-
+        :param num_codebooks: Number of codebooks to use for reconstruction. If None, all codebooks are used.
+            If specified, only the first `num_codebooks` will be used.
         """
-        weight = _dequantize_weight(self.get_codes()[selection], self.get_codebooks(), self.get_scales()[selection])
+        # 检查 num_codebooks 参数
+        if num_codebooks is not None:
+            num_codebooks = min(num_codebooks, self.num_codebooks)
+        
+        weight = _dequantize_weight(
+            self.get_codes()[selection], 
+            self.get_codebooks(), 
+            self.get_scales()[selection],
+            num_codebooks
+        )
         return weight
 
     @torch.no_grad()
diff --git a/src/utils.py b/src/utils.py
index 9dd77ed..e3ee3f4 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -61,35 +61,98 @@ def maybe_script(fn: callable) -> callable:
     return torch.jit.script(fn) if should_script else fn
 
 
+# @maybe_script
+# def _dequantize_weight(
+#     codes: torch.Tensor, codebooks: torch.Tensor, scales: Optional[torch.Tensor] = None
+# ) -> torch.Tensor:
+#     """
+#     Decode float weights from quantization codes. Differentiable.
+#     :param codes: tensor of integer quantization codes, shape [*dims, num_out_groups, num_in_groups, num_codebooks]
+#     :param codebooks: tensor of vectors for each quantization code, [num_codebooks, codebook_size, out_group_size, in_group_size]
+#     :param scales: weight will be multiplied by this factor, must be broadcastble with [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
+#     :return: reconstructed weight tensor of shape [*dims, num_in_groups*group_size]
+#     """
+#     num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
+#     num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape
+#     out_features = num_out_groups * out_group_size
+#     in_features = num_in_groups * in_group_size
+#     codebook_offsets = torch.arange(
+#         0, num_codebooks * codebook_size, codebook_size, device=codes.device
+#     )  # shape: [num_codebooks]
+#     reconstructed_weight_flat = F.embedding_bag(
+#         codes.flatten(0, -2) + codebook_offsets, codebooks.flatten(0, 1).flatten(-2, -1), mode="sum"
+#     )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size]
+
+#     reconstructed_weight_groupwise = reconstructed_weight_flat.view(
+#         list(codes.shape[:-3]) + [num_out_groups, num_in_groups, out_group_size, in_group_size]
+#     )
+#     if scales is not None:
+#         reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(scales)
+#     return reconstructed_weight_groupwise.swapaxes(-3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
+
 @maybe_script
 def _dequantize_weight(
-    codes: torch.Tensor, codebooks: torch.Tensor, scales: Optional[torch.Tensor] = None
+    codes: torch.Tensor, codebooks: torch.Tensor, scales: Optional[torch.Tensor] = None, num_codebooks: Optional[int] = None
 ) -> torch.Tensor:
     """
     Decode float weights from quantization codes. Differentiable.
     :param codes: tensor of integer quantization codes, shape [*dims, num_out_groups, num_in_groups, num_codebooks]
     :param codebooks: tensor of vectors for each quantization code, [num_codebooks, codebook_size, out_group_size, in_group_size]
     :param scales: weight will be multiplied by this factor, must be broadcastble with [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
+    :param num_codebooks: Number of codebooks to use. If None, all available codebooks are used.
     :return: reconstructed weight tensor of shape [*dims, num_in_groups*group_size]
     """
-    num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
-    num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape
-    out_features = num_out_groups * out_group_size
-    in_features = num_in_groups * in_group_size
-    codebook_offsets = torch.arange(
-        0, num_codebooks * codebook_size, codebook_size, device=codes.device
-    )  # shape: [num_codebooks]
-    reconstructed_weight_flat = F.embedding_bag(
-        codes.flatten(0, -2) + codebook_offsets, codebooks.flatten(0, 1).flatten(-2, -1), mode="sum"
-    )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size]
-
-    reconstructed_weight_groupwise = reconstructed_weight_flat.view(
-        list(codes.shape[:-3]) + [num_out_groups, num_in_groups, out_group_size, in_group_size]
-    )
-    if scales is not None:
-        reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(scales)
-    return reconstructed_weight_groupwise.swapaxes(-3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
-
+    # 使用所有可用的 codebook 或仅使用指定数量的 codebook
+    available_codebooks = codebooks.shape[0]
+    effective_num_codebooks = available_codebooks if num_codebooks is None else min(num_codebooks, available_codebooks)
+    
+    # 如果要使用所有 codebook，使用原始方法
+    if effective_num_codebooks == available_codebooks:
+        num_out_groups, num_in_groups, _ = codes.shape[-3:]
+        _, codebook_size, out_group_size, in_group_size = codebooks.shape
+        out_features = num_out_groups * out_group_size
+        in_features = num_in_groups * in_group_size
+        codebook_offsets = torch.arange(
+            0, available_codebooks * codebook_size, codebook_size, device=codes.device
+        )  # shape: [num_codebooks]
+        reconstructed_weight_flat = F.embedding_bag(
+            codes.flatten(0, -2) + codebook_offsets, codebooks.flatten(0, 1).flatten(-2, -1), mode="sum"
+        )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size]
+
+        reconstructed_weight_groupwise = reconstructed_weight_flat.view(
+            list(codes.shape[:-3]) + [num_out_groups, num_in_groups, out_group_size, in_group_size]
+        )
+        if scales is not None:
+            reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(scales)
+        return reconstructed_weight_groupwise.swapaxes(-3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
+    
+    # 使用部分 codebook
+    else:
+        num_out_groups, num_in_groups, _ = codes.shape[-3:]
+        _, codebook_size, out_group_size, in_group_size = codebooks.shape
+        out_features = num_out_groups * out_group_size
+        in_features = num_in_groups * in_group_size
+        
+        # 只使用前 effective_num_codebooks 个 codebook
+        selected_codes = codes[..., :effective_num_codebooks]
+        selected_codebooks = codebooks[:effective_num_codebooks]
+        
+        codebook_offsets = torch.arange(
+            0, effective_num_codebooks * codebook_size, codebook_size, device=codes.device
+        )  # shape: [effective_num_codebooks]
+        
+        reconstructed_weight_flat = F.embedding_bag(
+            selected_codes.flatten(0, -2) + codebook_offsets, 
+            selected_codebooks.flatten(0, 1).flatten(-2, -1), 
+            mode="sum"
+        )  # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size]
+
+        reconstructed_weight_groupwise = reconstructed_weight_flat.view(
+            list(codes.shape[:-3]) + [num_out_groups, num_in_groups, out_group_size, in_group_size]
+        )
+        if scales is not None:
+            reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(scales)
+        return reconstructed_weight_groupwise.swapaxes(-3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
 
 @contextlib.contextmanager
 def using_tf32(enabled: bool):

From 8baa524d842fead4d78143091a4956bf43475d55 Mon Sep 17 00:00:00 2001
From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:31:33 -0400
Subject: [PATCH 2/8] Update aq_engine.py

Updated _compuse mse with Matryoshka inspired loss and tuned codebook_weights
---
 aq_engine.py | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/aq_engine.py b/aq_engine.py
index 5d770f8..f6e6585 100644
--- a/aq_engine.py
+++ b/aq_engine.py
@@ -105,9 +105,7 @@ def quantize(self, *, args: Namespace, verbose: bool = True) -> QuantizedWeight:
             )
         return self.quantized_weight
 
-
-
-
+    ### modified _compute_mse for DropbyDrop
     def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor:
         """
         Compute the activation MSE error = ||X @ quantized_weight - X @ reference_weight||^2
@@ -132,10 +130,9 @@ def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor:
         #     reference_weight = self.layer.weight.detach()[out_channel_selection].to(quantized_weight.dtype)
         # delta_weight = (quantized_weight - reference_weight).to(self.XTX.dtype)
         # return (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features
-
+        
         assert self.quantized_weight is not None, "必须在 AQUtil.quantize 内部/之后调用"
-    
-    # 获取参考权重
+
         if isinstance(selection, ellipsis):
             reference_weight = self.layer.weight.detach().to(self.quantized_weight.codebooks.dtype)
         else:
@@ -146,25 +143,29 @@ def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor:
             )
             reference_weight = self.layer.weight.detach()[out_channel_selection].to(self.quantized_weight.codebooks.dtype)
         
-        # 计算总的 codebook 数量
         total_codebooks = self.quantized_weight.num_codebooks
-        total_loss = torch.tensor(0.0, device=self.device, dtype=self.XTX.dtype)
         
-        # 对每个渐进式阶段计算 MSE
+        # EXAMPLE - 35W 
+        codebook_weights = torch.tensor([0,0, 0.5,  0, 0.5], device=self.device, dtype=self.XTX.dtype)
+
+        total_loss = torch.tensor(0.0, device=self.device, dtype=self.XTX.dtype)
+
+        # Inspired by Matryoshka Representation Learning
         for i in range(1, total_codebooks + 1):
-            # 获取使用前 i 个 codebook 的量化权重
+            
             quantized_weight_i = self.quantized_weight(selection, num_codebooks=i)
             
-            # 计算当前阶段的 MSE
             delta_weight = (quantized_weight_i - reference_weight).to(self.XTX.dtype)
             mse_i = (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features
             
-            # 添加到总损失
-            total_loss = total_loss + mse_i
-        
-        return total_loss
-
+            # Ensure all tensors are on the same device before computation
+            mse_i = mse_i.to(self.device)
+            codebook_weight = codebook_weights[i-1].to(self.device)
+            
+            #total_loss = total_loss + mse_i
+            total_loss = total_loss + codebook_weight* mse_i 
 
+        return total_loss
 
 
     def _replace_and_compute_mse(self, params_to_replace: nn.ParameterDict, selection: slice) -> torch.Tensor:

From 745320f2c4f6ac6ba1fc2803195a22b976883b77 Mon Sep 17 00:00:00 2001
From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:34:58 -0400
Subject: [PATCH 3/8] Update aq.py

added note for dropping codebooks during inference
---
 src/aq.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/aq.py b/src/aq.py
index a1755ae..1585f3d 100644
--- a/src/aq.py
+++ b/src/aq.py
@@ -210,12 +210,14 @@ def forward(self, selection: Union[slice, ellipsis, torch.Tensor] = ..., num_cod
         # 检查 num_codebooks 参数
         if num_codebooks is not None:
             num_codebooks = min(num_codebooks, self.num_codebooks)
-        
+
+        # FOR DROP-BY-DROP's INFERENCE, modify num_codebooks (i.e. num_codebooks = 3) to simulate "dropping" of codebooks without any
+        # additional retraining or finetuning. Just load the quantized model through $SAVE_PATH in the shell script.
         weight = _dequantize_weight(
             self.get_codes()[selection], 
             self.get_codebooks(), 
             self.get_scales()[selection],
-            num_codebooks
+            num_codebooks                
         )
         return weight
 

From 656967ee72eceb4f3a157272616a50a74410e862 Mon Sep 17 00:00:00 2001
From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:39:51 -0400
Subject: [PATCH 4/8] Update quantize.sh

---
 quantize.sh | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/quantize.sh b/quantize.sh
index 0b1f204..8a16346 100644
--- a/quantize.sh
+++ b/quantize.sh
@@ -1,17 +1,20 @@
-export CUDA_VISIBLE_DEVICES=0,1   # or e.g. 0,1,2,3
-export MODEL_PATH=/model-weights/gemma-2b     # /scratch/ssd004/scratch/chensy/hf_home/models--meta-llama--Llama-2-7b-hf/blobs/2ef41cbc275000b29afe157ba487f0530b8c26dc
-export DATASET_PATH=pajama
-export SAVE_PATH=/scratch/ssd004/scratch/chensy/AQLM-x/
-# export WANDB_PROJECT=MY_AQ_EXPS
-# export WANDB_NAME=COOL_EXP_NAME
+#!/bin/bash
+#SBATCH --job-name=quantize
+#SBATCH --output=slurm-%j-MAT35-gemma2b.out
+#SBATCH --error=slurm-%j-MAT35-gemma2b.err
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export MODEL_PATH=/model-weights/gemma-2b/    
+export DATASET_PATH=wikitext2
+export SAVE_PATH=/project/aip-khisti/babaogl4/vaughan/5x8-MAT35-gemma2b
 
 python main.py $MODEL_PATH $DATASET_PATH \
  --nsamples=1024 \
  --val_size=32 \
- --num_codebooks=2 \
+ --num_codebooks=5 \
  --nbits_per_codebook=8 \
  --in_group_size=8 \
- --relative_mse_tolerance=0.005 \
+ --relative_mse_tolerance=0.01 \
  --finetune_batch_size=32 \
  --finetune_max_epochs=10 \
  --finetune_early_stop=3 \
@@ -19,4 +22,19 @@ python main.py $MODEL_PATH $DATASET_PATH \
  --local_batch_size=1 \
  --offload_activations \
  --resume \
- --save $SAVE_PATH
\ No newline at end of file
+ --save $SAVE_PATH
+
+# python main.py $MODEL_PATH $DATASET_PATH \
+#  --nsamples=1024 \
+#  --val_size=32 \
+#  --num_codebooks=5 \
+#  --nbits_per_codebook=8 \
+#  --in_group_size=8 \
+#  --relative_mse_tolerance=0.01 \
+#  --finetune_batch_size=32 \
+#  --finetune_max_epochs=10 \
+#  --finetune_early_stop=3 \
+#  --finetune_keep_best \
+#  --local_batch_size=1 \
+#  --offload_activations \
+#  --load /project/aip-khisti/babaogl4/vaughan/5x8-MAT35-gemma2b

From b0976759a2dfbad5aecefbf970cc9cd067b8158e Mon Sep 17 00:00:00 2001
From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:57:02 -0400
Subject: [PATCH 5/8] Update run.sh

---
 run.sh | 57 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/run.sh b/run.sh
index b3e1237..afd61e0 100644
--- a/run.sh
+++ b/run.sh
@@ -1,26 +1,41 @@
-#!/bin/sh
-#SBATCH --job-name=eval
-#SBATCH --gres=gpu:rtx6000:1
-#SBATCH --qos=normal
-#SBATCH --time=10:00:00
-#SBATCH -c 30
-#SBATCH --mem=60G
-#SBATCH --output=slurm-%j.out
-#SBATCH --error=slurm-%j.err
-
-
-export CUDA_HOME=/pkgs/cuda-12.4
-export PATH=$CUDA_HOME/bin:$PATH
-export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
-# module load cuda-12.4
+# run the following in terminal:
+sbatch \
+  --account=aip-khisti \
+  --nodes=1 \
+  --gres=gpu:l40s:4\
+  --ntasks-per-node=1 \
+  --mem=120G \
+  --cpus-per-task=4 \
+  --time=40:00:00 \
+  quantize.sh
+
+
+
+
+#no longer needed in the new Killarney setup.
+# #!/bin/sh
+# #SBATCH --job-name=eval
+# #SBATCH --gres=gpu:rtx6000:1
+# #SBATCH --qos=normal
+# #SBATCH --time=10:00:00
+# #SBATCH -c 30
+# #SBATCH --mem=60G
+# #SBATCH --output=slurm-%j.out
+# #SBATCH --error=slurm-%j.err
+
 
 # export CUDA_HOME=/pkgs/cuda-12.4
 # export PATH=$CUDA_HOME/bin:$PATH
 # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
-. /scratch/ssd004/scratch/chensy/envs/dora_llama
-# export CUDA_HOME=/pkgs/cuda-12.1
-# export PATH=$CUDA_HOME/bin:$PATH
-# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+# # module load cuda-12.4
+
+# # export CUDA_HOME=/pkgs/cuda-12.4
+# # export PATH=$CUDA_HOME/bin:$PATH
+# # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+# . /scratch/ssd004/scratch/chensy/envs/dora_llama
+# # export CUDA_HOME=/pkgs/cuda-12.1
+# # export PATH=$CUDA_HOME/bin:$PATH
+# # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
 
-lsmod | grep -i nvidia
-sh quantize.sh
\ No newline at end of file
+# lsmod | grep -i nvidia
+# sh quantize.sh

From d101f962bafb7c7059d4890e8d18ca0d06227967 Mon Sep 17 00:00:00 2001
From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com>
Date: Mon, 30 Mar 2026 18:30:31 -0400
Subject: [PATCH 6/8] Change SAVE_PATH and load path to ANONYMOUS

Updated SAVE_PATH and load path to ANONYMOUS.
---
 quantize.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quantize.sh b/quantize.sh
index 8a16346..8e8eb35 100644
--- a/quantize.sh
+++ b/quantize.sh
@@ -6,7 +6,7 @@
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 export MODEL_PATH=/model-weights/gemma-2b/    
 export DATASET_PATH=wikitext2
-export SAVE_PATH=/project/aip-khisti/babaogl4/vaughan/5x8-MAT35-gemma2b
+export SAVE_PATH=ANONYMOUS
 
 python main.py $MODEL_PATH $DATASET_PATH \
  --nsamples=1024 \
@@ -37,4 +37,4 @@ python main.py $MODEL_PATH $DATASET_PATH \
 #  --finetune_keep_best \
 #  --local_batch_size=1 \
 #  --offload_activations \
-#  --load /project/aip-khisti/babaogl4/vaughan/5x8-MAT35-gemma2b
+#  --load ANONYMOUS

From c739d7a61f8a99cd9ba94bf31213605148b23a4d Mon Sep 17 00:00:00 2001
From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com>
Date: Mon, 30 Mar 2026 18:31:59 -0400
Subject: [PATCH 7/8] Modified for new environment

Updated account information and removed obsolete comments.
---
 run.sh | 33 +--------------------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

diff --git a/run.sh b/run.sh
index afd61e0..3c95fdd 100644
--- a/run.sh
+++ b/run.sh
@@ -1,6 +1,6 @@
 # run the following in terminal:
 sbatch \
-  --account=aip-khisti \
+  --account=ANONYMOUS \
   --nodes=1 \
   --gres=gpu:l40s:4\
   --ntasks-per-node=1 \
@@ -8,34 +8,3 @@ sbatch \
   --cpus-per-task=4 \
   --time=40:00:00 \
   quantize.sh
-
-
-
-
-#no longer needed in the new Killarney setup.
-# #!/bin/sh
-# #SBATCH --job-name=eval
-# #SBATCH --gres=gpu:rtx6000:1
-# #SBATCH --qos=normal
-# #SBATCH --time=10:00:00
-# #SBATCH -c 30
-# #SBATCH --mem=60G
-# #SBATCH --output=slurm-%j.out
-# #SBATCH --error=slurm-%j.err
-
-
-# export CUDA_HOME=/pkgs/cuda-12.4
-# export PATH=$CUDA_HOME/bin:$PATH
-# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
-# # module load cuda-12.4
-
-# # export CUDA_HOME=/pkgs/cuda-12.4
-# # export PATH=$CUDA_HOME/bin:$PATH
-# # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
-# . /scratch/ssd004/scratch/chensy/envs/dora_llama
-# # export CUDA_HOME=/pkgs/cuda-12.1
-# # export PATH=$CUDA_HOME/bin:$PATH
-# # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
-
-# lsmod | grep -i nvidia
-# sh quantize.sh

From 4e980eaf5d327f12fa19cc035190a6eb4415cfd6 Mon Sep 17 00:00:00 2001
From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com>
Date: Mon, 30 Mar 2026 18:35:26 -0400
Subject: [PATCH 8/8] minor

---
 aq_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aq_engine.py b/aq_engine.py
index f6e6585..00c6467 100644
--- a/aq_engine.py
+++ b/aq_engine.py
@@ -131,7 +131,7 @@ def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor:
         # delta_weight = (quantized_weight - reference_weight).to(self.XTX.dtype)
         # return (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features
         
-        assert self.quantized_weight is not None, "必须在 AQUtil.quantize 内部/之后调用"
+        assert self.quantized_weight is not None, "must be called inside / after AQUtil.quantize"
 
         if isinstance(selection, ellipsis):
             reference_weight = self.layer.weight.detach().to(self.quantized_weight.codebooks.dtype)