From 3a36de65ee1b6a6bd8bd80e90bfe3e9232776078 Mon Sep 17 00:00:00 2001 From: chensy Date: Tue, 1 Apr 2025 19:43:48 -0400 Subject: [PATCH 1/8] matryoshka --- aq_engine.py | 51 ++++++++++++++++++++++---- quantize.sh | 22 +++++++++++ run.sh | 26 +++++++++++++ src/aq.py | 16 ++++++-- src/utils.py | 101 +++++++++++++++++++++++++++++++++++++++++---------- 5 files changed, 187 insertions(+), 29 deletions(-) create mode 100644 quantize.sh create mode 100644 run.sh diff --git a/aq_engine.py b/aq_engine.py index ea14eec..5d770f8 100644 --- a/aq_engine.py +++ b/aq_engine.py @@ -105,6 +105,9 @@ def quantize(self, *, args: Namespace, verbose: bool = True) -> QuantizedWeight: ) return self.quantized_weight + + + def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor: """ Compute the activation MSE error = ||X @ quantized_weight - X @ reference_weight||^2 @@ -114,21 +117,55 @@ def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor: The indices / slices must correspond to output channels (if out_group_size==1) or groups (if > 1). Formally, the indices must be in range [ 0 , self.out_features // self.out_group_size ) """ - assert self.quantized_weight is not None, "must be called inside / after AQUtil.quantize" - quantized_weight = self.quantized_weight(selection) - + # assert self.quantized_weight is not None, "must be called inside / after AQUtil.quantize" + # quantized_weight = self.quantized_weight(selection) + + # if isinstance(selection, ellipsis): + # reference_weight = self.layer.weight.detach().to(quantized_weight.dtype) + # else: + # assert isinstance(selection, slice) + # out_channel_selection = slice( + # selection.start * self.quantized_weight.out_group_size, + # selection.stop * self.quantized_weight.out_group_size, + # ) + + # reference_weight = self.layer.weight.detach()[out_channel_selection].to(quantized_weight.dtype) + # delta_weight = (quantized_weight - reference_weight).to(self.XTX.dtype) + # return (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features + + assert self.quantized_weight is not None, "必须在 AQUtil.quantize 内部/之后调用" + + # 获取参考权重 if isinstance(selection, ellipsis): - reference_weight = self.layer.weight.detach().to(quantized_weight.dtype) + reference_weight = self.layer.weight.detach().to(self.quantized_weight.codebooks.dtype) else: assert isinstance(selection, slice) out_channel_selection = slice( selection.start * self.quantized_weight.out_group_size, selection.stop * self.quantized_weight.out_group_size, ) + reference_weight = self.layer.weight.detach()[out_channel_selection].to(self.quantized_weight.codebooks.dtype) + + # 计算总的 codebook 数量 + total_codebooks = self.quantized_weight.num_codebooks + total_loss = torch.tensor(0.0, device=self.device, dtype=self.XTX.dtype) + + # 对每个渐进式阶段计算 MSE + for i in range(1, total_codebooks + 1): + # 获取使用前 i 个 codebook 的量化权重 + quantized_weight_i = self.quantized_weight(selection, num_codebooks=i) + + # 计算当前阶段的 MSE + delta_weight = (quantized_weight_i - reference_weight).to(self.XTX.dtype) + mse_i = (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features + + # 添加到总损失 + total_loss = total_loss + mse_i + + return total_loss + + - reference_weight = self.layer.weight.detach()[out_channel_selection].to(quantized_weight.dtype) - delta_weight = (quantized_weight - reference_weight).to(self.XTX.dtype) - return (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features def _replace_and_compute_mse(self, params_to_replace: nn.ParameterDict, selection: slice) -> torch.Tensor: """Utility for parallelism: replace the specified parameters of self.quantized_weight, then compute MSE""" diff --git a/quantize.sh b/quantize.sh new file mode 100644 index 0000000..0b1f204 --- /dev/null +++ b/quantize.sh @@ -0,0 +1,22 @@ +export CUDA_VISIBLE_DEVICES=0,1 # or e.g. 0,1,2,3 +export MODEL_PATH=/model-weights/gemma-2b # /scratch/ssd004/scratch/chensy/hf_home/models--meta-llama--Llama-2-7b-hf/blobs/2ef41cbc275000b29afe157ba487f0530b8c26dc +export DATASET_PATH=pajama +export SAVE_PATH=/scratch/ssd004/scratch/chensy/AQLM-x/ +# export WANDB_PROJECT=MY_AQ_EXPS +# export WANDB_NAME=COOL_EXP_NAME + +python main.py $MODEL_PATH $DATASET_PATH \ + --nsamples=1024 \ + --val_size=32 \ + --num_codebooks=2 \ + --nbits_per_codebook=8 \ + --in_group_size=8 \ + --relative_mse_tolerance=0.005 \ + --finetune_batch_size=32 \ + --finetune_max_epochs=10 \ + --finetune_early_stop=3 \ + --finetune_keep_best \ + --local_batch_size=1 \ + --offload_activations \ + --resume \ + --save $SAVE_PATH \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..b3e1237 --- /dev/null +++ b/run.sh @@ -0,0 +1,26 @@ +#!/bin/sh +#SBATCH --job-name=eval +#SBATCH --gres=gpu:rtx6000:1 +#SBATCH --qos=normal +#SBATCH --time=10:00:00 +#SBATCH -c 30 +#SBATCH --mem=60G +#SBATCH --output=slurm-%j.out +#SBATCH --error=slurm-%j.err + + +export CUDA_HOME=/pkgs/cuda-12.4 +export PATH=$CUDA_HOME/bin:$PATH +export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH +# module load cuda-12.4 + +# export CUDA_HOME=/pkgs/cuda-12.4 +# export PATH=$CUDA_HOME/bin:$PATH +# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH +. /scratch/ssd004/scratch/chensy/envs/dora_llama +# export CUDA_HOME=/pkgs/cuda-12.1 +# export PATH=$CUDA_HOME/bin:$PATH +# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH + +lsmod | grep -i nvidia +sh quantize.sh \ No newline at end of file diff --git a/src/aq.py b/src/aq.py index df2f681..a1755ae 100644 --- a/src/aq.py +++ b/src/aq.py @@ -197,16 +197,26 @@ def get_scales(self) -> torch.Tensor: def shape(self) -> Tuple[int, int]: return self.out_features, self.in_features - def forward(self, selection: Union[slice, ellipsis, torch.Tensor] = ...): + def forward(self, selection: Union[slice, ellipsis, torch.Tensor] = ..., num_codebooks: Optional[int] = None): """ Differentably reconstruct the weight (or parts thereof) from compressed components :param selection: By default, reconstruct the entire weight. If selection is specified, this method will instead reconstruct a portion of weight for the corresponding output dimensions (used for parallelism). The indices / slices must correspond to output channels (if out_group_size==1) or groups (if > 1). Formally, the indices must be in range [ 0 , self.out_features // self.out_group_size ) - + :param num_codebooks: Number of codebooks to use for reconstruction. If None, all codebooks are used. + If specified, only the first `num_codebooks` will be used. """ - weight = _dequantize_weight(self.get_codes()[selection], self.get_codebooks(), self.get_scales()[selection]) + # 检查 num_codebooks 参数 + if num_codebooks is not None: + num_codebooks = min(num_codebooks, self.num_codebooks) + + weight = _dequantize_weight( + self.get_codes()[selection], + self.get_codebooks(), + self.get_scales()[selection], + num_codebooks + ) return weight @torch.no_grad() diff --git a/src/utils.py b/src/utils.py index 9dd77ed..e3ee3f4 100644 --- a/src/utils.py +++ b/src/utils.py @@ -61,35 +61,98 @@ def maybe_script(fn: callable) -> callable: return torch.jit.script(fn) if should_script else fn +# @maybe_script +# def _dequantize_weight( +# codes: torch.Tensor, codebooks: torch.Tensor, scales: Optional[torch.Tensor] = None +# ) -> torch.Tensor: +# """ +# Decode float weights from quantization codes. Differentiable. +# :param codes: tensor of integer quantization codes, shape [*dims, num_out_groups, num_in_groups, num_codebooks] +# :param codebooks: tensor of vectors for each quantization code, [num_codebooks, codebook_size, out_group_size, in_group_size] +# :param scales: weight will be multiplied by this factor, must be broadcastble with [*dims, out_groups, num_in_groups, out_group_size, in_group_size] +# :return: reconstructed weight tensor of shape [*dims, num_in_groups*group_size] +# """ +# num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:] +# num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape +# out_features = num_out_groups * out_group_size +# in_features = num_in_groups * in_group_size +# codebook_offsets = torch.arange( +# 0, num_codebooks * codebook_size, codebook_size, device=codes.device +# ) # shape: [num_codebooks] +# reconstructed_weight_flat = F.embedding_bag( +# codes.flatten(0, -2) + codebook_offsets, codebooks.flatten(0, 1).flatten(-2, -1), mode="sum" +# ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size] + +# reconstructed_weight_groupwise = reconstructed_weight_flat.view( +# list(codes.shape[:-3]) + [num_out_groups, num_in_groups, out_group_size, in_group_size] +# ) +# if scales is not None: +# reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(scales) +# return reconstructed_weight_groupwise.swapaxes(-3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features]) + @maybe_script def _dequantize_weight( - codes: torch.Tensor, codebooks: torch.Tensor, scales: Optional[torch.Tensor] = None + codes: torch.Tensor, codebooks: torch.Tensor, scales: Optional[torch.Tensor] = None, num_codebooks: Optional[int] = None ) -> torch.Tensor: """ Decode float weights from quantization codes. Differentiable. :param codes: tensor of integer quantization codes, shape [*dims, num_out_groups, num_in_groups, num_codebooks] :param codebooks: tensor of vectors for each quantization code, [num_codebooks, codebook_size, out_group_size, in_group_size] :param scales: weight will be multiplied by this factor, must be broadcastble with [*dims, out_groups, num_in_groups, out_group_size, in_group_size] + :param num_codebooks: Number of codebooks to use. If None, all available codebooks are used. :return: reconstructed weight tensor of shape [*dims, num_in_groups*group_size] """ - num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:] - num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape - out_features = num_out_groups * out_group_size - in_features = num_in_groups * in_group_size - codebook_offsets = torch.arange( - 0, num_codebooks * codebook_size, codebook_size, device=codes.device - ) # shape: [num_codebooks] - reconstructed_weight_flat = F.embedding_bag( - codes.flatten(0, -2) + codebook_offsets, codebooks.flatten(0, 1).flatten(-2, -1), mode="sum" - ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size] - - reconstructed_weight_groupwise = reconstructed_weight_flat.view( - list(codes.shape[:-3]) + [num_out_groups, num_in_groups, out_group_size, in_group_size] - ) - if scales is not None: - reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(scales) - return reconstructed_weight_groupwise.swapaxes(-3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features]) - + # 使用所有可用的 codebook 或仅使用指定数量的 codebook + available_codebooks = codebooks.shape[0] + effective_num_codebooks = available_codebooks if num_codebooks is None else min(num_codebooks, available_codebooks) + + # 如果要使用所有 codebook,使用原始方法 + if effective_num_codebooks == available_codebooks: + num_out_groups, num_in_groups, _ = codes.shape[-3:] + _, codebook_size, out_group_size, in_group_size = codebooks.shape + out_features = num_out_groups * out_group_size + in_features = num_in_groups * in_group_size + codebook_offsets = torch.arange( + 0, available_codebooks * codebook_size, codebook_size, device=codes.device + ) # shape: [num_codebooks] + reconstructed_weight_flat = F.embedding_bag( + codes.flatten(0, -2) + codebook_offsets, codebooks.flatten(0, 1).flatten(-2, -1), mode="sum" + ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size] + + reconstructed_weight_groupwise = reconstructed_weight_flat.view( + list(codes.shape[:-3]) + [num_out_groups, num_in_groups, out_group_size, in_group_size] + ) + if scales is not None: + reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(scales) + return reconstructed_weight_groupwise.swapaxes(-3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features]) + + # 使用部分 codebook + else: + num_out_groups, num_in_groups, _ = codes.shape[-3:] + _, codebook_size, out_group_size, in_group_size = codebooks.shape + out_features = num_out_groups * out_group_size + in_features = num_in_groups * in_group_size + + # 只使用前 effective_num_codebooks 个 codebook + selected_codes = codes[..., :effective_num_codebooks] + selected_codebooks = codebooks[:effective_num_codebooks] + + codebook_offsets = torch.arange( + 0, effective_num_codebooks * codebook_size, codebook_size, device=codes.device + ) # shape: [effective_num_codebooks] + + reconstructed_weight_flat = F.embedding_bag( + selected_codes.flatten(0, -2) + codebook_offsets, + selected_codebooks.flatten(0, 1).flatten(-2, -1), + mode="sum" + ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size * in_group_size] + + reconstructed_weight_groupwise = reconstructed_weight_flat.view( + list(codes.shape[:-3]) + [num_out_groups, num_in_groups, out_group_size, in_group_size] + ) + if scales is not None: + reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(scales) + return reconstructed_weight_groupwise.swapaxes(-3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features]) @contextlib.contextmanager def using_tf32(enabled: bool): From 8baa524d842fead4d78143091a4956bf43475d55 Mon Sep 17 00:00:00 2001 From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com> Date: Tue, 30 Sep 2025 11:31:33 -0400 Subject: [PATCH 2/8] Update aq_engine.py Updated _compuse mse with Matryoshka inspired loss and tuned codebook_weights --- aq_engine.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/aq_engine.py b/aq_engine.py index 5d770f8..f6e6585 100644 --- a/aq_engine.py +++ b/aq_engine.py @@ -105,9 +105,7 @@ def quantize(self, *, args: Namespace, verbose: bool = True) -> QuantizedWeight: ) return self.quantized_weight - - - + ### modified _compute_mse for DropbyDrop def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor: """ Compute the activation MSE error = ||X @ quantized_weight - X @ reference_weight||^2 @@ -132,10 +130,9 @@ def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor: # reference_weight = self.layer.weight.detach()[out_channel_selection].to(quantized_weight.dtype) # delta_weight = (quantized_weight - reference_weight).to(self.XTX.dtype) # return (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features - + assert self.quantized_weight is not None, "必须在 AQUtil.quantize 内部/之后调用" - - # 获取参考权重 + if isinstance(selection, ellipsis): reference_weight = self.layer.weight.detach().to(self.quantized_weight.codebooks.dtype) else: @@ -146,25 +143,29 @@ def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor: ) reference_weight = self.layer.weight.detach()[out_channel_selection].to(self.quantized_weight.codebooks.dtype) - # 计算总的 codebook 数量 total_codebooks = self.quantized_weight.num_codebooks - total_loss = torch.tensor(0.0, device=self.device, dtype=self.XTX.dtype) - # 对每个渐进式阶段计算 MSE + # EXAMPLE - 35W + codebook_weights = torch.tensor([0,0, 0.5, 0, 0.5], device=self.device, dtype=self.XTX.dtype) + + total_loss = torch.tensor(0.0, device=self.device, dtype=self.XTX.dtype) + + # Inspired by Matryoshka Representation Learning for i in range(1, total_codebooks + 1): - # 获取使用前 i 个 codebook 的量化权重 + quantized_weight_i = self.quantized_weight(selection, num_codebooks=i) - # 计算当前阶段的 MSE delta_weight = (quantized_weight_i - reference_weight).to(self.XTX.dtype) mse_i = (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features - # 添加到总损失 - total_loss = total_loss + mse_i - - return total_loss - + # Ensure all tensors are on the same device before computation + mse_i = mse_i.to(self.device) + codebook_weight = codebook_weights[i-1].to(self.device) + + #total_loss = total_loss + mse_i + total_loss = total_loss + codebook_weight* mse_i + return total_loss def _replace_and_compute_mse(self, params_to_replace: nn.ParameterDict, selection: slice) -> torch.Tensor: From 745320f2c4f6ac6ba1fc2803195a22b976883b77 Mon Sep 17 00:00:00 2001 From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com> Date: Tue, 30 Sep 2025 11:34:58 -0400 Subject: [PATCH 3/8] Update aq.py added note for dropping codebooks during inference --- src/aq.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/aq.py b/src/aq.py index a1755ae..1585f3d 100644 --- a/src/aq.py +++ b/src/aq.py @@ -210,12 +210,14 @@ def forward(self, selection: Union[slice, ellipsis, torch.Tensor] = ..., num_cod # 检查 num_codebooks 参数 if num_codebooks is not None: num_codebooks = min(num_codebooks, self.num_codebooks) - + + # FOR DROP-BY-DROP's INFERENCE, modify num_codebooks (i.e. num_codebooks = 3) to simulate "dropping" of codebooks without any + # additional retraining or finetuning. Just load the quantized model through $SAVE_PATH in the shell script. weight = _dequantize_weight( self.get_codes()[selection], self.get_codebooks(), self.get_scales()[selection], - num_codebooks + num_codebooks ) return weight From 656967ee72eceb4f3a157272616a50a74410e862 Mon Sep 17 00:00:00 2001 From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com> Date: Tue, 30 Sep 2025 11:39:51 -0400 Subject: [PATCH 4/8] Update quantize.sh --- quantize.sh | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/quantize.sh b/quantize.sh index 0b1f204..8a16346 100644 --- a/quantize.sh +++ b/quantize.sh @@ -1,17 +1,20 @@ -export CUDA_VISIBLE_DEVICES=0,1 # or e.g. 0,1,2,3 -export MODEL_PATH=/model-weights/gemma-2b # /scratch/ssd004/scratch/chensy/hf_home/models--meta-llama--Llama-2-7b-hf/blobs/2ef41cbc275000b29afe157ba487f0530b8c26dc -export DATASET_PATH=pajama -export SAVE_PATH=/scratch/ssd004/scratch/chensy/AQLM-x/ -# export WANDB_PROJECT=MY_AQ_EXPS -# export WANDB_NAME=COOL_EXP_NAME +#!/bin/bash +#SBATCH --job-name=quantize +#SBATCH --output=slurm-%j-MAT35-gemma2b.out +#SBATCH --error=slurm-%j-MAT35-gemma2b.err + +export CUDA_VISIBLE_DEVICES=0,1,2,3 +export MODEL_PATH=/model-weights/gemma-2b/ +export DATASET_PATH=wikitext2 +export SAVE_PATH=/project/aip-khisti/babaogl4/vaughan/5x8-MAT35-gemma2b python main.py $MODEL_PATH $DATASET_PATH \ --nsamples=1024 \ --val_size=32 \ - --num_codebooks=2 \ + --num_codebooks=5 \ --nbits_per_codebook=8 \ --in_group_size=8 \ - --relative_mse_tolerance=0.005 \ + --relative_mse_tolerance=0.01 \ --finetune_batch_size=32 \ --finetune_max_epochs=10 \ --finetune_early_stop=3 \ @@ -19,4 +22,19 @@ python main.py $MODEL_PATH $DATASET_PATH \ --local_batch_size=1 \ --offload_activations \ --resume \ - --save $SAVE_PATH \ No newline at end of file + --save $SAVE_PATH + +# python main.py $MODEL_PATH $DATASET_PATH \ +# --nsamples=1024 \ +# --val_size=32 \ +# --num_codebooks=5 \ +# --nbits_per_codebook=8 \ +# --in_group_size=8 \ +# --relative_mse_tolerance=0.01 \ +# --finetune_batch_size=32 \ +# --finetune_max_epochs=10 \ +# --finetune_early_stop=3 \ +# --finetune_keep_best \ +# --local_batch_size=1 \ +# --offload_activations \ +# --load /project/aip-khisti/babaogl4/vaughan/5x8-MAT35-gemma2b From b0976759a2dfbad5aecefbf970cc9cd067b8158e Mon Sep 17 00:00:00 2001 From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com> Date: Tue, 30 Sep 2025 11:57:02 -0400 Subject: [PATCH 5/8] Update run.sh --- run.sh | 57 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/run.sh b/run.sh index b3e1237..afd61e0 100644 --- a/run.sh +++ b/run.sh @@ -1,26 +1,41 @@ -#!/bin/sh -#SBATCH --job-name=eval -#SBATCH --gres=gpu:rtx6000:1 -#SBATCH --qos=normal -#SBATCH --time=10:00:00 -#SBATCH -c 30 -#SBATCH --mem=60G -#SBATCH --output=slurm-%j.out -#SBATCH --error=slurm-%j.err - - -export CUDA_HOME=/pkgs/cuda-12.4 -export PATH=$CUDA_HOME/bin:$PATH -export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH -# module load cuda-12.4 +# run the following in terminal: +sbatch \ + --account=aip-khisti \ + --nodes=1 \ + --gres=gpu:l40s:4\ + --ntasks-per-node=1 \ + --mem=120G \ + --cpus-per-task=4 \ + --time=40:00:00 \ + quantize.sh + + + + +#no longer needed in the new Killarney setup. +# #!/bin/sh +# #SBATCH --job-name=eval +# #SBATCH --gres=gpu:rtx6000:1 +# #SBATCH --qos=normal +# #SBATCH --time=10:00:00 +# #SBATCH -c 30 +# #SBATCH --mem=60G +# #SBATCH --output=slurm-%j.out +# #SBATCH --error=slurm-%j.err + # export CUDA_HOME=/pkgs/cuda-12.4 # export PATH=$CUDA_HOME/bin:$PATH # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH -. /scratch/ssd004/scratch/chensy/envs/dora_llama -# export CUDA_HOME=/pkgs/cuda-12.1 -# export PATH=$CUDA_HOME/bin:$PATH -# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH +# # module load cuda-12.4 + +# # export CUDA_HOME=/pkgs/cuda-12.4 +# # export PATH=$CUDA_HOME/bin:$PATH +# # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH +# . /scratch/ssd004/scratch/chensy/envs/dora_llama +# # export CUDA_HOME=/pkgs/cuda-12.1 +# # export PATH=$CUDA_HOME/bin:$PATH +# # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH -lsmod | grep -i nvidia -sh quantize.sh \ No newline at end of file +# lsmod | grep -i nvidia +# sh quantize.sh From d101f962bafb7c7059d4890e8d18ca0d06227967 Mon Sep 17 00:00:00 2001 From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com> Date: Mon, 30 Mar 2026 18:30:31 -0400 Subject: [PATCH 6/8] Change SAVE_PATH and load path to ANONYMOUS Updated SAVE_PATH and load path to ANONYMOUS. --- quantize.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/quantize.sh b/quantize.sh index 8a16346..8e8eb35 100644 --- a/quantize.sh +++ b/quantize.sh @@ -6,7 +6,7 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3 export MODEL_PATH=/model-weights/gemma-2b/ export DATASET_PATH=wikitext2 -export SAVE_PATH=/project/aip-khisti/babaogl4/vaughan/5x8-MAT35-gemma2b +export SAVE_PATH=ANONYMOUS python main.py $MODEL_PATH $DATASET_PATH \ --nsamples=1024 \ @@ -37,4 +37,4 @@ python main.py $MODEL_PATH $DATASET_PATH \ # --finetune_keep_best \ # --local_batch_size=1 \ # --offload_activations \ -# --load /project/aip-khisti/babaogl4/vaughan/5x8-MAT35-gemma2b +# --load ANONYMOUS From c739d7a61f8a99cd9ba94bf31213605148b23a4d Mon Sep 17 00:00:00 2001 From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com> Date: Mon, 30 Mar 2026 18:31:59 -0400 Subject: [PATCH 7/8] Modified for new environment Updated account information and removed obsolete comments. --- run.sh | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/run.sh b/run.sh index afd61e0..3c95fdd 100644 --- a/run.sh +++ b/run.sh @@ -1,6 +1,6 @@ # run the following in terminal: sbatch \ - --account=aip-khisti \ + --account=ANONYMOUS \ --nodes=1 \ --gres=gpu:l40s:4\ --ntasks-per-node=1 \ @@ -8,34 +8,3 @@ sbatch \ --cpus-per-task=4 \ --time=40:00:00 \ quantize.sh - - - - -#no longer needed in the new Killarney setup. -# #!/bin/sh -# #SBATCH --job-name=eval -# #SBATCH --gres=gpu:rtx6000:1 -# #SBATCH --qos=normal -# #SBATCH --time=10:00:00 -# #SBATCH -c 30 -# #SBATCH --mem=60G -# #SBATCH --output=slurm-%j.out -# #SBATCH --error=slurm-%j.err - - -# export CUDA_HOME=/pkgs/cuda-12.4 -# export PATH=$CUDA_HOME/bin:$PATH -# export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH -# # module load cuda-12.4 - -# # export CUDA_HOME=/pkgs/cuda-12.4 -# # export PATH=$CUDA_HOME/bin:$PATH -# # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH -# . /scratch/ssd004/scratch/chensy/envs/dora_llama -# # export CUDA_HOME=/pkgs/cuda-12.1 -# # export PATH=$CUDA_HOME/bin:$PATH -# # export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH - -# lsmod | grep -i nvidia -# sh quantize.sh From 4e980eaf5d327f12fa19cc035190a6eb4415cfd6 Mon Sep 17 00:00:00 2001 From: liza-babaoglu <67078190+liza-babaoglu@users.noreply.github.com> Date: Mon, 30 Mar 2026 18:35:26 -0400 Subject: [PATCH 8/8] minor --- aq_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aq_engine.py b/aq_engine.py index f6e6585..00c6467 100644 --- a/aq_engine.py +++ b/aq_engine.py @@ -131,7 +131,7 @@ def _compute_mse(self, selection: Union[slice, ellipsis] = ...) -> torch.Tensor: # delta_weight = (quantized_weight - reference_weight).to(self.XTX.dtype) # return (delta_weight @ self.XTX).flatten() @ delta_weight.flatten() / self.quantized_weight.out_features - assert self.quantized_weight is not None, "必须在 AQUtil.quantize 内部/之后调用" + assert self.quantized_weight is not None, "must be called inside / after AQUtil.quantize" if isinstance(selection, ellipsis): reference_weight = self.layer.weight.detach().to(self.quantized_weight.codebooks.dtype)