diff --git a/models/__init__.py b/models/__init__.py
index 5ab9af8..db37f54 100644
--- a/models/__init__.py
+++ b/models/__init__.py
@@ -8,7 +8,7 @@
 from utils.utils import get_label
 from utils.minus_utils import model_layer_switch, lora_to_linear, lora_to_distill, lora_to_prunelora, linear_to_lora
 from trainer.model_arch import get_ffn1, get_mha_proj
-from utils.alpaca_utils import smart_tokenizer_and_embedding_resize
+# from utils.alpaca_utils import smart_tokenizer_and_embedding_resize
 from .modeling_bert import CoFiBertForSequenceClassification, AdaPBertForQuestionAnswering
 # from .modeling_roberta_backup import CoFiRobertaForSequenceClassification, NewRobertaForQuestionAnswering
 from .modeling_roberta import CoFiRobertaForSequenceClassification, NewRobertaForQuestionAnswering
diff --git a/models/modeling_mt5.py b/models/modeling_mt5.py
index 4c80629..d84f598 100644
--- a/models/modeling_mt5.py
+++ b/models/modeling_mt5.py
@@ -7,7 +7,10 @@
 from dataclasses import dataclass
 from typing import Union
 
-from transformers.file_utils import ModelOutput
+try:
+    from transformers.utils import ModelOutput
+except ImportError:
+    from transformers.file_utils import ModelOutput
 
 from transformers.models.mt5.modeling_mt5 import (
     MT5ForConditionalGeneration,
@@ -1763,4 +1766,4 @@ def custom_forward(*inputs):
             hidden_states=all_hidden_states,
             attentions=all_attentions,
             cross_attentions=all_cross_attentions,
-        )
\ No newline at end of file
+        )
diff --git a/models/modeling_outputs.py b/models/modeling_outputs.py
index 4c8f096..70a00e4 100644
--- a/models/modeling_outputs.py
+++ b/models/modeling_outputs.py
@@ -3,7 +3,10 @@
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
-from transformers.file_utils import ModelOutput
+try:
+    from transformers.utils import ModelOutput
+except ImportError:
+    from transformers.file_utils import ModelOutput
 from transformers.modeling_outputs import SequenceClassifierOutput, QuestionAnsweringModelOutput, BaseModelOutputWithPastAndCrossAttentions
 
 
@@ -40,4 +43,4 @@ class NewSequenceClassifierOutput(SequenceClassifierOutput):
     masked_loss: Optional[torch.FloatTensor] = None
     
 class AdaPBaseModelOutputWithPastAndCrossAttentions(BaseModelOutputWithPastAndCrossAttentions):
-    masked_hidden_states: torch.FloatTensor = None
\ No newline at end of file
+    masked_hidden_states: torch.FloatTensor = None
diff --git a/requirements.txt b/requirements.txt
index 36f0633..ebe0a51 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,17 @@
-datasets==2.10.0
-deepspeed==0.8.0
-matplotlib==3.7.1
-numpy==1.24.3
-ortools==9.6.2534
-pandas==1.5.2
-scikit_learn==1.1.3
-scipy==1.10.1
-seaborn==0.12.2
-tqdm==4.65.0
-transformers==4.28.1
-nltk==3.8.1
-rouge-score==0.1.2
-torch==1.10.2+cu113
---extra-index-url https://download.pytorch.org/whl/cu113
\ No newline at end of file
+torch>=2.2
+transformers==4.45.0
+accelerate>=1.0
+datasets>=3.0
+evaluate>=0.4.3
+deepspeed>=0.16
+tokenizers>=0.20
+numpy>=1.26
+pandas>=2.2
+scikit-learn>=1.5
+scipy>=1.13
+matplotlib>=3.9
+seaborn>=0.13
+tqdm>=4.66
+nltk>=3.9
+rouge-score>=0.1.2
+ortools>=9.10
diff --git a/run_minus_seq2seq_training.py b/run_minus_seq2seq_training.py
index cba2f6a..a9b84c8 100644
--- a/run_minus_seq2seq_training.py
+++ b/run_minus_seq2seq_training.py
@@ -15,7 +15,7 @@
 from transformers import (HfArgumentParser, EvalPrediction, DataCollatorForSeq2Seq, set_seed)
 from torch.nn.utils.rnn import pad_sequence
 from deepspeed.profiling.flops_profiler import get_model_profile
-from datasets import load_metric
+import evaluate
 from models.model_args import ModelArguments
 from utils.utils import *
 from utils.minus_utils import efficiency_testing, input_constructor, compare_parameters
@@ -109,7 +109,7 @@ def main():
         model.hidden_mask = model.hidden_mask.to(training_args.device)
 
     if 'wmt' in task_name:
-        metric = load_metric("sacrebleu")
+        metric = evaluate.load("sacrebleu")
         gen_prefix = "eval"
 
         def postprocess_text(preds, labels):
@@ -140,7 +140,7 @@ def compute_metrics(eval_preds):
             result = {k: round(v, 4) for k, v in result.items()}
             return result
     else:
-        metric = load_metric("rouge")
+        metric = evaluate.load("rouge")
         def compute_metrics(eval_pred):
             predictions, labels = eval_pred
             decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
@@ -269,4 +269,4 @@ def compute_metrics(eval_pred):
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/run_minus_squad_training.py b/run_minus_squad_training.py
index 192cb98..7826a7a 100644
--- a/run_minus_squad_training.py
+++ b/run_minus_squad_training.py
@@ -12,7 +12,7 @@
 
 from transformers import (HfArgumentParser, EvalPrediction, default_data_collator, set_seed)
 from deepspeed.profiling.flops_profiler import get_model_profile
-from datasets import load_metric
+import evaluate
 from transformers import SquadDataTrainingArguments
 from models.model_args import ModelArguments
 from utils.utils import *
@@ -153,9 +153,9 @@ def post_processing_function(examples, features, predictions):
 
     # Get the metric function
     if IS_SQUAD_V2:
-        metric = load_metric("squad_v2")
+        metric = evaluate.load("squad_v2")
     else:
-        metric = load_metric("squad")
+        metric = evaluate.load("squad")
 
     def compute_metrics(p: EvalPrediction):
         return metric.compute(predictions=p.predictions, references=p.label_ids)
@@ -248,4 +248,4 @@ def compute_metrics(p: EvalPrediction):
         logger.info(f"Parameter variables not changed after pruning: {same_vars}")
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/run_minus_training.py b/run_minus_training.py
index 7082c10..6b2f3ed 100644
--- a/run_minus_training.py
+++ b/run_minus_training.py
@@ -1,3 +1,4 @@
+import torch
 import os
 import json
 os.environ["WANDB_DISABLED"] = "true"
@@ -9,11 +10,15 @@
 torch.backends.cudnn.allow_tf32 = True
 import time
 transformers.logging.set_verbosity_error()
-
-
+import evaluate
+from sklearn.metrics import f1_score, matthews_corrcoef
+from scipy.stats import pearsonr, spearmanr
 from transformers import (HfArgumentParser, EvalPrediction, default_data_collator, DataCollatorWithPadding, set_seed)
-from deepspeed.profiling.flops_profiler import get_model_profile
-from datasets import load_metric
+try:
+    from deepspeed.profiling.flops_profiler import get_model_profile
+except ImportError:
+    get_model_profile = None
+# from datasets import load_metric
 from args import DataTrainingArguments
 from models.model_args import ModelArguments
 from utils.utils import *
@@ -95,9 +100,9 @@ def main():
 
     # Get the metric function
     if data_args.task_name is not None:
-        metric = load_metric("glue", data_args.task_name, experiment_id='elastictuning' + data_args.task_name + str(time.time()))
+        metric = evaluate.load("glue", data_args.task_name, experiment_id='elastictuning' + data_args.task_name + str(time.time()))
     else:
-        metric = load_metric("accuracy", experiment_id='elastictuning' + data_args.task_name + str(time.time()))
+        metric = evaluate.load("accuracy", experiment_id='elastictuning' + data_args.task_name + str(time.time()))
     # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
     # predictions and label_ids field) and has to return a dictionary string to float.
     def compute_metrics(p: EvalPrediction):
@@ -115,7 +120,31 @@ def compute_metrics(p: EvalPrediction):
             preds = list(map(lambda x: label2id[tuple(x)] if tuple(x) in label2id else -1, preds.tolist()))
             labels = list(map(lambda x: label2id[tuple(x)], labels.tolist()))
         if data_args.task_name is not None:
-            result = metric.compute(predictions=preds, references=labels)
+            try:
+                result = metric.compute(predictions=preds, references=labels)
+            except ValueError as e:
+                if "Unable to avoid copy while creating an array as requested" not in str(e):
+                    raise
+                preds_np = np.asarray(preds)
+                labels_np = np.asarray(labels)
+                if data_args.task_name in {"sst2", "mnli", "qnli", "rte"}:
+                    result = {"accuracy": float((preds_np == labels_np).mean())}
+                elif data_args.task_name in {"mrpc", "qqp"}:
+                    result = {
+                        "accuracy": float((preds_np == labels_np).mean()),
+                        "f1": float(f1_score(labels_np, preds_np)),
+                    }
+                elif data_args.task_name == "cola":
+                    result = {"matthews_correlation": float(matthews_corrcoef(labels_np, preds_np))}
+                elif data_args.task_name == "stsb":
+                    p = np.asarray(preds_np, dtype=np.float64)
+                    y = np.asarray(labels_np, dtype=np.float64)
+                    result = {
+                        "pearson": float(pearsonr(p, y)[0]),
+                        "spearmanr": float(spearmanr(p, y)[0]),
+                    }
+                else:
+                    raise
             if len(result) > 1:
                 result["combined_score"] = np.mean(list(result.values())).item()
             return result
@@ -162,14 +191,17 @@ def compute_metrics(p: EvalPrediction):
     if getattr(model, 'hidden_mask', None) is not None:
         model.hidden_mask = model.hidden_mask.to(training_args.device)
     
-    flops, macs, params = get_model_profile(
-        model,
-        kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()},
-        print_profile=True,
-        detailed=True,
-        output_file=os.path.join(training_args.output_dir, 'pretrain_deepspeed_profile.txt'),
-    )
-    torch.cuda.reset_peak_memory_stats()
+    if get_model_profile is not None:
+        flops, macs, params = get_model_profile(
+            model,
+            kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()},
+            print_profile=True,
+            detailed=True,
+            output_file=os.path.join(training_args.output_dir, 'pretrain_deepspeed_profile.txt'),
+        )
+        torch.cuda.reset_peak_memory_stats()
+    else:
+        logger.warning("deepspeed is not installed; skipping pre-training FLOPs/MACs profiling.")
     
     seq_len = 170 if IS_SQUAD else avg_seq_length(data_args.task_name)
     training_args.task_name = data_args.task_name
@@ -226,14 +258,15 @@ def compute_metrics(p: EvalPrediction):
         if isinstance(module, LoRALayer):
             module.eval()
 
-    flops, macs, params = get_model_profile(
-        model,
-        kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()},        print_profile=True,
-        detailed=True,
-        output_file=os.path.join(training_args.output_dir, 'deepspeed_profile.txt'),
-    )
-    efficiency_results['model_flops'] = flops
-    efficiency_results['model_macs'] = macs
+    if get_model_profile is not None:
+        flops, macs, params = get_model_profile(
+            model,
+            kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()},        print_profile=True,
+            detailed=True,
+            output_file=os.path.join(training_args.output_dir, 'deepspeed_profile.txt'),
+        )
+        efficiency_results['model_flops'] = flops
+        efficiency_results['model_macs'] = macs
     
     json.dump(efficiency_results, open(os.path.join(training_args.output_dir, 'efficiency_results.json'), 'w'), indent=4, sort_keys=True)
     run_report = gen_run_report(training_args.output_dir)
@@ -253,4 +286,4 @@ def compute_metrics(p: EvalPrediction):
         logger.info(f"Parameter variables not changed after pruning: {same_vars}")
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/run_pruning.py b/run_pruning.py
index 5d1cbf5..005884b 100644
--- a/run_pruning.py
+++ b/run_pruning.py
@@ -4,7 +4,7 @@
 import torch
 
 from tqdm import tqdm
-from datasets import load_metric
+import evaluate
 from utils import avg_seq_length
 from transformers import HfArgumentParser, TrainingArguments, DataCollatorWithPadding
 from torch.utils.data import DataLoader, RandomSampler
@@ -81,9 +81,9 @@ def test(model, eval_dataloader, head_mask, intermediate_mask, metric, data_args
 
     # Get the metric function
     if data_args.task_name is not None:
-        metric = load_metric("glue", data_args.task_name)
+        metric = evaluate.load("glue", data_args.task_name)
     else:
-        metric = load_metric("accuracy")
+        metric = evaluate.load("accuracy")
 
     
     if prune_mode == 'random':
@@ -170,4 +170,4 @@ def test(model, eval_dataloader, head_mask, intermediate_mask, metric, data_args
     # pre-pruning eval_results: {'accuracy': 0.84625}
     # cofi eval_results: {'accuracy': 0.8052}
         
-    json.dump(accuracy_by_ratios, open(os.path.join(training_args.output_dir, '%s_mask_%s_%s_nolora.json' % (prune_mode, mask_mode, test_mode)), 'w'))
\ No newline at end of file
+    json.dump(accuracy_by_ratios, open(os.path.join(training_args.output_dir, '%s_mask_%s_%s_nolora.json' % (prune_mode, mask_mode, test_mode)), 'w'))
diff --git a/scripts/adaptpruning/roberta_base_mrpc_momentum.sh b/scripts/adaptpruning/roberta_base_mrpc_momentum.sh
index 0930f29..e335f32 100644
--- a/scripts/adaptpruning/roberta_base_mrpc_momentum.sh
+++ b/scripts/adaptpruning/roberta_base_mrpc_momentum.sh
@@ -55,7 +55,7 @@ pre_pruning_tuning_steps=200
 sparsity_warmup_epochs=4
 
 learning_rate=2e-4
-training_batch_size=32
+training_batch_size=16
 num_train_epochs=120
 warmup_param_tuning_config=q:0-11,v:0-11,i:0-11
 teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
diff --git a/scripts/adaptpruning/roberta_base_mrpc_momentum_no_distill.sh b/scripts/adaptpruning/roberta_base_mrpc_momentum_no_distill.sh
new file mode 100644
index 0000000..55ef5fa
--- /dev/null
+++ b/scripts/adaptpruning/roberta_base_mrpc_momentum_no_distill.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+#SBATCH -p gpu-rtx6k
+#SBATCH -A h2lab
+#SBATCH --nodes=1                  # Number of nodes
+#SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
+#SBATCH --cpus-per-task=8          # Number of CPU cores per task
+#SBATCH --mem=32G                 # Memory per node (total memory)
+#SBATCH --gres=gpu:1               # Number of GPUs requested
+#SBATCH --time=300:00:00             # Walltime (hh:mm:ss)
+
+if [ "$#" -eq 0 ]; then
+    mac_constraint=0.4
+    lora_r=8
+    pruning_start=-1
+    pruning_scheduler=cubic_gradual
+    pruner_type=running_fisher
+    param_allocation_strategy=running_fisher
+    distillation_type=self_momentum
+    distill_mapping_strategy=dynamic_block_teacher_dynamic_student
+elif [ "$#" -eq 8 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3    
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+    distillation_type=$7
+    distill_mapping_strategy=$8
+elif [ "$#" -eq 9 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+    distillation_type=$7
+    distill_mapping_strategy=$8
+    gpu_id=$9
+    export CUDA_VISIBLE_DEVICES=$gpu_id
+fi
+
+model_name=roberta-base
+param_resizing_strategy=tophalf_limited
+task_name=mrpc
+adapter_type=lora
+pruning_start=-1
+pruning_stop=24
+distill_start=-1 # about 60%, between 3.4 and 3.8, but after 3.6 where the teacher is updated
+distill_epoch=96
+pruning_batches=64
+num_prunings=8
+pruning_batch_size=4
+# pre_pruning_tuning_epochs=1
+pre_pruning_tuning_steps=200
+sparsity_warmup_epochs=4
+
+learning_rate=2e-4
+training_batch_size=16
+num_train_epochs=120
+warmup_param_tuning_config=q:0-11,v:0-11,i:0-11
+teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
+student_param_tuning_config=q:0-11,v:0-11,i:0-11
+lora_alpha=$(($lora_r * 2))
+
+output_dir="output/${model_name}/${task_name}/bz${training_batch_size}/elastictuning/mac${mac_constraint}/epoch${num_train_epochs}/nodistill_epoch${distill_epoch}/numprune${num_prunings}/sparsity_warmup${sparsity_warmup_epochs}/pruning_start${pruning_start}/pruning_stop${pruning_stop}/lora_r${lora_r}/lora_alpha${lora_alpha}/warmup_param${warmup_param_tuning_config}/teacher_param${teacher_param_tuning_config}/distill_${distillation_type}/distill_mapping_${distill_mapping_strategy}"
+echo $output_dir
+mkdir -p $output_dir
+
+python run_minus_training.py \
+    --output_dir ${output_dir}\
+    --task_name ${task_name} \
+    --model_name_or_path ${model_name} \
+    --do_train \
+    --do_eval \
+    --save_strategy no \
+    --evaluation_strategy steps \
+    --logging_strategy steps \
+    --logging_steps 100 \
+    --log_level info \
+    --log_level_replica info \
+    --eval_steps 500 \
+    --max_seq_length 128 \
+    --num_train_epochs ${num_train_epochs} \
+    --per_device_train_batch_size ${training_batch_size} \
+    --per_device_eval_batch_size ${training_batch_size} \
+    --tf32 True \
+    --lr_scheduler_type linear\
+    --warmup_ratio 0.06\
+    --learning_rate ${learning_rate}\
+    --weight_decay 0.1\
+    --seed 128 \
+    --apply_lora \
+    --lora_alpha ${lora_alpha} \
+    --lora_r ${lora_r} \
+    --report_to none \
+    --pruning_batches ${pruning_batches} \
+    --pruning_batch_size ${pruning_batch_size} \
+    --mac_constraint ${mac_constraint} \
+    --pruning_scheduler ${pruning_scheduler} \
+    --sparsity_warmup_epochs ${sparsity_warmup_epochs} \
+    --param_allocation_strategy ${param_allocation_strategy} \
+    --teacher_param_tuning_config ${teacher_param_tuning_config} \
+    --student_param_tuning_config ${student_param_tuning_config} \
+    --head_scorer_type gradient_l1 \
+    --intermediate_scorer_type gradient_l1 \
+    --pruner_type ${pruner_type} \
+    --do_virtual_prune \
+    --pruning_start ${pruning_start} \
+    --pruning_stop ${pruning_stop} \
+    --num_prunings ${num_prunings} \
+    --pruning_scheduler_strategy saliency \
+    --collect_salience \
+    --salience_collecting_start 200 \
+    --salience_collecting_end -1 \
+    --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \
+    --mask_lr 0.01 \
+    --grafting_top_k -1 \
+    --param_resizing_strategy ${param_resizing_strategy} \
+    --tuning_expanding_ratio 4.0 \
+    --max_lora_r $(($lora_r * 8)) \
+    | tee ${output_dir}/log.txt
\ No newline at end of file
diff --git a/scripts/adaptpruning_nodistill/roberta_base_cola_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_cola_jonathan.sh
new file mode 100644
index 0000000..88d4b82
--- /dev/null
+++ b/scripts/adaptpruning_nodistill/roberta_base_cola_jonathan.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+#SBATCH -p gpu-rtx6k
+#SBATCH -A h2lab
+#SBATCH --nodes=1                  # Number of nodes
+#SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
+#SBATCH --cpus-per-task=8          # Number of CPU cores per task
+#SBATCH --mem=64G                 # Memory per node (total memory)
+#SBATCH --gres=gpu:1               # Number of GPUs requested
+#SBATCH --time=400:00:00             # Walltime (hh:mm:ss)
+
+if [ "$#" -eq 0 ]; then
+    mac_constraint=0.4
+    lora_r=2
+    pruning_start=-1
+    pruning_scheduler=cubic_gradual
+    pruner_type=running_fisher
+    param_allocation_strategy=running_fisher
+elif [ "$#" -eq 6 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3    
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+elif [ "$#" -eq 7 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+    gpu_id=$9
+    export CUDA_VISIBLE_DEVICES=$gpu_id
+fi
+
+model_name=roberta-base
+param_resizing_strategy=tophalf_limited
+task_name=cola
+adapter_type=lora
+pruning_start=-1
+pruning_stop=20
+pruning_batches=64
+num_prunings=8
+pruning_batch_size=4
+# pre_pruning_tuning_epochs=1
+pre_pruning_tuning_steps=200
+sparsity_warmup_epochs=1
+
+learning_rate=2e-4
+training_batch_size=16
+num_train_epochs=50
+warmup_param_tuning_config=q:0-11,v:0-11,i:0-11
+teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
+student_param_tuning_config=q:0-11,v:0-11,i:0-11
+lora_alpha=$(($lora_r * 2))
+
+output_dir="output/${model_name}/${task_name}"
+echo $output_dir
+mkdir -p $output_dir
+
+python run_minus_training.py \
+    --output_dir ${output_dir}\
+    --task_name ${task_name} \
+    --model_name_or_path ${model_name} \
+    --do_train \
+    --do_eval \
+    --save_strategy no \
+    --evaluation_strategy steps \
+    --logging_strategy steps \
+    --logging_steps 100 \
+    --log_level info \
+    --log_level_replica info \
+    --eval_steps 500 \
+    --max_seq_length 512 \
+    --num_train_epochs ${num_train_epochs} \
+    --per_device_train_batch_size ${training_batch_size} \
+    --per_device_eval_batch_size ${training_batch_size} \
+    --lr_scheduler_type linear\
+    --warmup_ratio 0.06\
+    --learning_rate ${learning_rate}\
+    --weight_decay 0 \
+    --seed 128 \
+    --apply_lora \
+    --lora_alpha ${lora_alpha} \
+    --lora_r ${lora_r} \
+    --report_to none \
+    --pruning_batches ${pruning_batches} \
+    --pruning_batch_size ${pruning_batch_size} \
+    --mac_constraint ${mac_constraint} \
+    --pruning_scheduler ${pruning_scheduler} \
+    --sparsity_warmup_epochs ${sparsity_warmup_epochs} \
+    --param_allocation_strategy ${param_allocation_strategy} \
+    --teacher_param_tuning_config ${teacher_param_tuning_config} \
+    --student_param_tuning_config ${student_param_tuning_config} \
+    --head_scorer_type gradient_l1 \
+    --intermediate_scorer_type gradient_l1 \
+    --pruner_type ${pruner_type} \
+    --pruning_start ${pruning_start} \
+    --pruning_stop ${pruning_stop} \
+    --num_prunings ${num_prunings} \
+    --pruning_scheduler_strategy saliency \
+    --collect_salience \
+    --salience_collecting_start 200 \
+    --salience_collecting_end -1 \
+    --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \
+    --mask_lr 0.01 \
+    --grafting_top_k -1 \
+    --param_resizing_strategy ${param_resizing_strategy} \
+    --tuning_expanding_ratio 4.0 \
+    --max_lora_r 4 \
+    | tee ${output_dir}/log.txt
\ No newline at end of file
diff --git a/scripts/adaptpruning_nodistill/roberta_base_mrpc_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_mrpc_jonathan.sh
new file mode 100644
index 0000000..ccd6950
--- /dev/null
+++ b/scripts/adaptpruning_nodistill/roberta_base_mrpc_jonathan.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+#SBATCH -p gpu-rtx6k
+#SBATCH -A h2lab
+#SBATCH --nodes=1                  # Number of nodes
+#SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
+#SBATCH --cpus-per-task=8          # Number of CPU cores per task
+#SBATCH --mem=64G                 # Memory per node (total memory)
+#SBATCH --gres=gpu:1               # Number of GPUs requested
+#SBATCH --time=400:00:00             # Walltime (hh:mm:ss)
+
+if [ "$#" -eq 0 ]; then
+    mac_constraint=0.4
+    lora_r=2
+    pruning_start=-1
+    pruning_scheduler=cubic_gradual
+    pruner_type=running_fisher
+    param_allocation_strategy=running_fisher
+elif [ "$#" -eq 6 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3    
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+elif [ "$#" -eq 7 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+    gpu_id=$9
+    export CUDA_VISIBLE_DEVICES=$gpu_id
+fi
+
+model_name=roberta-base
+param_resizing_strategy=tophalf_limited
+task_name=mrpc
+adapter_type=lora
+pruning_start=-1
+pruning_stop=20
+pruning_batches=64
+num_prunings=8
+pruning_batch_size=4
+# pre_pruning_tuning_epochs=1
+pre_pruning_tuning_steps=200
+sparsity_warmup_epochs=1
+
+learning_rate=2e-4 #TODO: check 1e-3
+training_batch_size=16
+num_train_epochs=50
+warmup_param_tuning_config=q:0-11,v:0-11,i:0-11
+teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
+student_param_tuning_config=q:0-11,v:0-11,i:0-11
+lora_alpha=$(($lora_r * 2))
+
+output_dir="output/${model_name}/${task_name}/bz${training_batch_size}/elastictuning_virtualprune_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/numprune${num_prunings}/sparsity_warmup${sparsity_warmup_epochs}/pruning_start${pruning_start}/pruning_stop${pruning_stop}/lora_r${lora_r}/lora_alpha${lora_alpha}/warmup_param${warmup_param_tuning_config}/teacher_param${teacher_param_tuning_config}/"
+echo $output_dir
+mkdir -p $output_dir
+
+python run_minus_training.py \
+    --output_dir ${output_dir}\
+    --task_name ${task_name} \
+    --model_name_or_path ${model_name} \
+    --do_train \
+    --do_eval \
+    --save_strategy no \
+    --evaluation_strategy steps \
+    --logging_strategy steps \
+    --logging_steps 100 \
+    --log_level info \
+    --log_level_replica info \
+    --eval_steps 500 \
+    --max_seq_length 512 \
+    --num_train_epochs ${num_train_epochs} \
+    --per_device_train_batch_size ${training_batch_size} \
+    --per_device_eval_batch_size ${training_batch_size} \
+    --lr_scheduler_type linear\
+    --warmup_ratio 0.06\
+    --learning_rate ${learning_rate}\
+    --weight_decay 0 \
+    --seed 128 \
+    --apply_lora \
+    --lora_alpha ${lora_alpha} \
+    --lora_r ${lora_r} \
+    --report_to none \
+    --pruning_batches ${pruning_batches} \
+    --pruning_batch_size ${pruning_batch_size} \
+    --mac_constraint ${mac_constraint} \
+    --pruning_scheduler ${pruning_scheduler} \
+    --sparsity_warmup_epochs ${sparsity_warmup_epochs} \
+    --param_allocation_strategy ${param_allocation_strategy} \
+    --teacher_param_tuning_config ${teacher_param_tuning_config} \
+    --student_param_tuning_config ${student_param_tuning_config} \
+    --head_scorer_type gradient_l1 \
+    --intermediate_scorer_type gradient_l1 \
+    --pruner_type ${pruner_type} \
+    --pruning_start ${pruning_start} \
+    --pruning_stop ${pruning_stop} \
+    --num_prunings ${num_prunings} \
+    --pruning_scheduler_strategy saliency \
+    --collect_salience \
+    --salience_collecting_start 200 \
+    --salience_collecting_end -1 \
+    --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \
+    --mask_lr 0.01 \
+    --grafting_top_k -1 \
+    --param_resizing_strategy ${param_resizing_strategy} \
+    --tuning_expanding_ratio 4.0 \
+    --max_lora_r 4 \
+    | tee ${output_dir}/log.txt
\ No newline at end of file
diff --git a/scripts/adaptpruning_nodistill/roberta_base_rte_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_rte_jonathan.sh
new file mode 100644
index 0000000..c8fd79f
--- /dev/null
+++ b/scripts/adaptpruning_nodistill/roberta_base_rte_jonathan.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+#SBATCH -p gpu-rtx6k
+#SBATCH -A h2lab
+#SBATCH --nodes=1                  # Number of nodes
+#SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
+#SBATCH --cpus-per-task=8          # Number of CPU cores per task
+#SBATCH --mem=64G                 # Memory per node (total memory)
+#SBATCH --gres=gpu:1               # Number of GPUs requested
+#SBATCH --time=400:00:00             # Walltime (hh:mm:ss)
+
+if [ "$#" -eq 0 ]; then
+    mac_constraint=0.4
+    lora_r=2
+    pruning_start=-1
+    pruning_scheduler=cubic_gradual
+    pruner_type=running_fisher
+    param_allocation_strategy=running_fisher
+elif [ "$#" -eq 6 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3    
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+elif [ "$#" -eq 7 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+    gpu_id=$9
+    export CUDA_VISIBLE_DEVICES=$gpu_id
+fi
+
+model_name=roberta-base
+param_resizing_strategy=tophalf_limited
+task_name=rte
+adapter_type=lora
+pruning_start=-1
+pruning_stop=20
+pruning_batches=64
+num_prunings=8
+pruning_batch_size=4
+# pre_pruning_tuning_epochs=1
+pre_pruning_tuning_steps=200
+sparsity_warmup_epochs=1
+
+learning_rate=2e-4
+training_batch_size=16
+num_train_epochs=50
+warmup_param_tuning_config=q:0-11,v:0-11,i:0-11
+teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
+student_param_tuning_config=q:0-11,v:0-11,i:0-11
+lora_alpha=$(($lora_r * 2))
+
+output_dir="output/${model_name}/${task_name}"
+echo $output_dir
+mkdir -p $output_dir
+
+python run_minus_training.py \
+    --output_dir ${output_dir}\
+    --task_name ${task_name} \
+    --model_name_or_path ${model_name} \
+    --do_train \
+    --do_eval \
+    --save_strategy no \
+    --evaluation_strategy steps \
+    --logging_strategy steps \
+    --logging_steps 100 \
+    --log_level info \
+    --log_level_replica info \
+    --eval_steps 500 \
+    --max_seq_length 512 \
+    --num_train_epochs ${num_train_epochs} \
+    --per_device_train_batch_size ${training_batch_size} \
+    --per_device_eval_batch_size ${training_batch_size} \
+    --lr_scheduler_type linear\
+    --warmup_ratio 0.06\
+    --learning_rate ${learning_rate}\
+    --weight_decay 0 \
+    --seed 128 \
+    --apply_lora \
+    --lora_alpha ${lora_alpha} \
+    --lora_r ${lora_r} \
+    --report_to none \
+    --pruning_batches ${pruning_batches} \
+    --pruning_batch_size ${pruning_batch_size} \
+    --mac_constraint ${mac_constraint} \
+    --pruning_scheduler ${pruning_scheduler} \
+    --sparsity_warmup_epochs ${sparsity_warmup_epochs} \
+    --param_allocation_strategy ${param_allocation_strategy} \
+    --teacher_param_tuning_config ${teacher_param_tuning_config} \
+    --student_param_tuning_config ${student_param_tuning_config} \
+    --head_scorer_type gradient_l1 \
+    --intermediate_scorer_type gradient_l1 \
+    --pruner_type ${pruner_type} \
+    --pruning_start ${pruning_start} \
+    --pruning_stop ${pruning_stop} \
+    --num_prunings ${num_prunings} \
+    --pruning_scheduler_strategy saliency \
+    --collect_salience \
+    --salience_collecting_start 200 \
+    --salience_collecting_end -1 \
+    --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \
+    --mask_lr 0.01 \
+    --grafting_top_k -1 \
+    --param_resizing_strategy ${param_resizing_strategy} \
+    --tuning_expanding_ratio 4.0 \
+    --max_lora_r 4 \
+    | tee ${output_dir}/log.txt
\ No newline at end of file
diff --git a/scripts/adaptpruning_nodistill/roberta_base_sst2_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_sst2_jonathan.sh
new file mode 100644
index 0000000..3b64734
--- /dev/null
+++ b/scripts/adaptpruning_nodistill/roberta_base_sst2_jonathan.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+#SBATCH -p gpu-rtx6k
+#SBATCH -A h2lab
+#SBATCH --nodes=1                  # Number of nodes
+#SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
+#SBATCH --cpus-per-task=8          # Number of CPU cores per task
+#SBATCH --mem=64G                 # Memory per node (total memory)
+#SBATCH --gres=gpu:1               # Number of GPUs requested
+#SBATCH --time=400:00:00             # Walltime (hh:mm:ss)
+
+if [ "$#" -eq 0 ]; then
+    mac_constraint=0.4
+    lora_r=2
+    pruning_start=-1
+    pruning_scheduler=cubic_gradual
+    pruner_type=running_fisher
+    param_allocation_strategy=running_fisher
+elif [ "$#" -eq 6 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3    
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+elif [ "$#" -eq 7 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+    gpu_id=$9
+    export CUDA_VISIBLE_DEVICES=$gpu_id
+fi
+
+model_name=roberta-base
+param_resizing_strategy=tophalf_limited
+task_name=sst2
+adapter_type=lora
+pruning_start=-1
+pruning_stop=20
+pruning_batches=64
+num_prunings=8
+pruning_batch_size=4
+# pre_pruning_tuning_epochs=1
+pre_pruning_tuning_steps=200
+sparsity_warmup_epochs=1
+
+learning_rate=2e-4
+training_batch_size=16
+num_train_epochs=50
+warmup_param_tuning_config=q:0-11,v:0-11,i:0-11
+teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
+student_param_tuning_config=q:0-11,v:0-11,i:0-11
+lora_alpha=$(($lora_r * 2))
+
+output_dir="output/${model_name}/${task_name}/bz${training_batch_size}/elastictuning_virtualprune_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/numprune${num_prunings}/sparsity_warmup${sparsity_warmup_epochs}/pruning_start${pruning_start}/pruning_stop${pruning_stop}/lora_r${lora_r}/lora_alpha${lora_alpha}/warmup_param${warmup_param_tuning_config}/teacher_param${teacher_param_tuning_config}/"
+echo $output_dir
+mkdir -p $output_dir
+
+python run_minus_training.py \
+    --output_dir ${output_dir}\
+    --task_name ${task_name} \
+    --model_name_or_path ${model_name} \
+    --do_train \
+    --do_eval \
+    --save_strategy no \
+    --evaluation_strategy steps \
+    --logging_strategy steps \
+    --logging_steps 100 \
+    --log_level info \
+    --log_level_replica info \
+    --eval_steps 500 \
+    --max_seq_length 512 \
+    --num_train_epochs ${num_train_epochs} \
+    --per_device_train_batch_size ${training_batch_size} \
+    --per_device_eval_batch_size ${training_batch_size} \
+    --lr_scheduler_type linear\
+    --warmup_ratio 0.06\
+    --learning_rate ${learning_rate}\
+    --weight_decay 0 \
+    --seed 128 \
+    --apply_lora \
+    --lora_alpha ${lora_alpha} \
+    --lora_r ${lora_r} \
+    --report_to none \
+    --pruning_batches ${pruning_batches} \
+    --pruning_batch_size ${pruning_batch_size} \
+    --mac_constraint ${mac_constraint} \
+    --pruning_scheduler ${pruning_scheduler} \
+    --sparsity_warmup_epochs ${sparsity_warmup_epochs} \
+    --param_allocation_strategy ${param_allocation_strategy} \
+    --teacher_param_tuning_config ${teacher_param_tuning_config} \
+    --student_param_tuning_config ${student_param_tuning_config} \
+    --head_scorer_type gradient_l1 \
+    --intermediate_scorer_type gradient_l1 \
+    --pruner_type ${pruner_type} \
+    --pruning_start ${pruning_start} \
+    --pruning_stop ${pruning_stop} \
+    --num_prunings ${num_prunings} \
+    --pruning_scheduler_strategy saliency \
+    --collect_salience \
+    --salience_collecting_start 200 \
+    --salience_collecting_end -1 \
+    --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \
+    --mask_lr 0.01 \
+    --grafting_top_k -1 \
+    --param_resizing_strategy ${param_resizing_strategy} \
+    --tuning_expanding_ratio 4.0 \
+    --max_lora_r 4 \
+    | tee ${output_dir}/log.txt
\ No newline at end of file
diff --git a/scripts/adaptpruning_nodistill/roberta_base_stsb_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_stsb_jonathan.sh
new file mode 100644
index 0000000..99a34c6
--- /dev/null
+++ b/scripts/adaptpruning_nodistill/roberta_base_stsb_jonathan.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+#SBATCH -p gpu-rtx6k
+#SBATCH -A h2lab
+#SBATCH --nodes=1                  # Number of nodes
+#SBATCH --ntasks-per-node=1        # Number of tasks per node (1 in this case)
+#SBATCH --cpus-per-task=8          # Number of CPU cores per task
+#SBATCH --mem=64G                 # Memory per node (total memory)
+#SBATCH --gres=gpu:1               # Number of GPUs requested
+#SBATCH --time=400:00:00             # Walltime (hh:mm:ss)
+
+if [ "$#" -eq 0 ]; then
+    mac_constraint=0.4
+    lora_r=2
+    pruning_start=-1
+    pruning_scheduler=cubic_gradual
+    pruner_type=running_fisher
+    param_allocation_strategy=running_fisher
+elif [ "$#" -eq 6 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3    
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+elif [ "$#" -eq 7 ]; then
+    mac_constraint=$1
+    lora_r=$2
+    pruning_start=$3
+    pruning_scheduler=$4
+    pruner_type=$5
+    param_allocation_strategy=$6
+    gpu_id=$9
+    export CUDA_VISIBLE_DEVICES=$gpu_id
+fi
+
+model_name=roberta-base
+param_resizing_strategy=tophalf_limited
+task_name=stsb
+adapter_type=lora
+pruning_start=-1
+pruning_stop=20
+pruning_batches=64
+num_prunings=8
+pruning_batch_size=4
+# pre_pruning_tuning_epochs=1
+pre_pruning_tuning_steps=200
+sparsity_warmup_epochs=1
+
+learning_rate=2e-4
+training_batch_size=16
+num_train_epochs=50
+warmup_param_tuning_config=q:0-11,v:0-11,i:0-11
+teacher_param_tuning_config=q:0-11,v:0-11,i:0-11
+student_param_tuning_config=q:0-11,v:0-11,i:0-11
+lora_alpha=$(($lora_r * 2))
+
+output_dir="output/${model_name}/${task_name}/bz${training_batch_size}/elastictuning_virtualprune_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/numprune${num_prunings}/sparsity_warmup${sparsity_warmup_epochs}/pruning_start${pruning_start}/pruning_stop${pruning_stop}/lora_r${lora_r}/lora_alpha${lora_alpha}/warmup_param${warmup_param_tuning_config}/teacher_param${teacher_param_tuning_config}/"
+echo $output_dir
+mkdir -p $output_dir
+
+python run_minus_training.py \
+    --output_dir ${output_dir}\
+    --task_name ${task_name} \
+    --model_name_or_path ${model_name} \
+    --do_train \
+    --do_eval \
+    --save_strategy no \
+    --evaluation_strategy steps \
+    --logging_strategy steps \
+    --logging_steps 100 \
+    --log_level info \
+    --log_level_replica info \
+    --eval_steps 500 \
+    --max_seq_length 512 \
+    --num_train_epochs ${num_train_epochs} \
+    --per_device_train_batch_size ${training_batch_size} \
+    --per_device_eval_batch_size ${training_batch_size} \
+    --lr_scheduler_type linear\
+    --warmup_ratio 0.06\
+    --learning_rate ${learning_rate}\
+    --weight_decay 0 \
+    --seed 128 \
+    --apply_lora \
+    --lora_alpha ${lora_alpha} \
+    --lora_r ${lora_r} \
+    --report_to none \
+    --pruning_batches ${pruning_batches} \
+    --pruning_batch_size ${pruning_batch_size} \
+    --mac_constraint ${mac_constraint} \
+    --pruning_scheduler ${pruning_scheduler} \
+    --sparsity_warmup_epochs ${sparsity_warmup_epochs} \
+    --param_allocation_strategy ${param_allocation_strategy} \
+    --teacher_param_tuning_config ${teacher_param_tuning_config} \
+    --student_param_tuning_config ${student_param_tuning_config} \
+    --head_scorer_type gradient_l1 \
+    --intermediate_scorer_type gradient_l1 \
+    --pruner_type ${pruner_type} \
+    --pruning_start ${pruning_start} \
+    --pruning_stop ${pruning_stop} \
+    --num_prunings ${num_prunings} \
+    --pruning_scheduler_strategy saliency \
+    --collect_salience \
+    --salience_collecting_start 200 \
+    --salience_collecting_end -1 \
+    --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \
+    --mask_lr 0.01 \
+    --grafting_top_k -1 \
+    --param_resizing_strategy ${param_resizing_strategy} \
+    --tuning_expanding_ratio 4.0 \
+    --max_lora_r 4 \
+    | tee ${output_dir}/log.txt
\ No newline at end of file
diff --git a/trainer/allocation_strategy.py b/trainer/allocation_strategy.py
index 37daa85..c2adc0e 100644
--- a/trainer/allocation_strategy.py
+++ b/trainer/allocation_strategy.py
@@ -2,7 +2,7 @@
 
 from typing import Dict, List
 from prune.fisher import collect_grads_by_suffix
-from ortools.algorithms import pywrapknapsack_solver
+# from ortools.algorithms import pywrapknapsack_solver
 
 def binary_knapsack_search(values_tensor: torch.Tensor, weights_tensor: torch.Tensor, capacities: List[int]) -> torch.Tensor:
     sorted_values, sorted_indices = torch.sort(values_tensor, descending=True)
diff --git a/trainer/model_arch.py b/trainer/model_arch.py
index 13ae305..ab9cdd8 100644
--- a/trainer/model_arch.py
+++ b/trainer/model_arch.py
@@ -1,5 +1,5 @@
 import torch
-from transformers.modeling_utils import PreTrainedModel, PretrainedConfig
+from transformers import PreTrainedModel, PretrainedConfig
 
 NAME2TEMPLATE = {
     'query': '.encoder.layer.%d.attention.self.query',
@@ -369,4 +369,4 @@ def get_layer(self, i, k):
         k_attr = NAME2ATTR[k][self.model_category]
         parent_layer = self.get_parent_layer(i, k)
         layer = getattr(parent_layer, k_attr)
-        return layer
\ No newline at end of file
+        return layer
diff --git a/trainer/trainer_minus.py b/trainer/trainer_minus.py
index c4d4d9a..796007d 100644
--- a/trainer/trainer_minus.py
+++ b/trainer/trainer_minus.py
@@ -19,23 +19,30 @@
 from args import MinusTrainingArguments
 from transformers import __version__
 from transformers import Trainer
-from transformers.trainer import unwrap_model, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+try:
+    from transformers.trainer import unwrap_model, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+except ImportError:
+    from transformers.modeling_utils import unwrap_model
+    from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.optimization import get_scheduler
 from torch.utils.data import DataLoader, Subset, IterableDataset
 from torch.utils.data.dataset import Dataset
 from torch.utils.data.distributed import DistributedSampler
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR
-from transformers.modeling_utils import PreTrainedModel
+from transformers import PreTrainedModel
 from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Callable
 from torch.utils.data.dataset import Dataset
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.data.data_collator import DataCollator
 from transformers.trainer_utils import EvalPrediction, TrainOutput, set_seed, get_last_checkpoint, speed_metrics, EvalLoopOutput, denumpify_detensorize
 from transformers.trainer_pt_utils import nested_concat, nested_numpify, nested_truncate, IterableDatasetShard, find_batch_size, nested_detach
-from transformers.file_utils import is_torch_tpu_available, WEIGHTS_NAME, CONFIG_NAME
+try:
+    from transformers.utils import WEIGHTS_NAME, CONFIG_NAME
+except ImportError:
+    from transformers.file_utils import WEIGHTS_NAME, CONFIG_NAME
 from transformers.trainer_callback import TrainerState
-from transformers.configuration_utils import PretrainedConfig
+from transformers import PretrainedConfig
 from prune import AdapterPruner, build_scorer, build_pruner
 from utils.minus_utils import count_params, prune_layer, to_cpu_recursive, lora_to_prunelora
 from utils.fisher_utils.efficiency.param import *
@@ -391,9 +398,11 @@ def __init__(
                 if args.half_precision_backend == "cuda_amp":
                     self.use_cuda_amp = True
                     self.amp_dtype = torch.float16 if args.fp16 else torch.bfloat16
+                    self.use_cpu_amp = False
                 elif args.half_precision_backend == "cpu_amp":
                     self.use_cpu_amp = True
                     self.amp_dtype = torch.bfloat16
+                    self.use_cuda_amp = False
                     
         logger.info("Half precision backend: " + args.half_precision_backend)
         logger.info("Half precision dtype: " + str(getattr(self, 'amp_dtype', None)))
@@ -964,7 +973,7 @@ def train(
         # Calculate the total model parameters before any pruning conducted
         self.n_params, self.n_param_vars = count_params(self.model, mode='main') # Exclude LoRA layers when counting parameters for later pruning usage
         # Logging mixed-precision status
-        logger.info("Use cpu_amp %s; use cuda_amp %s; mixed-precision dtype: %s" % (self.use_cpu_amp, self.use_cuda_amp, getattr(self, 'amp_dtype', None)))
+        # logger.info("Use cpu_amp %s; use cuda_amp %s; mixed-precision dtype: %s" % (self.use_cpu_amp, self.use_cuda_amp, getattr(self, 'amp_dtype', None)))
         
         if self.model.config.apply_lora:
             logger.info("Using PEFT with LoRA. Disabling grad for all non-teacher-learning layers.")
@@ -1473,7 +1482,7 @@ def train(
 
                 if (
                     args.logging_nan_inf_filter
-                    and not is_torch_tpu_available()
+                    # and not is_torch_tpu_available()
                     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
                 ):
                     # if loss is nan or inf simply add the average of previous logged losses
@@ -2320,4 +2329,4 @@ def calculate_distillation_loss(self, teacher_outputs, student_outputs):
             if distill_loss is not None:
                 loss += self.args.distill_loss_alpha * distill_loss
 
-            return distill_loss, ce_distill_loss, loss
\ No newline at end of file
+            return distill_loss, ce_distill_loss, loss
diff --git a/utils/__init__.py b/utils/__init__.py
index 01a7a68..a412e61 100644
--- a/utils/__init__.py
+++ b/utils/__init__.py
@@ -3,10 +3,12 @@
 from trainer.trainer_seq2seq_minus import MinusSeq2SeqTrainer
 from args import Seq2SeqDataTrainingArguments
 from utils.utils import *
-from datasets import load_metric
+# from datasets import load_metric
 from torch.utils.data import DataLoader
 from utils.qa_utils import postprocess_qa_predictions
 from dataclasses import dataclass
+import evaluate
+
 
 IGNORE_INDEX = -100
 GLUE_TASKS = set(["cola", "sst2", "mrpc", "stsb", "qqp", "mnli", "qnli", "rte"])
@@ -70,7 +72,7 @@ def build_trainer(data_args, training_args, model, tokenizer, train_dataset=None
     elif data_args.task_name is not None and data_args.task_name in GLUE_TASKS:
         label2id = model.label2id if hasattr(model, 'label2id') else None
         # Get the metric function
-        metric = load_metric("glue", data_args.task_name)
+        metric = evaluate.load("glue", data_args.task_name)
         # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
         # predictions and label_ids field) and has to return a dictionary string to float.
         def compute_metrics(p: EvalPrediction):
diff --git a/utils/fisher_utils/linalg.py b/utils/fisher_utils/linalg.py
index 4d1197a..febdff9 100644
--- a/utils/fisher_utils/linalg.py
+++ b/utils/fisher_utils/linalg.py
@@ -1,6 +1,14 @@
 import torch
-import cupy
-from cupyx.scipy.sparse.linalg import lsmr
+import numpy as np
+try:
+    import cupy
+    from cupyx.scipy.sparse.linalg import lsmr as cupy_lsmr
+    HAS_CUPY = True
+except ImportError:
+    cupy = None
+    cupy_lsmr = None
+    HAS_CUPY = False
+from scipy.sparse.linalg import lsmr as scipy_lsmr
 
 
 @torch.no_grad()
@@ -18,11 +26,18 @@ def lsmr_cupy_solver(A, B):
     B = B - A.sum(dim=1)
     if B.shape[0] == 1:
         X = B / A[0, 0]
+        solution = (None, 0)
     else:
-        CU_A = cupy.asarray(A.cpu().numpy())
-        CU_B = cupy.asarray(B.cpu().numpy())
-        solution = lsmr(CU_A, CU_B, damp=1)
-        X = cupy.asnumpy(solution[0])
+        np_A = A.cpu().numpy()
+        np_B = B.cpu().numpy()
+        if HAS_CUPY:
+            CU_A = cupy.asarray(np_A)
+            CU_B = cupy.asarray(np_B)
+            solution = cupy_lsmr(CU_A, CU_B, damp=1)
+            X = cupy.asnumpy(solution[0])
+        else:
+            solution = scipy_lsmr(np_A, np_B, damp=1)
+            X = np.asarray(solution[0])
         X = torch.from_numpy(X).to(A.device)
     X = X + 1
     return X, solution[1] < 3
diff --git a/utils/minus_utils.py b/utils/minus_utils.py
index 91cac44..397a30a 100644
--- a/utils/minus_utils.py
+++ b/utils/minus_utils.py
@@ -1176,5 +1176,6 @@ def kurtosis(a: torch.Tensor, axis: int = 0, fisher: bool = True, bias: bool = T
 
     # Compute kurtosis
     kurt = torch.mean(zscores.pow(4), axis, keepdim=True).squeeze()
+    print(f"KURTOSIS: {kurt.size()}")
 
     return kurt - 3 if fisher else kurt
\ No newline at end of file
diff --git a/utils/utils.py b/utils/utils.py
index aa6a726..1535f01 100644
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -7,7 +7,7 @@
 from transformers import PretrainedConfig
 from datasets import load_dataset, DatasetDict
 from typing import Sequence, Dict
-from utils import alpaca_utils
+# from utils import alpaca_utils
 from dataclasses import dataclass, field
 import logging
 import transformers