diff --git a/models/__init__.py b/models/__init__.py index 5ab9af8..db37f54 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -8,7 +8,7 @@ from utils.utils import get_label from utils.minus_utils import model_layer_switch, lora_to_linear, lora_to_distill, lora_to_prunelora, linear_to_lora from trainer.model_arch import get_ffn1, get_mha_proj -from utils.alpaca_utils import smart_tokenizer_and_embedding_resize +# from utils.alpaca_utils import smart_tokenizer_and_embedding_resize from .modeling_bert import CoFiBertForSequenceClassification, AdaPBertForQuestionAnswering # from .modeling_roberta_backup import CoFiRobertaForSequenceClassification, NewRobertaForQuestionAnswering from .modeling_roberta import CoFiRobertaForSequenceClassification, NewRobertaForQuestionAnswering diff --git a/models/modeling_mt5.py b/models/modeling_mt5.py index 4c80629..d84f598 100644 --- a/models/modeling_mt5.py +++ b/models/modeling_mt5.py @@ -7,7 +7,10 @@ from dataclasses import dataclass from typing import Union -from transformers.file_utils import ModelOutput +try: + from transformers.utils import ModelOutput +except ImportError: + from transformers.file_utils import ModelOutput from transformers.models.mt5.modeling_mt5 import ( MT5ForConditionalGeneration, @@ -1763,4 +1766,4 @@ def custom_forward(*inputs): hidden_states=all_hidden_states, attentions=all_attentions, cross_attentions=all_cross_attentions, - ) \ No newline at end of file + ) diff --git a/models/modeling_outputs.py b/models/modeling_outputs.py index 4c8f096..70a00e4 100644 --- a/models/modeling_outputs.py +++ b/models/modeling_outputs.py @@ -3,7 +3,10 @@ from dataclasses import dataclass from typing import Optional, Tuple -from transformers.file_utils import ModelOutput +try: + from transformers.utils import ModelOutput +except ImportError: + from transformers.file_utils import ModelOutput from transformers.modeling_outputs import SequenceClassifierOutput, QuestionAnsweringModelOutput, BaseModelOutputWithPastAndCrossAttentions @@ -40,4 +43,4 @@ class NewSequenceClassifierOutput(SequenceClassifierOutput): masked_loss: Optional[torch.FloatTensor] = None class AdaPBaseModelOutputWithPastAndCrossAttentions(BaseModelOutputWithPastAndCrossAttentions): - masked_hidden_states: torch.FloatTensor = None \ No newline at end of file + masked_hidden_states: torch.FloatTensor = None diff --git a/requirements.txt b/requirements.txt index 36f0633..ebe0a51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,17 @@ -datasets==2.10.0 -deepspeed==0.8.0 -matplotlib==3.7.1 -numpy==1.24.3 -ortools==9.6.2534 -pandas==1.5.2 -scikit_learn==1.1.3 -scipy==1.10.1 -seaborn==0.12.2 -tqdm==4.65.0 -transformers==4.28.1 -nltk==3.8.1 -rouge-score==0.1.2 -torch==1.10.2+cu113 ---extra-index-url https://download.pytorch.org/whl/cu113 \ No newline at end of file +torch>=2.2 +transformers==4.45.0 +accelerate>=1.0 +datasets>=3.0 +evaluate>=0.4.3 +deepspeed>=0.16 +tokenizers>=0.20 +numpy>=1.26 +pandas>=2.2 +scikit-learn>=1.5 +scipy>=1.13 +matplotlib>=3.9 +seaborn>=0.13 +tqdm>=4.66 +nltk>=3.9 +rouge-score>=0.1.2 +ortools>=9.10 diff --git a/run_minus_seq2seq_training.py b/run_minus_seq2seq_training.py index cba2f6a..a9b84c8 100644 --- a/run_minus_seq2seq_training.py +++ b/run_minus_seq2seq_training.py @@ -15,7 +15,7 @@ from transformers import (HfArgumentParser, EvalPrediction, DataCollatorForSeq2Seq, set_seed) from torch.nn.utils.rnn import pad_sequence from deepspeed.profiling.flops_profiler import get_model_profile -from datasets import load_metric +import evaluate from models.model_args import ModelArguments from utils.utils import * from utils.minus_utils import efficiency_testing, input_constructor, compare_parameters @@ -109,7 +109,7 @@ def main(): model.hidden_mask = model.hidden_mask.to(training_args.device) if 'wmt' in task_name: - metric = load_metric("sacrebleu") + metric = evaluate.load("sacrebleu") gen_prefix = "eval" def postprocess_text(preds, labels): @@ -140,7 +140,7 @@ def compute_metrics(eval_preds): result = {k: round(v, 4) for k, v in result.items()} return result else: - metric = load_metric("rouge") + metric = evaluate.load("rouge") def compute_metrics(eval_pred): predictions, labels = eval_pred decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) @@ -269,4 +269,4 @@ def compute_metrics(eval_pred): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/run_minus_squad_training.py b/run_minus_squad_training.py index 192cb98..7826a7a 100644 --- a/run_minus_squad_training.py +++ b/run_minus_squad_training.py @@ -12,7 +12,7 @@ from transformers import (HfArgumentParser, EvalPrediction, default_data_collator, set_seed) from deepspeed.profiling.flops_profiler import get_model_profile -from datasets import load_metric +import evaluate from transformers import SquadDataTrainingArguments from models.model_args import ModelArguments from utils.utils import * @@ -153,9 +153,9 @@ def post_processing_function(examples, features, predictions): # Get the metric function if IS_SQUAD_V2: - metric = load_metric("squad_v2") + metric = evaluate.load("squad_v2") else: - metric = load_metric("squad") + metric = evaluate.load("squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) @@ -248,4 +248,4 @@ def compute_metrics(p: EvalPrediction): logger.info(f"Parameter variables not changed after pruning: {same_vars}") if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/run_minus_training.py b/run_minus_training.py index 7082c10..6b2f3ed 100644 --- a/run_minus_training.py +++ b/run_minus_training.py @@ -1,3 +1,4 @@ +import torch import os import json os.environ["WANDB_DISABLED"] = "true" @@ -9,11 +10,15 @@ torch.backends.cudnn.allow_tf32 = True import time transformers.logging.set_verbosity_error() - - +import evaluate +from sklearn.metrics import f1_score, matthews_corrcoef +from scipy.stats import pearsonr, spearmanr from transformers import (HfArgumentParser, EvalPrediction, default_data_collator, DataCollatorWithPadding, set_seed) -from deepspeed.profiling.flops_profiler import get_model_profile -from datasets import load_metric +try: + from deepspeed.profiling.flops_profiler import get_model_profile +except ImportError: + get_model_profile = None +# from datasets import load_metric from args import DataTrainingArguments from models.model_args import ModelArguments from utils.utils import * @@ -95,9 +100,9 @@ def main(): # Get the metric function if data_args.task_name is not None: - metric = load_metric("glue", data_args.task_name, experiment_id='elastictuning' + data_args.task_name + str(time.time())) + metric = evaluate.load("glue", data_args.task_name, experiment_id='elastictuning' + data_args.task_name + str(time.time())) else: - metric = load_metric("accuracy", experiment_id='elastictuning' + data_args.task_name + str(time.time())) + metric = evaluate.load("accuracy", experiment_id='elastictuning' + data_args.task_name + str(time.time())) # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): @@ -115,7 +120,31 @@ def compute_metrics(p: EvalPrediction): preds = list(map(lambda x: label2id[tuple(x)] if tuple(x) in label2id else -1, preds.tolist())) labels = list(map(lambda x: label2id[tuple(x)], labels.tolist())) if data_args.task_name is not None: - result = metric.compute(predictions=preds, references=labels) + try: + result = metric.compute(predictions=preds, references=labels) + except ValueError as e: + if "Unable to avoid copy while creating an array as requested" not in str(e): + raise + preds_np = np.asarray(preds) + labels_np = np.asarray(labels) + if data_args.task_name in {"sst2", "mnli", "qnli", "rte"}: + result = {"accuracy": float((preds_np == labels_np).mean())} + elif data_args.task_name in {"mrpc", "qqp"}: + result = { + "accuracy": float((preds_np == labels_np).mean()), + "f1": float(f1_score(labels_np, preds_np)), + } + elif data_args.task_name == "cola": + result = {"matthews_correlation": float(matthews_corrcoef(labels_np, preds_np))} + elif data_args.task_name == "stsb": + p = np.asarray(preds_np, dtype=np.float64) + y = np.asarray(labels_np, dtype=np.float64) + result = { + "pearson": float(pearsonr(p, y)[0]), + "spearmanr": float(spearmanr(p, y)[0]), + } + else: + raise if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result @@ -162,14 +191,17 @@ def compute_metrics(p: EvalPrediction): if getattr(model, 'hidden_mask', None) is not None: model.hidden_mask = model.hidden_mask.to(training_args.device) - flops, macs, params = get_model_profile( - model, - kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()}, - print_profile=True, - detailed=True, - output_file=os.path.join(training_args.output_dir, 'pretrain_deepspeed_profile.txt'), - ) - torch.cuda.reset_peak_memory_stats() + if get_model_profile is not None: + flops, macs, params = get_model_profile( + model, + kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()}, + print_profile=True, + detailed=True, + output_file=os.path.join(training_args.output_dir, 'pretrain_deepspeed_profile.txt'), + ) + torch.cuda.reset_peak_memory_stats() + else: + logger.warning("deepspeed is not installed; skipping pre-training FLOPs/MACs profiling.") seq_len = 170 if IS_SQUAD else avg_seq_length(data_args.task_name) training_args.task_name = data_args.task_name @@ -226,14 +258,15 @@ def compute_metrics(p: EvalPrediction): if isinstance(module, LoRALayer): module.eval() - flops, macs, params = get_model_profile( - model, - kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()}, print_profile=True, - detailed=True, - output_file=os.path.join(training_args.output_dir, 'deepspeed_profile.txt'), - ) - efficiency_results['model_flops'] = flops - efficiency_results['model_macs'] = macs + if get_model_profile is not None: + flops, macs, params = get_model_profile( + model, + kwargs={k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer, output_seq_len=2).items()} if MODEL_GENERATIVE else {k: v.to(model.device) for k, v in input_constructor(training_args.per_device_eval_batch_size, data_args.max_seq_length, tokenizer).items()}, print_profile=True, + detailed=True, + output_file=os.path.join(training_args.output_dir, 'deepspeed_profile.txt'), + ) + efficiency_results['model_flops'] = flops + efficiency_results['model_macs'] = macs json.dump(efficiency_results, open(os.path.join(training_args.output_dir, 'efficiency_results.json'), 'w'), indent=4, sort_keys=True) run_report = gen_run_report(training_args.output_dir) @@ -253,4 +286,4 @@ def compute_metrics(p: EvalPrediction): logger.info(f"Parameter variables not changed after pruning: {same_vars}") if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/run_pruning.py b/run_pruning.py index 5d1cbf5..005884b 100644 --- a/run_pruning.py +++ b/run_pruning.py @@ -4,7 +4,7 @@ import torch from tqdm import tqdm -from datasets import load_metric +import evaluate from utils import avg_seq_length from transformers import HfArgumentParser, TrainingArguments, DataCollatorWithPadding from torch.utils.data import DataLoader, RandomSampler @@ -81,9 +81,9 @@ def test(model, eval_dataloader, head_mask, intermediate_mask, metric, data_args # Get the metric function if data_args.task_name is not None: - metric = load_metric("glue", data_args.task_name) + metric = evaluate.load("glue", data_args.task_name) else: - metric = load_metric("accuracy") + metric = evaluate.load("accuracy") if prune_mode == 'random': @@ -170,4 +170,4 @@ def test(model, eval_dataloader, head_mask, intermediate_mask, metric, data_args # pre-pruning eval_results: {'accuracy': 0.84625} # cofi eval_results: {'accuracy': 0.8052} - json.dump(accuracy_by_ratios, open(os.path.join(training_args.output_dir, '%s_mask_%s_%s_nolora.json' % (prune_mode, mask_mode, test_mode)), 'w')) \ No newline at end of file + json.dump(accuracy_by_ratios, open(os.path.join(training_args.output_dir, '%s_mask_%s_%s_nolora.json' % (prune_mode, mask_mode, test_mode)), 'w')) diff --git a/scripts/adaptpruning/roberta_base_mrpc_momentum.sh b/scripts/adaptpruning/roberta_base_mrpc_momentum.sh index 0930f29..e335f32 100644 --- a/scripts/adaptpruning/roberta_base_mrpc_momentum.sh +++ b/scripts/adaptpruning/roberta_base_mrpc_momentum.sh @@ -55,7 +55,7 @@ pre_pruning_tuning_steps=200 sparsity_warmup_epochs=4 learning_rate=2e-4 -training_batch_size=32 +training_batch_size=16 num_train_epochs=120 warmup_param_tuning_config=q:0-11,v:0-11,i:0-11 teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 diff --git a/scripts/adaptpruning/roberta_base_mrpc_momentum_no_distill.sh b/scripts/adaptpruning/roberta_base_mrpc_momentum_no_distill.sh new file mode 100644 index 0000000..55ef5fa --- /dev/null +++ b/scripts/adaptpruning/roberta_base_mrpc_momentum_no_distill.sh @@ -0,0 +1,121 @@ +#!/bin/bash +#SBATCH -p gpu-rtx6k +#SBATCH -A h2lab +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) +#SBATCH --cpus-per-task=8 # Number of CPU cores per task +#SBATCH --mem=32G # Memory per node (total memory) +#SBATCH --gres=gpu:1 # Number of GPUs requested +#SBATCH --time=300:00:00 # Walltime (hh:mm:ss) + +if [ "$#" -eq 0 ]; then + mac_constraint=0.4 + lora_r=8 + pruning_start=-1 + pruning_scheduler=cubic_gradual + pruner_type=running_fisher + param_allocation_strategy=running_fisher + distillation_type=self_momentum + distill_mapping_strategy=dynamic_block_teacher_dynamic_student +elif [ "$#" -eq 8 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 + distillation_type=$7 + distill_mapping_strategy=$8 +elif [ "$#" -eq 9 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 + distillation_type=$7 + distill_mapping_strategy=$8 + gpu_id=$9 + export CUDA_VISIBLE_DEVICES=$gpu_id +fi + +model_name=roberta-base +param_resizing_strategy=tophalf_limited +task_name=mrpc +adapter_type=lora +pruning_start=-1 +pruning_stop=24 +distill_start=-1 # about 60%, between 3.4 and 3.8, but after 3.6 where the teacher is updated +distill_epoch=96 +pruning_batches=64 +num_prunings=8 +pruning_batch_size=4 +# pre_pruning_tuning_epochs=1 +pre_pruning_tuning_steps=200 +sparsity_warmup_epochs=4 + +learning_rate=2e-4 +training_batch_size=16 +num_train_epochs=120 +warmup_param_tuning_config=q:0-11,v:0-11,i:0-11 +teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 +student_param_tuning_config=q:0-11,v:0-11,i:0-11 +lora_alpha=$(($lora_r * 2)) + +output_dir="output/${model_name}/${task_name}/bz${training_batch_size}/elastictuning/mac${mac_constraint}/epoch${num_train_epochs}/nodistill_epoch${distill_epoch}/numprune${num_prunings}/sparsity_warmup${sparsity_warmup_epochs}/pruning_start${pruning_start}/pruning_stop${pruning_stop}/lora_r${lora_r}/lora_alpha${lora_alpha}/warmup_param${warmup_param_tuning_config}/teacher_param${teacher_param_tuning_config}/distill_${distillation_type}/distill_mapping_${distill_mapping_strategy}" +echo $output_dir +mkdir -p $output_dir + +python run_minus_training.py \ + --output_dir ${output_dir}\ + --task_name ${task_name} \ + --model_name_or_path ${model_name} \ + --do_train \ + --do_eval \ + --save_strategy no \ + --evaluation_strategy steps \ + --logging_strategy steps \ + --logging_steps 100 \ + --log_level info \ + --log_level_replica info \ + --eval_steps 500 \ + --max_seq_length 128 \ + --num_train_epochs ${num_train_epochs} \ + --per_device_train_batch_size ${training_batch_size} \ + --per_device_eval_batch_size ${training_batch_size} \ + --tf32 True \ + --lr_scheduler_type linear\ + --warmup_ratio 0.06\ + --learning_rate ${learning_rate}\ + --weight_decay 0.1\ + --seed 128 \ + --apply_lora \ + --lora_alpha ${lora_alpha} \ + --lora_r ${lora_r} \ + --report_to none \ + --pruning_batches ${pruning_batches} \ + --pruning_batch_size ${pruning_batch_size} \ + --mac_constraint ${mac_constraint} \ + --pruning_scheduler ${pruning_scheduler} \ + --sparsity_warmup_epochs ${sparsity_warmup_epochs} \ + --param_allocation_strategy ${param_allocation_strategy} \ + --teacher_param_tuning_config ${teacher_param_tuning_config} \ + --student_param_tuning_config ${student_param_tuning_config} \ + --head_scorer_type gradient_l1 \ + --intermediate_scorer_type gradient_l1 \ + --pruner_type ${pruner_type} \ + --do_virtual_prune \ + --pruning_start ${pruning_start} \ + --pruning_stop ${pruning_stop} \ + --num_prunings ${num_prunings} \ + --pruning_scheduler_strategy saliency \ + --collect_salience \ + --salience_collecting_start 200 \ + --salience_collecting_end -1 \ + --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \ + --mask_lr 0.01 \ + --grafting_top_k -1 \ + --param_resizing_strategy ${param_resizing_strategy} \ + --tuning_expanding_ratio 4.0 \ + --max_lora_r $(($lora_r * 8)) \ + | tee ${output_dir}/log.txt \ No newline at end of file diff --git a/scripts/adaptpruning_nodistill/roberta_base_cola_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_cola_jonathan.sh new file mode 100644 index 0000000..88d4b82 --- /dev/null +++ b/scripts/adaptpruning_nodistill/roberta_base_cola_jonathan.sh @@ -0,0 +1,111 @@ +#!/bin/bash +#SBATCH -p gpu-rtx6k +#SBATCH -A h2lab +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) +#SBATCH --cpus-per-task=8 # Number of CPU cores per task +#SBATCH --mem=64G # Memory per node (total memory) +#SBATCH --gres=gpu:1 # Number of GPUs requested +#SBATCH --time=400:00:00 # Walltime (hh:mm:ss) + +if [ "$#" -eq 0 ]; then + mac_constraint=0.4 + lora_r=2 + pruning_start=-1 + pruning_scheduler=cubic_gradual + pruner_type=running_fisher + param_allocation_strategy=running_fisher +elif [ "$#" -eq 6 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 +elif [ "$#" -eq 7 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 + gpu_id=$9 + export CUDA_VISIBLE_DEVICES=$gpu_id +fi + +model_name=roberta-base +param_resizing_strategy=tophalf_limited +task_name=cola +adapter_type=lora +pruning_start=-1 +pruning_stop=20 +pruning_batches=64 +num_prunings=8 +pruning_batch_size=4 +# pre_pruning_tuning_epochs=1 +pre_pruning_tuning_steps=200 +sparsity_warmup_epochs=1 + +learning_rate=2e-4 +training_batch_size=16 +num_train_epochs=50 +warmup_param_tuning_config=q:0-11,v:0-11,i:0-11 +teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 +student_param_tuning_config=q:0-11,v:0-11,i:0-11 +lora_alpha=$(($lora_r * 2)) + +output_dir="output/${model_name}/${task_name}" +echo $output_dir +mkdir -p $output_dir + +python run_minus_training.py \ + --output_dir ${output_dir}\ + --task_name ${task_name} \ + --model_name_or_path ${model_name} \ + --do_train \ + --do_eval \ + --save_strategy no \ + --evaluation_strategy steps \ + --logging_strategy steps \ + --logging_steps 100 \ + --log_level info \ + --log_level_replica info \ + --eval_steps 500 \ + --max_seq_length 512 \ + --num_train_epochs ${num_train_epochs} \ + --per_device_train_batch_size ${training_batch_size} \ + --per_device_eval_batch_size ${training_batch_size} \ + --lr_scheduler_type linear\ + --warmup_ratio 0.06\ + --learning_rate ${learning_rate}\ + --weight_decay 0 \ + --seed 128 \ + --apply_lora \ + --lora_alpha ${lora_alpha} \ + --lora_r ${lora_r} \ + --report_to none \ + --pruning_batches ${pruning_batches} \ + --pruning_batch_size ${pruning_batch_size} \ + --mac_constraint ${mac_constraint} \ + --pruning_scheduler ${pruning_scheduler} \ + --sparsity_warmup_epochs ${sparsity_warmup_epochs} \ + --param_allocation_strategy ${param_allocation_strategy} \ + --teacher_param_tuning_config ${teacher_param_tuning_config} \ + --student_param_tuning_config ${student_param_tuning_config} \ + --head_scorer_type gradient_l1 \ + --intermediate_scorer_type gradient_l1 \ + --pruner_type ${pruner_type} \ + --pruning_start ${pruning_start} \ + --pruning_stop ${pruning_stop} \ + --num_prunings ${num_prunings} \ + --pruning_scheduler_strategy saliency \ + --collect_salience \ + --salience_collecting_start 200 \ + --salience_collecting_end -1 \ + --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \ + --mask_lr 0.01 \ + --grafting_top_k -1 \ + --param_resizing_strategy ${param_resizing_strategy} \ + --tuning_expanding_ratio 4.0 \ + --max_lora_r 4 \ + | tee ${output_dir}/log.txt \ No newline at end of file diff --git a/scripts/adaptpruning_nodistill/roberta_base_mrpc_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_mrpc_jonathan.sh new file mode 100644 index 0000000..ccd6950 --- /dev/null +++ b/scripts/adaptpruning_nodistill/roberta_base_mrpc_jonathan.sh @@ -0,0 +1,111 @@ +#!/bin/bash +#SBATCH -p gpu-rtx6k +#SBATCH -A h2lab +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) +#SBATCH --cpus-per-task=8 # Number of CPU cores per task +#SBATCH --mem=64G # Memory per node (total memory) +#SBATCH --gres=gpu:1 # Number of GPUs requested +#SBATCH --time=400:00:00 # Walltime (hh:mm:ss) + +if [ "$#" -eq 0 ]; then + mac_constraint=0.4 + lora_r=2 + pruning_start=-1 + pruning_scheduler=cubic_gradual + pruner_type=running_fisher + param_allocation_strategy=running_fisher +elif [ "$#" -eq 6 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 +elif [ "$#" -eq 7 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 + gpu_id=$9 + export CUDA_VISIBLE_DEVICES=$gpu_id +fi + +model_name=roberta-base +param_resizing_strategy=tophalf_limited +task_name=mrpc +adapter_type=lora +pruning_start=-1 +pruning_stop=20 +pruning_batches=64 +num_prunings=8 +pruning_batch_size=4 +# pre_pruning_tuning_epochs=1 +pre_pruning_tuning_steps=200 +sparsity_warmup_epochs=1 + +learning_rate=2e-4 #TODO: check 1e-3 +training_batch_size=16 +num_train_epochs=50 +warmup_param_tuning_config=q:0-11,v:0-11,i:0-11 +teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 +student_param_tuning_config=q:0-11,v:0-11,i:0-11 +lora_alpha=$(($lora_r * 2)) + +output_dir="output/${model_name}/${task_name}/bz${training_batch_size}/elastictuning_virtualprune_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/numprune${num_prunings}/sparsity_warmup${sparsity_warmup_epochs}/pruning_start${pruning_start}/pruning_stop${pruning_stop}/lora_r${lora_r}/lora_alpha${lora_alpha}/warmup_param${warmup_param_tuning_config}/teacher_param${teacher_param_tuning_config}/" +echo $output_dir +mkdir -p $output_dir + +python run_minus_training.py \ + --output_dir ${output_dir}\ + --task_name ${task_name} \ + --model_name_or_path ${model_name} \ + --do_train \ + --do_eval \ + --save_strategy no \ + --evaluation_strategy steps \ + --logging_strategy steps \ + --logging_steps 100 \ + --log_level info \ + --log_level_replica info \ + --eval_steps 500 \ + --max_seq_length 512 \ + --num_train_epochs ${num_train_epochs} \ + --per_device_train_batch_size ${training_batch_size} \ + --per_device_eval_batch_size ${training_batch_size} \ + --lr_scheduler_type linear\ + --warmup_ratio 0.06\ + --learning_rate ${learning_rate}\ + --weight_decay 0 \ + --seed 128 \ + --apply_lora \ + --lora_alpha ${lora_alpha} \ + --lora_r ${lora_r} \ + --report_to none \ + --pruning_batches ${pruning_batches} \ + --pruning_batch_size ${pruning_batch_size} \ + --mac_constraint ${mac_constraint} \ + --pruning_scheduler ${pruning_scheduler} \ + --sparsity_warmup_epochs ${sparsity_warmup_epochs} \ + --param_allocation_strategy ${param_allocation_strategy} \ + --teacher_param_tuning_config ${teacher_param_tuning_config} \ + --student_param_tuning_config ${student_param_tuning_config} \ + --head_scorer_type gradient_l1 \ + --intermediate_scorer_type gradient_l1 \ + --pruner_type ${pruner_type} \ + --pruning_start ${pruning_start} \ + --pruning_stop ${pruning_stop} \ + --num_prunings ${num_prunings} \ + --pruning_scheduler_strategy saliency \ + --collect_salience \ + --salience_collecting_start 200 \ + --salience_collecting_end -1 \ + --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \ + --mask_lr 0.01 \ + --grafting_top_k -1 \ + --param_resizing_strategy ${param_resizing_strategy} \ + --tuning_expanding_ratio 4.0 \ + --max_lora_r 4 \ + | tee ${output_dir}/log.txt \ No newline at end of file diff --git a/scripts/adaptpruning_nodistill/roberta_base_rte_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_rte_jonathan.sh new file mode 100644 index 0000000..c8fd79f --- /dev/null +++ b/scripts/adaptpruning_nodistill/roberta_base_rte_jonathan.sh @@ -0,0 +1,111 @@ +#!/bin/bash +#SBATCH -p gpu-rtx6k +#SBATCH -A h2lab +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) +#SBATCH --cpus-per-task=8 # Number of CPU cores per task +#SBATCH --mem=64G # Memory per node (total memory) +#SBATCH --gres=gpu:1 # Number of GPUs requested +#SBATCH --time=400:00:00 # Walltime (hh:mm:ss) + +if [ "$#" -eq 0 ]; then + mac_constraint=0.4 + lora_r=2 + pruning_start=-1 + pruning_scheduler=cubic_gradual + pruner_type=running_fisher + param_allocation_strategy=running_fisher +elif [ "$#" -eq 6 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 +elif [ "$#" -eq 7 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 + gpu_id=$9 + export CUDA_VISIBLE_DEVICES=$gpu_id +fi + +model_name=roberta-base +param_resizing_strategy=tophalf_limited +task_name=rte +adapter_type=lora +pruning_start=-1 +pruning_stop=20 +pruning_batches=64 +num_prunings=8 +pruning_batch_size=4 +# pre_pruning_tuning_epochs=1 +pre_pruning_tuning_steps=200 +sparsity_warmup_epochs=1 + +learning_rate=2e-4 +training_batch_size=16 +num_train_epochs=50 +warmup_param_tuning_config=q:0-11,v:0-11,i:0-11 +teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 +student_param_tuning_config=q:0-11,v:0-11,i:0-11 +lora_alpha=$(($lora_r * 2)) + +output_dir="output/${model_name}/${task_name}" +echo $output_dir +mkdir -p $output_dir + +python run_minus_training.py \ + --output_dir ${output_dir}\ + --task_name ${task_name} \ + --model_name_or_path ${model_name} \ + --do_train \ + --do_eval \ + --save_strategy no \ + --evaluation_strategy steps \ + --logging_strategy steps \ + --logging_steps 100 \ + --log_level info \ + --log_level_replica info \ + --eval_steps 500 \ + --max_seq_length 512 \ + --num_train_epochs ${num_train_epochs} \ + --per_device_train_batch_size ${training_batch_size} \ + --per_device_eval_batch_size ${training_batch_size} \ + --lr_scheduler_type linear\ + --warmup_ratio 0.06\ + --learning_rate ${learning_rate}\ + --weight_decay 0 \ + --seed 128 \ + --apply_lora \ + --lora_alpha ${lora_alpha} \ + --lora_r ${lora_r} \ + --report_to none \ + --pruning_batches ${pruning_batches} \ + --pruning_batch_size ${pruning_batch_size} \ + --mac_constraint ${mac_constraint} \ + --pruning_scheduler ${pruning_scheduler} \ + --sparsity_warmup_epochs ${sparsity_warmup_epochs} \ + --param_allocation_strategy ${param_allocation_strategy} \ + --teacher_param_tuning_config ${teacher_param_tuning_config} \ + --student_param_tuning_config ${student_param_tuning_config} \ + --head_scorer_type gradient_l1 \ + --intermediate_scorer_type gradient_l1 \ + --pruner_type ${pruner_type} \ + --pruning_start ${pruning_start} \ + --pruning_stop ${pruning_stop} \ + --num_prunings ${num_prunings} \ + --pruning_scheduler_strategy saliency \ + --collect_salience \ + --salience_collecting_start 200 \ + --salience_collecting_end -1 \ + --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \ + --mask_lr 0.01 \ + --grafting_top_k -1 \ + --param_resizing_strategy ${param_resizing_strategy} \ + --tuning_expanding_ratio 4.0 \ + --max_lora_r 4 \ + | tee ${output_dir}/log.txt \ No newline at end of file diff --git a/scripts/adaptpruning_nodistill/roberta_base_sst2_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_sst2_jonathan.sh new file mode 100644 index 0000000..3b64734 --- /dev/null +++ b/scripts/adaptpruning_nodistill/roberta_base_sst2_jonathan.sh @@ -0,0 +1,111 @@ +#!/bin/bash +#SBATCH -p gpu-rtx6k +#SBATCH -A h2lab +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) +#SBATCH --cpus-per-task=8 # Number of CPU cores per task +#SBATCH --mem=64G # Memory per node (total memory) +#SBATCH --gres=gpu:1 # Number of GPUs requested +#SBATCH --time=400:00:00 # Walltime (hh:mm:ss) + +if [ "$#" -eq 0 ]; then + mac_constraint=0.4 + lora_r=2 + pruning_start=-1 + pruning_scheduler=cubic_gradual + pruner_type=running_fisher + param_allocation_strategy=running_fisher +elif [ "$#" -eq 6 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 +elif [ "$#" -eq 7 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 + gpu_id=$9 + export CUDA_VISIBLE_DEVICES=$gpu_id +fi + +model_name=roberta-base +param_resizing_strategy=tophalf_limited +task_name=sst2 +adapter_type=lora +pruning_start=-1 +pruning_stop=20 +pruning_batches=64 +num_prunings=8 +pruning_batch_size=4 +# pre_pruning_tuning_epochs=1 +pre_pruning_tuning_steps=200 +sparsity_warmup_epochs=1 + +learning_rate=2e-4 +training_batch_size=16 +num_train_epochs=50 +warmup_param_tuning_config=q:0-11,v:0-11,i:0-11 +teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 +student_param_tuning_config=q:0-11,v:0-11,i:0-11 +lora_alpha=$(($lora_r * 2)) + +output_dir="output/${model_name}/${task_name}/bz${training_batch_size}/elastictuning_virtualprune_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/numprune${num_prunings}/sparsity_warmup${sparsity_warmup_epochs}/pruning_start${pruning_start}/pruning_stop${pruning_stop}/lora_r${lora_r}/lora_alpha${lora_alpha}/warmup_param${warmup_param_tuning_config}/teacher_param${teacher_param_tuning_config}/" +echo $output_dir +mkdir -p $output_dir + +python run_minus_training.py \ + --output_dir ${output_dir}\ + --task_name ${task_name} \ + --model_name_or_path ${model_name} \ + --do_train \ + --do_eval \ + --save_strategy no \ + --evaluation_strategy steps \ + --logging_strategy steps \ + --logging_steps 100 \ + --log_level info \ + --log_level_replica info \ + --eval_steps 500 \ + --max_seq_length 512 \ + --num_train_epochs ${num_train_epochs} \ + --per_device_train_batch_size ${training_batch_size} \ + --per_device_eval_batch_size ${training_batch_size} \ + --lr_scheduler_type linear\ + --warmup_ratio 0.06\ + --learning_rate ${learning_rate}\ + --weight_decay 0 \ + --seed 128 \ + --apply_lora \ + --lora_alpha ${lora_alpha} \ + --lora_r ${lora_r} \ + --report_to none \ + --pruning_batches ${pruning_batches} \ + --pruning_batch_size ${pruning_batch_size} \ + --mac_constraint ${mac_constraint} \ + --pruning_scheduler ${pruning_scheduler} \ + --sparsity_warmup_epochs ${sparsity_warmup_epochs} \ + --param_allocation_strategy ${param_allocation_strategy} \ + --teacher_param_tuning_config ${teacher_param_tuning_config} \ + --student_param_tuning_config ${student_param_tuning_config} \ + --head_scorer_type gradient_l1 \ + --intermediate_scorer_type gradient_l1 \ + --pruner_type ${pruner_type} \ + --pruning_start ${pruning_start} \ + --pruning_stop ${pruning_stop} \ + --num_prunings ${num_prunings} \ + --pruning_scheduler_strategy saliency \ + --collect_salience \ + --salience_collecting_start 200 \ + --salience_collecting_end -1 \ + --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \ + --mask_lr 0.01 \ + --grafting_top_k -1 \ + --param_resizing_strategy ${param_resizing_strategy} \ + --tuning_expanding_ratio 4.0 \ + --max_lora_r 4 \ + | tee ${output_dir}/log.txt \ No newline at end of file diff --git a/scripts/adaptpruning_nodistill/roberta_base_stsb_jonathan.sh b/scripts/adaptpruning_nodistill/roberta_base_stsb_jonathan.sh new file mode 100644 index 0000000..99a34c6 --- /dev/null +++ b/scripts/adaptpruning_nodistill/roberta_base_stsb_jonathan.sh @@ -0,0 +1,111 @@ +#!/bin/bash +#SBATCH -p gpu-rtx6k +#SBATCH -A h2lab +#SBATCH --nodes=1 # Number of nodes +#SBATCH --ntasks-per-node=1 # Number of tasks per node (1 in this case) +#SBATCH --cpus-per-task=8 # Number of CPU cores per task +#SBATCH --mem=64G # Memory per node (total memory) +#SBATCH --gres=gpu:1 # Number of GPUs requested +#SBATCH --time=400:00:00 # Walltime (hh:mm:ss) + +if [ "$#" -eq 0 ]; then + mac_constraint=0.4 + lora_r=2 + pruning_start=-1 + pruning_scheduler=cubic_gradual + pruner_type=running_fisher + param_allocation_strategy=running_fisher +elif [ "$#" -eq 6 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 +elif [ "$#" -eq 7 ]; then + mac_constraint=$1 + lora_r=$2 + pruning_start=$3 + pruning_scheduler=$4 + pruner_type=$5 + param_allocation_strategy=$6 + gpu_id=$9 + export CUDA_VISIBLE_DEVICES=$gpu_id +fi + +model_name=roberta-base +param_resizing_strategy=tophalf_limited +task_name=stsb +adapter_type=lora +pruning_start=-1 +pruning_stop=20 +pruning_batches=64 +num_prunings=8 +pruning_batch_size=4 +# pre_pruning_tuning_epochs=1 +pre_pruning_tuning_steps=200 +sparsity_warmup_epochs=1 + +learning_rate=2e-4 +training_batch_size=16 +num_train_epochs=50 +warmup_param_tuning_config=q:0-11,v:0-11,i:0-11 +teacher_param_tuning_config=q:0-11,v:0-11,i:0-11 +student_param_tuning_config=q:0-11,v:0-11,i:0-11 +lora_alpha=$(($lora_r * 2)) + +output_dir="output/${model_name}/${task_name}/bz${training_batch_size}/elastictuning_virtualprune_nodistill/mac${mac_constraint}/epoch${num_train_epochs}/numprune${num_prunings}/sparsity_warmup${sparsity_warmup_epochs}/pruning_start${pruning_start}/pruning_stop${pruning_stop}/lora_r${lora_r}/lora_alpha${lora_alpha}/warmup_param${warmup_param_tuning_config}/teacher_param${teacher_param_tuning_config}/" +echo $output_dir +mkdir -p $output_dir + +python run_minus_training.py \ + --output_dir ${output_dir}\ + --task_name ${task_name} \ + --model_name_or_path ${model_name} \ + --do_train \ + --do_eval \ + --save_strategy no \ + --evaluation_strategy steps \ + --logging_strategy steps \ + --logging_steps 100 \ + --log_level info \ + --log_level_replica info \ + --eval_steps 500 \ + --max_seq_length 512 \ + --num_train_epochs ${num_train_epochs} \ + --per_device_train_batch_size ${training_batch_size} \ + --per_device_eval_batch_size ${training_batch_size} \ + --lr_scheduler_type linear\ + --warmup_ratio 0.06\ + --learning_rate ${learning_rate}\ + --weight_decay 0 \ + --seed 128 \ + --apply_lora \ + --lora_alpha ${lora_alpha} \ + --lora_r ${lora_r} \ + --report_to none \ + --pruning_batches ${pruning_batches} \ + --pruning_batch_size ${pruning_batch_size} \ + --mac_constraint ${mac_constraint} \ + --pruning_scheduler ${pruning_scheduler} \ + --sparsity_warmup_epochs ${sparsity_warmup_epochs} \ + --param_allocation_strategy ${param_allocation_strategy} \ + --teacher_param_tuning_config ${teacher_param_tuning_config} \ + --student_param_tuning_config ${student_param_tuning_config} \ + --head_scorer_type gradient_l1 \ + --intermediate_scorer_type gradient_l1 \ + --pruner_type ${pruner_type} \ + --pruning_start ${pruning_start} \ + --pruning_stop ${pruning_stop} \ + --num_prunings ${num_prunings} \ + --pruning_scheduler_strategy saliency \ + --collect_salience \ + --salience_collecting_start 200 \ + --salience_collecting_end -1 \ + --pre_pruning_tuning_steps ${pre_pruning_tuning_steps} \ + --mask_lr 0.01 \ + --grafting_top_k -1 \ + --param_resizing_strategy ${param_resizing_strategy} \ + --tuning_expanding_ratio 4.0 \ + --max_lora_r 4 \ + | tee ${output_dir}/log.txt \ No newline at end of file diff --git a/trainer/allocation_strategy.py b/trainer/allocation_strategy.py index 37daa85..c2adc0e 100644 --- a/trainer/allocation_strategy.py +++ b/trainer/allocation_strategy.py @@ -2,7 +2,7 @@ from typing import Dict, List from prune.fisher import collect_grads_by_suffix -from ortools.algorithms import pywrapknapsack_solver +# from ortools.algorithms import pywrapknapsack_solver def binary_knapsack_search(values_tensor: torch.Tensor, weights_tensor: torch.Tensor, capacities: List[int]) -> torch.Tensor: sorted_values, sorted_indices = torch.sort(values_tensor, descending=True) diff --git a/trainer/model_arch.py b/trainer/model_arch.py index 13ae305..ab9cdd8 100644 --- a/trainer/model_arch.py +++ b/trainer/model_arch.py @@ -1,5 +1,5 @@ import torch -from transformers.modeling_utils import PreTrainedModel, PretrainedConfig +from transformers import PreTrainedModel, PretrainedConfig NAME2TEMPLATE = { 'query': '.encoder.layer.%d.attention.self.query', @@ -369,4 +369,4 @@ def get_layer(self, i, k): k_attr = NAME2ATTR[k][self.model_category] parent_layer = self.get_parent_layer(i, k) layer = getattr(parent_layer, k_attr) - return layer \ No newline at end of file + return layer diff --git a/trainer/trainer_minus.py b/trainer/trainer_minus.py index c4d4d9a..796007d 100644 --- a/trainer/trainer_minus.py +++ b/trainer/trainer_minus.py @@ -19,23 +19,30 @@ from args import MinusTrainingArguments from transformers import __version__ from transformers import Trainer -from transformers.trainer import unwrap_model, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +try: + from transformers.trainer import unwrap_model, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +except ImportError: + from transformers.modeling_utils import unwrap_model + from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from transformers.optimization import get_scheduler from torch.utils.data import DataLoader, Subset, IterableDataset from torch.utils.data.dataset import Dataset from torch.utils.data.distributed import DistributedSampler from torch.optim import Optimizer from torch.optim.lr_scheduler import LambdaLR -from transformers.modeling_utils import PreTrainedModel +from transformers import PreTrainedModel from typing import Any, Callable, Dict, List, Optional, Union, Tuple, Callable from torch.utils.data.dataset import Dataset from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.data.data_collator import DataCollator from transformers.trainer_utils import EvalPrediction, TrainOutput, set_seed, get_last_checkpoint, speed_metrics, EvalLoopOutput, denumpify_detensorize from transformers.trainer_pt_utils import nested_concat, nested_numpify, nested_truncate, IterableDatasetShard, find_batch_size, nested_detach -from transformers.file_utils import is_torch_tpu_available, WEIGHTS_NAME, CONFIG_NAME +try: + from transformers.utils import WEIGHTS_NAME, CONFIG_NAME +except ImportError: + from transformers.file_utils import WEIGHTS_NAME, CONFIG_NAME from transformers.trainer_callback import TrainerState -from transformers.configuration_utils import PretrainedConfig +from transformers import PretrainedConfig from prune import AdapterPruner, build_scorer, build_pruner from utils.minus_utils import count_params, prune_layer, to_cpu_recursive, lora_to_prunelora from utils.fisher_utils.efficiency.param import * @@ -391,9 +398,11 @@ def __init__( if args.half_precision_backend == "cuda_amp": self.use_cuda_amp = True self.amp_dtype = torch.float16 if args.fp16 else torch.bfloat16 + self.use_cpu_amp = False elif args.half_precision_backend == "cpu_amp": self.use_cpu_amp = True self.amp_dtype = torch.bfloat16 + self.use_cuda_amp = False logger.info("Half precision backend: " + args.half_precision_backend) logger.info("Half precision dtype: " + str(getattr(self, 'amp_dtype', None))) @@ -964,7 +973,7 @@ def train( # Calculate the total model parameters before any pruning conducted self.n_params, self.n_param_vars = count_params(self.model, mode='main') # Exclude LoRA layers when counting parameters for later pruning usage # Logging mixed-precision status - logger.info("Use cpu_amp %s; use cuda_amp %s; mixed-precision dtype: %s" % (self.use_cpu_amp, self.use_cuda_amp, getattr(self, 'amp_dtype', None))) + # logger.info("Use cpu_amp %s; use cuda_amp %s; mixed-precision dtype: %s" % (self.use_cpu_amp, self.use_cuda_amp, getattr(self, 'amp_dtype', None))) if self.model.config.apply_lora: logger.info("Using PEFT with LoRA. Disabling grad for all non-teacher-learning layers.") @@ -1473,7 +1482,7 @@ def train( if ( args.logging_nan_inf_filter - and not is_torch_tpu_available() + # and not is_torch_tpu_available() and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): # if loss is nan or inf simply add the average of previous logged losses @@ -2320,4 +2329,4 @@ def calculate_distillation_loss(self, teacher_outputs, student_outputs): if distill_loss is not None: loss += self.args.distill_loss_alpha * distill_loss - return distill_loss, ce_distill_loss, loss \ No newline at end of file + return distill_loss, ce_distill_loss, loss diff --git a/utils/__init__.py b/utils/__init__.py index 01a7a68..a412e61 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -3,10 +3,12 @@ from trainer.trainer_seq2seq_minus import MinusSeq2SeqTrainer from args import Seq2SeqDataTrainingArguments from utils.utils import * -from datasets import load_metric +# from datasets import load_metric from torch.utils.data import DataLoader from utils.qa_utils import postprocess_qa_predictions from dataclasses import dataclass +import evaluate + IGNORE_INDEX = -100 GLUE_TASKS = set(["cola", "sst2", "mrpc", "stsb", "qqp", "mnli", "qnli", "rte"]) @@ -70,7 +72,7 @@ def build_trainer(data_args, training_args, model, tokenizer, train_dataset=None elif data_args.task_name is not None and data_args.task_name in GLUE_TASKS: label2id = model.label2id if hasattr(model, 'label2id') else None # Get the metric function - metric = load_metric("glue", data_args.task_name) + metric = evaluate.load("glue", data_args.task_name) # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): diff --git a/utils/fisher_utils/linalg.py b/utils/fisher_utils/linalg.py index 4d1197a..febdff9 100644 --- a/utils/fisher_utils/linalg.py +++ b/utils/fisher_utils/linalg.py @@ -1,6 +1,14 @@ import torch -import cupy -from cupyx.scipy.sparse.linalg import lsmr +import numpy as np +try: + import cupy + from cupyx.scipy.sparse.linalg import lsmr as cupy_lsmr + HAS_CUPY = True +except ImportError: + cupy = None + cupy_lsmr = None + HAS_CUPY = False +from scipy.sparse.linalg import lsmr as scipy_lsmr @torch.no_grad() @@ -18,11 +26,18 @@ def lsmr_cupy_solver(A, B): B = B - A.sum(dim=1) if B.shape[0] == 1: X = B / A[0, 0] + solution = (None, 0) else: - CU_A = cupy.asarray(A.cpu().numpy()) - CU_B = cupy.asarray(B.cpu().numpy()) - solution = lsmr(CU_A, CU_B, damp=1) - X = cupy.asnumpy(solution[0]) + np_A = A.cpu().numpy() + np_B = B.cpu().numpy() + if HAS_CUPY: + CU_A = cupy.asarray(np_A) + CU_B = cupy.asarray(np_B) + solution = cupy_lsmr(CU_A, CU_B, damp=1) + X = cupy.asnumpy(solution[0]) + else: + solution = scipy_lsmr(np_A, np_B, damp=1) + X = np.asarray(solution[0]) X = torch.from_numpy(X).to(A.device) X = X + 1 return X, solution[1] < 3 diff --git a/utils/minus_utils.py b/utils/minus_utils.py index 91cac44..397a30a 100644 --- a/utils/minus_utils.py +++ b/utils/minus_utils.py @@ -1176,5 +1176,6 @@ def kurtosis(a: torch.Tensor, axis: int = 0, fisher: bool = True, bias: bool = T # Compute kurtosis kurt = torch.mean(zscores.pow(4), axis, keepdim=True).squeeze() + print(f"KURTOSIS: {kurt.size()}") return kurt - 3 if fisher else kurt \ No newline at end of file diff --git a/utils/utils.py b/utils/utils.py index aa6a726..1535f01 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -7,7 +7,7 @@ from transformers import PretrainedConfig from datasets import load_dataset, DatasetDict from typing import Sequence, Dict -from utils import alpaca_utils +# from utils import alpaca_utils from dataclasses import dataclass, field import logging import transformers