From a3ab795763de6b7ae728a3f103d3558739081770 Mon Sep 17 00:00:00 2001 From: Qamar Date: Sat, 25 May 2024 15:50:26 +0200 Subject: [PATCH] Noisy dataset evaluation --- script/gsm/flan_gsm.py | 16 ++++++---------- script/gsm/gsm_preprocess.py | 2 +- script/gsm/mistral_gsm.py | 12 ++++++------ script/gsm/mistral_instruct_gsm.py | 12 ++++++------ script/gsm/mistral_math_gsm.py | 12 ++++++------ script/gsm/script_run.py | 8 ++++---- script/gsm/utils.py | 18 ++++++++++++++++++ 7 files changed, 47 insertions(+), 33 deletions(-) diff --git a/script/gsm/flan_gsm.py b/script/gsm/flan_gsm.py index b4c4893..437062b 100644 --- a/script/gsm/flan_gsm.py +++ b/script/gsm/flan_gsm.py @@ -2,24 +2,20 @@ from transformers import T5Tokenizer, T5ForConditionalGeneration import csv -import zipfile import pandas as pd import re -import random -import sys, os, json from jsonformer import Jsonformer from config import access_token -from utils import get_questions_and_answer_from_dataset +from utils import get_noisy_questions_and_answer_from_dataset tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl") model = T5ForConditionalGeneration.from_pretrained( "google/flan-t5-xl", device_map={"": 0}, max_length=512 ) -DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab" - +#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab" json_schema1 = { "type": "object", @@ -28,12 +24,12 @@ }, } -csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv" -questions, ground_truths = get_questions_and_answer_from_dataset(csv_file) +csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv" +questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file) output_file = ( - f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/flan/flan_gsm_response.csv" + f"../../data/gsm/flan/flan_gsm_response.csv" ) counter = 0 with open(output_file, "w", newline="", encoding="utf-8") as csvfile: @@ -151,7 +147,7 @@ def safe_convert_to_int(value): # Save the DataFrame to a new CSV file accuracy_df.to_csv( - "/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv", + "../../data/gsm/accuracy.csv", mode="a", header=False, index=False, diff --git a/script/gsm/gsm_preprocess.py b/script/gsm/gsm_preprocess.py index 56fea25..7805231 100644 --- a/script/gsm/gsm_preprocess.py +++ b/script/gsm/gsm_preprocess.py @@ -2,7 +2,7 @@ import re # Load the dataset -df = pd.read_json("/home/stud/abedinz1/localDisk/nlplab/data/train.json") +df = pd.read_json("../../data/train.json") def extract_numeric_answer(answer): diff --git a/script/gsm/mistral_gsm.py b/script/gsm/mistral_gsm.py index 27afcd0..0f6e212 100644 --- a/script/gsm/mistral_gsm.py +++ b/script/gsm/mistral_gsm.py @@ -15,10 +15,10 @@ from jsonformer import Jsonformer from config import access_token -from utils import get_questions_and_answer_from_dataset +from utils import get_noisy_questions_and_answer_from_dataset -DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab" +#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab" access_token = access_token model_name = "mistralai/Mistral-7B-v0.1" tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token) @@ -47,12 +47,12 @@ }, } -csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv" -questions, ground_truths = get_questions_and_answer_from_dataset(csv_file) +csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv" +questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file) output_file = ( - f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/mistral/mistral_gsm_response.csv" + f"../../data/gsm/mistral/mistral_gsm_response.csv" ) counter = 0 with open(output_file, "w", newline="", encoding="utf-8") as csvfile: @@ -170,7 +170,7 @@ def safe_convert_to_int(value): # Save the DataFrame to a new CSV file accuracy_df.to_csv( - "/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv", + "../../data/gsm/accuracy.csv", mode="a", header=False, index=False, diff --git a/script/gsm/mistral_instruct_gsm.py b/script/gsm/mistral_instruct_gsm.py index ed39f54..f048013 100644 --- a/script/gsm/mistral_instruct_gsm.py +++ b/script/gsm/mistral_instruct_gsm.py @@ -15,10 +15,10 @@ from jsonformer import Jsonformer from config import access_token -from utils import get_questions_and_answer_from_dataset +from utils import get_noisy_questions_and_answer_from_dataset -DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab" +#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab" access_token = access_token model_name = "mistralai/Mistral-7B-Instruct-v0.1" tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token) @@ -47,11 +47,11 @@ }, } -csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv" -questions, ground_truths = get_questions_and_answer_from_dataset(csv_file) +csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv" +questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file) -output_file = f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/mistral_instruct/mistral_instruct_gsm_response.csv" +output_file = f"../../data/gsm/mistral_instruct/mistral_instruct_gsm_response.csv" counter = 0 with open(output_file, "w", newline="", encoding="utf-8") as csvfile: fieldnames = [ @@ -169,7 +169,7 @@ def safe_convert_to_int(value): # Save the DataFrame to a new CSV file accuracy_df.to_csv( - "/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv", + "../../data/gsm/accuracy.csv", mode="a", header=False, index=False, diff --git a/script/gsm/mistral_math_gsm.py b/script/gsm/mistral_math_gsm.py index 735efb5..897cfa6 100644 --- a/script/gsm/mistral_math_gsm.py +++ b/script/gsm/mistral_math_gsm.py @@ -15,10 +15,10 @@ from jsonformer import Jsonformer from config import access_token -from utils import get_questions_and_answer_from_dataset +from utils import get_noisy_questions_and_answer_from_dataset -DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab" +#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab" access_token = access_token model_name = "meta-math/MetaMath-Mistral-7B" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -46,11 +46,11 @@ }, } -csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv" -questions, ground_truths = get_questions_and_answer_from_dataset(csv_file) +csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv" +questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file) # TODO: Change to relative path -output_file = f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/mistral_math/mistral_math_gsm_response.csv" +output_file = f"../../data/gsm/mistral_math/mistral_math_gsm_response.csv" counter = 0 with open(output_file, "w", newline="", encoding="utf-8") as csvfile: fieldnames = [ @@ -170,7 +170,7 @@ def safe_convert_to_int(value): # Save the DataFrame to a new CSV file accuracy_df.to_csv( - "/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv", + "../../data/gsm/accuracy.csv", mode="a", header=False, index=False, diff --git a/script/gsm/script_run.py b/script/gsm/script_run.py index 5033cb7..944f8c3 100644 --- a/script/gsm/script_run.py +++ b/script/gsm/script_run.py @@ -2,10 +2,10 @@ # List of scripts to run sequentially scripts = [ - "python /home/stud/abedinz1/localDisk/nlplab/script/gsm/mistral_gsm.py", - "python /home/stud/abedinz1/localDisk/nlplab/script/gsm/mistral_instruct_gsm.py", - "python /home/stud/abedinz1/localDisk/nlplab/script/gsm/mistral_math_gsm.py", - "python /home/stud/abedinz1/localDisk/nlplab/script/gsm/flan_gsm.py" + "python mistral_gsm.py", + "python mistral_instruct_gsm.py", + "python mistral_math_gsm.py", + "python flan_gsm.py" ] # Run each script sequentially diff --git a/script/gsm/utils.py b/script/gsm/utils.py index 279bdea..2b10d11 100644 --- a/script/gsm/utils.py +++ b/script/gsm/utils.py @@ -11,3 +11,21 @@ def get_questions_and_answer_from_dataset(csv_file_path): groundTruths = data["numeric_answer"].tolist() return questions, groundTruths + + +def get_noisy_questions_and_answer_from_dataset(csv_file_path, aug=1): + # Load the specific CSV file + data = pd.read_csv(csv_file_path) + + # Extract the question column + if aug == 1: + questions = [eval(sublist)[0] for sublist in data['noisy_questions_aug_1']] + else: + questions = data[f"noisy_questions_aug_{aug}"].apply(eval).tolist() + groundTruths = data["numeric_answer"].tolist() + + return questions, groundTruths + +#Testing +#questions, _ = get_noisy_questions_and_answer_from_dataset('../../data/noisy_datasets/gsm8k_noisy_punct_10.csv',aug=4) +#print(questions)