Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 6 additions & 10 deletions script/gsm/flan_gsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,20 @@
from transformers import T5Tokenizer, T5ForConditionalGeneration

import csv
import zipfile
import pandas as pd
import re
import random
import sys, os, json

from jsonformer import Jsonformer

from config import access_token
from utils import get_questions_and_answer_from_dataset
from utils import get_noisy_questions_and_answer_from_dataset

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained(
"google/flan-t5-xl", device_map={"": 0}, max_length=512
)

DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"

#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"

json_schema1 = {
"type": "object",
Expand All @@ -28,12 +24,12 @@
},
}

csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv"
questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)


output_file = (
f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/flan/flan_gsm_response.csv"
f"../../data/gsm/flan/flan_gsm_response.csv"
)
counter = 0
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
Expand Down Expand Up @@ -151,7 +147,7 @@ def safe_convert_to_int(value):

# Save the DataFrame to a new CSV file
accuracy_df.to_csv(
"/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv",
"../../data/gsm/accuracy.csv",
mode="a",
header=False,
index=False,
Expand Down
2 changes: 1 addition & 1 deletion script/gsm/gsm_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re

# Load the dataset
df = pd.read_json("/home/stud/abedinz1/localDisk/nlplab/data/train.json")
df = pd.read_json("../../data/train.json")


def extract_numeric_answer(answer):
Expand Down
12 changes: 6 additions & 6 deletions script/gsm/mistral_gsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
from jsonformer import Jsonformer

from config import access_token
from utils import get_questions_and_answer_from_dataset
from utils import get_noisy_questions_and_answer_from_dataset


DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
access_token = access_token
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
Expand Down Expand Up @@ -47,12 +47,12 @@
},
}

csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv"
questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)


output_file = (
f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/mistral/mistral_gsm_response.csv"
f"../../data/gsm/mistral/mistral_gsm_response.csv"
)
counter = 0
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
Expand Down Expand Up @@ -170,7 +170,7 @@ def safe_convert_to_int(value):

# Save the DataFrame to a new CSV file
accuracy_df.to_csv(
"/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv",
"../../data/gsm/accuracy.csv",
mode="a",
header=False,
index=False,
Expand Down
12 changes: 6 additions & 6 deletions script/gsm/mistral_instruct_gsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
from jsonformer import Jsonformer

from config import access_token
from utils import get_questions_and_answer_from_dataset
from utils import get_noisy_questions_and_answer_from_dataset


DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
access_token = access_token
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
Expand Down Expand Up @@ -47,11 +47,11 @@
},
}

csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv"
questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)


output_file = f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/mistral_instruct/mistral_instruct_gsm_response.csv"
output_file = f"../../data/gsm/mistral_instruct/mistral_instruct_gsm_response.csv"
counter = 0
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
fieldnames = [
Expand Down Expand Up @@ -169,7 +169,7 @@ def safe_convert_to_int(value):

# Save the DataFrame to a new CSV file
accuracy_df.to_csv(
"/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv",
"../../data/gsm/accuracy.csv",
mode="a",
header=False,
index=False,
Expand Down
12 changes: 6 additions & 6 deletions script/gsm/mistral_math_gsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
from jsonformer import Jsonformer

from config import access_token
from utils import get_questions_and_answer_from_dataset
from utils import get_noisy_questions_and_answer_from_dataset


DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
access_token = access_token
model_name = "meta-math/MetaMath-Mistral-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
Expand Down Expand Up @@ -46,11 +46,11 @@
},
}

csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv"
questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)

# TODO: Change to relative path
output_file = f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/mistral_math/mistral_math_gsm_response.csv"
output_file = f"../../data/gsm/mistral_math/mistral_math_gsm_response.csv"
counter = 0
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
fieldnames = [
Expand Down Expand Up @@ -170,7 +170,7 @@ def safe_convert_to_int(value):

# Save the DataFrame to a new CSV file
accuracy_df.to_csv(
"/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv",
"../../data/gsm/accuracy.csv",
mode="a",
header=False,
index=False,
Expand Down
8 changes: 4 additions & 4 deletions script/gsm/script_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

# List of scripts to run sequentially
scripts = [
"python /home/stud/abedinz1/localDisk/nlplab/script/gsm/mistral_gsm.py",
"python /home/stud/abedinz1/localDisk/nlplab/script/gsm/mistral_instruct_gsm.py",
"python /home/stud/abedinz1/localDisk/nlplab/script/gsm/mistral_math_gsm.py",
"python /home/stud/abedinz1/localDisk/nlplab/script/gsm/flan_gsm.py"
"python mistral_gsm.py",
"python mistral_instruct_gsm.py",
"python mistral_math_gsm.py",
"python flan_gsm.py"
]

# Run each script sequentially
Expand Down
18 changes: 18 additions & 0 deletions script/gsm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,21 @@ def get_questions_and_answer_from_dataset(csv_file_path):
groundTruths = data["numeric_answer"].tolist()

return questions, groundTruths


def get_noisy_questions_and_answer_from_dataset(csv_file_path, aug=1):
# Load the specific CSV file
data = pd.read_csv(csv_file_path)

# Extract the question column
if aug == 1:
questions = [eval(sublist)[0] for sublist in data['noisy_questions_aug_1']]
else:
questions = data[f"noisy_questions_aug_{aug}"].apply(eval).tolist()
groundTruths = data["numeric_answer"].tolist()

return questions, groundTruths

#Testing
#questions, _ = get_noisy_questions_and_answer_from_dataset('../../data/noisy_datasets/gsm8k_noisy_punct_10.csv',aug=4)
#print(questions)