From a3ab795763de6b7ae728a3f103d3558739081770 Mon Sep 17 00:00:00 2001
From: Qamar <qamar@intern.int.fhg.de>
Date: Sat, 25 May 2024 15:50:26 +0200
Subject: [PATCH] Noisy dataset evaluation

---
 script/gsm/flan_gsm.py             | 16 ++++++----------
 script/gsm/gsm_preprocess.py       |  2 +-
 script/gsm/mistral_gsm.py          | 12 ++++++------
 script/gsm/mistral_instruct_gsm.py | 12 ++++++------
 script/gsm/mistral_math_gsm.py     | 12 ++++++------
 script/gsm/script_run.py           |  8 ++++----
 script/gsm/utils.py                | 18 ++++++++++++++++++
 7 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/script/gsm/flan_gsm.py b/script/gsm/flan_gsm.py
index b4c4893..437062b 100644
--- a/script/gsm/flan_gsm.py
+++ b/script/gsm/flan_gsm.py
@@ -2,24 +2,20 @@
 from transformers import T5Tokenizer, T5ForConditionalGeneration
 
 import csv
-import zipfile
 import pandas as pd
 import re
-import random
-import sys, os, json
 
 from jsonformer import Jsonformer
 
 from config import access_token
-from utils import get_questions_and_answer_from_dataset
+from utils import get_noisy_questions_and_answer_from_dataset
 
 tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
 model = T5ForConditionalGeneration.from_pretrained(
     "google/flan-t5-xl", device_map={"": 0}, max_length=512
 )
 
-DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
-
+#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
 
 json_schema1 = {
     "type": "object",
@@ -28,12 +24,12 @@
     },
 }
 
-csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
-questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
+csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv"
+questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)
 
 
 output_file = (
-    f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/flan/flan_gsm_response.csv"
+    f"../../data/gsm/flan/flan_gsm_response.csv"
 )
 counter = 0
 with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
@@ -151,7 +147,7 @@ def safe_convert_to_int(value):
 
 # Save the DataFrame to a new CSV file
 accuracy_df.to_csv(
-    "/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv",
+    "../../data/gsm/accuracy.csv",
     mode="a",
     header=False,
     index=False,
diff --git a/script/gsm/gsm_preprocess.py b/script/gsm/gsm_preprocess.py
index 56fea25..7805231 100644
--- a/script/gsm/gsm_preprocess.py
+++ b/script/gsm/gsm_preprocess.py
@@ -2,7 +2,7 @@
 import re
 
 # Load the dataset
-df = pd.read_json("/home/stud/abedinz1/localDisk/nlplab/data/train.json")
+df = pd.read_json("../../data/train.json")
 
 
 def extract_numeric_answer(answer):
diff --git a/script/gsm/mistral_gsm.py b/script/gsm/mistral_gsm.py
index 27afcd0..0f6e212 100644
--- a/script/gsm/mistral_gsm.py
+++ b/script/gsm/mistral_gsm.py
@@ -15,10 +15,10 @@
 from jsonformer import Jsonformer
 
 from config import access_token
-from utils import get_questions_and_answer_from_dataset
+from utils import get_noisy_questions_and_answer_from_dataset
 
 
-DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
+#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
 access_token = access_token
 model_name = "mistralai/Mistral-7B-v0.1"
 tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
@@ -47,12 +47,12 @@
     },
 }
 
-csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
-questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
+csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv"
+questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)
 
 
 output_file = (
-    f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/mistral/mistral_gsm_response.csv"
+    f"../../data/gsm/mistral/mistral_gsm_response.csv"
 )
 counter = 0
 with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
@@ -170,7 +170,7 @@ def safe_convert_to_int(value):
 
 # Save the DataFrame to a new CSV file
 accuracy_df.to_csv(
-    "/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv",
+    "../../data/gsm/accuracy.csv",
     mode="a",
     header=False,
     index=False,
diff --git a/script/gsm/mistral_instruct_gsm.py b/script/gsm/mistral_instruct_gsm.py
index ed39f54..f048013 100644
--- a/script/gsm/mistral_instruct_gsm.py
+++ b/script/gsm/mistral_instruct_gsm.py
@@ -15,10 +15,10 @@
 from jsonformer import Jsonformer
 
 from config import access_token
-from utils import get_questions_and_answer_from_dataset
+from utils import get_noisy_questions_and_answer_from_dataset
 
 
-DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
+#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
 access_token = access_token
 model_name = "mistralai/Mistral-7B-Instruct-v0.1"
 tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
@@ -47,11 +47,11 @@
     },
 }
 
-csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
-questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
+csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv"
+questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)
 
 
-output_file = f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/mistral_instruct/mistral_instruct_gsm_response.csv"
+output_file = f"../../data/gsm/mistral_instruct/mistral_instruct_gsm_response.csv"
 counter = 0
 with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
     fieldnames = [
@@ -169,7 +169,7 @@ def safe_convert_to_int(value):
 
 # Save the DataFrame to a new CSV file
 accuracy_df.to_csv(
-    "/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv",
+    "../../data/gsm/accuracy.csv",
     mode="a",
     header=False,
     index=False,
diff --git a/script/gsm/mistral_math_gsm.py b/script/gsm/mistral_math_gsm.py
index 735efb5..897cfa6 100644
--- a/script/gsm/mistral_math_gsm.py
+++ b/script/gsm/mistral_math_gsm.py
@@ -15,10 +15,10 @@
 from jsonformer import Jsonformer
 
 from config import access_token
-from utils import get_questions_and_answer_from_dataset
+from utils import get_noisy_questions_and_answer_from_dataset
 
 
-DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
+#DIR_PATH = "/home/stud/abedinz1/localDisk/nlplab"
 access_token = access_token
 model_name = "meta-math/MetaMath-Mistral-7B"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -46,11 +46,11 @@
     },
 }
 
-csv_file = f"{DIR_PATH}/data/gsm/train_preprocessed.csv"
-questions, ground_truths = get_questions_and_answer_from_dataset(csv_file)
+csv_file = f"../../data/noisy_datasets/gsm8k_noisy_punct_10.csv"
+questions, ground_truths = get_noisy_questions_and_answer_from_dataset(csv_file)
 
 # TODO: Change to relative path
-output_file = f"/home/stud/abedinz1/localDisk/nlplab/data/gsm/mistral_math/mistral_math_gsm_response.csv"
+output_file = f"../../data/gsm/mistral_math/mistral_math_gsm_response.csv"
 counter = 0
 with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
     fieldnames = [
@@ -170,7 +170,7 @@ def safe_convert_to_int(value):
 
 # Save the DataFrame to a new CSV file
 accuracy_df.to_csv(
-    "/home/stud/abedinz1/localDisk/nlplab/data/gsm/accuracy.csv",
+    "../../data/gsm/accuracy.csv",
     mode="a",
     header=False,
     index=False,
diff --git a/script/gsm/script_run.py b/script/gsm/script_run.py
index 5033cb7..944f8c3 100644
--- a/script/gsm/script_run.py
+++ b/script/gsm/script_run.py
@@ -2,10 +2,10 @@
 
 # List of scripts to run sequentially
 scripts = [
-    "python  /home/stud/abedinz1/localDisk/nlplab/script/gsm/mistral_gsm.py",
-    "python  /home/stud/abedinz1/localDisk/nlplab/script/gsm/mistral_instruct_gsm.py",
-    "python  /home/stud/abedinz1/localDisk/nlplab/script/gsm/mistral_math_gsm.py",
-    "python  /home/stud/abedinz1/localDisk/nlplab/script/gsm/flan_gsm.py"
+    "python  mistral_gsm.py",
+    "python  mistral_instruct_gsm.py",
+    "python  mistral_math_gsm.py",
+    "python  flan_gsm.py"
 ]
 
 # Run each script sequentially
diff --git a/script/gsm/utils.py b/script/gsm/utils.py
index 279bdea..2b10d11 100644
--- a/script/gsm/utils.py
+++ b/script/gsm/utils.py
@@ -11,3 +11,21 @@ def get_questions_and_answer_from_dataset(csv_file_path):
     groundTruths = data["numeric_answer"].tolist()
 
     return questions, groundTruths
+
+
+def get_noisy_questions_and_answer_from_dataset(csv_file_path, aug=1):
+    # Load the specific CSV file
+    data = pd.read_csv(csv_file_path)
+
+    # Extract the question column
+    if aug == 1:
+        questions = [eval(sublist)[0] for sublist in data['noisy_questions_aug_1']]
+    else:
+        questions = data[f"noisy_questions_aug_{aug}"].apply(eval).tolist()
+    groundTruths = data["numeric_answer"].tolist()
+
+    return questions, groundTruths
+
+#Testing
+#questions, _ = get_noisy_questions_and_answer_from_dataset('../../data/noisy_datasets/gsm8k_noisy_punct_10.csv',aug=4)
+#print(questions)