From 8de1a75366983e6b925a43a48eb8df9cf9566247 Mon Sep 17 00:00:00 2001
From: Mauro Marques Filho <eu@resolvicomai.app>
Date: Wed, 20 May 2026 19:14:42 -0300
Subject: [PATCH] Handle missing IFBench eval responses

---
 evaluation_lib.py    | 36 +++++++++++++++++++++++++++---------
 instructions_test.py | 40 ++++++++++++++++++++++++++++++++++++++++
 run_eval.py          |  1 +
 3 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/evaluation_lib.py b/evaluation_lib.py
index ea99ca9..c16c064 100644
--- a/evaluation_lib.py
+++ b/evaluation_lib.py
@@ -40,6 +40,17 @@ class OutputExample:
   follow_instruction_list: list[bool]
 
 
+def empty_output(inp):
+  """Builds a failed output for prompts with no generated response."""
+  return OutputExample(
+      instruction_id_list=inp.instruction_id_list,
+      prompt=inp.prompt,
+      response="",
+      follow_all_instructions=False,
+      follow_instruction_list=[False] * len(inp.instruction_id_list),
+  )
+
+
 def read_prompt_list(input_jsonl_filename):
   """Read inputs from jsonl."""
   inputs = []
@@ -77,7 +88,10 @@ def test_instruction_following_strict(
     prompt_to_response,
 ):
   """Tests response to see if instrutions are followed."""
-  response = prompt_to_response[inp.prompt]
+  response = get_response_for_prompt(inp, prompt_to_response)
+  if response is None:
+    return empty_output(inp)
+
   instruction_list = inp.instruction_id_list
   is_following_list = []
 
@@ -109,15 +123,9 @@ def test_instruction_following_loose(
     prompt_to_response,
 ):
   """Tests response for an upper bound for following instructions."""
-  response = prompt_to_response[inp.prompt]
+  response = get_response_for_prompt(inp, prompt_to_response)
   if response is None:
-      return OutputExample(
-          instruction_id_list=inp.instruction_id_list,
-          prompt=inp.prompt,
-          response="",
-          follow_all_instructions=False,
-          follow_instruction_list=[False] * len(inp.instruction_id_list),
-      )
+    return empty_output(inp)
 
   r = response.split("\n")
   response_remove_first = "\n".join(r[1:]).strip()
@@ -173,9 +181,19 @@ def read_prompt_to_response_dict(input_jsonl_filename):
     for l in f:
       example = json.loads(l)
       return_dict[example["prompt"]] = example["response"]
+      stripped_prompt = example["prompt"].strip()
+      if stripped_prompt != example["prompt"]:
+        return_dict[stripped_prompt] = example["response"]
   return return_dict
 
 
+def get_response_for_prompt(inp, prompt_to_response):
+  """Returns a prompt response, tolerating harmless leading/trailing spaces."""
+  if inp.prompt in prompt_to_response:
+    return prompt_to_response[inp.prompt]
+  return prompt_to_response.get(inp.prompt.strip())
+
+
 def print_report(outputs):
   """Prints a report on accuracy scores."""
 
diff --git a/instructions_test.py b/instructions_test.py
index 197161d..97d1214 100644
--- a/instructions_test.py
+++ b/instructions_test.py
@@ -15,8 +15,13 @@
 
 """Tests for instructions.py."""
 
+import json
+import os
+import tempfile
+
 from absl.testing import absltest
 from absl.testing import parameterized
+import evaluation_lib
 import instructions
 
 # pylint:disable=g-complex-comprehension
@@ -1185,5 +1190,40 @@ def test_stop_word_percentage__returns_false_when_response_empty(self):
         self.assertFalse(instruction.check_following(""), "expected False for empty response")
         self.assertFalse(instruction.check_following("   "), "expected False for whitespace-only response")
 
+    def test_evaluation_lookup__tolerates_trailing_prompt_space(self):
+        """Test eval lookup when response files have harmless prompt whitespace drift."""
+        inp = evaluation_lib.InputExample(
+            key=1,
+            instruction_id_list=["sentence:keyword"],
+            prompt="Say hello.",
+            kwargs=[{"word": "hello", "N": 1}],
+        )
+        with tempfile.NamedTemporaryFile("w", delete=False) as f:
+            f.write(json.dumps({"prompt": "Say hello. ", "response": "hello there."}))
+            f.write("\n")
+            response_file = f.name
+
+        try:
+            prompt_to_response = evaluation_lib.read_prompt_to_response_dict(response_file)
+        finally:
+            os.unlink(response_file)
+
+        output = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response)
+        self.assertTrue(output.follow_all_instructions)
+
+    def test_evaluation_lookup__missing_response_fails_without_crashing(self):
+        """Test missing model responses are scored as failed instead of raising."""
+        inp = evaluation_lib.InputExample(
+            key=1,
+            instruction_id_list=["sentence:keyword"],
+            prompt="Say hello.",
+            kwargs=[{"word": "hello", "N": 1}],
+        )
+
+        output = evaluation_lib.test_instruction_following_strict(inp, {})
+        self.assertFalse(output.follow_all_instructions)
+        self.assertEqual(output.follow_instruction_list, [False])
+        self.assertEqual(output.response, "")
+
 if __name__ == '__main__':
     absltest.main()
diff --git a/run_eval.py b/run_eval.py
index 0a7a5bc..965d00a 100644
--- a/run_eval.py
+++ b/run_eval.py
@@ -47,6 +47,7 @@ def main(argv):
   inputs = evaluation_lib.read_prompt_list(_INPUT_DATA.value)
   prompt_to_response = evaluation_lib.read_prompt_to_response_dict(
       _INPUT_RESPONSE_DATA.value)
+  os.makedirs(_OUTPUT_DIR.value, exist_ok=True)
 
   # get instruction following results
   for func, output_file_name in [