From 8de1a75366983e6b925a43a48eb8df9cf9566247 Mon Sep 17 00:00:00 2001 From: Mauro Marques Filho Date: Wed, 20 May 2026 19:14:42 -0300 Subject: [PATCH] Handle missing IFBench eval responses --- evaluation_lib.py | 36 +++++++++++++++++++++++++++--------- instructions_test.py | 40 ++++++++++++++++++++++++++++++++++++++++ run_eval.py | 1 + 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/evaluation_lib.py b/evaluation_lib.py index ea99ca9..c16c064 100644 --- a/evaluation_lib.py +++ b/evaluation_lib.py @@ -40,6 +40,17 @@ class OutputExample: follow_instruction_list: list[bool] +def empty_output(inp): + """Builds a failed output for prompts with no generated response.""" + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response="", + follow_all_instructions=False, + follow_instruction_list=[False] * len(inp.instruction_id_list), + ) + + def read_prompt_list(input_jsonl_filename): """Read inputs from jsonl.""" inputs = [] @@ -77,7 +88,10 @@ def test_instruction_following_strict( prompt_to_response, ): """Tests response to see if instrutions are followed.""" - response = prompt_to_response[inp.prompt] + response = get_response_for_prompt(inp, prompt_to_response) + if response is None: + return empty_output(inp) + instruction_list = inp.instruction_id_list is_following_list = [] @@ -109,15 +123,9 @@ def test_instruction_following_loose( prompt_to_response, ): """Tests response for an upper bound for following instructions.""" - response = prompt_to_response[inp.prompt] + response = get_response_for_prompt(inp, prompt_to_response) if response is None: - return OutputExample( - instruction_id_list=inp.instruction_id_list, - prompt=inp.prompt, - response="", - follow_all_instructions=False, - follow_instruction_list=[False] * len(inp.instruction_id_list), - ) + return empty_output(inp) r = response.split("\n") response_remove_first = "\n".join(r[1:]).strip() @@ -173,9 +181,19 @@ def read_prompt_to_response_dict(input_jsonl_filename): for l in f: example = json.loads(l) return_dict[example["prompt"]] = example["response"] + stripped_prompt = example["prompt"].strip() + if stripped_prompt != example["prompt"]: + return_dict[stripped_prompt] = example["response"] return return_dict +def get_response_for_prompt(inp, prompt_to_response): + """Returns a prompt response, tolerating harmless leading/trailing spaces.""" + if inp.prompt in prompt_to_response: + return prompt_to_response[inp.prompt] + return prompt_to_response.get(inp.prompt.strip()) + + def print_report(outputs): """Prints a report on accuracy scores.""" diff --git a/instructions_test.py b/instructions_test.py index 197161d..97d1214 100644 --- a/instructions_test.py +++ b/instructions_test.py @@ -15,8 +15,13 @@ """Tests for instructions.py.""" +import json +import os +import tempfile + from absl.testing import absltest from absl.testing import parameterized +import evaluation_lib import instructions # pylint:disable=g-complex-comprehension @@ -1185,5 +1190,40 @@ def test_stop_word_percentage__returns_false_when_response_empty(self): self.assertFalse(instruction.check_following(""), "expected False for empty response") self.assertFalse(instruction.check_following(" "), "expected False for whitespace-only response") + def test_evaluation_lookup__tolerates_trailing_prompt_space(self): + """Test eval lookup when response files have harmless prompt whitespace drift.""" + inp = evaluation_lib.InputExample( + key=1, + instruction_id_list=["sentence:keyword"], + prompt="Say hello.", + kwargs=[{"word": "hello", "N": 1}], + ) + with tempfile.NamedTemporaryFile("w", delete=False) as f: + f.write(json.dumps({"prompt": "Say hello. ", "response": "hello there."})) + f.write("\n") + response_file = f.name + + try: + prompt_to_response = evaluation_lib.read_prompt_to_response_dict(response_file) + finally: + os.unlink(response_file) + + output = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response) + self.assertTrue(output.follow_all_instructions) + + def test_evaluation_lookup__missing_response_fails_without_crashing(self): + """Test missing model responses are scored as failed instead of raising.""" + inp = evaluation_lib.InputExample( + key=1, + instruction_id_list=["sentence:keyword"], + prompt="Say hello.", + kwargs=[{"word": "hello", "N": 1}], + ) + + output = evaluation_lib.test_instruction_following_strict(inp, {}) + self.assertFalse(output.follow_all_instructions) + self.assertEqual(output.follow_instruction_list, [False]) + self.assertEqual(output.response, "") + if __name__ == '__main__': absltest.main() diff --git a/run_eval.py b/run_eval.py index 0a7a5bc..965d00a 100644 --- a/run_eval.py +++ b/run_eval.py @@ -47,6 +47,7 @@ def main(argv): inputs = evaluation_lib.read_prompt_list(_INPUT_DATA.value) prompt_to_response = evaluation_lib.read_prompt_to_response_dict( _INPUT_RESPONSE_DATA.value) + os.makedirs(_OUTPUT_DIR.value, exist_ok=True) # get instruction following results for func, output_file_name in [