Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions evaluation_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ class OutputExample:
follow_instruction_list: list[bool]


def empty_output(inp):
"""Builds a failed output for prompts with no generated response."""
return OutputExample(
instruction_id_list=inp.instruction_id_list,
prompt=inp.prompt,
response="",
follow_all_instructions=False,
follow_instruction_list=[False] * len(inp.instruction_id_list),
)


def read_prompt_list(input_jsonl_filename):
"""Read inputs from jsonl."""
inputs = []
Expand Down Expand Up @@ -77,7 +88,10 @@ def test_instruction_following_strict(
prompt_to_response,
):
"""Tests response to see if instrutions are followed."""
response = prompt_to_response[inp.prompt]
response = get_response_for_prompt(inp, prompt_to_response)
if response is None:
return empty_output(inp)

instruction_list = inp.instruction_id_list
is_following_list = []

Expand Down Expand Up @@ -109,15 +123,9 @@ def test_instruction_following_loose(
prompt_to_response,
):
"""Tests response for an upper bound for following instructions."""
response = prompt_to_response[inp.prompt]
response = get_response_for_prompt(inp, prompt_to_response)
if response is None:
return OutputExample(
instruction_id_list=inp.instruction_id_list,
prompt=inp.prompt,
response="",
follow_all_instructions=False,
follow_instruction_list=[False] * len(inp.instruction_id_list),
)
return empty_output(inp)

r = response.split("\n")
response_remove_first = "\n".join(r[1:]).strip()
Expand Down Expand Up @@ -173,9 +181,19 @@ def read_prompt_to_response_dict(input_jsonl_filename):
for l in f:
example = json.loads(l)
return_dict[example["prompt"]] = example["response"]
stripped_prompt = example["prompt"].strip()
if stripped_prompt != example["prompt"]:
return_dict[stripped_prompt] = example["response"]
return return_dict


def get_response_for_prompt(inp, prompt_to_response):
"""Returns a prompt response, tolerating harmless leading/trailing spaces."""
if inp.prompt in prompt_to_response:
return prompt_to_response[inp.prompt]
return prompt_to_response.get(inp.prompt.strip())


def print_report(outputs):
"""Prints a report on accuracy scores."""

Expand Down
40 changes: 40 additions & 0 deletions instructions_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@

"""Tests for instructions.py."""

import json
import os
import tempfile

from absl.testing import absltest
from absl.testing import parameterized
import evaluation_lib
import instructions

# pylint:disable=g-complex-comprehension
Expand Down Expand Up @@ -1185,5 +1190,40 @@ def test_stop_word_percentage__returns_false_when_response_empty(self):
self.assertFalse(instruction.check_following(""), "expected False for empty response")
self.assertFalse(instruction.check_following(" "), "expected False for whitespace-only response")

def test_evaluation_lookup__tolerates_trailing_prompt_space(self):
"""Test eval lookup when response files have harmless prompt whitespace drift."""
inp = evaluation_lib.InputExample(
key=1,
instruction_id_list=["sentence:keyword"],
prompt="Say hello.",
kwargs=[{"word": "hello", "N": 1}],
)
with tempfile.NamedTemporaryFile("w", delete=False) as f:
f.write(json.dumps({"prompt": "Say hello. ", "response": "hello there."}))
f.write("\n")
response_file = f.name

try:
prompt_to_response = evaluation_lib.read_prompt_to_response_dict(response_file)
finally:
os.unlink(response_file)

output = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response)
self.assertTrue(output.follow_all_instructions)

def test_evaluation_lookup__missing_response_fails_without_crashing(self):
"""Test missing model responses are scored as failed instead of raising."""
inp = evaluation_lib.InputExample(
key=1,
instruction_id_list=["sentence:keyword"],
prompt="Say hello.",
kwargs=[{"word": "hello", "N": 1}],
)

output = evaluation_lib.test_instruction_following_strict(inp, {})
self.assertFalse(output.follow_all_instructions)
self.assertEqual(output.follow_instruction_list, [False])
self.assertEqual(output.response, "")

if __name__ == '__main__':
absltest.main()
1 change: 1 addition & 0 deletions run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def main(argv):
inputs = evaluation_lib.read_prompt_list(_INPUT_DATA.value)
prompt_to_response = evaluation_lib.read_prompt_to_response_dict(
_INPUT_RESPONSE_DATA.value)
os.makedirs(_OUTPUT_DIR.value, exist_ok=True)

# get instruction following results
for func, output_file_name in [
Expand Down