diff --git a/README.md b/README.md index 2810d4f..ec4eb90 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,36 @@ We also release our IF-RLVR code, as part of [open-instruct](https://github.com/ The new training constraints and verification functions are here: https://github.com/allenai/open-instruct/tree/main/open_instruct/IFEvalG +You can also use the IFBench verifiers directly as a reward function in +training or local smoke tests: + +```python +import evaluation_lib +import reward_lib + +examples = evaluation_lib.read_prompt_list("data/IFBench_test.jsonl") +reward_fn = reward_lib.make_reward_fn(examples, mode="loose") + +rewards = reward_fn( + [examples[0].prompt], + ["A model response that attempts to satisfy the prompt constraints."], +) +``` + +For debugging reward shaping, `reward_lib.score_response(...)` returns both the +binary prompt-level reward and the fractional instruction-level reward. + +To run a reproducible local reward smoke test against prompt/response jsonl +files, use: + +``` +python3 -m run_reward \ + --input_data=data/IFBench_test.jsonl \ + --input_response_data=data/sample_output.jsonl \ + --mode=loose \ + --limit=5 +``` + ## 📊 Model Performance Leaderboard | Rank | Model | IFBench Score | IFEval Score | diff --git a/reward_lib.py b/reward_lib.py new file mode 100644 index 0000000..01a7eae --- /dev/null +++ b/reward_lib.py @@ -0,0 +1,168 @@ +# coding=utf-8 +# Copyright 2026 Allen Institute for AI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Reward helpers for using IFBench verifiers in RLVR training loops.""" + +import copy +import dataclasses +from typing import Callable, Iterable, Mapping, Sequence + +import evaluation_lib + + +@dataclasses.dataclass(frozen=True) +class RewardResult: + """Structured reward output for a single prompt/response pair.""" + + instruction_id_list: list[str] + prompt: str + response: str + mode: str + prompt_reward: float + instruction_reward: float + follow_instruction_list: list[bool] + + +def _as_input_example(example): + """Converts a json-like example into an InputExample if needed.""" + if isinstance(example, evaluation_lib.InputExample): + return example + return evaluation_lib.InputExample( + key=example["key"], + instruction_id_list=example["instruction_id_list"], + prompt=example["prompt"], + kwargs=example["kwargs"], + ) + + +def _copy_input_example(example): + """Copies an input example and normalizes nullable kwargs before scoring.""" + example = _as_input_example(example) + return evaluation_lib.InputExample( + key=example.key, + instruction_id_list=list(example.instruction_id_list), + prompt=example.prompt, + kwargs=[ + {key: value for key, value in kwargs.items() if value is not None} + for kwargs in copy.deepcopy(example.kwargs) + ], + ) + + +def score_response(example, response: str, mode: str = "loose") -> RewardResult: + """Scores one response with IFBench verifiers. + + Args: + example: An evaluation_lib.InputExample or json-like dict from IFBench data. + response: Model response to score. + mode: Either "loose" or "strict", matching IFBench evaluation modes. + + Returns: + RewardResult with a binary prompt_reward and fractional instruction_reward. + """ + scoring_functions = { + "loose": evaluation_lib.test_instruction_following_loose, + "strict": evaluation_lib.test_instruction_following_strict, + } + if mode not in scoring_functions: + raise ValueError(f"mode must be one of {sorted(scoring_functions)}") + + inp = _copy_input_example(example) + output = scoring_functions[mode](inp, {inp.prompt: response}) + followed_count = sum(output.follow_instruction_list) + instruction_count = len(output.follow_instruction_list) + instruction_reward = ( + followed_count / instruction_count if instruction_count else 0.0 + ) + + return RewardResult( + instruction_id_list=list(output.instruction_id_list), + prompt=output.prompt, + response=output.response, + mode=mode, + prompt_reward=1.0 if output.follow_all_instructions else 0.0, + instruction_reward=instruction_reward, + follow_instruction_list=list(output.follow_instruction_list), + ) + + +def build_prompt_index(examples: Iterable) -> dict[str, evaluation_lib.InputExample]: + """Builds a prompt-to-example index for reward functions.""" + return {example.prompt: example for example in map(_as_input_example, examples)} + + +def make_reward_fn( + examples: Iterable, + mode: str = "loose", + *, + missing_prompt_reward: float = 0.0, +) -> Callable[[Sequence[str], Sequence[str]], list[float]]: + """Creates a batch reward function for RLVR trainers. + + The returned function accepts parallel prompt and response batches and returns + one binary prompt-level reward per pair. Unknown prompts receive + missing_prompt_reward so streaming trainers can continue safely. + """ + prompt_index = build_prompt_index(examples) + + def reward_fn(prompts: Sequence[str], responses: Sequence[str]) -> list[float]: + if len(prompts) != len(responses): + raise ValueError("prompts and responses must have the same length") + + rewards = [] + for prompt, response in zip(prompts, responses): + example = prompt_index.get(prompt) + if example is None: + rewards.append(missing_prompt_reward) + else: + rewards.append(score_response(example, response, mode).prompt_reward) + return rewards + + return reward_fn + + +def score_response_batch( + prompt_index: Mapping[str, evaluation_lib.InputExample], + prompts: Sequence[str], + responses: Sequence[str], + mode: str = "loose", +) -> list[RewardResult]: + """Scores a batch and returns structured per-example results.""" + if len(prompts) != len(responses): + raise ValueError("prompts and responses must have the same length") + + results = [] + for prompt, response in zip(prompts, responses): + if prompt not in prompt_index: + raise KeyError(f"prompt not found in IFBench inputs: {prompt!r}") + results.append(score_response(prompt_index[prompt], response, mode)) + return results + + +def summarize_results(results: Sequence[RewardResult]) -> dict[str, float]: + """Summarizes prompt-level and instruction-level rewards.""" + if not results: + return { + "example_count": 0, + "prompt_reward_mean": 0.0, + "instruction_reward_mean": 0.0, + } + + return { + "example_count": len(results), + "prompt_reward_mean": sum(r.prompt_reward for r in results) / len(results), + "instruction_reward_mean": sum(r.instruction_reward for r in results) + / len(results), + } diff --git a/reward_lib_test.py b/reward_lib_test.py new file mode 100644 index 0000000..c6a9c9b --- /dev/null +++ b/reward_lib_test.py @@ -0,0 +1,105 @@ +# coding=utf-8 +# Copyright 2026 Allen Institute for AI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for reward_lib.py.""" + +from absl.testing import absltest + +import evaluation_lib +import reward_lib + + +class RewardLibTest(absltest.TestCase): + + def _example(self, instruction_ids=None, kwargs=None): + return evaluation_lib.InputExample( + key=1, + instruction_id_list=instruction_ids or ["sentence:keyword"], + prompt="Write one sentence.", + kwargs=kwargs or [{"word": "giraffe", "N": 1}], + ) + + def test_score_response_returns_prompt_and_instruction_rewards(self): + result = reward_lib.score_response( + self._example(), "A giraffe walks carefully." + ) + + self.assertEqual(result.prompt_reward, 1.0) + self.assertEqual(result.instruction_reward, 1.0) + self.assertEqual(result.follow_instruction_list, [True]) + + def test_score_response_reports_partial_instruction_reward(self): + result = reward_lib.score_response( + self._example( + instruction_ids=["sentence:keyword", "count:numbers"], + kwargs=[{"word": "giraffe", "N": 1}, {"N": 2}], + ), + "A giraffe walks carefully.", + ) + + self.assertEqual(result.prompt_reward, 0.0) + self.assertEqual(result.instruction_reward, 0.5) + self.assertEqual(result.follow_instruction_list, [True, False]) + + def test_make_reward_fn_scores_known_prompts_and_handles_unknown(self): + reward_fn = reward_lib.make_reward_fn( + [self._example()], missing_prompt_reward=-1.0 + ) + + rewards = reward_fn( + ["Write one sentence.", "Unknown prompt."], + ["A giraffe walks carefully.", "Anything."], + ) + + self.assertEqual(rewards, [1.0, -1.0]) + + def test_score_response_filters_none_kwargs_for_loose_scoring(self): + example = self._example( + instruction_ids=["sentence:keyword"], + kwargs=[{"word": "giraffe", "N": 1, "unused": None}], + ) + + result = reward_lib.score_response(example, "A giraffe walks carefully.") + + self.assertEqual(result.prompt_reward, 1.0) + + def test_batch_helpers_validate_lengths(self): + reward_fn = reward_lib.make_reward_fn([self._example()]) + + with self.assertRaisesRegex(ValueError, "same length"): + reward_fn(["Write one sentence."], []) + + def test_summarize_results_reports_means(self): + results = [ + reward_lib.score_response(self._example(), "A giraffe walks carefully."), + reward_lib.score_response(self._example(), "A cat walks carefully."), + ] + + summary = reward_lib.summarize_results(results) + + self.assertEqual(summary["example_count"], 2) + self.assertEqual(summary["prompt_reward_mean"], 0.5) + self.assertEqual(summary["instruction_reward_mean"], 0.5) + + def test_summarize_results_handles_empty_input(self): + summary = reward_lib.summarize_results([]) + + self.assertEqual(summary["example_count"], 0) + self.assertEqual(summary["prompt_reward_mean"], 0.0) + self.assertEqual(summary["instruction_reward_mean"], 0.0) + + +if __name__ == "__main__": + absltest.main() diff --git a/run_reward.py b/run_reward.py new file mode 100644 index 0000000..1508a48 --- /dev/null +++ b/run_reward.py @@ -0,0 +1,82 @@ +# coding=utf-8 +# Copyright 2026 Allen Institute for AI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Smoke runner for using IFBench verifiers as RLVR rewards.""" + +import dataclasses +import json + +from absl import app +from absl import flags + +import evaluation_lib +import reward_lib + + +_INPUT_DATA = flags.DEFINE_string( + "input_data", None, "Path to IFBench input jsonl.", required=True +) +_INPUT_RESPONSE_DATA = flags.DEFINE_string( + "input_response_data", + None, + "Path to jsonl rows with prompt and response fields.", + required=True, +) +_MODE = flags.DEFINE_enum( + "mode", "loose", ["loose", "strict"], "IFBench verifier mode." +) +_LIMIT = flags.DEFINE_integer( + "limit", None, "Optional maximum number of examples to score." +) +_OUTPUT_JSONL = flags.DEFINE_string( + "output_jsonl", None, "Optional path for per-example reward results." +) + + +def _write_jsonl(path: str, results: list[reward_lib.RewardResult]) -> None: + with open(path, "w") as f: + for result in results: + f.write(json.dumps(dataclasses.asdict(result))) + f.write("\n") + + +def main(argv): + if len(argv) > 1: + raise app.UsageError("Too many command-line arguments.") + + examples = evaluation_lib.read_prompt_list(_INPUT_DATA.value) + if _LIMIT.value is not None: + examples = examples[: _LIMIT.value] + prompt_to_response = evaluation_lib.read_prompt_to_response_dict( + _INPUT_RESPONSE_DATA.value + ) + + results = [ + reward_lib.score_response( + example, + prompt_to_response.get(example.prompt, ""), + mode=_MODE.value, + ) + for example in examples + ] + summary = reward_lib.summarize_results(results) + print(json.dumps(summary, sort_keys=True)) + + if _OUTPUT_JSONL.value: + _write_jsonl(_OUTPUT_JSONL.value, results) + + +if __name__ == "__main__": + app.run(main)