Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,36 @@ We also release our IF-RLVR code, as part of [open-instruct](https://github.com/

The new training constraints and verification functions are here: https://github.com/allenai/open-instruct/tree/main/open_instruct/IFEvalG

You can also use the IFBench verifiers directly as a reward function in
training or local smoke tests:

```python
import evaluation_lib
import reward_lib

examples = evaluation_lib.read_prompt_list("data/IFBench_test.jsonl")
reward_fn = reward_lib.make_reward_fn(examples, mode="loose")

rewards = reward_fn(
[examples[0].prompt],
["A model response that attempts to satisfy the prompt constraints."],
)
```

For debugging reward shaping, `reward_lib.score_response(...)` returns both the
binary prompt-level reward and the fractional instruction-level reward.

To run a reproducible local reward smoke test against prompt/response jsonl
files, use:

```
python3 -m run_reward \
--input_data=data/IFBench_test.jsonl \
--input_response_data=data/sample_output.jsonl \
--mode=loose \
--limit=5
```

## 📊 Model Performance Leaderboard

| Rank | Model | IFBench Score | IFEval Score |
Expand Down
168 changes: 168 additions & 0 deletions reward_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# coding=utf-8
# Copyright 2026 Allen Institute for AI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Reward helpers for using IFBench verifiers in RLVR training loops."""

import copy
import dataclasses
from typing import Callable, Iterable, Mapping, Sequence

import evaluation_lib


@dataclasses.dataclass(frozen=True)
class RewardResult:
"""Structured reward output for a single prompt/response pair."""

instruction_id_list: list[str]
prompt: str
response: str
mode: str
prompt_reward: float
instruction_reward: float
follow_instruction_list: list[bool]


def _as_input_example(example):
"""Converts a json-like example into an InputExample if needed."""
if isinstance(example, evaluation_lib.InputExample):
return example
return evaluation_lib.InputExample(
key=example["key"],
instruction_id_list=example["instruction_id_list"],
prompt=example["prompt"],
kwargs=example["kwargs"],
)


def _copy_input_example(example):
"""Copies an input example and normalizes nullable kwargs before scoring."""
example = _as_input_example(example)
return evaluation_lib.InputExample(
key=example.key,
instruction_id_list=list(example.instruction_id_list),
prompt=example.prompt,
kwargs=[
{key: value for key, value in kwargs.items() if value is not None}
for kwargs in copy.deepcopy(example.kwargs)
],
)


def score_response(example, response: str, mode: str = "loose") -> RewardResult:
"""Scores one response with IFBench verifiers.

Args:
example: An evaluation_lib.InputExample or json-like dict from IFBench data.
response: Model response to score.
mode: Either "loose" or "strict", matching IFBench evaluation modes.

Returns:
RewardResult with a binary prompt_reward and fractional instruction_reward.
"""
scoring_functions = {
"loose": evaluation_lib.test_instruction_following_loose,
"strict": evaluation_lib.test_instruction_following_strict,
}
if mode not in scoring_functions:
raise ValueError(f"mode must be one of {sorted(scoring_functions)}")

inp = _copy_input_example(example)
output = scoring_functions[mode](inp, {inp.prompt: response})
followed_count = sum(output.follow_instruction_list)
instruction_count = len(output.follow_instruction_list)
instruction_reward = (
followed_count / instruction_count if instruction_count else 0.0
)

return RewardResult(
instruction_id_list=list(output.instruction_id_list),
prompt=output.prompt,
response=output.response,
mode=mode,
prompt_reward=1.0 if output.follow_all_instructions else 0.0,
instruction_reward=instruction_reward,
follow_instruction_list=list(output.follow_instruction_list),
)


def build_prompt_index(examples: Iterable) -> dict[str, evaluation_lib.InputExample]:
"""Builds a prompt-to-example index for reward functions."""
return {example.prompt: example for example in map(_as_input_example, examples)}


def make_reward_fn(
examples: Iterable,
mode: str = "loose",
*,
missing_prompt_reward: float = 0.0,
) -> Callable[[Sequence[str], Sequence[str]], list[float]]:
"""Creates a batch reward function for RLVR trainers.

The returned function accepts parallel prompt and response batches and returns
one binary prompt-level reward per pair. Unknown prompts receive
missing_prompt_reward so streaming trainers can continue safely.
"""
prompt_index = build_prompt_index(examples)

def reward_fn(prompts: Sequence[str], responses: Sequence[str]) -> list[float]:
if len(prompts) != len(responses):
raise ValueError("prompts and responses must have the same length")

rewards = []
for prompt, response in zip(prompts, responses):
example = prompt_index.get(prompt)
if example is None:
rewards.append(missing_prompt_reward)
else:
rewards.append(score_response(example, response, mode).prompt_reward)
return rewards

return reward_fn


def score_response_batch(
prompt_index: Mapping[str, evaluation_lib.InputExample],
prompts: Sequence[str],
responses: Sequence[str],
mode: str = "loose",
) -> list[RewardResult]:
"""Scores a batch and returns structured per-example results."""
if len(prompts) != len(responses):
raise ValueError("prompts and responses must have the same length")

results = []
for prompt, response in zip(prompts, responses):
if prompt not in prompt_index:
raise KeyError(f"prompt not found in IFBench inputs: {prompt!r}")
results.append(score_response(prompt_index[prompt], response, mode))
return results


def summarize_results(results: Sequence[RewardResult]) -> dict[str, float]:
"""Summarizes prompt-level and instruction-level rewards."""
if not results:
return {
"example_count": 0,
"prompt_reward_mean": 0.0,
"instruction_reward_mean": 0.0,
}

return {
"example_count": len(results),
"prompt_reward_mean": sum(r.prompt_reward for r in results) / len(results),
"instruction_reward_mean": sum(r.instruction_reward for r in results)
/ len(results),
}
105 changes: 105 additions & 0 deletions reward_lib_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# coding=utf-8
# Copyright 2026 Allen Institute for AI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for reward_lib.py."""

from absl.testing import absltest

import evaluation_lib
import reward_lib


class RewardLibTest(absltest.TestCase):

def _example(self, instruction_ids=None, kwargs=None):
return evaluation_lib.InputExample(
key=1,
instruction_id_list=instruction_ids or ["sentence:keyword"],
prompt="Write one sentence.",
kwargs=kwargs or [{"word": "giraffe", "N": 1}],
)

def test_score_response_returns_prompt_and_instruction_rewards(self):
result = reward_lib.score_response(
self._example(), "A giraffe walks carefully."
)

self.assertEqual(result.prompt_reward, 1.0)
self.assertEqual(result.instruction_reward, 1.0)
self.assertEqual(result.follow_instruction_list, [True])

def test_score_response_reports_partial_instruction_reward(self):
result = reward_lib.score_response(
self._example(
instruction_ids=["sentence:keyword", "count:numbers"],
kwargs=[{"word": "giraffe", "N": 1}, {"N": 2}],
),
"A giraffe walks carefully.",
)

self.assertEqual(result.prompt_reward, 0.0)
self.assertEqual(result.instruction_reward, 0.5)
self.assertEqual(result.follow_instruction_list, [True, False])

def test_make_reward_fn_scores_known_prompts_and_handles_unknown(self):
reward_fn = reward_lib.make_reward_fn(
[self._example()], missing_prompt_reward=-1.0
)

rewards = reward_fn(
["Write one sentence.", "Unknown prompt."],
["A giraffe walks carefully.", "Anything."],
)

self.assertEqual(rewards, [1.0, -1.0])

def test_score_response_filters_none_kwargs_for_loose_scoring(self):
example = self._example(
instruction_ids=["sentence:keyword"],
kwargs=[{"word": "giraffe", "N": 1, "unused": None}],
)

result = reward_lib.score_response(example, "A giraffe walks carefully.")

self.assertEqual(result.prompt_reward, 1.0)

def test_batch_helpers_validate_lengths(self):
reward_fn = reward_lib.make_reward_fn([self._example()])

with self.assertRaisesRegex(ValueError, "same length"):
reward_fn(["Write one sentence."], [])

def test_summarize_results_reports_means(self):
results = [
reward_lib.score_response(self._example(), "A giraffe walks carefully."),
reward_lib.score_response(self._example(), "A cat walks carefully."),
]

summary = reward_lib.summarize_results(results)

self.assertEqual(summary["example_count"], 2)
self.assertEqual(summary["prompt_reward_mean"], 0.5)
self.assertEqual(summary["instruction_reward_mean"], 0.5)

def test_summarize_results_handles_empty_input(self):
summary = reward_lib.summarize_results([])

self.assertEqual(summary["example_count"], 0)
self.assertEqual(summary["prompt_reward_mean"], 0.0)
self.assertEqual(summary["instruction_reward_mean"], 0.0)


if __name__ == "__main__":
absltest.main()
82 changes: 82 additions & 0 deletions run_reward.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# coding=utf-8
# Copyright 2026 Allen Institute for AI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Smoke runner for using IFBench verifiers as RLVR rewards."""

import dataclasses
import json

from absl import app
from absl import flags

import evaluation_lib
import reward_lib


_INPUT_DATA = flags.DEFINE_string(
"input_data", None, "Path to IFBench input jsonl.", required=True
)
_INPUT_RESPONSE_DATA = flags.DEFINE_string(
"input_response_data",
None,
"Path to jsonl rows with prompt and response fields.",
required=True,
)
_MODE = flags.DEFINE_enum(
"mode", "loose", ["loose", "strict"], "IFBench verifier mode."
)
_LIMIT = flags.DEFINE_integer(
"limit", None, "Optional maximum number of examples to score."
)
_OUTPUT_JSONL = flags.DEFINE_string(
"output_jsonl", None, "Optional path for per-example reward results."
)


def _write_jsonl(path: str, results: list[reward_lib.RewardResult]) -> None:
with open(path, "w") as f:
for result in results:
f.write(json.dumps(dataclasses.asdict(result)))
f.write("\n")


def main(argv):
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")

examples = evaluation_lib.read_prompt_list(_INPUT_DATA.value)
if _LIMIT.value is not None:
examples = examples[: _LIMIT.value]
prompt_to_response = evaluation_lib.read_prompt_to_response_dict(
_INPUT_RESPONSE_DATA.value
)

results = [
reward_lib.score_response(
example,
prompt_to_response.get(example.prompt, ""),
mode=_MODE.value,
)
for example in examples
]
summary = reward_lib.summarize_results(results)
print(json.dumps(summary, sort_keys=True))

if _OUTPUT_JSONL.value:
_write_jsonl(_OUTPUT_JSONL.value, results)


if __name__ == "__main__":
app.run(main)