allenai · nam157 · May 20, 2026 · May 20, 2026
diff --git a/README.md b/README.md
@@ -29,6 +29,36 @@ We also release our IF-RLVR code, as part of [open-instruct](https://github.com/
 
 The new training constraints and verification functions are here: https://github.com/allenai/open-instruct/tree/main/open_instruct/IFEvalG
 
+You can also use the IFBench verifiers directly as a reward function in
+training or local smoke tests:
+
+```python
+import evaluation_lib
+import reward_lib
+
+examples = evaluation_lib.read_prompt_list("data/IFBench_test.jsonl")
+reward_fn = reward_lib.make_reward_fn(examples, mode="loose")
+
+rewards = reward_fn(
+    [examples[0].prompt],
+    ["A model response that attempts to satisfy the prompt constraints."],
+)
+```
+
+For debugging reward shaping, `reward_lib.score_response(...)` returns both the
+binary prompt-level reward and the fractional instruction-level reward.
+
+To run a reproducible local reward smoke test against prompt/response jsonl
+files, use:
+
+```
+python3 -m run_reward \
+  --input_data=data/IFBench_test.jsonl \
+  --input_response_data=data/sample_output.jsonl \
+  --mode=loose \
+  --limit=5
+```
+
 ## 📊 Model Performance Leaderboard
 
 | Rank | Model | IFBench Score | IFEval Score |

diff --git a/reward_lib.py b/reward_lib.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2026 Allen Institute for AI.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Reward helpers for using IFBench verifiers in RLVR training loops."""
+
+import copy
+import dataclasses
+from typing import Callable, Iterable, Mapping, Sequence
+
+import evaluation_lib
+
+
+@dataclasses.dataclass(frozen=True)
+class RewardResult:
+  """Structured reward output for a single prompt/response pair."""
+
+  instruction_id_list: list[str]
+  prompt: str
+  response: str
+  mode: str
+  prompt_reward: float
+  instruction_reward: float
+  follow_instruction_list: list[bool]
+
+
+def _as_input_example(example):
+  """Converts a json-like example into an InputExample if needed."""
+  if isinstance(example, evaluation_lib.InputExample):
+    return example
+  return evaluation_lib.InputExample(
+      key=example["key"],
+      instruction_id_list=example["instruction_id_list"],
+      prompt=example["prompt"],
+      kwargs=example["kwargs"],
+  )
+
+
+def _copy_input_example(example):
+  """Copies an input example and normalizes nullable kwargs before scoring."""
+  example = _as_input_example(example)
+  return evaluation_lib.InputExample(
+      key=example.key,
+      instruction_id_list=list(example.instruction_id_list),
+      prompt=example.prompt,
+      kwargs=[
+          {key: value for key, value in kwargs.items() if value is not None}
+          for kwargs in copy.deepcopy(example.kwargs)
+      ],
+  )
+
+
+def score_response(example, response: str, mode: str = "loose") -> RewardResult:
+  """Scores one response with IFBench verifiers.
+
+  Args:
+    example: An evaluation_lib.InputExample or json-like dict from IFBench data.
+    response: Model response to score.
+    mode: Either "loose" or "strict", matching IFBench evaluation modes.
+
+  Returns:
+    RewardResult with a binary prompt_reward and fractional instruction_reward.
+  """
+  scoring_functions = {
+      "loose": evaluation_lib.test_instruction_following_loose,
+      "strict": evaluation_lib.test_instruction_following_strict,
+  }
+  if mode not in scoring_functions:
+    raise ValueError(f"mode must be one of {sorted(scoring_functions)}")
+
+  inp = _copy_input_example(example)
+  output = scoring_functions[mode](inp, {inp.prompt: response})
+  followed_count = sum(output.follow_instruction_list)
+  instruction_count = len(output.follow_instruction_list)
+  instruction_reward = (
+      followed_count / instruction_count if instruction_count else 0.0
+  )
+
+  return RewardResult(
+      instruction_id_list=list(output.instruction_id_list),
+      prompt=output.prompt,
+      response=output.response,
+      mode=mode,
+      prompt_reward=1.0 if output.follow_all_instructions else 0.0,
+      instruction_reward=instruction_reward,
+      follow_instruction_list=list(output.follow_instruction_list),
+  )
+
+
+def build_prompt_index(examples: Iterable) -> dict[str, evaluation_lib.InputExample]:
+  """Builds a prompt-to-example index for reward functions."""
+  return {example.prompt: example for example in map(_as_input_example, examples)}
+
+
+def make_reward_fn(
+    examples: Iterable,
+    mode: str = "loose",
+    *,
+    missing_prompt_reward: float = 0.0,
+) -> Callable[[Sequence[str], Sequence[str]], list[float]]:
+  """Creates a batch reward function for RLVR trainers.
+
+  The returned function accepts parallel prompt and response batches and returns
+  one binary prompt-level reward per pair. Unknown prompts receive
+  missing_prompt_reward so streaming trainers can continue safely.
+  """
+  prompt_index = build_prompt_index(examples)
+
+  def reward_fn(prompts: Sequence[str], responses: Sequence[str]) -> list[float]:
+    if len(prompts) != len(responses):
+      raise ValueError("prompts and responses must have the same length")
+
+    rewards = []
+    for prompt, response in zip(prompts, responses):
+      example = prompt_index.get(prompt)
+      if example is None:
+        rewards.append(missing_prompt_reward)
+      else:
+        rewards.append(score_response(example, response, mode).prompt_reward)
+    return rewards
+
+  return reward_fn
+
+
+def score_response_batch(
+    prompt_index: Mapping[str, evaluation_lib.InputExample],
+    prompts: Sequence[str],
+    responses: Sequence[str],
+    mode: str = "loose",
+) -> list[RewardResult]:
+  """Scores a batch and returns structured per-example results."""
+  if len(prompts) != len(responses):
+    raise ValueError("prompts and responses must have the same length")
+
+  results = []
+  for prompt, response in zip(prompts, responses):
+    if prompt not in prompt_index:
+      raise KeyError(f"prompt not found in IFBench inputs: {prompt!r}")
+    results.append(score_response(prompt_index[prompt], response, mode))
+  return results
+
+
+def summarize_results(results: Sequence[RewardResult]) -> dict[str, float]:
+  """Summarizes prompt-level and instruction-level rewards."""
+  if not results:
+    return {
+        "example_count": 0,
+        "prompt_reward_mean": 0.0,
+        "instruction_reward_mean": 0.0,
+    }
+
+  return {
+      "example_count": len(results),
+      "prompt_reward_mean": sum(r.prompt_reward for r in results) / len(results),
+      "instruction_reward_mean": sum(r.instruction_reward for r in results)
+      / len(results),
+  }
diff --git a/reward_lib_test.py b/reward_lib_test.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2026 Allen Institute for AI.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for reward_lib.py."""
+
+from absl.testing import absltest
+
+import evaluation_lib
+import reward_lib
+
+
+class RewardLibTest(absltest.TestCase):
+
+  def _example(self, instruction_ids=None, kwargs=None):
+    return evaluation_lib.InputExample(
+        key=1,
+        instruction_id_list=instruction_ids or ["sentence:keyword"],
+        prompt="Write one sentence.",
+        kwargs=kwargs or [{"word": "giraffe", "N": 1}],
+    )
+
+  def test_score_response_returns_prompt_and_instruction_rewards(self):
+    result = reward_lib.score_response(
+        self._example(), "A giraffe walks carefully."
+    )
+
+    self.assertEqual(result.prompt_reward, 1.0)
+    self.assertEqual(result.instruction_reward, 1.0)
+    self.assertEqual(result.follow_instruction_list, [True])
+
+  def test_score_response_reports_partial_instruction_reward(self):
+    result = reward_lib.score_response(
+        self._example(
+            instruction_ids=["sentence:keyword", "count:numbers"],
+            kwargs=[{"word": "giraffe", "N": 1}, {"N": 2}],
+        ),
+        "A giraffe walks carefully.",
+    )
+
+    self.assertEqual(result.prompt_reward, 0.0)
+    self.assertEqual(result.instruction_reward, 0.5)
+    self.assertEqual(result.follow_instruction_list, [True, False])
+
+  def test_make_reward_fn_scores_known_prompts_and_handles_unknown(self):
+    reward_fn = reward_lib.make_reward_fn(
+        [self._example()], missing_prompt_reward=-1.0
+    )
+
+    rewards = reward_fn(
+        ["Write one sentence.", "Unknown prompt."],
+        ["A giraffe walks carefully.", "Anything."],
+    )
+
+    self.assertEqual(rewards, [1.0, -1.0])
+
+  def test_score_response_filters_none_kwargs_for_loose_scoring(self):
+    example = self._example(
+        instruction_ids=["sentence:keyword"],
+        kwargs=[{"word": "giraffe", "N": 1, "unused": None}],
+    )
+
+    result = reward_lib.score_response(example, "A giraffe walks carefully.")
+
+    self.assertEqual(result.prompt_reward, 1.0)
+
+  def test_batch_helpers_validate_lengths(self):
+    reward_fn = reward_lib.make_reward_fn([self._example()])
+
+    with self.assertRaisesRegex(ValueError, "same length"):
+      reward_fn(["Write one sentence."], [])
+
+  def test_summarize_results_reports_means(self):
+    results = [
+        reward_lib.score_response(self._example(), "A giraffe walks carefully."),
+        reward_lib.score_response(self._example(), "A cat walks carefully."),
+    ]
+
+    summary = reward_lib.summarize_results(results)
+
+    self.assertEqual(summary["example_count"], 2)
+    self.assertEqual(summary["prompt_reward_mean"], 0.5)
+    self.assertEqual(summary["instruction_reward_mean"], 0.5)
+
+  def test_summarize_results_handles_empty_input(self):
+    summary = reward_lib.summarize_results([])
+
+    self.assertEqual(summary["example_count"], 0)
+    self.assertEqual(summary["prompt_reward_mean"], 0.0)
+    self.assertEqual(summary["instruction_reward_mean"], 0.0)
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/run_reward.py b/run_reward.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2026 Allen Institute for AI.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Smoke runner for using IFBench verifiers as RLVR rewards."""
+
+import dataclasses
+import json
+
+from absl import app
+from absl import flags
+
+import evaluation_lib
+import reward_lib
+
+
+_INPUT_DATA = flags.DEFINE_string(
+    "input_data", None, "Path to IFBench input jsonl.", required=True
+)
+_INPUT_RESPONSE_DATA = flags.DEFINE_string(
+    "input_response_data",
+    None,
+    "Path to jsonl rows with prompt and response fields.",
+    required=True,
+)
+_MODE = flags.DEFINE_enum(
+    "mode", "loose", ["loose", "strict"], "IFBench verifier mode."
+)
+_LIMIT = flags.DEFINE_integer(
+    "limit", None, "Optional maximum number of examples to score."
+)
+_OUTPUT_JSONL = flags.DEFINE_string(
+    "output_jsonl", None, "Optional path for per-example reward results."
+)
+
+
+def _write_jsonl(path: str, results: list[reward_lib.RewardResult]) -> None:
+  with open(path, "w") as f:
+    for result in results:
+      f.write(json.dumps(dataclasses.asdict(result)))
+      f.write("\n")
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
+  examples = evaluation_lib.read_prompt_list(_INPUT_DATA.value)
+  if _LIMIT.value is not None:
+    examples = examples[: _LIMIT.value]
+  prompt_to_response = evaluation_lib.read_prompt_to_response_dict(
+      _INPUT_RESPONSE_DATA.value
+  )
+
+  results = [
+      reward_lib.score_response(
+          example,
+          prompt_to_response.get(example.prompt, ""),
+          mode=_MODE.value,
+      )
+      for example in examples
+  ]
+  summary = reward_lib.summarize_results(results)
+  print(json.dumps(summary, sort_keys=True))
+
+  if _OUTPUT_JSONL.value:
+    _write_jsonl(_OUTPUT_JSONL.value, results)
+
+
+if __name__ == "__main__":
+  app.run(main)