allenai · yannickvilleneuve-hash · May 26, 2026
diff --git a/README.md b/README.md
@@ -29,6 +29,42 @@ We also release our IF-RLVR code, as part of [open-instruct](https://github.com/
 
 The new training constraints and verification functions are here: https://github.com/allenai/open-instruct/tree/main/open_instruct/IFEvalG
 
+## RLVR task export and rewards
+
+This repo also includes a lightweight adapter for using IFBench prompts and
+verifiers as train-ready RLVR tasks.
+
+Export IFBench rows as chat-message JSONL tasks:
+
+```
+python3 -m rlvr_adapter export \
+  --input-data data/IFBench_test.jsonl \
+  --output-data data/ifbench_rlvr_tasks.jsonl
+```
+
+Score model responses with scalar rewards:
+
+```
+python3 -m rlvr_adapter score \
+  --input-data data/IFBench_test.jsonl \
+  --response-data data/sample_output.jsonl \
+  --output-data eval/ifbench_rewards.jsonl \
+  --reward-mode all
+```
+
+For Prime Intellect Verifiers or prime-rl, install the optional dependencies
+and load the environment from `ifbench_verifiers.py`:
+
+```
+uv sync --extra rlvr
+python3 - <<'PY'
+from ifbench_verifiers import IFBenchEnvConfig, load_environment
+
+env = load_environment(IFBenchEnvConfig())
+print(len(env.taskset.get_dataset()))
+PY
+```
+
 ## 📊 Model Performance Leaderboard
 
 | Rank | Model | IFBench Score | IFEval Score |

diff --git a/evaluation_lib.py b/evaluation_lib.py
@@ -17,6 +17,7 @@
 
 import collections
 import dataclasses
+import copy
 import json
 from typing import Dict, Optional, Union
 
@@ -40,6 +41,22 @@ class OutputExample:
   follow_instruction_list: list[bool]
 
 
+def _lookup_response(prompt_to_response, prompt):
+  """Fetches a response, tolerating harmless prompt whitespace drift."""
+  if prompt in prompt_to_response:
+    return prompt_to_response[prompt]
+
+  stripped_prompt = prompt.strip()
+  if stripped_prompt in prompt_to_response:
+    return prompt_to_response[stripped_prompt]
+
+  return None
+
+
+def _drop_none_values(kwargs):
+  return {key: value for key, value in kwargs.items() if value is not None}
+
+
 def read_prompt_list(input_jsonl_filename):
   """Read inputs from jsonl."""
   inputs = []
@@ -77,15 +94,15 @@ def test_instruction_following_strict(
     prompt_to_response,
 ):
   """Tests response to see if instrutions are followed."""
-  response = prompt_to_response[inp.prompt]
+  response = _lookup_response(prompt_to_response, inp.prompt) or ""
   instruction_list = inp.instruction_id_list
   is_following_list = []
 
   for index, instruction_id in enumerate(instruction_list):
     instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
     instruction = instruction_cls(instruction_id)
-    inp.kwargs[index] = {key: value for key, value in inp.kwargs[index].items() if value is not None}
-    instruction.build_description(**inp.kwargs[index])
+    instruction_kwargs = _drop_none_values(inp.kwargs[index])
+    instruction.build_description(**instruction_kwargs)
     args = instruction.get_instruction_args()
     if args and "prompt" in args:
       instruction.build_description(prompt=inp.prompt)
@@ -109,7 +126,7 @@ def test_instruction_following_loose(
     prompt_to_response,
 ):
   """Tests response for an upper bound for following instructions."""
-  response = prompt_to_response[inp.prompt]
+  response = _lookup_response(prompt_to_response, inp.prompt)
   if response is None:
       return OutputExample(
           instruction_id_list=inp.instruction_id_list,
@@ -144,7 +161,7 @@ def test_instruction_following_loose(
     instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
     instruction = instruction_cls(instruction_id)
 
-    instruction.build_description(**inp.kwargs[index])
+    instruction.build_description(**_drop_none_values(inp.kwargs[index]))
     args = instruction.get_instruction_args()
     if args and "prompt" in args:
       instruction.build_description(prompt=inp.prompt)
@@ -166,13 +183,26 @@ def test_instruction_following_loose(
   )
 
 
+def evaluate_response(inp, response, loose=False):
+  """Evaluates one response without requiring a prompt-response jsonl mapping."""
+  prompt_to_response = {inp.prompt: response}
+  inp_copy = copy.deepcopy(inp)
+
+  if loose:
+    return test_instruction_following_loose(inp_copy, prompt_to_response)
+
+  return test_instruction_following_strict(inp_copy, prompt_to_response)
+
+
 def read_prompt_to_response_dict(input_jsonl_filename):
   """Creates dictionary matching prompt and response."""
   return_dict = {}
   with open(input_jsonl_filename, "r") as f:
     for l in f:
       example = json.loads(l)
-      return_dict[example["prompt"]] = example["response"]
+      prompt = example["prompt"]
+      return_dict[prompt] = example["response"]
+      return_dict.setdefault(prompt.strip(), example["response"])
   return return_dict
 
 

diff --git a/ifbench_verifiers.py b/ifbench_verifiers.py
@@ -0,0 +1,120 @@
+"""Prime Intellect Verifiers environment for IFBench.
+
+This module is optional: install the ``rlvr`` extra before loading it with
+Verifiers or prime-rl.
+"""
+
+from collections.abc import Mapping
+from typing import Literal
+
+import evaluation_lib
+import rlvr_adapter
+
+try:
+  import verifiers as vf
+except ImportError as exc:  # pragma: no cover - exercised only without extra deps.
+  raise ImportError(
+      "ifbench_verifiers requires the optional rlvr dependencies. "
+      "Install with `uv sync --extra rlvr` or `pip install .[rlvr]`."
+  ) from exc
+
+
+RewardMode = Literal["all", "mean"]
+
+
+class IFBenchTasksetConfig(vf.TasksetConfig):
+  input_path: str = "data/IFBench_test.jsonl"
+  eval_input_path: str | None = None
+  limit: int | None = None
+  eval_limit: int | None = None
+  loose: bool = False
+  reward_mode: RewardMode = "all"
+  system_prompt: str | None = None
+
+
+def _input_from_task(task) -> evaluation_lib.InputExample:
+  return evaluation_lib.InputExample(
+      key=task.get("key", task.get("example_id", "")),
+      instruction_id_list=list(task["instruction_id_list"]),
+      prompt=task["question"],
+      kwargs=list(task["kwargs"]),
+  )
+
+
+def _assistant_response(completion) -> str:
+  for message in reversed(completion or []):
+    if isinstance(message, Mapping):
+      role = message.get("role")
+      content = message.get("content")
+    else:
+      role = getattr(message, "role", None)
+      content = getattr(message, "content", None)
+
+    if role == "assistant":
+      return str(content or "")
+
+  return ""
+
+
+class IFBenchTaskset(vf.Taskset):
+  config_type = IFBenchTasksetConfig
+  config: IFBenchTasksetConfig
+
+  def __init__(self, config: IFBenchTasksetConfig | dict | None = None):
+    resolved_config = self.config_type.from_config(config)
+    eval_path = resolved_config.eval_input_path or resolved_config.input_path
+    super().__init__(
+        source=lambda: list(
+            self._load_ifbench_rows(resolved_config.input_path, resolved_config.limit)
+        ),
+        eval_source=lambda: list(
+            self._load_ifbench_rows(eval_path, resolved_config.eval_limit)
+        ),
+        system_prompt=resolved_config.system_prompt,
+        config=resolved_config,
+    )
+
+  def _load_ifbench_rows(self, path: str, limit: int | None):
+    inputs = evaluation_lib.read_prompt_list(path)
+    if limit is not None:
+      inputs = inputs[:limit]
+
+    for inp in inputs:
+      task = rlvr_adapter.task_from_input(inp)
+      yield {
+          "example_id": str(inp.key),
+          "key": str(inp.key),
+          "prompt": task["messages"],
+          "question": inp.prompt,
+          "instruction_id_list": inp.instruction_id_list,
+          "kwargs": inp.kwargs,
+          "info": {"reward_spec": task["reward_spec"]},
+      }
+
+  @vf.reward(weight=1.0)
+  async def ifbench_reward(self, task, state) -> float:
+    response = _assistant_response(state.get("completion"))
+    inp = _input_from_task(task)
+    scored = rlvr_adapter.score_response(
+        inp,
+        response,
+        loose=self.config.loose,
+        reward_mode=self.config.reward_mode,
+    )
+    return float(scored["reward"])
+
+
+class IFBenchEnvConfig(vf.EnvConfig):
+  taskset: IFBenchTasksetConfig = IFBenchTasksetConfig()
+  harness: vf.HarnessConfig = vf.HarnessConfig()
+
+
+def load_taskset(config: IFBenchTasksetConfig) -> IFBenchTaskset:
+  return IFBenchTaskset(config=config)
+
+
+def load_environment(config: IFBenchEnvConfig) -> vf.Env:
+  return vf.Env(
+      taskset=IFBenchTaskset(config=config.taskset),
+      harness=vf.Harness(config=config.harness),
+  )
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,10 @@ dependencies = [
 dev = [
     "pytest",
 ]
+rlvr = [
+    "datasets",
+    "verifiers>=0.1.14",
+]
 
 [build-system]
 requires = ["hatchling"]