From b43ee152e3c7ca125c5ab2429eba4449c7e67cdf Mon Sep 17 00:00:00 2001
From: partyplatter08-lab
 <226721044+partyplatter08-lab@users.noreply.github.com>
Date: Sun, 31 May 2026 13:11:15 -0400
Subject: [PATCH 1/4] Add IFBench RLVR verifiers environment

---
 environments/ifbench_rlvr/README.md       |  45 +++++
 environments/ifbench_rlvr/ifbench_rlvr.py | 207 ++++++++++++++++++++++
 environments/ifbench_rlvr/pyproject.toml  |  27 +++
 tests/test_ifbench_rlvr.py                | 138 +++++++++++++++
 4 files changed, 417 insertions(+)
 create mode 100644 environments/ifbench_rlvr/README.md
 create mode 100644 environments/ifbench_rlvr/ifbench_rlvr.py
 create mode 100644 environments/ifbench_rlvr/pyproject.toml
 create mode 100644 tests/test_ifbench_rlvr.py

diff --git a/environments/ifbench_rlvr/README.md b/environments/ifbench_rlvr/README.md
new file mode 100644
index 0000000..d0e4102
--- /dev/null
+++ b/environments/ifbench_rlvr/README.md
@@ -0,0 +1,45 @@
+# ifbench-rlvr
+
+Verifiers environment for IFBench instruction-following RLVR.
+
+## Overview
+
+- Environment ID: `ifbench-rlvr`
+- Task type: single-turn instruction following
+- Reward: fraction of IFBench verifier constraints satisfied by the model response
+- Primary training dataset: `allenai/IF_multi_constraints_upto5`
+- Default eval dataset: `allenai/IFBench_test`
+
+The environment converts IFBench examples into `verifiers` single-turn tasks. Each
+task prompt is the IFBench user prompt, and the reward calls the same verifier
+classes used by IFBench. The released IF-RLVR training split references the
+additional verifier registry from `allenai/open-instruct`, so that package is
+declared as an environment dependency.
+
+## Quickstart
+
+From this directory:
+
+```bash
+prime eval run . -m openai/gpt-5-nano -n 20 -r 1
+```
+
+Use a small local smoke test with the packaged IFBench sample data:
+
+```bash
+python -m pytest tests/test_ifbench_rlvr.py
+```
+
+## Environment Arguments
+
+| Arg | Type | Default | Description |
+| --- | --- | --- | --- |
+| `dataset_name` | str | `allenai/IF_multi_constraints_upto5` | Hugging Face train dataset. |
+| `dataset_split` | str | `train` | Train split. |
+| `eval_dataset_name` | str | `allenai/IFBench_test` | Hugging Face eval dataset. |
+| `eval_dataset_split` | str | `train` | Eval split. |
+| `train_jsonl` | str or null | null | Optional local IFBench-format train JSONL. |
+| `eval_jsonl` | str or null | null | Optional local IFBench-format eval JSONL. |
+| `num_train_examples` | int | `-1` | Limit train examples; `-1` means all. |
+| `num_eval_examples` | int | `-1` | Limit eval examples; `-1` means all. |
+| `strict` | bool | `true` | Use strict IFBench checking. If false, use loose checking. |
diff --git a/environments/ifbench_rlvr/ifbench_rlvr.py b/environments/ifbench_rlvr/ifbench_rlvr.py
new file mode 100644
index 0000000..6747eac
--- /dev/null
+++ b/environments/ifbench_rlvr/ifbench_rlvr.py
@@ -0,0 +1,207 @@
+import ast
+import json
+from pathlib import Path
+from typing import Any
+
+from datasets import Dataset, load_dataset
+
+import instructions_registry
+import verifiers as vf
+
+try:
+    from open_instruct.IFEvalG import instructions_registry as rlvr_instructions_registry
+except ImportError:
+    rlvr_instructions_registry = None
+
+
+DEFAULT_TRAIN_DATASET = "allenai/IF_multi_constraints_upto5"
+DEFAULT_EVAL_DATASET = "allenai/IFBench_test"
+
+
+def _normalise_kwargs(kwargs: Any) -> dict[str, Any]:
+    if kwargs is None:
+        return {}
+    if not isinstance(kwargs, dict):
+        raise TypeError(f"Expected instruction kwargs to be a dict or None, got {type(kwargs)!r}")
+    return {key: value for key, value in kwargs.items() if value is not None}
+
+
+def _flatten_ground_truth(ground_truth: str) -> tuple[list[str], list[dict[str, Any]]]:
+    parsed = ast.literal_eval(ground_truth)
+    instruction_ids: list[str] = []
+    kwargs: list[dict[str, Any]] = []
+
+    for entry in parsed:
+        ids = entry.get("instruction_id", [])
+        args = entry.get("kwargs", [])
+        if isinstance(ids, str):
+            ids = [ids]
+        if not isinstance(args, list):
+            args = [args]
+        if len(args) < len(ids):
+            args = [*args, *([{}] * (len(ids) - len(args)))]
+        for instruction_id, instruction_kwargs in zip(ids, args):
+            instruction_ids.append(instruction_id)
+            kwargs.append(_normalise_kwargs(instruction_kwargs))
+
+    return instruction_ids, kwargs
+
+
+def normalise_example(example: dict[str, Any]) -> dict[str, Any]:
+    """Convert released IFBench train/eval formats into a verifiers task row."""
+    if "prompt" in example:
+        prompt = example["prompt"]
+        instruction_ids = list(example["instruction_id_list"])
+        kwargs = [_normalise_kwargs(item) for item in example["kwargs"]]
+    else:
+        messages = example["messages"]
+        prompt = next(message["content"] for message in messages if message["role"] == "user")
+        instruction_ids, kwargs = _flatten_ground_truth(example["ground_truth"])
+
+    answer_payload = {
+        "key": example.get("key"),
+        "prompt": prompt,
+        "instruction_id_list": instruction_ids,
+        "kwargs": kwargs,
+    }
+
+    return {
+        "question": prompt,
+        "answer": json.dumps(answer_payload, ensure_ascii=False),
+        "info": {
+            "key": example.get("key"),
+            "instruction_id_list": instruction_ids,
+        },
+    }
+
+
+def _read_jsonl(path: str | Path) -> Dataset:
+    rows = []
+    with Path(path).open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if line.strip():
+                rows.append(normalise_example(json.loads(line)))
+    return Dataset.from_list(rows)
+
+
+def _load_hf_dataset(name: str, split: str, limit: int) -> Dataset:
+    dataset = load_dataset(name, split=split)
+    if limit != -1:
+        dataset = dataset.select(range(min(limit, len(dataset))))
+    return dataset.map(normalise_example, remove_columns=dataset.column_names)
+
+
+def _get_instruction_cls(instruction_id: str):
+    if instruction_id in instructions_registry.INSTRUCTION_DICT:
+        return instructions_registry.INSTRUCTION_DICT[instruction_id]
+    if (
+        rlvr_instructions_registry is not None
+        and instruction_id in rlvr_instructions_registry.INSTRUCTION_DICT
+    ):
+        return rlvr_instructions_registry.INSTRUCTION_DICT[instruction_id]
+    raise KeyError(
+        f"Unknown IFBench instruction id {instruction_id!r}. "
+        "Install allenai/open-instruct for IF-RLVR training constraints."
+    )
+
+
+def _candidate_responses(response: str, strict: bool) -> list[str]:
+    if strict:
+        return [response]
+    lines = response.split("\n")
+    response_remove_first = "\n".join(lines[1:]).strip()
+    response_remove_last = "\n".join(lines[:-1]).strip()
+    response_remove_both = "\n".join(lines[1:-1]).strip()
+    return [
+        response,
+        response.replace("*", ""),
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        response_remove_first.replace("*", ""),
+        response_remove_last.replace("*", ""),
+        response_remove_both.replace("*", ""),
+    ]
+
+
+def instruction_following_reward(
+    completion, answer: str, parser: vf.Parser, strict: bool = True, **kwargs
+) -> float:
+    response = parser.parse_answer(completion) or ""
+    payload = json.loads(answer)
+    followed = []
+
+    for instruction_id, instruction_kwargs in zip(
+        payload["instruction_id_list"], payload["kwargs"]
+    ):
+        instruction_cls = _get_instruction_cls(instruction_id)
+        instruction = instruction_cls(instruction_id)
+        instruction.build_description(**instruction_kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=payload["prompt"])
+        followed.append(
+            any(
+                candidate.strip() and instruction.check_following(candidate)
+                for candidate in _candidate_responses(response, strict)
+            )
+        )
+
+    if not followed:
+        return 0.0
+    return sum(followed) / len(followed)
+
+
+def all_constraints_reward(completion, answer: str, parser: vf.Parser, strict: bool = True, **kwargs) -> float:
+    return 1.0 if instruction_following_reward(completion, answer, parser, strict=strict) == 1.0 else 0.0
+
+
+def load_environment(
+    dataset_name: str = DEFAULT_TRAIN_DATASET,
+    dataset_split: str = "train",
+    eval_dataset_name: str = DEFAULT_EVAL_DATASET,
+    eval_dataset_split: str = "train",
+    train_jsonl: str | None = None,
+    eval_jsonl: str | None = None,
+    num_train_examples: int = -1,
+    num_eval_examples: int = -1,
+    strict: bool = True,
+    system_prompt: str | None = None,
+) -> vf.Environment:
+    def build_dataset() -> Dataset:
+        if train_jsonl:
+            dataset = _read_jsonl(train_jsonl)
+            if num_train_examples != -1:
+                dataset = dataset.select(range(min(num_train_examples, len(dataset))))
+            return dataset
+        return _load_hf_dataset(dataset_name, dataset_split, num_train_examples)
+
+    def build_eval_dataset() -> Dataset:
+        if eval_jsonl:
+            dataset = _read_jsonl(eval_jsonl)
+            if num_eval_examples != -1:
+                dataset = dataset.select(range(min(num_eval_examples, len(dataset))))
+            return dataset
+        return _load_hf_dataset(eval_dataset_name, eval_dataset_split, num_eval_examples)
+
+    parser = vf.Parser()
+    rubric = vf.Rubric(
+        funcs=[
+            lambda completion, answer, **kwargs: instruction_following_reward(
+                completion, answer, parser, strict=strict, **kwargs
+            ),
+            lambda completion, answer, **kwargs: all_constraints_reward(
+                completion, answer, parser, strict=strict, **kwargs
+            ),
+        ],
+        weights=[1.0, 0.0],
+        parser=parser,
+    )
+
+    return vf.SingleTurnEnv(
+        dataset=build_dataset,
+        eval_dataset=build_eval_dataset,
+        system_prompt=system_prompt,
+        parser=parser,
+        rubric=rubric,
+    )
diff --git a/environments/ifbench_rlvr/pyproject.toml b/environments/ifbench_rlvr/pyproject.toml
new file mode 100644
index 0000000..6bb30d0
--- /dev/null
+++ b/environments/ifbench_rlvr/pyproject.toml
@@ -0,0 +1,27 @@
+[project]
+name = "ifbench-rlvr"
+version = "0.1.0"
+description = "Prime Verifiers environment for IFBench instruction-following RLVR."
+license = "Apache-2.0"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "datasets",
+    "ifbench @ git+https://github.com/allenai/IFBench.git",
+    "open-instruct @ git+https://github.com/allenai/open-instruct.git",
+    "verifiers>=0.1.14",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["ifbench_rlvr.py", "pyproject.toml", "README.md"]
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.verifiers.eval]
+num_examples = 20
+rollouts_per_example = 1
diff --git a/tests/test_ifbench_rlvr.py b/tests/test_ifbench_rlvr.py
new file mode 100644
index 0000000..19bd8d3
--- /dev/null
+++ b/tests/test_ifbench_rlvr.py
@@ -0,0 +1,138 @@
+import json
+import sys
+import types
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+ENV_DIR = ROOT / "environments" / "ifbench_rlvr"
+sys.path.insert(0, str(ROOT))
+sys.path.insert(0, str(ENV_DIR))
+
+
+class DummyDataset(list):
+    @classmethod
+    def from_list(cls, rows):
+        return cls(rows)
+
+    @property
+    def column_names(self):
+        return list(self[0].keys()) if self else []
+
+    def select(self, indices):
+        return DummyDataset([self[index] for index in indices])
+
+    def map(self, function, remove_columns=None):
+        return DummyDataset([function(row) for row in self])
+
+
+datasets_stub = types.ModuleType("datasets")
+datasets_stub.Dataset = DummyDataset
+datasets_stub.load_dataset = lambda *args, **kwargs: DummyDataset()
+sys.modules.setdefault("datasets", datasets_stub)
+
+verifiers_stub = types.ModuleType("verifiers")
+verifiers_stub.Environment = object
+verifiers_stub.Parser = object
+verifiers_stub.Rubric = lambda *args, **kwargs: object()
+verifiers_stub.SingleTurnEnv = lambda *args, **kwargs: {"args": args, "kwargs": kwargs}
+sys.modules.setdefault("verifiers", verifiers_stub)
+
+import ifbench_rlvr
+
+
+class DummyParser:
+    def parse_answer(self, completion):
+        return completion
+
+
+def test_normalise_eval_example():
+    example = {
+        "key": "case-1",
+        "prompt": "Say hello. End with done",
+        "instruction_id_list": ["last_word:last_word_answer"],
+        "kwargs": [{"last_word": "done", "unused": None}],
+    }
+
+    row = ifbench_rlvr.normalise_example(example)
+    answer = json.loads(row["answer"])
+
+    assert row["question"] == "Say hello. End with done"
+    assert answer["instruction_id_list"] == ["last_word:last_word_answer"]
+    assert answer["kwargs"] == [{"last_word": "done"}]
+
+
+def test_normalise_training_ground_truth():
+    example = {
+            "key": "train-1",
+            "messages": [
+            {
+                "role": "user",
+                "content": "Connect all sentences with hyphens and end with brief.",
+            }
+        ],
+        "ground_truth": (
+            "[{'instruction_id': ['detectable_format:sentence_hyphens', "
+            "'last_word:last_word_answer'], 'kwargs': [None, {'last_word': 'brief'}]}]"
+        ),
+    }
+
+    row = ifbench_rlvr.normalise_example(example)
+    answer = json.loads(row["answer"])
+
+    assert row["question"] == "Connect all sentences with hyphens and end with brief."
+    assert answer["instruction_id_list"] == [
+        "detectable_format:sentence_hyphens",
+        "last_word:last_word_answer",
+    ]
+    assert answer["kwargs"] == [{}, {"last_word": "brief"}]
+
+
+def test_instruction_reward_scores_fractional_constraints():
+    row = ifbench_rlvr.normalise_example(
+        {
+            "key": "case-2",
+            "prompt": "Include foo twice and end with done.",
+            "instruction_id_list": [
+                "count:numbers",
+                "count:conjunctions",
+            ],
+            "kwargs": [
+                {"N": 2},
+                {"small_n": 2},
+            ],
+        }
+    )
+
+    assert ifbench_rlvr.instruction_following_reward(
+        "There are 1 and 2 numbers.",
+        row["answer"],
+        DummyParser(),
+    ) == 0.5
+    assert ifbench_rlvr.instruction_following_reward(
+        "There are 1 and 2 numbers, and this conjunction plus but makes two.",
+        row["answer"],
+        DummyParser(),
+    ) == 1.0
+
+
+def test_read_jsonl_builds_verifiers_rows(tmp_path):
+    input_path = tmp_path / "ifbench.jsonl"
+    input_path.write_text(
+        json.dumps(
+            {
+                "key": "case-3",
+                "prompt": "End with done.",
+                "instruction_id_list": ["last_word:last_word_answer"],
+                "kwargs": [{"last_word": "done"}],
+            }
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+
+    dataset = ifbench_rlvr._read_jsonl(input_path)
+
+    assert len(dataset) == 1
+    assert dataset[0]["question"] == "End with done."
+    assert json.loads(dataset[0]["answer"])["kwargs"] == [{"last_word": "done"}]

From 3d9d4067572347697e697546440602ee47e81b46 Mon Sep 17 00:00:00 2001
From: partyplatter08-lab
 <226721044+partyplatter08-lab@users.noreply.github.com>
Date: Sun, 31 May 2026 14:06:23 -0400
Subject: [PATCH 2/4] Fix IFBench RLVR environment dependency metadata

---
 environments/ifbench_rlvr/README.md      | 1 +
 environments/ifbench_rlvr/pyproject.toml | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/environments/ifbench_rlvr/README.md b/environments/ifbench_rlvr/README.md
index d0e4102..7bb3acd 100644
--- a/environments/ifbench_rlvr/README.md
+++ b/environments/ifbench_rlvr/README.md
@@ -21,6 +21,7 @@ declared as an environment dependency.
 From this directory:
 
 ```bash
+uv python pin 3.12
 prime eval run . -m openai/gpt-5-nano -n 20 -r 1
 ```
 
diff --git a/environments/ifbench_rlvr/pyproject.toml b/environments/ifbench_rlvr/pyproject.toml
index 6bb30d0..5c3ea85 100644
--- a/environments/ifbench_rlvr/pyproject.toml
+++ b/environments/ifbench_rlvr/pyproject.toml
@@ -4,11 +4,11 @@ version = "0.1.0"
 description = "Prime Verifiers environment for IFBench instruction-following RLVR."
 license = "Apache-2.0"
 readme = "README.md"
-requires-python = ">=3.10"
+requires-python = ">=3.12,<3.13"
 dependencies = [
     "datasets",
     "ifbench @ git+https://github.com/allenai/IFBench.git",
-    "open-instruct @ git+https://github.com/allenai/open-instruct.git",
+    "open-instruct @ git+https://github.com/allenai/open-instruct.git@38fb335",
     "verifiers>=0.1.14",
 ]
 

From 4f7fdf8b83a843d427f6c1c4720941630836a586 Mon Sep 17 00:00:00 2001
From: partyplatter08-lab
 <226721044+partyplatter08-lab@users.noreply.github.com>
Date: Sun, 31 May 2026 18:08:16 -0400
Subject: [PATCH 3/4] Clarify IFBench RLVR smoke test command

---
 environments/ifbench_rlvr/README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/environments/ifbench_rlvr/README.md b/environments/ifbench_rlvr/README.md
index 7bb3acd..53199b4 100644
--- a/environments/ifbench_rlvr/README.md
+++ b/environments/ifbench_rlvr/README.md
@@ -25,10 +25,11 @@ uv python pin 3.12
 prime eval run . -m openai/gpt-5-nano -n 20 -r 1
 ```
 
-Use a small local smoke test with the packaged IFBench sample data:
+From the repository root, use the focused smoke tests with the packaged IFBench
+sample data:
 
 ```bash
-python -m pytest tests/test_ifbench_rlvr.py
+PYTHONDONTWRITEBYTECODE=1 uv run pytest tests/test_ifbench_rlvr.py
 ```
 
 ## Environment Arguments

From 8348b4a8b1f2fbab6663a3590fe6df57d92a5002 Mon Sep 17 00:00:00 2001
From: partyplatter08-lab
 <226721044+partyplatter08-lab@users.noreply.github.com>
Date: Mon, 1 Jun 2026 02:44:52 -0400
Subject: [PATCH 4/4] Fix IFBench RLVR reward kwargs handling

---
 environments/ifbench_rlvr/ifbench_rlvr.py | 17 ++++++++++---
 tests/test_ifbench_rlvr.py                | 30 +++++++++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/environments/ifbench_rlvr/ifbench_rlvr.py b/environments/ifbench_rlvr/ifbench_rlvr.py
index 6747eac..bc1b2d3 100644
--- a/environments/ifbench_rlvr/ifbench_rlvr.py
+++ b/environments/ifbench_rlvr/ifbench_rlvr.py
@@ -125,9 +125,9 @@ def _candidate_responses(response: str, strict: bool) -> list[str]:
 
 
 def instruction_following_reward(
-    completion, answer: str, parser: vf.Parser, strict: bool = True, **kwargs
+    completion, answer: str, response_parser: vf.Parser, strict: bool = True, **kwargs
 ) -> float:
-    response = parser.parse_answer(completion) or ""
+    response = response_parser.parse_answer(completion) or ""
     payload = json.loads(answer)
     followed = []
 
@@ -152,8 +152,17 @@ def instruction_following_reward(
     return sum(followed) / len(followed)
 
 
-def all_constraints_reward(completion, answer: str, parser: vf.Parser, strict: bool = True, **kwargs) -> float:
-    return 1.0 if instruction_following_reward(completion, answer, parser, strict=strict) == 1.0 else 0.0
+def all_constraints_reward(
+    completion, answer: str, response_parser: vf.Parser, strict: bool = True, **kwargs
+) -> float:
+    return (
+        1.0
+        if instruction_following_reward(
+            completion, answer, response_parser, strict=strict
+        )
+        == 1.0
+        else 0.0
+    )
 
 
 def load_environment(
diff --git a/tests/test_ifbench_rlvr.py b/tests/test_ifbench_rlvr.py
index 19bd8d3..e473f55 100644
--- a/tests/test_ifbench_rlvr.py
+++ b/tests/test_ifbench_rlvr.py
@@ -116,6 +116,36 @@ def test_instruction_reward_scores_fractional_constraints():
     ) == 1.0
 
 
+def test_reward_helpers_ignore_verifiers_parser_kwarg():
+    row = ifbench_rlvr.normalise_example(
+        {
+            "key": "case-parser-kwarg",
+            "prompt": "Include exactly two numbers.",
+            "instruction_id_list": ["count:numbers"],
+            "kwargs": [{"N": 2}],
+        }
+    )
+
+    assert (
+        ifbench_rlvr.instruction_following_reward(
+            "The numbers are 1 and 2.",
+            row["answer"],
+            DummyParser(),
+            parser=DummyParser(),
+        )
+        == 1.0
+    )
+    assert (
+        ifbench_rlvr.all_constraints_reward(
+            "The numbers are 1 and 2.",
+            row["answer"],
+            DummyParser(),
+            parser=DummyParser(),
+        )
+        == 1.0
+    )
+
+
 def test_read_jsonl_builds_verifiers_rows(tmp_path):
     input_path = tmp_path / "ifbench.jsonl"
     input_path.write_text(