From b43ee152e3c7ca125c5ab2429eba4449c7e67cdf Mon Sep 17 00:00:00 2001 From: partyplatter08-lab <226721044+partyplatter08-lab@users.noreply.github.com> Date: Sun, 31 May 2026 13:11:15 -0400 Subject: [PATCH 1/4] Add IFBench RLVR verifiers environment --- environments/ifbench_rlvr/README.md | 45 +++++ environments/ifbench_rlvr/ifbench_rlvr.py | 207 ++++++++++++++++++++++ environments/ifbench_rlvr/pyproject.toml | 27 +++ tests/test_ifbench_rlvr.py | 138 +++++++++++++++ 4 files changed, 417 insertions(+) create mode 100644 environments/ifbench_rlvr/README.md create mode 100644 environments/ifbench_rlvr/ifbench_rlvr.py create mode 100644 environments/ifbench_rlvr/pyproject.toml create mode 100644 tests/test_ifbench_rlvr.py diff --git a/environments/ifbench_rlvr/README.md b/environments/ifbench_rlvr/README.md new file mode 100644 index 0000000..d0e4102 --- /dev/null +++ b/environments/ifbench_rlvr/README.md @@ -0,0 +1,45 @@ +# ifbench-rlvr + +Verifiers environment for IFBench instruction-following RLVR. + +## Overview + +- Environment ID: `ifbench-rlvr` +- Task type: single-turn instruction following +- Reward: fraction of IFBench verifier constraints satisfied by the model response +- Primary training dataset: `allenai/IF_multi_constraints_upto5` +- Default eval dataset: `allenai/IFBench_test` + +The environment converts IFBench examples into `verifiers` single-turn tasks. Each +task prompt is the IFBench user prompt, and the reward calls the same verifier +classes used by IFBench. The released IF-RLVR training split references the +additional verifier registry from `allenai/open-instruct`, so that package is +declared as an environment dependency. + +## Quickstart + +From this directory: + +```bash +prime eval run . -m openai/gpt-5-nano -n 20 -r 1 +``` + +Use a small local smoke test with the packaged IFBench sample data: + +```bash +python -m pytest tests/test_ifbench_rlvr.py +``` + +## Environment Arguments + +| Arg | Type | Default | Description | +| --- | --- | --- | --- | +| `dataset_name` | str | `allenai/IF_multi_constraints_upto5` | Hugging Face train dataset. | +| `dataset_split` | str | `train` | Train split. | +| `eval_dataset_name` | str | `allenai/IFBench_test` | Hugging Face eval dataset. | +| `eval_dataset_split` | str | `train` | Eval split. | +| `train_jsonl` | str or null | null | Optional local IFBench-format train JSONL. | +| `eval_jsonl` | str or null | null | Optional local IFBench-format eval JSONL. | +| `num_train_examples` | int | `-1` | Limit train examples; `-1` means all. | +| `num_eval_examples` | int | `-1` | Limit eval examples; `-1` means all. | +| `strict` | bool | `true` | Use strict IFBench checking. If false, use loose checking. | diff --git a/environments/ifbench_rlvr/ifbench_rlvr.py b/environments/ifbench_rlvr/ifbench_rlvr.py new file mode 100644 index 0000000..6747eac --- /dev/null +++ b/environments/ifbench_rlvr/ifbench_rlvr.py @@ -0,0 +1,207 @@ +import ast +import json +from pathlib import Path +from typing import Any + +from datasets import Dataset, load_dataset + +import instructions_registry +import verifiers as vf + +try: + from open_instruct.IFEvalG import instructions_registry as rlvr_instructions_registry +except ImportError: + rlvr_instructions_registry = None + + +DEFAULT_TRAIN_DATASET = "allenai/IF_multi_constraints_upto5" +DEFAULT_EVAL_DATASET = "allenai/IFBench_test" + + +def _normalise_kwargs(kwargs: Any) -> dict[str, Any]: + if kwargs is None: + return {} + if not isinstance(kwargs, dict): + raise TypeError(f"Expected instruction kwargs to be a dict or None, got {type(kwargs)!r}") + return {key: value for key, value in kwargs.items() if value is not None} + + +def _flatten_ground_truth(ground_truth: str) -> tuple[list[str], list[dict[str, Any]]]: + parsed = ast.literal_eval(ground_truth) + instruction_ids: list[str] = [] + kwargs: list[dict[str, Any]] = [] + + for entry in parsed: + ids = entry.get("instruction_id", []) + args = entry.get("kwargs", []) + if isinstance(ids, str): + ids = [ids] + if not isinstance(args, list): + args = [args] + if len(args) < len(ids): + args = [*args, *([{}] * (len(ids) - len(args)))] + for instruction_id, instruction_kwargs in zip(ids, args): + instruction_ids.append(instruction_id) + kwargs.append(_normalise_kwargs(instruction_kwargs)) + + return instruction_ids, kwargs + + +def normalise_example(example: dict[str, Any]) -> dict[str, Any]: + """Convert released IFBench train/eval formats into a verifiers task row.""" + if "prompt" in example: + prompt = example["prompt"] + instruction_ids = list(example["instruction_id_list"]) + kwargs = [_normalise_kwargs(item) for item in example["kwargs"]] + else: + messages = example["messages"] + prompt = next(message["content"] for message in messages if message["role"] == "user") + instruction_ids, kwargs = _flatten_ground_truth(example["ground_truth"]) + + answer_payload = { + "key": example.get("key"), + "prompt": prompt, + "instruction_id_list": instruction_ids, + "kwargs": kwargs, + } + + return { + "question": prompt, + "answer": json.dumps(answer_payload, ensure_ascii=False), + "info": { + "key": example.get("key"), + "instruction_id_list": instruction_ids, + }, + } + + +def _read_jsonl(path: str | Path) -> Dataset: + rows = [] + with Path(path).open("r", encoding="utf-8") as handle: + for line in handle: + if line.strip(): + rows.append(normalise_example(json.loads(line))) + return Dataset.from_list(rows) + + +def _load_hf_dataset(name: str, split: str, limit: int) -> Dataset: + dataset = load_dataset(name, split=split) + if limit != -1: + dataset = dataset.select(range(min(limit, len(dataset)))) + return dataset.map(normalise_example, remove_columns=dataset.column_names) + + +def _get_instruction_cls(instruction_id: str): + if instruction_id in instructions_registry.INSTRUCTION_DICT: + return instructions_registry.INSTRUCTION_DICT[instruction_id] + if ( + rlvr_instructions_registry is not None + and instruction_id in rlvr_instructions_registry.INSTRUCTION_DICT + ): + return rlvr_instructions_registry.INSTRUCTION_DICT[instruction_id] + raise KeyError( + f"Unknown IFBench instruction id {instruction_id!r}. " + "Install allenai/open-instruct for IF-RLVR training constraints." + ) + + +def _candidate_responses(response: str, strict: bool) -> list[str]: + if strict: + return [response] + lines = response.split("\n") + response_remove_first = "\n".join(lines[1:]).strip() + response_remove_last = "\n".join(lines[:-1]).strip() + response_remove_both = "\n".join(lines[1:-1]).strip() + return [ + response, + response.replace("*", ""), + response_remove_first, + response_remove_last, + response_remove_both, + response_remove_first.replace("*", ""), + response_remove_last.replace("*", ""), + response_remove_both.replace("*", ""), + ] + + +def instruction_following_reward( + completion, answer: str, parser: vf.Parser, strict: bool = True, **kwargs +) -> float: + response = parser.parse_answer(completion) or "" + payload = json.loads(answer) + followed = [] + + for instruction_id, instruction_kwargs in zip( + payload["instruction_id_list"], payload["kwargs"] + ): + instruction_cls = _get_instruction_cls(instruction_id) + instruction = instruction_cls(instruction_id) + instruction.build_description(**instruction_kwargs) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=payload["prompt"]) + followed.append( + any( + candidate.strip() and instruction.check_following(candidate) + for candidate in _candidate_responses(response, strict) + ) + ) + + if not followed: + return 0.0 + return sum(followed) / len(followed) + + +def all_constraints_reward(completion, answer: str, parser: vf.Parser, strict: bool = True, **kwargs) -> float: + return 1.0 if instruction_following_reward(completion, answer, parser, strict=strict) == 1.0 else 0.0 + + +def load_environment( + dataset_name: str = DEFAULT_TRAIN_DATASET, + dataset_split: str = "train", + eval_dataset_name: str = DEFAULT_EVAL_DATASET, + eval_dataset_split: str = "train", + train_jsonl: str | None = None, + eval_jsonl: str | None = None, + num_train_examples: int = -1, + num_eval_examples: int = -1, + strict: bool = True, + system_prompt: str | None = None, +) -> vf.Environment: + def build_dataset() -> Dataset: + if train_jsonl: + dataset = _read_jsonl(train_jsonl) + if num_train_examples != -1: + dataset = dataset.select(range(min(num_train_examples, len(dataset)))) + return dataset + return _load_hf_dataset(dataset_name, dataset_split, num_train_examples) + + def build_eval_dataset() -> Dataset: + if eval_jsonl: + dataset = _read_jsonl(eval_jsonl) + if num_eval_examples != -1: + dataset = dataset.select(range(min(num_eval_examples, len(dataset)))) + return dataset + return _load_hf_dataset(eval_dataset_name, eval_dataset_split, num_eval_examples) + + parser = vf.Parser() + rubric = vf.Rubric( + funcs=[ + lambda completion, answer, **kwargs: instruction_following_reward( + completion, answer, parser, strict=strict, **kwargs + ), + lambda completion, answer, **kwargs: all_constraints_reward( + completion, answer, parser, strict=strict, **kwargs + ), + ], + weights=[1.0, 0.0], + parser=parser, + ) + + return vf.SingleTurnEnv( + dataset=build_dataset, + eval_dataset=build_eval_dataset, + system_prompt=system_prompt, + parser=parser, + rubric=rubric, + ) diff --git a/environments/ifbench_rlvr/pyproject.toml b/environments/ifbench_rlvr/pyproject.toml new file mode 100644 index 0000000..6bb30d0 --- /dev/null +++ b/environments/ifbench_rlvr/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "ifbench-rlvr" +version = "0.1.0" +description = "Prime Verifiers environment for IFBench instruction-following RLVR." +license = "Apache-2.0" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "datasets", + "ifbench @ git+https://github.com/allenai/IFBench.git", + "open-instruct @ git+https://github.com/allenai/open-instruct.git", + "verifiers>=0.1.14", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = ["ifbench_rlvr.py", "pyproject.toml", "README.md"] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.verifiers.eval] +num_examples = 20 +rollouts_per_example = 1 diff --git a/tests/test_ifbench_rlvr.py b/tests/test_ifbench_rlvr.py new file mode 100644 index 0000000..19bd8d3 --- /dev/null +++ b/tests/test_ifbench_rlvr.py @@ -0,0 +1,138 @@ +import json +import sys +import types +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +ENV_DIR = ROOT / "environments" / "ifbench_rlvr" +sys.path.insert(0, str(ROOT)) +sys.path.insert(0, str(ENV_DIR)) + + +class DummyDataset(list): + @classmethod + def from_list(cls, rows): + return cls(rows) + + @property + def column_names(self): + return list(self[0].keys()) if self else [] + + def select(self, indices): + return DummyDataset([self[index] for index in indices]) + + def map(self, function, remove_columns=None): + return DummyDataset([function(row) for row in self]) + + +datasets_stub = types.ModuleType("datasets") +datasets_stub.Dataset = DummyDataset +datasets_stub.load_dataset = lambda *args, **kwargs: DummyDataset() +sys.modules.setdefault("datasets", datasets_stub) + +verifiers_stub = types.ModuleType("verifiers") +verifiers_stub.Environment = object +verifiers_stub.Parser = object +verifiers_stub.Rubric = lambda *args, **kwargs: object() +verifiers_stub.SingleTurnEnv = lambda *args, **kwargs: {"args": args, "kwargs": kwargs} +sys.modules.setdefault("verifiers", verifiers_stub) + +import ifbench_rlvr + + +class DummyParser: + def parse_answer(self, completion): + return completion + + +def test_normalise_eval_example(): + example = { + "key": "case-1", + "prompt": "Say hello. End with done", + "instruction_id_list": ["last_word:last_word_answer"], + "kwargs": [{"last_word": "done", "unused": None}], + } + + row = ifbench_rlvr.normalise_example(example) + answer = json.loads(row["answer"]) + + assert row["question"] == "Say hello. End with done" + assert answer["instruction_id_list"] == ["last_word:last_word_answer"] + assert answer["kwargs"] == [{"last_word": "done"}] + + +def test_normalise_training_ground_truth(): + example = { + "key": "train-1", + "messages": [ + { + "role": "user", + "content": "Connect all sentences with hyphens and end with brief.", + } + ], + "ground_truth": ( + "[{'instruction_id': ['detectable_format:sentence_hyphens', " + "'last_word:last_word_answer'], 'kwargs': [None, {'last_word': 'brief'}]}]" + ), + } + + row = ifbench_rlvr.normalise_example(example) + answer = json.loads(row["answer"]) + + assert row["question"] == "Connect all sentences with hyphens and end with brief." + assert answer["instruction_id_list"] == [ + "detectable_format:sentence_hyphens", + "last_word:last_word_answer", + ] + assert answer["kwargs"] == [{}, {"last_word": "brief"}] + + +def test_instruction_reward_scores_fractional_constraints(): + row = ifbench_rlvr.normalise_example( + { + "key": "case-2", + "prompt": "Include foo twice and end with done.", + "instruction_id_list": [ + "count:numbers", + "count:conjunctions", + ], + "kwargs": [ + {"N": 2}, + {"small_n": 2}, + ], + } + ) + + assert ifbench_rlvr.instruction_following_reward( + "There are 1 and 2 numbers.", + row["answer"], + DummyParser(), + ) == 0.5 + assert ifbench_rlvr.instruction_following_reward( + "There are 1 and 2 numbers, and this conjunction plus but makes two.", + row["answer"], + DummyParser(), + ) == 1.0 + + +def test_read_jsonl_builds_verifiers_rows(tmp_path): + input_path = tmp_path / "ifbench.jsonl" + input_path.write_text( + json.dumps( + { + "key": "case-3", + "prompt": "End with done.", + "instruction_id_list": ["last_word:last_word_answer"], + "kwargs": [{"last_word": "done"}], + } + ) + + "\n", + encoding="utf-8", + ) + + dataset = ifbench_rlvr._read_jsonl(input_path) + + assert len(dataset) == 1 + assert dataset[0]["question"] == "End with done." + assert json.loads(dataset[0]["answer"])["kwargs"] == [{"last_word": "done"}] From 3d9d4067572347697e697546440602ee47e81b46 Mon Sep 17 00:00:00 2001 From: partyplatter08-lab <226721044+partyplatter08-lab@users.noreply.github.com> Date: Sun, 31 May 2026 14:06:23 -0400 Subject: [PATCH 2/4] Fix IFBench RLVR environment dependency metadata --- environments/ifbench_rlvr/README.md | 1 + environments/ifbench_rlvr/pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/environments/ifbench_rlvr/README.md b/environments/ifbench_rlvr/README.md index d0e4102..7bb3acd 100644 --- a/environments/ifbench_rlvr/README.md +++ b/environments/ifbench_rlvr/README.md @@ -21,6 +21,7 @@ declared as an environment dependency. From this directory: ```bash +uv python pin 3.12 prime eval run . -m openai/gpt-5-nano -n 20 -r 1 ``` diff --git a/environments/ifbench_rlvr/pyproject.toml b/environments/ifbench_rlvr/pyproject.toml index 6bb30d0..5c3ea85 100644 --- a/environments/ifbench_rlvr/pyproject.toml +++ b/environments/ifbench_rlvr/pyproject.toml @@ -4,11 +4,11 @@ version = "0.1.0" description = "Prime Verifiers environment for IFBench instruction-following RLVR." license = "Apache-2.0" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.12,<3.13" dependencies = [ "datasets", "ifbench @ git+https://github.com/allenai/IFBench.git", - "open-instruct @ git+https://github.com/allenai/open-instruct.git", + "open-instruct @ git+https://github.com/allenai/open-instruct.git@38fb335", "verifiers>=0.1.14", ] From 4f7fdf8b83a843d427f6c1c4720941630836a586 Mon Sep 17 00:00:00 2001 From: partyplatter08-lab <226721044+partyplatter08-lab@users.noreply.github.com> Date: Sun, 31 May 2026 18:08:16 -0400 Subject: [PATCH 3/4] Clarify IFBench RLVR smoke test command --- environments/ifbench_rlvr/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/environments/ifbench_rlvr/README.md b/environments/ifbench_rlvr/README.md index 7bb3acd..53199b4 100644 --- a/environments/ifbench_rlvr/README.md +++ b/environments/ifbench_rlvr/README.md @@ -25,10 +25,11 @@ uv python pin 3.12 prime eval run . -m openai/gpt-5-nano -n 20 -r 1 ``` -Use a small local smoke test with the packaged IFBench sample data: +From the repository root, use the focused smoke tests with the packaged IFBench +sample data: ```bash -python -m pytest tests/test_ifbench_rlvr.py +PYTHONDONTWRITEBYTECODE=1 uv run pytest tests/test_ifbench_rlvr.py ``` ## Environment Arguments From 8348b4a8b1f2fbab6663a3590fe6df57d92a5002 Mon Sep 17 00:00:00 2001 From: partyplatter08-lab <226721044+partyplatter08-lab@users.noreply.github.com> Date: Mon, 1 Jun 2026 02:44:52 -0400 Subject: [PATCH 4/4] Fix IFBench RLVR reward kwargs handling --- environments/ifbench_rlvr/ifbench_rlvr.py | 17 ++++++++++--- tests/test_ifbench_rlvr.py | 30 +++++++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/environments/ifbench_rlvr/ifbench_rlvr.py b/environments/ifbench_rlvr/ifbench_rlvr.py index 6747eac..bc1b2d3 100644 --- a/environments/ifbench_rlvr/ifbench_rlvr.py +++ b/environments/ifbench_rlvr/ifbench_rlvr.py @@ -125,9 +125,9 @@ def _candidate_responses(response: str, strict: bool) -> list[str]: def instruction_following_reward( - completion, answer: str, parser: vf.Parser, strict: bool = True, **kwargs + completion, answer: str, response_parser: vf.Parser, strict: bool = True, **kwargs ) -> float: - response = parser.parse_answer(completion) or "" + response = response_parser.parse_answer(completion) or "" payload = json.loads(answer) followed = [] @@ -152,8 +152,17 @@ def instruction_following_reward( return sum(followed) / len(followed) -def all_constraints_reward(completion, answer: str, parser: vf.Parser, strict: bool = True, **kwargs) -> float: - return 1.0 if instruction_following_reward(completion, answer, parser, strict=strict) == 1.0 else 0.0 +def all_constraints_reward( + completion, answer: str, response_parser: vf.Parser, strict: bool = True, **kwargs +) -> float: + return ( + 1.0 + if instruction_following_reward( + completion, answer, response_parser, strict=strict + ) + == 1.0 + else 0.0 + ) def load_environment( diff --git a/tests/test_ifbench_rlvr.py b/tests/test_ifbench_rlvr.py index 19bd8d3..e473f55 100644 --- a/tests/test_ifbench_rlvr.py +++ b/tests/test_ifbench_rlvr.py @@ -116,6 +116,36 @@ def test_instruction_reward_scores_fractional_constraints(): ) == 1.0 +def test_reward_helpers_ignore_verifiers_parser_kwarg(): + row = ifbench_rlvr.normalise_example( + { + "key": "case-parser-kwarg", + "prompt": "Include exactly two numbers.", + "instruction_id_list": ["count:numbers"], + "kwargs": [{"N": 2}], + } + ) + + assert ( + ifbench_rlvr.instruction_following_reward( + "The numbers are 1 and 2.", + row["answer"], + DummyParser(), + parser=DummyParser(), + ) + == 1.0 + ) + assert ( + ifbench_rlvr.all_constraints_reward( + "The numbers are 1 and 2.", + row["answer"], + DummyParser(), + parser=DummyParser(), + ) + == 1.0 + ) + + def test_read_jsonl_builds_verifiers_rows(tmp_path): input_path = tmp_path / "ifbench.jsonl" input_path.write_text(