RPG/index.html at main · complex-reasoning/RPG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="RPG">
  <meta property="og:title" content="On the Design of KL-Regularized Policy Gradient Algorithms for LLM Reasoning"/>
  <meta property="og:description" content="On the Design of KL-Regularized Policy Gradient Algorithms for LLM Reasoning. Introducing RPG (Regularized Policy Gradient)"/>
  <meta property="og:url" content="https://github.com/complex-reasoning/RPG"/>


  <meta name="twitter:title" content="On the Design of KL-Regularized Policy Gradient Algorithms for LLM Reasoning">
  <meta name="twitter:description" content="On the Design of KL-Regularized Policy Gradient Algorithms for LLM Reasoning. Introducing RPG (Regularized Policy Gradient)">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="LLM, RLHF, Reinforcement Learning,RPG">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>On the Design of KL-Regularized Policy Gradient Algorithms for LLM Reasoning</title>
  <link rel="icon" type="image/x-icon" href="static/images/rpg.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">On the Design of KL-Regularized Policy Gradient Algorithms for LLM Reasoning</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="https://yifzhang.com" target="_blank">Yifan Zhang</a><sup>*</sup>,</span>
                <span class="author-block">
                  <a href="https://lauyikfung.github.io/" target="_blank">Yifeng Liu</a><sup>*</sup>,</span>
                  <span class="author-block">
                    <a href="https://scholar.google.com/citations?user=8foZzX4AAAAJ" target="_blank">Huizhuo Yuan</a>,</span>
                        <span class="author-block">
                          <a href="https://web.cs.ucla.edu/~qgu/" target="_blank">Quanquan Gu</a><sup>†</sup>,</span>
                          <span class="author-block">
                            <a href="https://en.wikipedia.org/wiki/Andrew_Yao" target="_blank">Andrew C Yao</a><sup>†</sup></span>
                  </div>

                  <div class="is-size-5 publication-authors">
                    <span class="author-block">IIIS, Tsinghua University,&nbsp;&nbsp;&nbsp;Shanghai Qi Zhi Institute,&nbsp;&nbsp;&nbsp;University of California, Los Angeles</span><br>
                    <span class="eql-cntrb"><small><sup>*</sup>Equal contribution</small></span>
                    <span class="eql-cntrb"><small><sup>†</sup>Corresponding author</small></span>
                  </div>

                  <div class="column has-text-centered">
                    <div class="publication-links">
                         <!-- Arxiv PDF link -->
                      <span class="link-block">
                        <a href="RPG.pdf" target="_blank"
                        class="external-link button is-normal is-rounded is-dark">
                        <span class="icon">
                          <i class="fas fa-file-pdf"></i>
                        </span>
                        <span>Paper</span>
                      </a>
                    </span>

                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/complex-reasoning/RPG" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2505.17508" target="_blank"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>

              <!-- HuggingFace Model Link -->
              <span class="link-block">
                <a href="https://huggingface.co/papers/2505.17508" target="_blank"
                class="external-link button is-normal is-rounded is-dark">
                <span class="icon">
                  <img src="static/images/hf-logo.svg" alt="My Icon">
                </span>
                <span>Huggingface</span>
              </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
            <p>
                Policy gradient algorithms have been successfully applied to enhance the reasoning capabilities
                of large language models (LLMs). KL regularization is ubiquitous, yet the design surface, choice
                of KL direction (forward vs. reverse), normalization (normalized vs. unnormalized), and estimator
                (k1/k2/k3), is scattered across the literature and often intertwined with off-policy estimation.
                We ask a focused question: under the off-policy setting, what weighting is required for each KL
                variant so that the surrogate we optimize yields the exact gradient of the intended KL-regularized
                objective? We answer this with a compact, unified derivation we call the Regularized Policy
                Gradient (<b>RPG</b>) view. RPG (i) unifies normalized and unnormalized KL variants and shows
                that the widely-used k3 penalty is exactly the unnormalized KL; (ii) specifies conditions under
                which REINFORCE-style losses with stop-gradient are gradient-equivalent to fully differentiable
                surrogates; (iii) identifies and corrects an off-policy importance-weighting mismatch in GRPO’s
                KL term; and (iv) introduces RPG-Style Clip, a clipped-importance-sampling step within RPGREINFORCE that enables stable, off-policy policy-gradient training at scale. On mathematical
                reasoning benchmarks (AIME24, AIME25), RPG-REINFORCE with RPG-Style Clip improves
                accuracy by up to +6 absolute percentage points over DAPO. Notably, RPG is a stable and
                scalable RL algorithm for LLM reasoning, realized via (a) a KL-correct objective, (b) clipped
                importance sampling, and (c) an iterative reference-policy update scheme.
            </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->

<div class="columns is-centered">
  <div class="column is-three-fifths">
    <h2 class="title is-3"> </h2>
    <h2 class="title is-3">Regularized Policy Gradient</h2>
    <p align="center">
      <img src="static/images/framework.png" width="1024" height="768"/>
    </p>
  </div>
</div>
<!-- Results. -->
<div class="columns is-centered">
  <div class="column is-three-fifths">
    <div class="content has-text-justified">
      <ul>
        <li>
            We derive policy gradients and corresponding surrogate losses for Forward/Reverse KL, in normalized (KL) and unnormalized (UKL) forms, under off-policy sampling with importance weights.
        </li>
        <li>
            We give both fully differentiable surrogates and REINFORCE-style losses (with stop-gradient) and prove their gradient-equivalence to the intended regularized objective (Proposition 4.1, Appendix J).
        </li>
        <li>
            We introduce RPG-Style Clip, a clipped-importance-weighted REINFORCE estimator that substantially improves stability and variance control while preserving the RPG gradients.
        <li>
            We reveal the equality between the k3 estimator and unnormalized KL (Appendix B), and show that GRPO’s KL penalty omits an essential importance weight under off-policy sampling. We provide a corrected estimator and loss consistent with the intended objective.
        <li>
            We present an iterative training framework that periodically updates the reference model to satisfy KL constraints while allowing the policy to depart meaningfully from the initial checkpoint.
        </li>
        <li>
            On math reasoning, RPG-REINFORCE (with RPG-Style Clip) yields stable and scalable training and outperforms DAPO by up to +6 absolute points on AIME24/25.
        </li>
      </ul>
    </div>
  </div>
</div>
<!--/ Results. -->

<!-- Results. -->
<div class="columns is-centered">
    <div class="column is-three-fifths">
      <div class="content has-text-justified">
        <h4 class="title">Experimental Results</h4>
        <figure>
            <p align="center">
                <img src="static/images/table3.png" width="800" height="800" alt="4K context length results"/>
            </p>
            <figcaption align="center">
                Combined performance metrics on the AIME24 and AIME25 mathematical reasoning benchmarks, showing "Last" and "Best" scores for 4K context length.
                The "Last" score is from the 400th training step, assuming the training process remained stable to that point.
                The highest score in each column is <b>bolded</b>, and the second highest is <span style="text-decoration:underline;">underlined</span>.
                RPG and RPG-REINFORCE methods are highlighted with light cyan and light green backgrounds, respectively.
            </figcaption>
          <p align="center">
                <img src="static/images/table5.png" width="800" height="800" alt="2K context length results"/>
            </p>
             <figcaption align="center">
                Combined performance metrics on the AIME24 and AIME25 mathematical reasoning benchmarks, showing "Last" and "Best" scores for 2K context length.
                The "Last" score is from the 400th training step, assuming the training process remained stable to that point.
                The highest score in each column is <b>bolded</b>, and the second highest is <span style="text-decoration:underline;">underlined</span>.
                RPG and RPG-REINFORCE methods are highlighted with light cyan and light green backgrounds, respectively.
            </figcaption>
        </figure>
        <figure>
            <p align="center">
              <img src="static/images/figure2.png" width="900" height="600" alt="Training loss plot"/>
            </p>
            <figcaption align="center">
              Training dynamics and benchmark performance for RPG and REINFORCE-style RPG compared to baselines (GRPO, DAPO) with 4K context length.
            </figcaption>
          </figure>
        <figure>
          <p align="center">
            <img src="static/images/figure3.png" width="900" height="600" alt="Validation loss plot"/>
          </p>
          <figcaption align="center">
            Training dynamics and benchmark performance for RPG and REINFORCE-style RPG compared to baselines (GRPO, DAPO) with 2K context length.
          </figcaption>
        </figure>
      </div>
    </div>
</div>
<!--/ Results. -->

<!-- Results. -->
<div class="columns is-centered">
    <div class="column is-three-fifths">
      <div class="content has-text-justified">
        <h4 class="title">Regularized Policy Gradients with fully differentiable surrogate loss functions</h4>
        <figure>
            <p align="center">
              <img src="static/images/table1.png" width="900" height="600" alt="Regularized Policy Gradients with fully differentiable surrogate loss functions"/>
            </p>
          </figure>
      </div>
    </div>
</div>
<!--/ Results. -->

<!-- Results. -->
<div class="columns is-centered">
    <div class="column is-three-fifths">
      <div class="content has-text-justified">
        <h4 class="title">REINFORCE-Style Regularized Policy Gradients</h4>
        <figure>
            <p align="center">
              <img src="static/images/table2.png" width="900" height="600" alt="REINFORCE-Style Regularized Policy Gradients"/>
            </p>
          </figure>
      </div>
    </div>
</div>
<!--/ Results. -->

<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">Citation</h2>
      <p>
        Please cite the paper and star this <a href="https://github.com/complex-reasoning/RPG" target="_blank">repo</a> if you use RPG and find it interesting/useful, thanks!
      </p>
      <pre><code>@article{zhang2025design,
    title={On the Design of KL-Regularized Policy Gradient Algorithms for LLM Reasoning},
    author={Zhang, Yifan and Liu, Yifeng and Yuan, Huizhuo and Gu, Quanquan and Yao, Andrew C},
    journal={arXiv preprint arXiv:2505.17508},
    year={2025},
}</code></pre>
    </div>
</section>
<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->

<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>