From 724531af9febb33a7a5eba8cb1ea30b9253d319d Mon Sep 17 00:00:00 2001
From: Yaseen Hamdulay <yaseen@cerebrium.ai>
Date: Thu, 11 Jun 2026 11:34:25 +0200
Subject: [PATCH 1/3] update checkpointing docs

---
 other-topics/checkpointing.mdx | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/other-topics/checkpointing.mdx b/other-topics/checkpointing.mdx
index aa0e9d8..fb4a446 100644
--- a/other-topics/checkpointing.mdx
+++ b/other-topics/checkpointing.mdx
@@ -16,8 +16,8 @@ Cerebrium has native checkpointing and restore functionality built in to the pla
 Checkpointing is available on our v2 runtime environment. Add the following to your `cerebrium.toml` to upgrade.
 
 ```
-[cerebrium.runtime]
-container_runtime = "v2"
+[cerebrium.experimental]
+checkpointing = true
 ```
 
 To create a checkpoint your application has to send a trigger to our runtime after it has performed its initialization and is ready. When this trigger is received, the runtime verifies if a new checkpoint is required. To save resources, the system will not create a new checkpoint if:
@@ -38,17 +38,28 @@ A checkpoint is tightly coupled to a single deployment. To disable restoring fro
 ```python
 from vllm import AsyncLLMEngine
 from vllm.engine.arg_utils import AsyncEngineArgs
+import http
+import urllib
+
 # Init vLLM engine
 engine_args = AsyncEngineArgs(
     model="Qwen/Qwen2.5-0.5B-Instruct",
-    async_scheduling=False
+    async_scheduling=False,
+    sleep_mode=True
 )
-AsyncLLMEngine.from_engine_args(engine_args)
+engine = AsyncLLMEngine.from_engine_args(engine_args)
 
+# Drop KV cache for reduced GPU memory footprint.
+engine.sleep(level=1)
 # Trigger checkpoint
-urllib.request.urlopen("http://169.254.169.253:8234/checkpoint/", method="POST")
-# Wait for it to complete
-urllib.request.urlopen("http://169.254.169.253:8234/checkpoint/wait")
+try:
+    urllib.request.urlopen("http://169.254.169.253:8234/checkpoint/", method="POST")
+except (http.client.RemoteDisconnect):
+    # TCP connections disconnect on restore and throw remote
+    pass
+
+# Restore KV cache
+engine.wake_up()
 ```
 
 ## Limitations
@@ -70,3 +81,7 @@ urllib.request.urlopen("http://169.254.169.253:8234/checkpoint/wait")
 vLLM checkpointing support is not complete but still possible. See https://github.com/vllm-project/vllm/issues/34303 and other issues.
 
 If you are getting an EngineCoreDead exception add `async_scheduling=False` to your AsyncEngineArgs and it should succeed.
+
+The larger the size of the memory checkpoint the slower the restore is. We can reduce the size of the snapshot substantially and improve startup times by dropping the KV Cache before checkpoint and recreating it after restore. vLLM has functionality that does this built in as part of [vLLM Sleep Mode](https://docs.vllm.ai/en/latest/features/sleep_mode/).
+
+You

From 79120af93d32c169518b4f4dcca882a5afe2373e Mon Sep 17 00:00:00 2001
From: Yaseen Hamdulay <yaseen@cerebrium.ai>
Date: Thu, 11 Jun 2026 14:27:40 +0200
Subject: [PATCH 2/3] remove trailing slash

---
 other-topics/checkpointing.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/other-topics/checkpointing.mdx b/other-topics/checkpointing.mdx
index fb4a446..1e2687d 100644
--- a/other-topics/checkpointing.mdx
+++ b/other-topics/checkpointing.mdx
@@ -53,7 +53,7 @@ engine = AsyncLLMEngine.from_engine_args(engine_args)
 engine.sleep(level=1)
 # Trigger checkpoint
 try:
-    urllib.request.urlopen("http://169.254.169.253:8234/checkpoint/", method="POST")
+    urllib.request.urlopen("http://169.254.169.253:8234/checkpoint", method="POST")
 except (http.client.RemoteDisconnect):
     # TCP connections disconnect on restore and throw remote
     pass

From e1211bcb236741764d87af482bb67ef9fd88a779 Mon Sep 17 00:00:00 2001
From: Yaseen Hamdulay <yaseen@cerebrium.ai>
Date: Thu, 11 Jun 2026 14:28:16 +0200
Subject: [PATCH 3/3] fix except

---
 other-topics/checkpointing.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/other-topics/checkpointing.mdx b/other-topics/checkpointing.mdx
index 1e2687d..894ae5d 100644
--- a/other-topics/checkpointing.mdx
+++ b/other-topics/checkpointing.mdx
@@ -54,7 +54,7 @@ engine.sleep(level=1)
 # Trigger checkpoint
 try:
     urllib.request.urlopen("http://169.254.169.253:8234/checkpoint", method="POST")
-except (http.client.RemoteDisconnect):
+except http.client.RemoteDisconnect:
     # TCP connections disconnect on restore and throw remote
     pass