CerebriumAI · milo157 · Jun 23, 2026
diff --git a/5-large-language-models/1-openai-compatible-endpoint/cerebrium.toml b/5-large-language-models/1-openai-compatible-endpoint/cerebrium.toml
@@ -1,14 +1,18 @@
 [cerebrium.deployment]
 name = "1-openai-compatible-endpoint"
 python_version = "3.11"
-docker_base_image_url = "debian:bookworm-slim"
+docker_base_image_url = "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
+disable_auth = true
+deployment_initialization_timeout = 830
 include = ["./*", "main.py", "cerebrium.toml"]
 exclude = [".*"]
 
 [cerebrium.hardware]
-cpu = 2
-memory = 12.0
-compute = "AMPERE_A10"
+cpu = 8
+memory = 60.0
+compute = "ADA_L40"
+provider = "aws"
+region = "us-east-1"
 
 [cerebrium.scaling]
 min_replicas = 0
@@ -18,3 +22,7 @@ cooldown = 30
 [cerebrium.dependencies.pip]
 vllm = "latest"
 pydantic = "latest"
+huggingface_hub = "latest"
+
+[cerebrium.experimental]
+checkpointing = true
diff --git a/5-large-language-models/1-openai-compatible-endpoint/main.py b/5-large-language-models/1-openai-compatible-endpoint/main.py
@@ -1,21 +1,46 @@
+import http.client
 import json
 import os
 import time
-
+import urllib.request
 
 from huggingface_hub import login
 from pydantic import BaseModel
 from vllm import SamplingParams, AsyncLLMEngine
 from vllm.engine.arg_utils import AsyncEngineArgs
 
+CHECKPOINT_URL = "http://169.254.169.253:8234/checkpoint"
+
+
+def _trigger_snapshot() -> None:
+    print("[init] requesting GPU snapshot", flush=True)
+    try:
+        req = urllib.request.Request(CHECKPOINT_URL, method="POST")
+        urllib.request.urlopen(req, timeout=300)
+        print("[init] snapshot complete", flush=True)
+    except http.client.RemoteDisconnected:
+        # TCP connections disconnect on restore and throw remote
+        print("[init] snapshot complete (RemoteDisconnected)", flush=True)
+    except Exception as exc:
+        print(f"[init] snapshot failed: {type(exc).__name__}: {exc}", flush=True)
+
+
+print("[init] starting", flush=True)
 login(token=os.environ.get("HF_TOKEN"))
 
 engine_args = AsyncEngineArgs(
     model="meta-llama/Meta-Llama-3.1-8B-Instruct",
     gpu_memory_utilization=0.9,
     max_model_len=8192,
+    async_scheduling=False,
 )
+
+print("[init] building vLLM engine", flush=True)
 engine = AsyncLLMEngine.from_engine_args(engine_args)
+print("[init] vLLM engine ready", flush=True)
+
+_trigger_snapshot()
+print("[init] handler ready", flush=True)
 
 
 class Message(BaseModel):
@@ -35,7 +60,9 @@ async def run(
     prompt = " ".join(
         [f"{Message(**msg).role}: {Message(**msg).content}" for msg in messages]
     )
-    sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
+    sampling_params = SamplingParams(
+        temperature=temperature, top_p=top_p, max_tokens=max_tokens
+    )
     results_generator = engine.generate(prompt, sampling_params, run_id)
 
     previous_text = ""
@@ -54,21 +81,17 @@ async def run(
             "choices": [{"index": 0, "delta": {}, "finish_reason": None}],
         }
 
-        # Include role in the first chunk
         if first_chunk:
             chunk["choices"][0]["delta"]["role"] = "assistant"
             first_chunk = False
 
-        # Add new text to delta if there is any
         if new_text:
             chunk["choices"][0]["delta"]["content"] = new_text
 
-        # Check for a finish_reason
         finish_reason = prompt_output[0].finish_reason
         if finish_reason and finish_reason != "none":
             chunk["choices"][0]["finish_reason"] = finish_reason
 
         yield f"data: {json.dumps(chunk)}\n\n"
 
-    # After all chunks, send [DONE]
     yield "data: [DONE]\n\n"
diff --git a/5-large-language-models/4-llama-openai-chat-compatible-endpoint/cerebrium.toml b/5-large-language-models/4-llama-openai-chat-compatible-endpoint/cerebrium.toml
@@ -1,20 +1,28 @@
 [cerebrium.deployment]
-name = "llm-run-3u"
+name = "4-llama-openai-compatible"
 python_version = "3.11"
-docker_base_image_url = "debian:bookworm-slim"
+docker_base_image_url = "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
+disable_auth = true
+deployment_initialization_timeout = 830
 include = ["./*", "main.py", "cerebrium.toml"]
 exclude = [".*"]
 
 [cerebrium.hardware]
-cpu = 2
-memory = 12.0
-compute = "AMPERE_A10"
+cpu = 8
+memory = 60.0
+compute = "ADA_L40"
+provider = "aws"
+region = "us-east-1"
 
 [cerebrium.scaling]
 min_replicas = 0
 max_replicas = 5
-cooldown = 60
+cooldown = 30
 
 [cerebrium.dependencies.pip]
 vllm = "latest"
 pydantic = "latest"
+huggingface_hub = "latest"
+
+[cerebrium.experimental]
+checkpointing = true
diff --git a/5-large-language-models/4-llama-openai-chat-compatible-endpoint/main.py b/5-large-language-models/4-llama-openai-chat-compatible-endpoint/main.py
@@ -1,20 +1,46 @@
+import http.client
+import json
 import os
 import time
-import json
+import urllib.request
 
 from huggingface_hub import login
 from pydantic import BaseModel
 from vllm import SamplingParams, AsyncLLMEngine
 from vllm.engine.arg_utils import AsyncEngineArgs
 
+CHECKPOINT_URL = "http://169.254.169.253:8234/checkpoint"
+
+
+def _trigger_snapshot() -> None:
+    print("[init] requesting GPU snapshot", flush=True)
+    try:
+        req = urllib.request.Request(CHECKPOINT_URL, method="POST")
+        urllib.request.urlopen(req, timeout=300)
+        print("[init] snapshot complete", flush=True)
+    except http.client.RemoteDisconnected:
+        # TCP connections disconnect on restore and throw remote
+        print("[init] snapshot complete (RemoteDisconnected)", flush=True)
+    except Exception as exc:
+        print(f"[init] snapshot failed: {type(exc).__name__}: {exc}", flush=True)
+
+
+print("[init] starting", flush=True)
 login(token=os.environ.get("HF_TOKEN"))
 
 engine_args = AsyncEngineArgs(
     model="meta-llama/Meta-Llama-3.1-8B-Instruct",
     gpu_memory_utilization=0.9,
     max_model_len=8192,
+    async_scheduling=False,
 )
+
+print("[init] building vLLM engine", flush=True)
 engine = AsyncLLMEngine.from_engine_args(engine_args)
+print("[init] vLLM engine ready", flush=True)
+
+_trigger_snapshot()
+print("[init] handler ready", flush=True)
 
 
 class Message(BaseModel):
@@ -45,7 +71,6 @@ async def run(
     top_p: float = 0.95,
     max_tokens: int = 4096,
 ):
-    # Format your prompt for llama-friendly usage:
     prompt = format_chat_prompt(messages)
 
     sampling_params = SamplingParams(
@@ -61,36 +86,25 @@ async def run(
         new_text = prompt_output[0].text[len(previous_text) :]
         previous_text = prompt_output[0].text
 
-        # Construct OpenAI-compatible chunk
         chunk = {
             "id": run_id,
             "object": "chat.completion.chunk",
             "created": int(time.time()),
             "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {},
-                    "finish_reason": None,
-                }
-            ],
+            "choices": [{"index": 0, "delta": {}, "finish_reason": None}],
         }
 
-        # Include the role in the first chunk
         if first_chunk:
             chunk["choices"][0]["delta"]["role"] = "assistant"
             first_chunk = False
 
-        # Add new text to the delta if any
         if new_text:
             chunk["choices"][0]["delta"]["content"] = new_text
 
-        # Capture a finish reason if it's provided
-        finish_reason = prompt_output[0].finish_reason or None
+        finish_reason = prompt_output[0].finish_reason
         if finish_reason and finish_reason != "none":
             chunk["choices"][0]["finish_reason"] = finish_reason
 
         yield f"data: {json.dumps(chunk)}\n\n"
 
-    # Send the final [DONE] message
     yield "data: [DONE]\n\n"
diff --git a/5-large-language-models/6-deepseek-sglang/cerebrium.toml b/5-large-language-models/6-deepseek-sglang/cerebrium.toml
@@ -1,18 +1,21 @@
 [cerebrium.deployment]
 name = "6-deepseek-sglang"
 python_version = "3.11"
-docker_base_image_url = "debian:bookworm-slim"
-disable_auth = false
-include = ['./*', 'main.py', 'cerebrium.toml']
-exclude = ['.*']
+docker_base_image_url = "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
+disable_auth = true
+deployment_initialization_timeout = 830
+include = ["./*", "main.py", "cerebrium.toml"]
+exclude = [".*"]
 pre_build_commands = [
     "pip install sglang[all]>=0.4.2.post2 --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer"
 ]
 
 [cerebrium.hardware]
-cpu = 4
-memory = 12.0
-compute = "AMPERE_A10"
+cpu = 8
+memory = 60.0
+compute = "ADA_L40"
+provider = "aws"
+region = "us-east-1"
 
 [cerebrium.scaling]
 min_replicas = 0
@@ -25,4 +28,10 @@ scaling_target = 100
 
 [cerebrium.dependencies.pip]
 huggingface_hub = "latest"
-pydantic = "latest"
+pydantic = "latest"
+
+[cerebrium.dependencies.apt]
+libnuma-dev = "latest"
+
+[cerebrium.experimental]
+checkpointing = true
diff --git a/5-large-language-models/6-deepseek-sglang/main.py b/5-large-language-models/6-deepseek-sglang/main.py
@@ -1,23 +1,47 @@
-# launch the offline engine
-from huggingface_hub import login
-from sglang import Runtime
-from pydantic import BaseModel
+import http.client
 import json
-from typing import List, Dict, Any
-import time
 import os
+import time
+import urllib.request
+
+from huggingface_hub import login
+from sglang import Runtime
+
+CHECKPOINT_URL = "http://169.254.169.253:8234/checkpoint"
 
 os.environ["HF_TRANSFER"] = "1"
 os.environ["HF_HUB_VERBOSE"] = "1"
 os.environ["HF_HUB_ENABLE_PROGRESS_BARS"] = "1"
 
+
+def _trigger_snapshot() -> None:
+    print("[init] requesting GPU snapshot", flush=True)
+    try:
+        req = urllib.request.Request(CHECKPOINT_URL, method="POST")
+        urllib.request.urlopen(req, timeout=300)
+        print("[init] snapshot complete", flush=True)
+    except http.client.RemoteDisconnected:
+        # TCP connections disconnect on restore and throw remote
+        print("[init] snapshot complete (RemoteDisconnected)", flush=True)
+    except Exception as exc:
+        print(f"[init] snapshot failed: {type(exc).__name__}: {exc}", flush=True)
+
+
+print("[init] starting", flush=True)
 login(token=os.environ.get("HF_TOKEN"))
 
-# model_id = "deepseek-ai/DeepSeek-R1" ##uncomment for R1
+# model_id = "deepseek-ai/DeepSeek-R1"  # uncomment for R1
 model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
+
+print("[init] building SGLang runtime", flush=True)
 runtime = Runtime(
-    model_path=model_id, tp_size=1
-)  # change tp_size=8 if serving R1 on H200
+    model_path=model_id,
+    tp_size=1,  # change tp_size=8 if serving R1 on H200
+)
+print("[init] SGLang runtime ready", flush=True)
+
+_trigger_snapshot()
+print("[init] handler ready", flush=True)
 
 
 async def run(
@@ -29,7 +53,6 @@ async def run(
     top_p: float = 0.95,
     max_tokens: int = 4096,
 ):
-
     sampling_params = {"temperature": temperature, "top_p": top_p}
     tokenizer = runtime.get_tokenizer()
 
@@ -38,24 +61,15 @@ async def run(
     )
 
     stream = runtime.add_request(prompt, sampling_params)
-    full_text = ""
     first_chunk = True
 
     async for output in stream:
-        full_text += output
-
         chunk = {
             "id": run_id,
             "object": "chat.completion.chunk",
             "created": int(time.time()),
             "model": model,
-            "choices": [
-                {
-                    "index": 0,
-                    "delta": {},
-                    "finish_reason": None,
-                }
-            ],
+            "choices": [{"index": 0, "delta": {}, "finish_reason": None}],
         }
 
         if first_chunk: