Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
[cerebrium.deployment]
name = "1-openai-compatible-endpoint"
python_version = "3.11"
docker_base_image_url = "debian:bookworm-slim"
docker_base_image_url = "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
disable_auth = true
deployment_initialization_timeout = 830
include = ["./*", "main.py", "cerebrium.toml"]
exclude = [".*"]

[cerebrium.hardware]
cpu = 2
memory = 12.0
compute = "AMPERE_A10"
cpu = 8
memory = 60.0
compute = "ADA_L40"
provider = "aws"
region = "us-east-1"

[cerebrium.scaling]
min_replicas = 0
Expand All @@ -18,3 +22,7 @@ cooldown = 30
[cerebrium.dependencies.pip]
vllm = "latest"
pydantic = "latest"
huggingface_hub = "latest"

[cerebrium.experimental]
checkpointing = true
35 changes: 29 additions & 6 deletions 5-large-language-models/1-openai-compatible-endpoint/main.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,46 @@
import http.client
import json
import os
import time

import urllib.request

from huggingface_hub import login
from pydantic import BaseModel
from vllm import SamplingParams, AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs

CHECKPOINT_URL = "http://169.254.169.253:8234/checkpoint"


def _trigger_snapshot() -> None:
print("[init] requesting GPU snapshot", flush=True)
try:
req = urllib.request.Request(CHECKPOINT_URL, method="POST")
urllib.request.urlopen(req, timeout=300)
print("[init] snapshot complete", flush=True)
except http.client.RemoteDisconnected:
# TCP connections disconnect on restore and throw remote
print("[init] snapshot complete (RemoteDisconnected)", flush=True)
except Exception as exc:
print(f"[init] snapshot failed: {type(exc).__name__}: {exc}", flush=True)


print("[init] starting", flush=True)
login(token=os.environ.get("HF_TOKEN"))

engine_args = AsyncEngineArgs(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
gpu_memory_utilization=0.9,
max_model_len=8192,
async_scheduling=False,
)

print("[init] building vLLM engine", flush=True)
engine = AsyncLLMEngine.from_engine_args(engine_args)
print("[init] vLLM engine ready", flush=True)

_trigger_snapshot()
print("[init] handler ready", flush=True)


class Message(BaseModel):
Expand All @@ -35,7 +60,9 @@ async def run(
prompt = " ".join(
[f"{Message(**msg).role}: {Message(**msg).content}" for msg in messages]
)
sampling_params = SamplingParams(temperature=temperature, top_p=top_p)
sampling_params = SamplingParams(
temperature=temperature, top_p=top_p, max_tokens=max_tokens
)
results_generator = engine.generate(prompt, sampling_params, run_id)

previous_text = ""
Expand All @@ -54,21 +81,17 @@ async def run(
"choices": [{"index": 0, "delta": {}, "finish_reason": None}],
}

# Include role in the first chunk
if first_chunk:
chunk["choices"][0]["delta"]["role"] = "assistant"
first_chunk = False

# Add new text to delta if there is any
if new_text:
chunk["choices"][0]["delta"]["content"] = new_text

# Check for a finish_reason
finish_reason = prompt_output[0].finish_reason
if finish_reason and finish_reason != "none":
chunk["choices"][0]["finish_reason"] = finish_reason

yield f"data: {json.dumps(chunk)}\n\n"

# After all chunks, send [DONE]
yield "data: [DONE]\n\n"
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
[cerebrium.deployment]
name = "llm-run-3u"
name = "4-llama-openai-compatible"
python_version = "3.11"
docker_base_image_url = "debian:bookworm-slim"
docker_base_image_url = "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
disable_auth = true
deployment_initialization_timeout = 830
include = ["./*", "main.py", "cerebrium.toml"]
exclude = [".*"]

[cerebrium.hardware]
cpu = 2
memory = 12.0
compute = "AMPERE_A10"
cpu = 8
memory = 60.0
compute = "ADA_L40"
provider = "aws"
region = "us-east-1"

[cerebrium.scaling]
min_replicas = 0
max_replicas = 5
cooldown = 60
cooldown = 30

[cerebrium.dependencies.pip]
vllm = "latest"
pydantic = "latest"
huggingface_hub = "latest"

[cerebrium.experimental]
checkpointing = true
Original file line number Diff line number Diff line change
@@ -1,20 +1,46 @@
import http.client
import json
import os
import time
import json
import urllib.request

from huggingface_hub import login
from pydantic import BaseModel
from vllm import SamplingParams, AsyncLLMEngine
from vllm.engine.arg_utils import AsyncEngineArgs

CHECKPOINT_URL = "http://169.254.169.253:8234/checkpoint"


def _trigger_snapshot() -> None:
print("[init] requesting GPU snapshot", flush=True)
try:
req = urllib.request.Request(CHECKPOINT_URL, method="POST")
urllib.request.urlopen(req, timeout=300)
print("[init] snapshot complete", flush=True)
except http.client.RemoteDisconnected:
# TCP connections disconnect on restore and throw remote
print("[init] snapshot complete (RemoteDisconnected)", flush=True)
except Exception as exc:
print(f"[init] snapshot failed: {type(exc).__name__}: {exc}", flush=True)


print("[init] starting", flush=True)
login(token=os.environ.get("HF_TOKEN"))

engine_args = AsyncEngineArgs(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
gpu_memory_utilization=0.9,
max_model_len=8192,
async_scheduling=False,
)

print("[init] building vLLM engine", flush=True)
engine = AsyncLLMEngine.from_engine_args(engine_args)
print("[init] vLLM engine ready", flush=True)

_trigger_snapshot()
print("[init] handler ready", flush=True)


class Message(BaseModel):
Expand Down Expand Up @@ -45,7 +71,6 @@ async def run(
top_p: float = 0.95,
max_tokens: int = 4096,
):
# Format your prompt for llama-friendly usage:
prompt = format_chat_prompt(messages)

sampling_params = SamplingParams(
Expand All @@ -61,36 +86,25 @@ async def run(
new_text = prompt_output[0].text[len(previous_text) :]
previous_text = prompt_output[0].text

# Construct OpenAI-compatible chunk
chunk = {
"id": run_id,
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": model,
"choices": [
{
"index": 0,
"delta": {},
"finish_reason": None,
}
],
"choices": [{"index": 0, "delta": {}, "finish_reason": None}],
}

# Include the role in the first chunk
if first_chunk:
chunk["choices"][0]["delta"]["role"] = "assistant"
first_chunk = False

# Add new text to the delta if any
if new_text:
chunk["choices"][0]["delta"]["content"] = new_text

# Capture a finish reason if it's provided
finish_reason = prompt_output[0].finish_reason or None
finish_reason = prompt_output[0].finish_reason
if finish_reason and finish_reason != "none":
chunk["choices"][0]["finish_reason"] = finish_reason

yield f"data: {json.dumps(chunk)}\n\n"

# Send the final [DONE] message
yield "data: [DONE]\n\n"
25 changes: 17 additions & 8 deletions 5-large-language-models/6-deepseek-sglang/cerebrium.toml
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
[cerebrium.deployment]
name = "6-deepseek-sglang"
python_version = "3.11"
docker_base_image_url = "debian:bookworm-slim"
disable_auth = false
include = ['./*', 'main.py', 'cerebrium.toml']
exclude = ['.*']
docker_base_image_url = "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
disable_auth = true
deployment_initialization_timeout = 830
include = ["./*", "main.py", "cerebrium.toml"]
exclude = [".*"]
pre_build_commands = [
"pip install sglang[all]>=0.4.2.post2 --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer"
]

[cerebrium.hardware]
cpu = 4
memory = 12.0
compute = "AMPERE_A10"
cpu = 8
memory = 60.0
compute = "ADA_L40"
provider = "aws"
region = "us-east-1"

[cerebrium.scaling]
min_replicas = 0
Expand All @@ -25,4 +28,10 @@ scaling_target = 100

[cerebrium.dependencies.pip]
huggingface_hub = "latest"
pydantic = "latest"
pydantic = "latest"

[cerebrium.dependencies.apt]
libnuma-dev = "latest"

[cerebrium.experimental]
checkpointing = true
54 changes: 34 additions & 20 deletions 5-large-language-models/6-deepseek-sglang/main.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,47 @@
# launch the offline engine
from huggingface_hub import login
from sglang import Runtime
from pydantic import BaseModel
import http.client
import json
from typing import List, Dict, Any
import time
import os
import time
import urllib.request

from huggingface_hub import login
from sglang import Runtime

CHECKPOINT_URL = "http://169.254.169.253:8234/checkpoint"

os.environ["HF_TRANSFER"] = "1"
os.environ["HF_HUB_VERBOSE"] = "1"
os.environ["HF_HUB_ENABLE_PROGRESS_BARS"] = "1"


def _trigger_snapshot() -> None:
print("[init] requesting GPU snapshot", flush=True)
try:
req = urllib.request.Request(CHECKPOINT_URL, method="POST")
urllib.request.urlopen(req, timeout=300)
print("[init] snapshot complete", flush=True)
except http.client.RemoteDisconnected:
# TCP connections disconnect on restore and throw remote
print("[init] snapshot complete (RemoteDisconnected)", flush=True)
except Exception as exc:
print(f"[init] snapshot failed: {type(exc).__name__}: {exc}", flush=True)


print("[init] starting", flush=True)
login(token=os.environ.get("HF_TOKEN"))

# model_id = "deepseek-ai/DeepSeek-R1" ##uncomment for R1
# model_id = "deepseek-ai/DeepSeek-R1" # uncomment for R1
model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

print("[init] building SGLang runtime", flush=True)
runtime = Runtime(
model_path=model_id, tp_size=1
) # change tp_size=8 if serving R1 on H200
model_path=model_id,
tp_size=1, # change tp_size=8 if serving R1 on H200
)
print("[init] SGLang runtime ready", flush=True)

_trigger_snapshot()
print("[init] handler ready", flush=True)


async def run(
Expand All @@ -29,7 +53,6 @@ async def run(
top_p: float = 0.95,
max_tokens: int = 4096,
):

sampling_params = {"temperature": temperature, "top_p": top_p}
tokenizer = runtime.get_tokenizer()

Expand All @@ -38,24 +61,15 @@ async def run(
)

stream = runtime.add_request(prompt, sampling_params)
full_text = ""
first_chunk = True

async for output in stream:
full_text += output

chunk = {
"id": run_id,
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": model,
"choices": [
{
"index": 0,
"delta": {},
"finish_reason": None,
}
],
"choices": [{"index": 0, "delta": {}, "finish_reason": None}],
}

if first_chunk:
Expand Down
Loading