Skip to main content

Run a vLLM Server

This example creates a reusable vLLM template, starts a GPU pod, and prints the OpenAI-compatible /v1/models endpoint.

Authenticate before running the script:

lium init

For CI, scripts, or temporary overrides, set LIUM_API_KEY instead. For gated Hugging Face models, also set HF_TOKEN.

#!/usr/bin/env python3
"""Run vLLM on a Lium GPU pod."""

import os

from lium.sdk import Lium

lium = Lium()
GPU_TYPE = "A100"

# Volume paths cache Hugging Face models between pods.
template = lium.create_template(
name="vllm-smollm",
docker_image="vllm/vllm-openai",
ports=[22, 8000],
environment={
"HF_HOME": "/root/.cache/huggingface",
"HF_TOKEN": os.environ.get("HF_TOKEN", ""),
},
# The vllm/vllm-openai image expects API-server args, not a shell command.
start_command="--model HuggingFaceTB/SmolLM-135M --host 0.0.0.0 --port 8000",
volumes=["/workspace", "/root/.cache/huggingface"],
)

executors = lium.ls(gpu_type=GPU_TYPE)
if not executors:
raise RuntimeError(f"No {GPU_TYPE} executors are currently available")

first = executors[0]
pod = lium.up(executor_id=first.id, template_id=template.id)
ready_pod = lium.wait_ready(pod["id"], timeout=600)

if not ready_pod:
raise RuntimeError("Pod did not become ready before the timeout")

pod = lium.pod(pod_id=pod["id"])

if "8000" in pod["ports_mapping"]:
port = pod["ports_mapping"]["8000"]
host = pod["executor"]["executor_ip_address"]

print("vLLM is loading. Check the model endpoint:")
print(f"http://{host}:{port}/v1/models")

Once vLLM finishes loading, check the endpoint printed by the script:

curl http://<host>:<port>/v1/models

You can later change the model name:

lium.edit(
pod["id"],
startup_commands="--model meta-llama/Llama-3.2-1B-Instruct --host 0.0.0.0 --port 8000",
environment={
"HF_HOME": "/root/.cache/huggingface",
"HF_TOKEN": os.environ.get("HF_TOKEN", ""),
},
)

Stop it when you are done:

lium.down(ready_pod)