Run a vLLM Server
This example creates a reusable vLLM template, starts a GPU pod, and prints the OpenAI-compatible /v1/models endpoint.
Authenticate before running the script:
lium init
For CI, scripts, or temporary overrides, set LIUM_API_KEY instead. For gated Hugging Face models, also set HF_TOKEN.
#!/usr/bin/env python3
"""Run vLLM on a Lium GPU pod."""
import os
from lium.sdk import Lium
lium = Lium()
GPU_TYPE = "A100"
# Volume paths cache Hugging Face models between pods.
template = lium.create_template(
name="vllm-smollm",
docker_image="vllm/vllm-openai",
ports=[22, 8000],
environment={
"HF_HOME": "/root/.cache/huggingface",
"HF_TOKEN": os.environ.get("HF_TOKEN", ""),
},
# The vllm/vllm-openai image expects API-server args, not a shell command.
start_command="--model HuggingFaceTB/SmolLM-135M --host 0.0.0.0 --port 8000",
volumes=["/workspace", "/root/.cache/huggingface"],
)
executors = lium.ls(gpu_type=GPU_TYPE)
if not executors:
raise RuntimeError(f"No {GPU_TYPE} executors are currently available")
first = executors[0]
pod = lium.up(executor_id=first.id, template_id=template.id)
ready_pod = lium.wait_ready(pod["id"], timeout=600)
if not ready_pod:
raise RuntimeError("Pod did not become ready before the timeout")
pod = lium.pod(pod_id=pod["id"])
if "8000" in pod["ports_mapping"]:
port = pod["ports_mapping"]["8000"]
host = pod["executor"]["executor_ip_address"]
print("vLLM is loading. Check the model endpoint:")
print(f"http://{host}:{port}/v1/models")
Once vLLM finishes loading, check the endpoint printed by the script:
curl http://<host>:<port>/v1/models
You can later change the model name:
lium.edit(
pod["id"],
startup_commands="--model meta-llama/Llama-3.2-1B-Instruct --host 0.0.0.0 --port 8000",
environment={
"HF_HOME": "/root/.cache/huggingface",
"HF_TOKEN": os.environ.get("HF_TOKEN", ""),
},
)
Stop it when you are done:
lium.down(ready_pod)