llama-stack-mirror/examples/inference.py
Antony Sallas 17e74251e2 updated with vllm based values
rh-pre-commit.version: 2.3.2
rh-pre-commit.check-secrets: ENABLED
2025-10-22 18:20:32 +08:00

26 lines
No EOL
695 B
Python

from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url="http://localhost:8321",
default_headers={
"X-Telemetry-Service": "llama-stack-inference",
"X-Telemetry-Version": "1.0.0",
}
)
# List available models
models = client.models.list()
# Select the first LLM
llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "vllm")
model_id = llm.identifier
print("Model:", model_id)
response = client.chat.completions.create(
model=model_id,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a haiku about coding"},
],
)
print(response)