docs: make inference model configurable (#4385)

Allow users to specify the inference model through the INFERENCE_MODEL
environment variable instead of hardcoding it, with fallback to
ollama/llama3.2:3b if not set.

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
This commit is contained in:
Costa Shulyupin 2025-12-15 12:02:28 +02:00 committed by GitHub
parent 62f7818051
commit 2b85600a7e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -16,6 +16,7 @@ Run this script after starting a Llama Stack server:
"""
import io
import os
import requests
from openai import OpenAI
@ -53,7 +54,7 @@ print("=" * 80)
print(f"Query: {query}\n")
resp = client.responses.create(
model="ollama/llama3.2:3b", # feel free to change this to any other model
model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
input=query,
tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
include=["file_search_call.results"],
@ -93,7 +94,7 @@ print(f"Found {len(context_chunks)} relevant chunks\n")
# Step 3: Use Chat Completions with retrieved context
print("Generating response with chat completions...")
completion = client.chat.completions.create(
model="ollama/llama3.2:3b", # Feel free to change this to any other model
model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
messages=[
{
"role": "system",