From 2b85600a7ea88aa7b679a3f3626a971f9fc01eba Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Mon, 15 Dec 2025 12:02:28 +0200 Subject: [PATCH] docs: make inference model configurable (#4385) Allow users to specify the inference model through the INFERENCE_MODEL environment variable instead of hardcoding it, with fallback to ollama/llama3.2:3b if not set. Signed-off-by: Costa Shulyupin Signed-off-by: Costa Shulyupin --- docs/docs/getting_started/demo_script.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/docs/getting_started/demo_script.py b/docs/docs/getting_started/demo_script.py index eaa4d4d02..fcae51d86 100644 --- a/docs/docs/getting_started/demo_script.py +++ b/docs/docs/getting_started/demo_script.py @@ -16,6 +16,7 @@ Run this script after starting a Llama Stack server: """ import io +import os import requests from openai import OpenAI @@ -53,7 +54,7 @@ print("=" * 80) print(f"Query: {query}\n") resp = client.responses.create( - model="ollama/llama3.2:3b", # feel free to change this to any other model + model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"), input=query, tools=[{"type": "file_search", "vector_store_ids": [vs.id]}], include=["file_search_call.results"], @@ -93,7 +94,7 @@ print(f"Found {len(context_chunks)} relevant chunks\n") # Step 3: Use Chat Completions with retrieved context print("Generating response with chat completions...") completion = client.chat.completions.create( - model="ollama/llama3.2:3b", # Feel free to change this to any other model + model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"), messages=[ { "role": "system",