From 2b85600a7ea88aa7b679a3f3626a971f9fc01eba Mon Sep 17 00:00:00 2001
From: Costa Shulyupin <costa.shul@redhat.com>
Date: Mon, 15 Dec 2025 12:02:28 +0200
Subject: [PATCH] docs: make inference model configurable (#4385)

Allow users to specify the inference model through the INFERENCE_MODEL
environment variable instead of hardcoding it, with fallback to
ollama/llama3.2:3b if not set.

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
---
 docs/docs/getting_started/demo_script.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/docs/getting_started/demo_script.py b/docs/docs/getting_started/demo_script.py
index eaa4d4d02..fcae51d86 100644
--- a/docs/docs/getting_started/demo_script.py
+++ b/docs/docs/getting_started/demo_script.py
@@ -16,6 +16,7 @@ Run this script after starting a Llama Stack server:
 """
 
 import io
+import os
 
 import requests
 from openai import OpenAI
@@ -53,7 +54,7 @@ print("=" * 80)
 print(f"Query: {query}\n")
 
 resp = client.responses.create(
-    model="ollama/llama3.2:3b",  # feel free to change this to any other model
+    model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
     input=query,
     tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
     include=["file_search_call.results"],
@@ -93,7 +94,7 @@ print(f"Found {len(context_chunks)} relevant chunks\n")
 # Step 3: Use Chat Completions with retrieved context
 print("Generating response with chat completions...")
 completion = client.chat.completions.create(
-    model="ollama/llama3.2:3b",  # Feel free to change this to any other model
+    model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
     messages=[
         {
             "role": "system",