docs: make inference model configurable (#4385)

Allow users to specify the inference model through the INFERENCE_MODEL environment variable instead of hardcoding it, with fallback to ollama/llama3.2:3b if not set. Signed-off-by: Costa Shulyupin <costa.shul@redhat.com> Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
2025-12-17 09:29:47 +00:00 · 2025-12-15 12:02:28 +02:00 · 2025-12-15 12:02:28 +02:00 · 2b85600a7e
commit 2b85600a7e
parent 62f7818051
1 changed files with 3 additions and 2 deletions
--- a/docs/docs/getting_started/demo_script.py
+++ b/docs/docs/getting_started/demo_script.py
@ -16,6 +16,7 @@ Run this script after starting a Llama Stack server:
 """

 import io
+import os

 import requests
 from openai import OpenAI
@ -53,7 +54,7 @@ print("=" * 80)
 print(f"Query: {query}\n")

 resp = client.responses.create(
-    model="ollama/llama3.2:3b",  # feel free to change this to any other model
+    model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
    input=query,
    tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
    include=["file_search_call.results"],
@ -93,7 +94,7 @@ print(f"Found {len(context_chunks)} relevant chunks\n")
 # Step 3: Use Chat Completions with retrieved context
 print("Generating response with chat completions...")
 completion = client.chat.completions.create(
-    model="ollama/llama3.2:3b",  # Feel free to change this to any other model
+    model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
    messages=[
        {
            "role": "system",