mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 09:29:47 +00:00
docs: make inference model configurable (#4385)
Allow users to specify the inference model through the INFERENCE_MODEL environment variable instead of hardcoding it, with fallback to ollama/llama3.2:3b if not set. Signed-off-by: Costa Shulyupin <costa.shul@redhat.com> Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
This commit is contained in:
parent
62f7818051
commit
2b85600a7e
1 changed files with 3 additions and 2 deletions
|
|
@ -16,6 +16,7 @@ Run this script after starting a Llama Stack server:
|
|||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
|
||||
import requests
|
||||
from openai import OpenAI
|
||||
|
|
@ -53,7 +54,7 @@ print("=" * 80)
|
|||
print(f"Query: {query}\n")
|
||||
|
||||
resp = client.responses.create(
|
||||
model="ollama/llama3.2:3b", # feel free to change this to any other model
|
||||
model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
|
||||
input=query,
|
||||
tools=[{"type": "file_search", "vector_store_ids": [vs.id]}],
|
||||
include=["file_search_call.results"],
|
||||
|
|
@ -93,7 +94,7 @@ print(f"Found {len(context_chunks)} relevant chunks\n")
|
|||
# Step 3: Use Chat Completions with retrieved context
|
||||
print("Generating response with chat completions...")
|
||||
completion = client.chat.completions.create(
|
||||
model="ollama/llama3.2:3b", # Feel free to change this to any other model
|
||||
model=os.getenv("INFERENCE_MODEL", "ollama/llama3.2:3b"),
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue