From 4bca4af3e42ea2290973e97555fb8736246c62d3 Mon Sep 17 00:00:00 2001 From: Wen Zhou Date: Sun, 6 Jul 2025 05:37:37 +0200 Subject: [PATCH] refactor: set proper name for embedding all-minilm:l6-v2 and update to use "starter" in detailed_tutorial (#2627) # What does this PR do? - we are using `all-minilm:l6-v2` but the model we download from ollama is `all-minilm:latest` latest: https://ollama.com/library/all-minilm:latest 1b226e2802db l6-v2: https://ollama.com/library/all-minilm:l6-v2 pin 1b226e2802db - even currently they are exactly the same model but if [all-minilm:l12-v2](https://ollama.com/library/all-minilm:l12-v2) is updated, "latest" might not be the same for l6-v2. - the only change in this PR is pin the model id in ollama - also update detailed_tutorial with "starter" to replace deprecated "ollama". ## Test Plan ``` >INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" >llama stack build --run --template ollama --image-type venv ... Build Successful! You can find the newly-built template here: /home/wenzhou/zdtsw-forking/lls/llama-stack/llama_stack/templates/ollama/run.yaml .... - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 model_type: !!python/object/apply:llama_stack.apis.models.models.ModelType - embedding provider_id: ollama provider_model_id: all-minilm:l6-v2 ... ``` test ``` >llama-stack-client inference chat-completion --message "Write me a 2-sentence poem about the moon" INFO:httpx:HTTP Request: GET http://localhost:8321/v1/models "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://localhost:8321/v1/openai/v1/chat/completions "HTTP/1.1 200 OK" OpenAIChatCompletion( id='chatcmpl-04f99071-3da2-44ba-a19f-03b5b7fc70b7', choices=[ OpenAIChatCompletionChoice( finish_reason='stop', index=0, message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam( role='assistant', content="Here is a 2-sentence poem about the moon:\n\nSilver crescent in the midnight sky,\nLuna's gentle face, a beauty to the eye.", name=None, tool_calls=None, refusal=None, annotations=None, audio=None, function_call=None ), logprobs=None ) ], created=1751644429, model='llama3.2:3b-instruct-fp16', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage={'completion_tokens': 33, 'prompt_tokens': 36, 'total_tokens': 69, 'completion_tokens_details': None, 'prompt_tokens_details': None} ) ``` --------- Signed-off-by: Wen Zhou --- docs/source/distributions/building_distro.md | 22 +-- .../getting_started/detailed_tutorial.md | 132 ++++++++++-------- .../remote/inference/ollama/models.py | 2 +- tests/Containerfile | 2 +- .../llama-stack-provider-ollama/run.yaml | 2 +- 5 files changed, 91 insertions(+), 69 deletions(-) diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md index d3fb28947..f24974dd3 100644 --- a/docs/source/distributions/building_distro.md +++ b/docs/source/distributions/building_distro.md @@ -393,17 +393,17 @@ llama stack list ``` ``` -------------------------------+-----------------------------------------------------------------------------+--------------+------------+ -| Stack Name | Path | Build Config | Run Config | -+------------------------------+-----------------------------------------------------------------------------+--------------+------------+ -| together | /home/wenzhou/.llama/distributions/together | Yes | No | -+------------------------------+-----------------------------------------------------------------------------+--------------+------------+ -| bedrock | /home/wenzhou/.llama/distributions/bedrock | Yes | No | -+------------------------------+-----------------------------------------------------------------------------+--------------+------------+ -| starter | /home/wenzhou/.llama/distributions/starter | No | No | -+------------------------------+-----------------------------------------------------------------------------+--------------+------------+ -| remote-vllm | /home/wenzhou/.llama/distributions/remote-vllm | Yes | Yes | -+------------------------------+-----------------------------------------------------------------------------+--------------+------------+ +------------------------------+-----------------------------------------------------------------+--------------+------------+ +| Stack Name | Path | Build Config | Run Config | ++------------------------------+-----------------------------------------------------------------------------+--------------+ +| together | ~/.llama/distributions/together | Yes | No | ++------------------------------+-----------------------------------------------------------------------------+--------------+ +| bedrock | ~/.llama/distributions/bedrock | Yes | No | ++------------------------------+-----------------------------------------------------------------------------+--------------+ +| starter | ~/.llama/distributions/starter | Yes | Yes | ++------------------------------+-----------------------------------------------------------------------------+--------------+ +| remote-vllm | ~/.llama/distributions/remote-vllm | Yes | Yes | ++------------------------------+-----------------------------------------------------------------------------+--------------+ ``` ### Removing a Distribution diff --git a/docs/source/getting_started/detailed_tutorial.md b/docs/source/getting_started/detailed_tutorial.md index d80ec3554..35cb7f02e 100644 --- a/docs/source/getting_started/detailed_tutorial.md +++ b/docs/source/getting_started/detailed_tutorial.md @@ -42,7 +42,7 @@ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | ie Setup your virtual environment. ```bash -uv sync --python 3.10 +uv sync --python 3.12 source .venv/bin/activate ``` ## Step 2: Run Llama Stack @@ -56,9 +56,10 @@ You can use Python to build and run the Llama Stack server, which is useful for Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup, which defines the providers and their settings. Now let's build and run the Llama Stack config for Ollama. +We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables. ```bash -INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type venv --run +ENABLE_OLLAMA=ollama OLLAMA_INFERENCE_MODEL="llama3.2:3b" llama stack build --template starter --image-type venv --run ``` ::: :::{tab-item} Using `conda` @@ -69,17 +70,18 @@ which defines the providers and their settings. Now let's build and run the Llama Stack config for Ollama. ```bash -INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type conda --image-name llama3-3b-conda --run +ENABLE_OLLAMA=ollama INFERENCE_MODEL="llama3.2:3b" llama stack build --template starter --image-type conda --run ``` ::: :::{tab-item} Using a Container You can use a container image to run the Llama Stack server. We provide several container images for the server component that works with different inference providers out of the box. For this guide, we will use -`llamastack/distribution-ollama` as the container image. If you'd like to build your own image or customize the +`llamastack/distribution-starter` as the container image. If you'd like to build your own image or customize the configurations, please check out [this guide](../references/index.md). First lets setup some environment variables and create a local directory to mount into the container’s file system. ```bash export INFERENCE_MODEL="llama3.2:3b" +export ENABLE_OLLAMA=ollama export LLAMA_STACK_PORT=8321 mkdir -p ~/.llama ``` @@ -90,7 +92,7 @@ docker run -it \ --pull always \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v ~/.llama:/root/.llama \ - llamastack/distribution-ollama \ + llamastack/distribution-starter \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env OLLAMA_URL=http://host.docker.internal:11434 @@ -112,7 +114,7 @@ docker run -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v ~/.llama:/root/.llama \ --network=host \ - llamastack/distribution-ollama \ + llamastack/distribution-starter \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env OLLAMA_URL=http://localhost:11434 @@ -146,7 +148,7 @@ source .venv/bin/activate :::{tab-item} Install with `venv` ```bash -uv venv client --python 3.10 +uv venv client --python 3.12 source client/bin/activate pip install llama-stack-client ``` @@ -154,7 +156,7 @@ pip install llama-stack-client :::{tab-item} Install with `conda` ```bash -yes | conda create -n stack-client python=3.10 +yes | conda create -n stack-client python=3.12 conda activate stack-client pip install llama-stack-client ``` @@ -177,37 +179,56 @@ List the models llama-stack-client models list Available Models -┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓ -┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃ -┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩ -│ embedding │ all-MiniLM-L6-v2 │ all-minilm:latest │ {'embedding_dimension': 384.0} │ ollama │ -├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼─────────────────┤ -│ llm │ llama3.2:3b │ llama3.2:3b │ │ ollama │ -└─────────────────┴─────────────────────────────────────┴─────────────────────────────────────┴───────────────────────────────────────────┴─────────────────┘ - -Total models: 2 +┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃ +┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩ +│ embedding │ ollama/all-minilm:l6-v2 │ all-minilm:l6-v2 │ {'embedding_dimension': 384.0} │ ollama │ +├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────┤ +│ ... │ ... │ ... │ │ ... │ +├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────┤ +│ llm │ ollama/Llama-3.2:3b │ llama3.2:3b │ │ ollama │ +└─────────────────┴─────────────────────────────────────┴─────────────────────────────────────┴───────────────────────────────────────────┴───────────────────────┘ ``` You can test basic Llama inference completion using the CLI. ```bash -llama-stack-client inference chat-completion --message "tell me a joke" +llama-stack-client inference chat-completion --model-id "ollama/llama3.2:3b" --message "tell me a joke" + ``` Sample output: ```python -ChatCompletionResponse( - completion_message=CompletionMessage( - content="Here's one:\n\nWhat do you call a fake noodle?\n\nAn impasta!", - role="assistant", - stop_reason="end_of_turn", - tool_calls=[], - ), - logprobs=None, - metrics=[ - Metric(metric="prompt_tokens", value=14.0, unit=None), - Metric(metric="completion_tokens", value=27.0, unit=None), - Metric(metric="total_tokens", value=41.0, unit=None), +OpenAIChatCompletion( + id="chatcmpl-08d7b2be-40f3-47ed-8f16-a6f29f2436af", + choices=[ + OpenAIChatCompletionChoice( + finish_reason="stop", + index=0, + message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam( + role="assistant", + content="Why couldn't the bicycle stand up by itself?\n\nBecause it was two-tired.", + name=None, + tool_calls=None, + refusal=None, + annotations=None, + audio=None, + function_call=None, + ), + logprobs=None, + ) ], + created=1751725254, + model="llama3.2:3b", + object="chat.completion", + service_tier=None, + system_fingerprint="fp_ollama", + usage={ + "completion_tokens": 18, + "prompt_tokens": 29, + "total_tokens": 47, + "completion_tokens_details": None, + "prompt_tokens_details": None, + }, ) ``` @@ -233,19 +254,19 @@ client = LlamaStackClient(base_url="http://localhost:8321") models = client.models.list() # Select the first LLM -llm = next(m for m in models if m.model_type == "llm") +llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama") model_id = llm.identifier print("Model:", model_id) -response = client.inference.chat_completion( - model_id=model_id, +response = client.chat.completions.create( + model=model_id, messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Write a haiku about coding"}, ], ) -print(response.completion_message.content) +print(response) ``` ### ii. Run the Script @@ -255,12 +276,8 @@ uv run python inference.py ``` Which will output: ``` -Model: llama3.2:3b -Here is a haiku about coding: - -Lines of code unfold -Logic flows through digital night -Beauty in the bits +Model: ollama/llama3.2:3b +OpenAIChatCompletion(id='chatcmpl-30cd0f28-a2ad-4b6d-934b-13707fc60ebf', choices=[OpenAIChatCompletionChoice(finish_reason='stop', index=0, message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(role='assistant', content="Lines of code unfold\nAlgorithms dance with ease\nLogic's gentle kiss", name=None, tool_calls=None, refusal=None, annotations=None, audio=None, function_call=None), logprobs=None)], created=1751732480, model='llama3.2:3b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage={'completion_tokens': 16, 'prompt_tokens': 37, 'total_tokens': 53, 'completion_tokens_details': None, 'prompt_tokens_details': None}) ``` ::: @@ -278,7 +295,7 @@ import uuid client = LlamaStackClient(base_url=f"http://localhost:8321") models = client.models.list() -llm = next(m for m in models if m.model_type == "llm") +llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama") model_id = llm.identifier agent = Agent(client, model=model_id, instructions="You are a helpful assistant.") @@ -315,19 +332,20 @@ uv run python agent.py ```{dropdown} 👋 Click here to see the sample output Non-streaming ... - agent> I'm an artificial intelligence designed to assist and communicate with users like you. I don't have a personal identity, but I'm here to provide information, answer questions, and help with tasks to the best of my abilities. + agent> I'm an artificial intelligence designed to assist and communicate with users like you. I don't have a personal identity, but I can provide information, answer questions, and help with tasks to the best of my abilities. - I can be used for a wide range of purposes, such as: + I'm a large language model, which means I've been trained on a massive dataset of text from various sources, allowing me to understand and respond to a wide range of topics and questions. My purpose is to provide helpful and accurate information, and I'm constantly learning and improving my responses based on the interactions I have with users like you. + I can help with: + + * Answering questions on various subjects * Providing definitions and explanations * Offering suggestions and ideas - * Helping with language translation - * Assisting with writing and proofreading - * Generating text or responses to questions - * Playing simple games or chatting about topics of interest - - I'm constantly learning and improving my abilities, so feel free to ask me anything, and I'll do my best to help! + * Assisting with language-related tasks, such as proofreading and editing + * Generating text and content + * And more! + Feel free to ask me anything, and I'll do my best to help! Streaming ... AgentTurnResponseStreamChunk( │ event=TurnResponseEvent( @@ -421,15 +439,15 @@ uv run python agent.py Streaming with print helper... - inference> Déjà vu! + inference> Déjà vu! You're asking me again! - As I mentioned earlier, I'm an artificial intelligence language model. I don't have a personal identity or consciousness like humans do. I exist solely to process and respond to text-based inputs, providing information and assistance on a wide range of topics. + As I mentioned earlier, I'm a computer program designed to simulate conversation and answer questions. I don't have a personal identity or consciousness like a human would. I exist solely as a digital entity, running on computer servers and responding to inputs from users like you. - I'm a computer program designed to simulate human-like conversations, using natural language processing (NLP) and machine learning algorithms to understand and generate responses. My purpose is to help users like you with their questions, provide information, and engage in conversation. + I'm a type of artificial intelligence (AI) called a large language model, which means I've been trained on a massive dataset of text from various sources. This training allows me to understand and respond to a wide range of questions and topics. - Think of me as a virtual companion, a helpful tool designed to make your interactions more efficient and enjoyable. I don't have personal opinions, emotions, or biases, but I'm here to provide accurate and informative responses to the best of my abilities. + My purpose is to provide helpful and accurate information, answer questions, and assist users like you with tasks and conversations. I don't have personal preferences, emotions, or opinions like humans do. My goal is to be informative, neutral, and respectful in my responses. - So, who am I? I'm just a computer program designed to help you! + So, that's me in a nutshell! ``` ::: @@ -483,7 +501,11 @@ client.tool_runtime.rag_tool.insert( ) # Get the model being served -llm = next(m for m in client.models.list() if m.model_type == "llm") +llm = next( + m + for m in client.models.list() + if m.model_type == "llm" and m.provider_id == "ollama" +) model = llm.identifier # Create the RAG agent diff --git a/llama_stack/providers/remote/inference/ollama/models.py b/llama_stack/providers/remote/inference/ollama/models.py index cacf88861..64ddb23d9 100644 --- a/llama_stack/providers/remote/inference/ollama/models.py +++ b/llama_stack/providers/remote/inference/ollama/models.py @@ -84,7 +84,7 @@ MODEL_ENTRIES = [ CoreModelId.llama_guard_3_1b.value, ), ProviderModelEntry( - provider_model_id="all-minilm:latest", + provider_model_id="all-minilm:l6-v2", aliases=["all-minilm"], model_type=ModelType.embedding, metadata={ diff --git a/tests/Containerfile b/tests/Containerfile index 3080d053a..441d276c2 100644 --- a/tests/Containerfile +++ b/tests/Containerfile @@ -7,7 +7,7 @@ FROM --platform=linux/amd64 ollama/ollama:latest RUN ollama serve & \ sleep 5 && \ ollama pull llama3.2:3b-instruct-fp16 && \ - ollama pull all-minilm:latest + ollama pull all-minilm:l6-v2 # Set the entrypoint to start ollama serve ENTRYPOINT ["ollama", "serve"] diff --git a/tests/external-provider/llama-stack-provider-ollama/run.yaml b/tests/external-provider/llama-stack-provider-ollama/run.yaml index 60cff7503..65fd7571c 100644 --- a/tests/external-provider/llama-stack-provider-ollama/run.yaml +++ b/tests/external-provider/llama-stack-provider-ollama/run.yaml @@ -105,7 +105,7 @@ models: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 provider_id: custom_ollama - provider_model_id: all-minilm:latest + provider_model_id: all-minilm:l6-v2 model_type: embedding shields: [] vector_dbs: []