chore: remove straggler references to llama-models (#1345)

Straggler references cleanup
2025-03-01 14:26:03 -08:00 · 2025-03-01 14:26:03 -08:00 · 46b0a404e8
commit 46b0a404e8
parent 8bbd52bb9f
19 changed files with 827 additions and 74 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -123,15 +123,15 @@ Some tips about common tasks you work on while contributing to Llama Stack:
 ### Using `llama stack build`
-Building a stack image (conda / docker) will use the production version of the `llama-stack`, `llama-models` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_MODELS_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
+Building a stack image (conda / docker) will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
 Example:
 ```bash
 $ cd work/
 $ git clone https://github.com/meta-llama/llama-stack.git
-$ git clone https://github.com/meta-llama/llama-models.git
+$ git clone https://github.com/meta-llama/llama-stack-client-python.git
 $ cd llama-stack
-$ LLAMA_STACK_DIR=$(pwd) LLAMA_MODELS_DIR=../llama-models llama stack build --template <...>
+$ LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --template <...>
 ```
--- a/docs/openapi_generator/README.md
+++ b/docs/openapi_generator/README.md
@ -3,7 +3,7 @@ The RFC Specification (OpenAPI format) is generated from the set of API endpoint
 Please install the following packages before running the script:
 ```
-pip install fire PyYAML llama-models
+pip install fire PyYAML
 ```
 Then simply run `sh run_openapi_generator.sh`
--- a/docs/openapi_generator/run_openapi_generator.sh
+++ b/docs/openapi_generator/run_openapi_generator.sh
@ -28,6 +28,5 @@ if [ ${#missing_packages[@]} -ne 0 ]; then
 fi
 stack_dir=$(dirname $(dirname $THIS_DIR))
-models_dir=$(dirname $stack_dir)/llama-models
+PYTHONPATH=$PYTHONPATH:$stack_dir \
 PYTHONPATH=$PYTHONPATH:$stack_dir:$models_dir \
  python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/_static
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@ -7,11 +7,14 @@
 import argparse
 import textwrap
 from io import StringIO
 from pathlib import Path
 from llama_stack.cli.subcommand import Subcommand
 from llama_stack.cli.table import print_table
 from llama_stack.models.llama.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family
 ROOT_DIR = Path(__file__).parent.parent
 class ModelPromptFormat(Subcommand):
    """Llama model cli for describe a model prompt format (message formats)"""
@ -77,9 +80,9 @@ class ModelPromptFormat(Subcommand):
        if model_id not in supported_model_ids:
            self.parser.error(f"{model_id} is not a valid Model. Choose one from --\n {model_str}")
-        llama_3_1_file = importlib.resources.files("llama_models") / "llama3_1/prompt_format.md"
+        llama_3_1_file = ROOT_DIR / "models" / "llama" / "llama3_1" / "prompt_format.md"
-        llama_3_2_text_file = importlib.resources.files("llama_models") / "llama3_2/text_prompt_format.md"
+        llama_3_2_text_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "text_prompt_format.md"
-        llama_3_2_vision_file = importlib.resources.files("llama_models") / "llama3_2/vision_prompt_format.md"
+        llama_3_2_vision_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "vision_prompt_format.md"
        if model_family(model_id) == ModelFamily.llama3_1:
            with importlib.resources.as_file(llama_3_1_file) as f:
                content = f.open("r").read()
--- a/llama_stack/distribution/build_conda_env.sh
+++ b/llama_stack/distribution/build_conda_env.sh
@ -6,8 +6,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
 # This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -16,8 +16,8 @@ UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
 if [ -n "$LLAMA_STACK_DIR" ]; then
  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
 fi
-if [ -n "$LLAMA_MODELS_DIR" ]; then
+if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
-  echo "Using llama-models-dir=$LLAMA_MODELS_DIR"
+  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
 fi
 if [ "$#" -lt 3 ]; then
@ -87,8 +87,6 @@ ensure_conda_env_python310() {
    # these packages are damaged in test-pypi, so install them first
    uv pip install fastapi libcst
    uv pip install --extra-index-url https://test.pypi.org/simple/ \
      llama-models==$TEST_PYPI_VERSION \
      llama-stack-client==$TEST_PYPI_VERSION \
      llama-stack==$TEST_PYPI_VERSION \
      $pip_dependencies
    if [ -n "$special_pip_deps" ]; then
@ -111,22 +109,21 @@ ensure_conda_env_python310() {
    else
      PYPI_VERSION="${PYPI_VERSION:-}"
      if [ -n "$PYPI_VERSION" ]; then
-        SPEC_VERSION="llama-stack==${PYPI_VERSION} llama-models==${PYPI_VERSION} llama-stack-client==${PYPI_VERSION}"
+        SPEC_VERSION="llama-stack==${PYPI_VERSION}"
      else
        SPEC_VERSION="llama-stack"
      fi
      uv pip install --no-cache-dir $SPEC_VERSION
    fi
-    if [ -n "$LLAMA_MODELS_DIR" ]; then
+    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
-      if [ ! -d "$LLAMA_MODELS_DIR" ]; then
+      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
-        printf "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: $LLAMA_MODELS_DIR${NC}\n" >&2
+        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
        exit 1
      fi
-      printf "Installing from LLAMA_MODELS_DIR: $LLAMA_MODELS_DIR\n"
+      printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
-      uv pip uninstall llama-models
+      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
      uv pip install --no-cache-dir -e "$LLAMA_MODELS_DIR"
    fi
    # Install pip dependencies
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -6,7 +6,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
@ -116,7 +115,6 @@ EOF
 fi
 stack_mount="/app/llama-stack-source"
 models_mount="/app/llama-models-source"
 client_mount="/app/llama-stack-client-source"
 install_local_package() {
@ -140,10 +138,6 @@ EOF
 }
 if [ -n "$LLAMA_MODELS_DIR" ]; then
  install_local_package "$LLAMA_MODELS_DIR" "$models_mount" "LLAMA_MODELS_DIR"
 fi
 if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
  install_local_package "$LLAMA_STACK_CLIENT_DIR" "$client_mount" "LLAMA_STACK_CLIENT_DIR"
 fi
@ -213,9 +207,6 @@ if [ "$USE_COPY_NOT_MOUNT" != "true" ]; then
  if [ -n "$LLAMA_STACK_DIR" ]; then
    CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_STACK_DIR"):$stack_mount")
  fi
  if [ -n "$LLAMA_MODELS_DIR" ]; then
    CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_MODELS_DIR"):$models_mount")
  fi
  if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
    CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_STACK_CLIENT_DIR"):$client_mount")
  fi
@ -231,7 +222,7 @@ if [ -n "$PYPI_VERSION" ]; then
  version_tag="$PYPI_VERSION"
 elif [ -n "$TEST_PYPI_VERSION" ]; then
  version_tag="test-$TEST_PYPI_VERSION"
-elif [[ -n "$LLAMA_STACK_DIR" || -n "$LLAMA_MODELS_DIR" ]]; then
+elif [[ -n "$LLAMA_STACK_DIR" || -n "$LLAMA_STACK_CLIENT_DIR" ]]; then
  version_tag="dev"
 else
  URL="https://pypi.org/pypi/llama-stack/json"
--- a/llama_stack/distribution/build_venv.sh
+++ b/llama_stack/distribution/build_venv.sh
@ -9,8 +9,8 @@
 # TODO: combine this with build_conda_env.sh since it is almost identical
 # the only difference is that we don't do any conda-specific setup
 LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
 # This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -21,8 +21,8 @@ VIRTUAL_ENV=${VIRTUAL_ENV:-}
 if [ -n "$LLAMA_STACK_DIR" ]; then
  echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
 fi
-if [ -n "$LLAMA_MODELS_DIR" ]; then
+if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
-  echo "Using llama-models-dir=$LLAMA_MODELS_DIR"
+  echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
 fi
 if [ "$#" -lt 2 ]; then
@ -95,7 +95,7 @@ run() {
    # we are building a command line so word splitting is expected
    uv pip install --extra-index-url https://test.pypi.org/simple/ \
      --index-strategy unsafe-best-match \
-      llama-models=="$TEST_PYPI_VERSION" llama-stack=="$TEST_PYPI_VERSION" \
+      llama-stack=="$TEST_PYPI_VERSION" \
      $pip_dependencies
    if [ -n "$special_pip_deps" ]; then
      IFS='#' read -ra parts <<<"$special_pip_deps"
@ -120,15 +120,14 @@ run() {
      uv pip install --no-cache-dir llama-stack
    fi
-    if [ -n "$LLAMA_MODELS_DIR" ]; then
+    if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
-      if [ ! -d "$LLAMA_MODELS_DIR" ]; then
+      if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
-        printf "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_MODELS_DIR" >&2
+        printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
        exit 1
      fi
-      printf "Installing from LLAMA_MODELS_DIR: %s\n" "$LLAMA_MODELS_DIR"
+      printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
-      uv pip uninstall llama-models
+      uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
      uv pip install --no-cache-dir -e "$LLAMA_MODELS_DIR"
    fi
    # Install pip dependencies
--- a/llama_stack/models/llama/llama3/model.py
+++ b/llama_stack/models/llama/llama3/model.py
@ -27,11 +27,7 @@ from fairscale.nn.model_parallel.layers import (
 )
 from torch import nn
-from ..api import ModelArgs
+from .args import ModelArgs
 # **NOTE**: This code is not runnable without installing `torch` and `fairscale`
 # dependencies. These dependencies are not part of the default dependencies
 # (requirements.txt) of the `llama-models` package.
 class RMSNorm(torch.nn.Module):
--- a/llama_stack/models/llama/llama3_1/prompt_format.md
+++ b/llama_stack/models/llama/llama3_1/prompt_format.md
@ -0,0 +1,358 @@
 # Llama 3.1 - Prompt Formats
 ## Tokens
 Here is a list of special tokens that are supported by Llama 3.1:
 - `<|begin_of_text|>`: Specifies the start of the prompt
 - `<|end_of_text|>`: Model will cease to generate more tokens. This token is generated only by the base models.
 - `<|finetune_right_pad_id|>`: This token is used for padding text sequences to the same length in a batch.
 - `<|start_header_id|>` and `<|end_header_id|>`: These tokens enclose the role for a particular message. The possible roles are: [system, user, assistant and ipython]
 - `<|eom_id|>`: End of message. A message represents a possible stopping point for execution where the model can inform the executor that a tool call needs to be made. This is used for multi-step interactions between the model and any available tools. This token is emitted by the model when the Environment: ipython instruction is used in the system prompt, or if the model calls for a built-in tool.
 - `<|eot_id|>`: End of turn. Represents when the model has determined that it has finished interacting with the user message that initiated its response. This is used in two scenarios:
    - at the end of a direct interaction between the model and the user
    - at the end of multiple interactions between the model and any available tools
    This token signals to the executor that the model has finished generating a response.
 - `<|python_tag|>`: Is a special tag used in the model's response to signify a tool call.
 There are 4 different roles that are supported by Llama 3.1
 - `system`: Sets the context in which to interact with the AI model. It typically includes rules, guidelines, or necessary information that helps the model respond effectively.
 - `user`: Represents the human interacting with the model. It includes the inputs, commands, and questions to the model.
 - `ipython`: A new role introduced in Llama 3.1. Semantically, this role means "tool". This role is used to mark messages with the output of a tool call when sent back to the model from the executor.
 - `assistant`: Represents the response generated by the AI model based on the context provided in the `system`, `ipython` and `user` prompts.
 ## Llama 3.1 Base Model
 Text completion for Llama 3.1 base model uses this format.
 ##### Input Prompt Format
 ```
 <|begin_of_text|>Color of sky is blue but sometimes can also be
 ```
 ##### Model Response Format
 ```
 red, orange, yellow, green, purple, pink, brown, gray, black, white, and even rainbow colors. The color of the sky can change due to various reasons such as time of day, weather conditions, pollution, and atmospheric phenomena.
 The color of the sky is primarily blue because of a phenomenon called
 ```
 Note start special tag
 ## Llama 3.1 Instruct Model
 ## User and assistant conversation
 Here is a regular multi-turn user assistant conversation and how its formatted.
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
 Answer who are you in the form of jeopardy?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 Here's my response
 "What is a helpful assistant?"<|eot_id|>
 ```
 ## Tool Calling Formats
 The three built-in tools (brave_search, wolfram_alpha, and code interpreter) can be turned on using the system prompt:
 - Brave Search: Tool call to perform web searches.
 - Wolfram Alpha: Tool call to perform complex mathematical calculations.
 - Code Interpreter: Enables the model to output python code.
 ## Builtin Tool Calling
 Here is an example of a conversation using brave search
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 Environment: ipython
 Tools: brave_search, wolfram_alpha
 Cutting Knowledge Date: December 2023
 Today Date: 21 September 2024
 You are a helpful assistant.
 <|eot_id|><|start_header_id|>user<|end_header_id|>
 Search the web for the latest price of 1oz gold?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 <|python_tag|>brave_search.call(query="latest price of 1oz gold")<|eom_id|>
 ```
 - Just including Environment: ipython turns on code interpreter; therefore, you don't need to specify code interpretation on the Tools: line. The model can generate python code which is interpreted by the executor, with the result provided back to the model.
 - The message body of the assistant response starts with a special tag <|python_tag|>
 - As alluded to above, in such an environment, the model can generate <|eom_id|> instead of just the standard <|eot_id|> . The latter indicates the turn is finished, while the former indicates continued multi-step reasoning. That is, the model is expecting a continuation message with the output of the tool call.
 - The model tool call response is of the form `tool.call(query="...")` wher tool is `brave_search` or `wolfram_alpha`
 ## Builtin Code Interpreter
 Here is an actual example of model responding with code
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 Environment: ipython<|eot_id|><|start_header_id|>user<|end_header_id|>
 Write code to check if number is prime, use that to see if the number 7 is prime<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 <|python_tag|>def is_prime(n):
    if n <= 1
        return False
    for i in range(2, int(n**0.5) + 1):
        if n % i == 0:
            return False
    return True
 print(is_prime(7))  # Output: True<|eom_id|>
 ```
 - Model starts with <|python_tag|> and continues writing python code that it needs to be executed
 - No explicit mention of code_interpreter in system prompt. `Environment: ipython` implicitly enables it.
 ## Built-in tools full interaction
 Here is a full interaction with the built-in tools including the tool response and the final assistant response.
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 Environment: ipython
 Tools: brave_search, wolfram_alpha
 <|eot_id|><|start_header_id|>user<|end_header_id|>
 What is the 100th decimal of pi?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 <|python_tag|>wolfram_alpha.call(query="100th decimal of pi")<|eom_id|><|start_header_id|>ipython<|end_header_id|>
 {
    "queryresult": {
        "success": true,
        "inputstring": "100th decimal of pi",
        "pods": [
            {
                "title": "Input interpretation",
                "subpods": [
                    {
                        "title": "",
                        "plaintext": "100th digit | π"
                    }
                ]
            },
            {
                "title": "Nearby digits",
                "subpods": [
                    {
                        "title": "",
                        "plaintext": "...86208998628034825342117067982148086513282306647093..."
                    }
                ]
            },
            {
                "title": "Result",
                "primary": true,
                "subpods": [
                    {
                        "title": "",
                        "plaintext": "7"
                    }
                ]
            }
        ]
    }
 }
 <|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 The 100th decimal of pi is 7.<|eot_id|>
 ```
 - Note the `<|python_tag|>` in the assistant response.
 - Role is `ipython` for the wolfram alpha response that is passed back to the model.
 - Final message from assistant has <|eot_id|> tag.
 ## Zero shot tool calling
 ## JSON based tool calling
 Llama models can now output custom tool calls from a single message to allow easier tool calling.
 The following prompts provide an example of how custom tools can be called from the output of the model.
 It's important to note that the model itself does not execute the calls; it provides structured output to facilitate calling by an executor.
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 Environment: ipython
 Cutting Knowledge Date: December 2023
 Today Date: 21 September 2024
 You are a helpful assistant.
 <|eot_id|><|start_header_id|>user<|end_header_id|>
 Answer the user's question by making use of the following functions if needed.
 If none of the function can be used, please say so.
 Here is a list of functions in JSON format:
 {
    "type": "function",
    "function": {
        "name": "trending_songs",
        "description": "Returns the trending songs on a Music site",
        "parameters": {
            "type": "object",
            "properties": [
                {
                    "n": {
                        "type": "object",
                        "description": "The number of songs to return"
                    }
                },
                {
                    "genre": {
                        "type": "object",
                        "description": "The genre of the songs to return"
                    }
                }
            ],
            "required": ["n"]
        }
    }
 }
 Return function calls in JSON format.<|eot_id|><|start_header_id|>user<|end_header_id|>
 Use tools to get latest trending songs<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 <|python_tag|>{
    "type": "function",
    "name": "trending_songs",
    "parameters": {
        "n": "10",
        "genre": "all"
    }
 }<|eom_id|>
 ```
 - JSON format for providing tools needs name, description and parameters
 - Model responds with `<|python_tag|>` and `<|eom_id|>` as `Environment: ipython` was in the system prompt
 - Instructions for tools added as a user message
 - Only single tool calls are supported as of now
 ## Example of a user defined tool calling
 ## `<function>` based tool calling
 Here is an example of how you could also write custom instructions for model to do zero shot tool calling.
 In this example, we define a custom tool calling format using the `<function>` tag.
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 Environment: ipython
 Cutting Knowledge Date: December 2023
 Today Date: 21 September 2024
 You are a helpful assistant.
 <|eot_id|><|start_header_id|>user<|end_header_id|>
 You have access to the following functions:
 Use the function 'trending_songs' to 'Returns the trending songs on a Music site':
 {"name": "trending_songs", "description": "Returns the trending songs on a Music site", "parameters": {"genre": {"description": "The genre of the songs to return", "param_type": "str", "required": false}, "n": {"description": "The number of songs to return", "param_type": "int", "required": true}}}
 Think very carefully before calling functions.
 If you choose to call a function ONLY reply in the following format with no prefix or suffix:
 <function=example_function_name>{"example_name": "example_value"}</function>
 Reminder:
 - If looking for real time information use relevant functions before falling back to brave_search
 - Function calls MUST follow the specified format, start with <function= and end with </function>
 - Required parameters MUST be specified
 - Only call one function at a time
 - Put the entire function call reply on one line<|eot_id|><|start_header_id|>user<|end_header_id|>
 Use tools to get latest trending songs<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 <function=trending_songs>{"n": 10}</function><|eot_id|>
 ```
 - In this case, model does NOT respond with `<|python_tag|>` and ends with `<|eot_id|>`
 - Instructions for tools added as a user message
 Thank You!
--- a/llama_stack/models/llama/llama3_2/text_prompt_format.md
+++ b/llama_stack/models/llama/llama3_2/text_prompt_format.md
@ -0,0 +1,286 @@
 ## User and assistant conversation
 Here is a regular multi-turn user assistant conversation and how its formatted.
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
 Who are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 I'm an AI assistant, which means I'm a computer program designed to simulate conversation and answer questions to the best of my ability. I'm here to help you with any questions or tasks you may have, and I'll do my best to provide you with accurate and helpful information.
 I don't have a personal name, but I'm often referred to as a "virtual assistant" or a "chatbot." I'm a machine learning model, which means I've been trained on a large dataset of text and can generate responses based on patterns and context.
 I can help with a wide range of topics, from general knowledge and trivia to more specialized subjects like science, history, and technology. I can also assist with tasks like language translation, text summarization, and even generating creative content like stories or poetry.
 So, what can I help you with today?<|eot_id|>
 ```
 ##### Notes
 This format is unchanged from Llama3.1
 ## Zero shot function calling
 For Llama3.2 1B and 3B instruct models, we are introducing a new format for zero shot function calling.
 This new format is designed to be more flexible and powerful than the previous format.
 All available functions can be provided in the system message. A key difference is in the format of how the assistant responds with function calls.
 It is pythonic in the form of `[func1(params_name=params_value, params_name2=params_value2...), func2(params)]` instead of the `json` or `<function>` tag that were defined in Llama3.1.
 Here is an example for the same,
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 You are an expert in composing functions. You are given a question and a set of possible functions.
 Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
 If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
 also point it out. You should only return the function call in tools call sections.
 If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
 You SHOULD NOT include any other text in the response.
 Here is a list of functions in JSON format that you can invoke.
 [
    {
        "name": "get_weather",
        "description": "Get weather info for places",
        "parameters": {
            "type": "dict",
            "required": [
                "city"
            ],
            "properties": {
                "city": {
                    "type": "string",
                    "description": "The name of the city to get the weather for"
                },
                "metric": {
                    "type": "string",
                    "description": "The metric for weather. Options are: celsius, fahrenheit",
                    "default": "celsius"
                }
            }
        }
    }
 ]<|eot_id|><|start_header_id|>user<|end_header_id|>
 What is the weather in SF and Seattle?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 [get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')]<|eot_id|>
 ```
 ##### Notes
 - The output supports multiple tool calls natively
 - JSON format for defining the functions in the system prompt is similar to Llama3.1
 ## Zero shot function calling with user message
 While the default is to provide all function calls in a system message, in Llama3.2 text models you can also provide information for all the available tools in a user message.
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
 Questions: Can you retrieve the details for the user with the ID 7890, who has black as their special request?
 Here is a list of functions in JSON format that you can invoke:
 [
    {
        "name": "get_user_info",
        "description": "Retrieve details for a specific user by their unique identifier. Note that the provided function is in Python 3 syntax.",
        "parameters": {
            "type": "dict",
            "required": [
                "user_id"
            ],
            "properties": {
                "user_id": {
                "type": "integer",
                "description": "The unique identifier of the user. It is used to fetch the specific user details from the database."
            },
            "special": {
                "type": "string",
                "description": "Any special information or parameters that need to be considered while fetching user details.",
                "default": "none"
                }
            }
        }
    }
 ]
 Should you decide to return the function call(s),Put it in the format of [func1(params_name=params_value, params_name2=params_value2...), func2(params)]
 NO other text MUST be included.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 [get_user_info(user_id=7890, special='black')]<|eot_id|>
 ```
 ##### Notes
 - The tool call format for the model is the same whether your function calls are provided in the system or user message.
 - While builtin tool calls end with a <|eom_id|>, notice the <|eot_id|> for zero shot tool calls.
 ## Code Interpreter
 Code Interpreter continues to work in 3.2 text models similar to Llama 3.1 model family.
 Here is an example,
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 Environment: ipython
 Cutting Knowledge Date: December 2023
 Today Date: 24 September 2024
 <|eot_id|><|start_header_id|>user<|end_header_id|>
 Write code to check if number is prime. Use it to verify if number 7 is prime<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 <|python_tag|>def is_prime(n):
    if n <= 1:
        return False
    if n == 2:
        return True
    if n % 2 == 0:
        return False
    max_divisor = int(n**0.5) + 1
    for d in range(3, max_divisor, 2):
        if n % d == 0:
            return False
    return True
 print(is_prime(7))  # Output: True<|eom_id|>
 ```
 ##### Notes
 - Note `Environment: ipython` in the system prompt.
 - Note that the response starts with `<|python_tag|>` and ends with `<|eom_id|>`
 ## Zero shot function calling E2E format
 Here is an example of the e2e cycle of tool calls with the model in a muti-step way.
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 You are an expert in composing functions. You are given a question and a set of possible functions.
 Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
 If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
 also point it out. You should only return the function call in tools call sections.
 If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
 You SHOULD NOT include any other text in the response.
 Here is a list of functions in JSON format that you can invoke.
 [
    {
        "name": "get_weather",
        "description": "Get weather info for places",
        "parameters": {
            "type": "dict",
            "required": [
                "city"
            ],
            "properties": {
                "city": {
                    "type": "string",
                    "description": "The name of the city to get the weather for"
                },
                "metric": {
                    "type": "string",
                    "description": "The metric for weather. Options are: celsius, fahrenheit",
                    "default": "celsius"
                }
            }
        }
    }
 ]<|eot_id|><|start_header_id|>user<|end_header_id|>
 What is the weather in SF?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 <|python_tag|>[get_weather(city="San Francisco", metric="celsius")]<|eot_id|><|start_header_id|>ipython<|end_header_id|>
 "25 C"<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 The weather in San Francisco is 25 C.<|eot_id|>
 ```
 ##### Notes
 - The output of the function call is provided back to the model as a tool response ( in json format ).
 - Notice `<|start_header_id|>ipython<|end_header_id|>` as the header message preceding the tool response.
 - The model finally summarizes the information from the tool response and returns the result to the user.
 ## Prompt format for base models
 For base models (Llama3.2-1B and Llama3.2-3B), the prompt format for a simple completion is as follows
 ##### Input Prompt Format
 ```
 <|begin_of_text|>The color of the sky is blue but sometimes it can also be
 ```
 ##### Model Response Format
 ```
 gray or even purple. The color of the sky can change depending on the time of day, the weather, and the amount of pollution in the air. The color of the sky can also be affected by the presence of dust, smoke, and other particles in the air.
 ## Step 1: Identify the factors that
 ```
 ##### Notes
 Same as Llama3.1
 Thank You!
--- a/llama_stack/models/llama/llama3_2/vision_prompt_format.md
+++ b/llama_stack/models/llama/llama3_2/vision_prompt_format.md
@ -0,0 +1,141 @@
 ## User and assistant conversation
 Here is a regular multi-turn user assistant conversation and how its formatted.
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
 Who are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 I'm a helpful assistant, here to provide information, answer questions, and assist with tasks to the best of my abilities. I'm a large language model, which means I can understand and respond to natural language inputs, and I'm constantly learning and improving to provide more accurate and helpful responses.
 I can help with a wide range of topics, from general knowledge and trivia to more specific areas like science, history, technology, and more. I can also assist with tasks like language translation, text summarization, and even generating creative content like stories or dialogues.
 What can I help you with today?<|eot_id|>
 ```
 ##### Notes
 This format is unchanged from Llama3.1
 ## User and assistant conversation with Images
 This example shows how to pass and image to the model as part of the messages.
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
 <|image|>Describe this image in two sentences<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 The image depicts a small dog standing on a skateboard, with its front paws firmly planted on the board and its back paws slightly raised. The dog's fur is predominantly brown and white, with a distinctive black stripe running down its back, and it is wearing a black collar around its neck.<|eot_id|>
 ```
 ##### Notes
 - The `<|image|>` tag is used to indicate presence of the image
 - The model isn't an early fusion model so doesn't actually translate an image into several tokens. Instead the cross-attention layers take input "on the side" from a vision encoder
 ![Image](mm-model.png)
 - Its important to postion the <|image|> tag appropriately in the prompt. Image will only attend to the subsequent text tokens
 - The <|image|> tag is part of the user message body, implying that it should only come after the header `<|start_header_id|>{role}<|end_header_id|>` in the message body
 - We recommend using a single image in one prompt
 ## Builtin and Zero Shot Tool Calling
 Llama3.2 vision models follow the same tool calling format as Llama3.1 models when inputs are text only.
 Use `Environment: ipython` to enable tools.
 Add `Tools: {{tool_name1}},{{tool_name2}}` for each of the builtin tools.
 The same builtin tools as Llama3.1 are available,
 - code_interpreter (for executing python code)
 - brave_search (to search the web)
 - wolfram_alpha (for querying wolfram alpha for mathematical questions)
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>
 Environment: ipython
 Tools: brave_search, wolfram_alpha
 Cutting Knowledge Date: December 2023
 Today Date: 23 September 2024
 You are a helpful assistant.
 <|eot_id|><|start_header_id|>user<|end_header_id|>
 Search the web for the latest price of 1oz gold?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
 ```
 ##### Model Response Format
 ```
 <|python_tag|>brave_search.call(query="latest price of 1oz gold")<|eom_id|>
 ```
 ##### Notes
 - Note the `<|python_tag|>` before `brave_search` function call.
 - The `<|eom_id|>` tag is used to indicate the end of the message.
 - Similar to Llama3.1, code_interpreter is not explicitly mentioned but is enabled via `Environment: ipython`.
 - Tool Calling does NOT work with images in the prompt as of now.
 ## Prompt format for base models
 For base models (Llama3.2-11B-Vision and Llama3.2-90B-Vision), the prompt format for a simple completion is as follows
 ##### Input Prompt Format
 ```
 <|begin_of_text|>The color of the sky is blue but sometimes it can also be
 ```
 ##### Model Response Format
 ```
 red, orange, pink, purple, and even black. The color of the sky is determined by the amount of sunlight that is scattered by the atmosphere and the amount of dust and water vapor present in the atmosphere. During sunrise and sunset, the sky can take on a range of colors due to the scattering of light by
 ```
 ##### Notes
 - Same as Llama3.1
 ## Prompt format for base models with Image
 For base models (Llama3.2-11B-Vision and Llama3.2-90B-Vision), here is an example of how the text completion format looks with an image,
 ##### Input Prompt Format
 ```
 <|begin_of_text|><|image|>If I had to write a haiku for this one
 ```
 ##### Model Response Format
 ```
 , it would be: A skateboarder's delight, a puppy on a board, a furry little thrill-seeker. This puppy is a true skateboarding enthusiast, always eager to hit the streets and show off his skills. He's a master of the board, gliding effortlessly across the pavement with grace and style.
 ```
 ##### Notes
 - Note the placement of the special tags <|begin_of_text|> and <|image|>
 Thank You!
--- a/llama_stack/providers/inline/inference/meta_reference/inference.py
+++ b/llama_stack/providers/inline/inference/meta_reference/inference.py
@ -111,7 +111,7 @@ class MetaReferenceInferenceImpl(
        )
        if llama_model is None:
            raise ValueError(
-                "Please make sure your llama_model in model metadata or model identifier is in llama-models SKU list"
+                "Please make sure your llama_model in model metadata or model identifier is in Llama SKU list"
            )
        self.model_registry_helper = ModelRegistryHelper(
--- a/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/run_quantize_checkpoint.sh
+++ b/llama_stack/providers/inline/inference/meta_reference/quantization/scripts/run_quantize_checkpoint.sh
@ -21,7 +21,7 @@ NPROC=$7
 echo $MASTER_HOST, $RUN_ID, $CKPT_DIR, $QUANT_CKPT_DIR
-NCCL_NET=Socket NCCL_SOCKET_IFNAME=eth TIKTOKEN_CACHE_DIR="" PYTHONPATH="/home/$USER/llama-models:/home/$USER/llama-stack" \
+NCCL_NET=Socket NCCL_SOCKET_IFNAME=eth TIKTOKEN_CACHE_DIR="" PYTHONPATH="/home/$USER/llama-stack" \
  torchrun \
   --nnodes=$NNODES --nproc_per_node=$NPROC \
   --rdzv_id=$RUN_ID \
--- a/llama_stack/providers/remote/safety/bedrock/bedrock.py
+++ b/llama_stack/providers/remote/safety/bedrock/bedrock.py
@ -59,7 +59,8 @@ class BedrockSafetyAdapter(Safety, ShieldsProtocolPrivate):
        if not shield:
            raise ValueError(f"Shield {shield_id} not found")
-        """This is the implementation for the bedrock guardrails. The input to the guardrails is to be of this format
+        """
        This is the implementation for the bedrock guardrails. The input to the guardrails is to be of this format
        ```content = [
            {
                "text": {
@ -67,10 +68,8 @@ class BedrockSafetyAdapter(Safety, ShieldsProtocolPrivate):
                }
            }
        ]```
-        However the incoming messages are of this type UserMessage(content=....) coming from
+        Incoming messages contain content, role . For now we will extract the content and
-        https://github.com/meta-llama/llama-models/blob/main/models/llama3/api/datatypes.py
+        default the "qualifiers": ["query"]
        They contain content, role . For now we will extract the content and default the "qualifiers": ["query"]
        """
        shield_params = shield.params
--- a/llama_stack/scripts/install_packages.sh
+++ b/llama_stack/scripts/install_packages.sh
@ -1,15 +0,0 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 VERSION="$1"
 set -euo pipefail
 set -x
 uv pip install -U --extra-index-url https://test.pypi.org/simple \
  llama-stack==$VERSION llama-models==$VERSION llama-stack-client==$VERSION
--- a/llama_stack/scripts/run_tests.sh
+++ b/llama_stack/scripts/run_tests.sh
@ -12,5 +12,4 @@ set -euo pipefail
 set -x
 stack_dir=$(dirname $(dirname $THIS_DIR))
-models_dir=$(dirname $stack_dir)/llama-models
+PYTHONPATH=$stack_dir pytest -p no:warnings --asyncio-mode auto --tb=short
 PYTHONPATH=$models_dir:$stack_dir pytest -p no:warnings --asyncio-mode auto --tb=short
--- a/llama_stack/templates/dell/doc_template.md
+++ b/llama_stack/templates/dell/doc_template.md
@ -111,8 +111,8 @@ docker run -it \
  --network host \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v $HOME/.llama:/root/.llama \
-  # NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed
+  # NOTE: mount the llama-stack directory if testing local changes else not needed
-  -v /home/hjshah/git/llama-stack:/app/llama-stack-source -v /home/hjshah/git/llama-models:/app/llama-models-source \
+  -v /home/hjshah/git/llama-stack:/app/llama-stack-source \
  # localhost/distribution-dell:dev if building / testing locally
  llamastack/distribution-{{ name }}\
  --port $LLAMA_STACK_PORT  \
--- a/pyproject.toml
+++ b/pyproject.toml
@ -160,5 +160,5 @@ exclude = [
 [[tool.mypy.overrides]]
 # packages that lack typing annotations, do not have stubs, or are unavailable.
-module = ["llama_models.*", "yaml", "fire"]
+module = ["yaml", "fire"]
 ignore_missing_imports = true
--- a/rfcs/RFC-0001-llama-stack.md
+++ b/rfcs/RFC-0001-llama-stack.md
@ -65,7 +65,7 @@ We define the Llama Stack as a layer cake shown below.
 ![Figure 3: Llama Stack](../docs/resources/llama-stack.png)
-The API is defined in the [YAML](../docs/_static/llama-stack-spec.yaml) and [HTML](../docs/_static/llama-stack-spec.html) files. These files were generated using the Pydantic definitions in (api/datatypes.py and api/endpoints.py) files that are in the llama-models, llama-stack, and llama-agentic-system repositories.
+The API is defined in the [YAML](../docs/_static/llama-stack-spec.yaml) and [HTML](../docs/_static/llama-stack-spec.html) files.
 ## Sample implementations