Merge branch 'meta-llama:main' into feat/litellm_sambanova_usage

2025-12-28 04:21:58 +00:00 · 2025-03-12 15:12:42 -05:00 · 2025-03-12 15:12:42 -05:00 · e49bcd46fe
commit e49bcd46fe
parent 397eed9630 b7a9c45477
90 changed files with 3142 additions and 586 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -363,6 +363,37 @@
            }
        },
        "/v1/agents": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A ListAgentsResponse.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListAgentsResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "List all agents.",
+                "parameters": []
+            },
            "post": {
                "responses": {
                    "200": {
@ -609,6 +640,47 @@
            }
        },
        "/v1/agents/{agent_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "An Agent of the agent.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/Agent"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "Describe an agent by its ID.",
+                "parameters": [
+                    {
+                        "name": "agent_id",
+                        "in": "path",
+                        "description": "ID of the agent.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            },
            "delete": {
                "responses": {
                    "200": {
@ -2276,6 +2348,49 @@
                ]
            }
        },
+        "/v1/agents/{agent_id}/sessions": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "A ListAgentSessionsResponse.",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ListAgentSessionsResponse"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Agents"
+                ],
+                "description": "List all session(s) of a given agent.",
+                "parameters": [
+                    {
+                        "name": "agent_id",
+                        "in": "path",
+                        "description": "The ID of the agent to list sessions for.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
        "/v1/eval/benchmarks": {
            "get": {
                "responses": {
@ -6565,6 +6680,28 @@
                "title": "ScoringResult",
                "description": "A scoring result for a single row."
            },
+            "Agent": {
+                "type": "object",
+                "properties": {
+                    "agent_id": {
+                        "type": "string"
+                    },
+                    "agent_config": {
+                        "$ref": "#/components/schemas/AgentConfig"
+                    },
+                    "created_at": {
+                        "type": "string",
+                        "format": "date-time"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "agent_id",
+                    "agent_config",
+                    "created_at"
+                ],
+                "title": "Agent"
+            },
            "Session": {
                "type": "object",
                "properties": {
@ -7907,6 +8044,38 @@
                ],
                "title": "ToolInvocationResult"
            },
+            "ListAgentSessionsResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/Session"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "ListAgentSessionsResponse"
+            },
+            "ListAgentsResponse": {
+                "type": "object",
+                "properties": {
+                    "data": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/Agent"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "data"
+                ],
+                "title": "ListAgentsResponse"
+            },
            "BucketResponse": {
                "type": "object",
                "properties": {
@ -9321,21 +9490,11 @@
                "type": "object",
                "properties": {
                    "tool_responses": {
-                        "oneOf": [
-                            {
-                                "type": "array",
-                                "items": {
-                                    "$ref": "#/components/schemas/ToolResponse"
-                                }
-                            },
-                            {
-                                "type": "array",
-                                "items": {
-                                    "$ref": "#/components/schemas/ToolResponseMessage"
-                                }
-                            }
-                        ],
-                        "description": "The tool call responses to resume the turn with. NOTE: ToolResponseMessage will be deprecated. Use ToolResponse."
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ToolResponse"
+                        },
+                        "description": "The tool call responses to resume the turn with."
                    },
                    "stream": {
                        "type": "boolean",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -238,6 +238,28 @@ paths:
              $ref: '#/components/schemas/CompletionRequest'
        required: true
  /v1/agents:
+    get:
+      responses:
+        '200':
+          description: A ListAgentsResponse.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListAgentsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: List all agents.
+      parameters: []
    post:
      responses:
        '200':
@ -410,6 +432,34 @@ paths:
              $ref: '#/components/schemas/CreateUploadSessionRequest'
        required: true
  /v1/agents/{agent_id}:
+    get:
+      responses:
+        '200':
+          description: An Agent of the agent.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/Agent'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: Describe an agent by its ID.
+      parameters:
+        - name: agent_id
+          in: path
+          description: ID of the agent.
+          required: true
+          schema:
+            type: string
    delete:
      responses:
        '200':
@ -1528,6 +1578,36 @@ paths:
          required: true
          schema:
            type: string
+  /v1/agents/{agent_id}/sessions:
+    get:
+      responses:
+        '200':
+          description: A ListAgentSessionsResponse.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListAgentSessionsResponse'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Agents
+      description: List all session(s) of a given agent.
+      parameters:
+        - name: agent_id
+          in: path
+          description: >-
+            The ID of the agent to list sessions for.
+          required: true
+          schema:
+            type: string
  /v1/eval/benchmarks:
    get:
      responses:
@ -4549,6 +4629,22 @@ components:
        - aggregated_results
      title: ScoringResult
      description: A scoring result for a single row.
+    Agent:
+      type: object
+      properties:
+        agent_id:
+          type: string
+        agent_config:
+          $ref: '#/components/schemas/AgentConfig'
+        created_at:
+          type: string
+          format: date-time
+      additionalProperties: false
+      required:
+        - agent_id
+        - agent_config
+        - created_at
+      title: Agent
    Session:
      type: object
      properties:
@ -5385,6 +5481,28 @@ components:
      required:
        - content
      title: ToolInvocationResult
+    ListAgentSessionsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/Session'
+      additionalProperties: false
+      required:
+        - data
+      title: ListAgentSessionsResponse
+    ListAgentsResponse:
+      type: object
+      properties:
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/Agent'
+      additionalProperties: false
+      required:
+        - data
+      title: ListAgentsResponse
    BucketResponse:
      type: object
      properties:
@ -6287,16 +6405,11 @@ components:
      type: object
      properties:
        tool_responses:
-          oneOf:
-            - type: array
-              items:
-                $ref: '#/components/schemas/ToolResponse'
-            - type: array
-              items:
-                $ref: '#/components/schemas/ToolResponseMessage'
+          type: array
+          items:
+            $ref: '#/components/schemas/ToolResponse'
          description: >-
-            The tool call responses to resume the turn with. NOTE: ToolResponseMessage
-            will be deprecated. Use ToolResponse.
+            The tool call responses to resume the turn with.
        stream:
          type: boolean
          description: Whether to stream the response.
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@ -1267,7 +1267,6 @@
        }
      ],
      "source": [
-        "# NBVAL_SKIP\n",
        "from pydantic import BaseModel\n",
        "\n",
        "\n",
@ -1279,7 +1278,7 @@
        "\n",
        "user_input = \"Michael Jordan was born in 1963. He played basketball for the Chicago Bulls. He retired in 2003. Extract this information into JSON for me. \"\n",
        "response = client.inference.completion(\n",
-        "    model_id=model_id,\n",
+        "    model_id=\"meta-llama/Llama-3.1-8B-Instruct\",\n",
        "    content=user_input,\n",
        "    stream=False,\n",
        "    sampling_params={\n",
@ -1640,7 +1639,7 @@
        "agent = Agent(\n",
        "    client, \n",
        "    model=model_id,\n",
-        "    instructions=\"You are a helpful assistant\",\n",
+        "    instructions=\"You are a helpful assistant. Use websearch tool to help answer questions.\",\n",
        "    tools=[\"builtin::websearch\"],\n",
        ")\n",
        "user_prompts = [\n",
--- a/docs/openapi_generator/README.md
+++ b/docs/openapi_generator/README.md
@ -1,9 +1 @@
 The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
-
-Please install the following packages before running the script:
-
-```
-pip install fire PyYAML
-```
-
-Then simply run `sh run_openapi_generator.sh`
--- a/docs/source/building_applications/evals.md
+++ b/docs/source/building_applications/evals.md
@ -23,9 +23,12 @@ In this example, we will show you how to:

 ##### Building a Search Agent
 ```python
+from llama_stack_client import LlamaStackClient
 from llama_stack_client.lib.agents.agent import Agent
 from llama_stack_client.lib.agents.event_logger import EventLogger

+client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
+
 agent = Agent(
    client,
    model="meta-llama/Llama-3.3-70B-Instruct",
@ -33,7 +36,7 @@ agent = Agent(
    tools=["builtin::websearch"],
 )
 user_prompts = [
-    "Which teams played in the NBA western conference finals of 2024. Search the web for the answer.",
+    "Which teams played in the NBA Western Conference Finals of 2024. Search the web for the answer.",
    "In which episode and season of South Park does Bill Cosby (BSM-471) first appear? Give me the number and title. Search the web for the answer.",
    "What is the British-American kickboxer Andrew Tate's kickboxing name? Search the web for the answer.",
 ]
--- a/docs/source/distributions/building_distro.md
+++ b/docs/source/distributions/building_distro.md
@ -33,6 +33,8 @@ Can be set to any of the following log levels:

 The default global log level is `info`. `all` sets the log level for all components.

+A user can also set `LLAMA_STACK_LOG_FILE` which will pipe the logs to the specified path as well as to the terminal. An example would be: `export LLAMA_STACK_LOG_FILE=server.log`
+
 ### Llama Stack Build

 In order to build your own distribution, we recommend you clone the `llama-stack` repository.
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@ -40,7 +40,6 @@ The following models are available by default:
 - `accounts/fireworks/models/llama-v3p1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
 - `accounts/fireworks/models/llama-v3p1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
 - `accounts/fireworks/models/llama-v3p1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `accounts/fireworks/models/llama-v3p2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
 - `accounts/fireworks/models/llama-v3p2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
 - `accounts/fireworks/models/llama-v3p2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
 - `accounts/fireworks/models/llama-v3p2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@ -23,7 +23,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
 | telemetry | `inline::meta-reference` |
 | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
-| vector_io | `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |


 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
@ -130,7 +130,7 @@ llama stack run ./run-with-safety.yaml \
 ### (Optional) Update Model Serving Configuration

 ```{note}
-Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models.
+Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py) for the supported Ollama models.
 ```

 To serve a new model with `ollama`
--- a/docs/source/references/llama_cli_reference/index.md
+++ b/docs/source/references/llama_cli_reference/index.md
@ -1,6 +1,6 @@
 # llama (server-side) CLI Reference

-The `llama` CLI tool helps you setup and use the Llama Stack. It should be available on your path after installing the `llama-stack` package.
+The `llama` CLI tool helps you set up and use the Llama Stack. The CLI is available on your path after installing the `llama-stack` package.

 ## Installation

@ -27,9 +27,9 @@ You have two ways to install Llama Stack:


 ## `llama` subcommands
-1. `download`: `llama` cli tools supports downloading the model from Meta or Hugging Face.
-2. `model`: Lists available models and their properties.
-3. `stack`: Allows you to build and run a Llama Stack server. You can read more about this [here](../../distributions/building_distro).
+1. `download`: Supports downloading models from Meta or Hugging Face.  [Downloading models](#downloading-models)
+2. `model`: Lists available models and their properties. [Understanding models](#understand-the-models)
+3. `stack`: Allows you to build a stack using the `llama stack` distribution and run a Llama Stack server. You can read more about how to build a Llama Stack distribution in the [Build your own Distribution](../../distributions/building_distro) documentation.

 ### Sample Usage

@ -117,7 +117,7 @@ You should see a table like this:
 +----------------------------------+------------------------------------------+----------------+
 ```

-To download models, you can use the llama download command.
+To download models, you can use the `llama download` command.

 ### Downloading from [Meta](https://llama.meta.com/llama-downloads/)

@ -191,7 +191,7 @@ You should see a table like this:
 The `llama model` command helps you explore the model’s interface.

 1. `download`: Download the model from different sources. (meta, huggingface)
-2. `list`: Lists all the models available for download with hardware requirements to deploy the models.
+2. `list`: Lists all the models available for download with hardware requirements for deploying the models.
 3. `prompt-format`: Show llama model message formats.
 4. `describe`: Describes all the properties of the model.

@ -262,13 +262,12 @@ llama model prompt-format -m Llama3.2-3B-Instruct
 ![alt text](../../../resources/prompt-format.png)


-
 You will be shown a Markdown formatted description of the model interface and how prompts / messages are formatted for various scenarios.

 **NOTE**: Outputs in terminal are color printed to show special tokens.

 ### Remove model
-You can run `llama model remove` to remove unecessary model:
+You can run `llama model remove` to remove an unnecessary model:

 ```
 llama model remove -m Llama-Guard-3-8B-int8
--- a/docs/zero_to_hero_guide/README.md
+++ b/docs/zero_to_hero_guide/README.md
@ -40,7 +40,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
   ollama run llama3.2:3b-instruct-fp16 --keepalive -1m
   ```
   **Note**:
-     - The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
+     - The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py)
     - `keepalive -1m` is used so that ollama continues to keep the model in memory indefinitely. Otherwise, ollama frees up memory and you would have to run `ollama run` again.

 ---