Merge branch 'meta-llama:main' into feat/litellm_sambanova_usage

2025-12-28 04:21:58 +00:00 · 2025-03-17 09:42:15 -05:00 · 2025-03-17 09:42:15 -05:00 · 716cb09056
commit 716cb09056
parent bed93ec552 b56b06037c
145 changed files with 21384 additions and 1283 deletions
--- a/docs/_static/llama-stack-spec.html
+++ b/docs/_static/llama-stack-spec.html
@ -6,8 +6,8 @@
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>OpenAPI specification</title>
    <link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
-    <script type="module" src="https://unpkg.com/@stoplight/elements/web-components.min.js"></script>
-    <link rel="stylesheet" href="https://unpkg.com/@stoplight/elements/styles.min.css">
+    <script type="module" src="https://cdn.jsdelivr.net/npm/@stoplight/elements/web-components.min.js"></script>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@stoplight/elements/styles.min.css">
    <style>
        body {
            margin: 0;
@ -2151,6 +2151,48 @@
                }
            }
        },
+        "/v1/providers/{provider_id}": {
+            "get": {
+                "responses": {
+                    "200": {
+                        "description": "OK",
+                        "content": {
+                            "application/json": {
+                                "schema": {
+                                    "$ref": "#/components/schemas/ProviderInfo"
+                                }
+                            }
+                        }
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Providers"
+                ],
+                "description": "",
+                "parameters": [
+                    {
+                        "name": "provider_id",
+                        "in": "path",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
+            }
+        },
        "/v1/tool-runtime/invoke": {
            "post": {
                "responses": {
@ -2642,7 +2684,7 @@
                }
            }
        },
-        "/v1/inspect/providers": {
+        "/v1/providers": {
            "get": {
                "responses": {
                    "200": {
@ -4347,24 +4389,6 @@
                        "type": "string",
                        "description": "Unique identifier for the tool call this response is for"
                    },
-                    "tool_name": {
-                        "oneOf": [
-                            {
-                                "type": "string",
-                                "enum": [
-                                    "brave_search",
-                                    "wolfram_alpha",
-                                    "photogen",
-                                    "code_interpreter"
-                                ],
-                                "title": "BuiltinTool"
-                            },
-                            {
-                                "type": "string"
-                            }
-                        ],
-                        "description": "Name of the tool that was called"
-                    },
                    "content": {
                        "$ref": "#/components/schemas/InterleavedContent",
                        "description": "The response content from the tool"
@ -4374,7 +4398,6 @@
                "required": [
                    "role",
                    "call_id",
-                    "tool_name",
                    "content"
                ],
                "title": "ToolResponseMessage",
@ -4549,7 +4572,7 @@
                    "metrics": {
                        "type": "array",
                        "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
+                            "$ref": "#/components/schemas/MetricInResponse"
                        }
                    },
                    "completion_message": {
@ -4571,46 +4594,9 @@
                "title": "ChatCompletionResponse",
                "description": "Response from a chat completion request."
            },
-            "MetricEvent": {
+            "MetricInResponse": {
                "type": "object",
                "properties": {
-                    "trace_id": {
-                        "type": "string"
-                    },
-                    "span_id": {
-                        "type": "string"
-                    },
-                    "timestamp": {
-                        "type": "string",
-                        "format": "date-time"
-                    },
-                    "attributes": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "integer"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "null"
-                                }
-                            ]
-                        }
-                    },
-                    "type": {
-                        "type": "string",
-                        "const": "metric",
-                        "default": "metric"
-                    },
                    "metric": {
                        "type": "string"
                    },
@ -4630,15 +4616,10 @@
                },
                "additionalProperties": false,
                "required": [
-                    "trace_id",
-                    "span_id",
-                    "timestamp",
-                    "type",
                    "metric",
-                    "value",
-                    "unit"
+                    "value"
                ],
-                "title": "MetricEvent"
+                "title": "MetricInResponse"
            },
            "TokenLogProbs": {
                "type": "object",
@ -4715,6 +4696,12 @@
            "CompletionResponse": {
                "type": "object",
                "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricInResponse"
+                        }
+                    },
                    "content": {
                        "type": "string",
                        "description": "The generated completion text"
@ -4924,7 +4911,7 @@
                    "metrics": {
                        "type": "array",
                        "items": {
-                            "$ref": "#/components/schemas/MetricEvent"
+                            "$ref": "#/components/schemas/MetricInResponse"
                        }
                    },
                    "event": {
@ -5082,6 +5069,12 @@
            "CompletionResponseStreamChunk": {
                "type": "object",
                "properties": {
+                    "metrics": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/MetricInResponse"
+                        }
+                    },
                    "delta": {
                        "type": "string",
                        "description": "New content generated since last chunk. This can be one or more tokens."
@ -7961,6 +7954,53 @@
                ],
                "title": "InsertChunksRequest"
            },
+            "ProviderInfo": {
+                "type": "object",
+                "properties": {
+                    "api": {
+                        "type": "string"
+                    },
+                    "provider_id": {
+                        "type": "string"
+                    },
+                    "provider_type": {
+                        "type": "string"
+                    },
+                    "config": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "api",
+                    "provider_id",
+                    "provider_type",
+                    "config"
+                ],
+                "title": "ProviderInfo"
+            },
            "InvokeToolRequest": {
                "type": "object",
                "properties": {
@ -8173,27 +8213,6 @@
                ],
                "title": "ListModelsResponse"
            },
-            "ProviderInfo": {
-                "type": "object",
-                "properties": {
-                    "api": {
-                        "type": "string"
-                    },
-                    "provider_id": {
-                        "type": "string"
-                    },
-                    "provider_type": {
-                        "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "api",
-                    "provider_id",
-                    "provider_type"
-                ],
-                "title": "ProviderInfo"
-            },
            "ListProvidersResponse": {
                "type": "object",
                "properties": {
@ -8363,6 +8382,75 @@
                ],
                "title": "LogSeverity"
            },
+            "MetricEvent": {
+                "type": "object",
+                "properties": {
+                    "trace_id": {
+                        "type": "string"
+                    },
+                    "span_id": {
+                        "type": "string"
+                    },
+                    "timestamp": {
+                        "type": "string",
+                        "format": "date-time"
+                    },
+                    "attributes": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "integer"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "null"
+                                }
+                            ]
+                        }
+                    },
+                    "type": {
+                        "type": "string",
+                        "const": "metric",
+                        "default": "metric"
+                    },
+                    "metric": {
+                        "type": "string"
+                    },
+                    "value": {
+                        "oneOf": [
+                            {
+                                "type": "integer"
+                            },
+                            {
+                                "type": "number"
+                            }
+                        ]
+                    },
+                    "unit": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "trace_id",
+                    "span_id",
+                    "timestamp",
+                    "type",
+                    "metric",
+                    "value",
+                    "unit"
+                ],
+                "title": "MetricEvent"
+            },
            "SpanEndPayload": {
                "type": "object",
                "properties": {
@ -10125,6 +10213,10 @@
        {
            "name": "PostTraining (Coming Soon)"
        },
+        {
+            "name": "Providers",
+            "x-displayName": "Providers API for inspecting, listing, and modifying providers and their configurations."
+        },
        {
            "name": "Safety"
        },
@ -10171,6 +10263,7 @@
                "Inspect",
                "Models",
                "PostTraining (Coming Soon)",
+                "Providers",
                "Safety",
                "Scoring",
                "ScoringFunctions",
--- a/docs/_static/llama-stack-spec.yaml
+++ b/docs/_static/llama-stack-spec.yaml
@ -1444,6 +1444,34 @@ paths:
            schema:
              $ref: '#/components/schemas/InsertChunksRequest'
        required: true
+  /v1/providers/{provider_id}:
+    get:
+      responses:
+        '200':
+          description: OK
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ProviderInfo'
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Providers
+      description: ''
+      parameters:
+        - name: provider_id
+          in: path
+          required: true
+          schema:
+            type: string
  /v1/tool-runtime/invoke:
    post:
      responses:
@ -1782,7 +1810,7 @@ paths:
            schema:
              $ref: '#/components/schemas/RegisterModelRequest'
        required: true
-  /v1/inspect/providers:
+  /v1/providers:
    get:
      responses:
        '200':
@ -2943,17 +2971,6 @@ components:
          type: string
          description: >-
            Unique identifier for the tool call this response is for
-        tool_name:
-          oneOf:
-            - type: string
-              enum:
-                - brave_search
-                - wolfram_alpha
-                - photogen
-                - code_interpreter
-              title: BuiltinTool
-            - type: string
-          description: Name of the tool that was called
        content:
          $ref: '#/components/schemas/InterleavedContent'
          description: The response content from the tool
@ -2961,7 +2978,6 @@ components:
      required:
        - role
        - call_id
-        - tool_name
        - content
      title: ToolResponseMessage
      description: >-
@ -3101,7 +3117,7 @@ components:
        metrics:
          type: array
          items:
-            $ref: '#/components/schemas/MetricEvent'
+            $ref: '#/components/schemas/MetricInResponse'
        completion_message:
          $ref: '#/components/schemas/CompletionMessage'
          description: The complete response message
@ -3116,29 +3132,9 @@ components:
        - completion_message
      title: ChatCompletionResponse
      description: Response from a chat completion request.
-    MetricEvent:
+    MetricInResponse:
      type: object
      properties:
-        trace_id:
-          type: string
-        span_id:
-          type: string
-        timestamp:
-          type: string
-          format: date-time
-        attributes:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: string
-              - type: integer
-              - type: number
-              - type: boolean
-              - type: 'null'
-        type:
-          type: string
-          const: metric
-          default: metric
        metric:
          type: string
        value:
@ -3149,14 +3145,9 @@ components:
          type: string
      additionalProperties: false
      required:
-        - trace_id
-        - span_id
-        - timestamp
-        - type
        - metric
        - value
-        - unit
-      title: MetricEvent
+      title: MetricInResponse
    TokenLogProbs:
      type: object
      properties:
@ -3213,6 +3204,10 @@ components:
    CompletionResponse:
      type: object
      properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
        content:
          type: string
          description: The generated completion text
@ -3412,7 +3407,7 @@ components:
        metrics:
          type: array
          items:
-            $ref: '#/components/schemas/MetricEvent'
+            $ref: '#/components/schemas/MetricInResponse'
        event:
          $ref: '#/components/schemas/ChatCompletionResponseEvent'
          description: The event containing the new content
@ -3531,6 +3526,10 @@ components:
    CompletionResponseStreamChunk:
      type: object
      properties:
+        metrics:
+          type: array
+          items:
+            $ref: '#/components/schemas/MetricInResponse'
        delta:
          type: string
          description: >-
@ -5438,6 +5437,32 @@ components:
        - vector_db_id
        - chunks
      title: InsertChunksRequest
+    ProviderInfo:
+      type: object
+      properties:
+        api:
+          type: string
+        provider_id:
+          type: string
+        provider_type:
+          type: string
+        config:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - api
+        - provider_id
+        - provider_type
+        - config
+      title: ProviderInfo
    InvokeToolRequest:
      type: object
      properties:
@ -5573,21 +5598,6 @@ components:
      required:
        - data
      title: ListModelsResponse
-    ProviderInfo:
-      type: object
-      properties:
-        api:
-          type: string
-        provider_id:
-          type: string
-        provider_type:
-          type: string
-      additionalProperties: false
-      required:
-        - api
-        - provider_id
-        - provider_type
-      title: ProviderInfo
    ListProvidersResponse:
      type: object
      properties:
@ -5703,6 +5713,47 @@ components:
        - error
        - critical
      title: LogSeverity
+    MetricEvent:
+      type: object
+      properties:
+        trace_id:
+          type: string
+        span_id:
+          type: string
+        timestamp:
+          type: string
+          format: date-time
+        attributes:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: string
+              - type: integer
+              - type: number
+              - type: boolean
+              - type: 'null'
+        type:
+          type: string
+          const: metric
+          default: metric
+        metric:
+          type: string
+        value:
+          oneOf:
+            - type: integer
+            - type: number
+        unit:
+          type: string
+      additionalProperties: false
+      required:
+        - trace_id
+        - span_id
+        - timestamp
+        - type
+        - metric
+        - value
+        - unit
+      title: MetricEvent
    SpanEndPayload:
      type: object
      properties:
@ -6820,6 +6871,9 @@ tags:
  - name: Inspect
  - name: Models
  - name: PostTraining (Coming Soon)
+  - name: Providers
+    x-displayName: >-
+      Providers API for inspecting, listing, and modifying providers and their configurations.
  - name: Safety
  - name: Scoring
  - name: ScoringFunctions
@ -6844,6 +6898,7 @@ x-tagGroups:
      - Inspect
      - Models
      - PostTraining (Coming Soon)
+      - Providers
      - Safety
      - Scoring
      - ScoringFunctions
--- a/docs/openapi_generator/pyopenapi/template.html
+++ b/docs/openapi_generator/pyopenapi/template.html
@ -6,8 +6,8 @@
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>OpenAPI specification</title>
    <link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
-    <script type="module" src="https://unpkg.com/@stoplight/elements/web-components.min.js"></script>
-    <link rel="stylesheet" href="https://unpkg.com/@stoplight/elements/styles.min.css">
+    <script type="module" src="https://cdn.jsdelivr.net/npm/@stoplight/elements/web-components.min.js"></script>
+    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@stoplight/elements/styles.min.css">
    <style>
        body {
            margin: 0;
--- a/docs/source/concepts/index.md
+++ b/docs/source/concepts/index.md
@ -71,4 +71,4 @@ While there is a lot of flexibility to mix-and-match providers, often users will
 **Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.


-**On-device Distro**: Finally, you may want to run Llama Stack directly on an edge device (mobile phone or a tablet.) We provide Distros for iOS and Android (coming soon.)
+**On-device Distro**: To run Llama Stack directly on an edge device (mobile phone or a tablet), we provide Distros for [iOS](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/ios_sdk.html) and [Android](https://llama-stack.readthedocs.io/en/latest/distributions/ondevice_distro/android_sdk.html)
--- a/docs/source/distributions/ondevice_distro/android_sdk.md
+++ b/docs/source/distributions/ondevice_distro/android_sdk.md
@ -8,12 +8,12 @@ Features:
 - Remote Inferencing: Perform inferencing tasks remotely with Llama models hosted on a remote connection (or serverless localhost).
 - Simple Integration: With easy-to-use APIs, a developer can quickly integrate Llama Stack in their Android app. The difference with local vs remote inferencing is also minimal.

-Latest Release Notes: [v0.0.58](https://github.com/meta-llama/llama-stack-client-kotlin/releases/tag/v0.0.58)
+Latest Release Notes: [link](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release)

 *Tagged releases are stable versions of the project. While we strive to maintain a stable main branch, it's not guaranteed to be free of bugs or issues.*

 ## Android Demo App
-Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-apps/tree/android-kotlin-app-latest/examples/android_app)
+Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-client-kotlin/tree/examples/android_app)

 The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlamaStackRemoteInference.kts`, and `MainActivity.java`. With encompassed business logic, the app shows how to use Llama Stack for both the environments.

@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
 Add the following dependency in your `build.gradle.kts` file:
 ```
 dependencies {
- implementation("com.llama.llamastack:llama-stack-client-kotlin:0.0.58")
+ implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2")
 }
 ```
 This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
@ -36,13 +36,13 @@ If you plan on doing remote inferencing this is sufficient to get started.
 For local inferencing, it is required to include the ExecuTorch library into your app.

 Include the ExecuTorch library by:
-1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/blob/release/0.0.58/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
+1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
 2. Move the script to the top level of your Android app where the app directory resides:
 <p align="center">
-<img src="https://raw.githubusercontent.com/meta-llama/llama-stack-client-kotlin/refs/heads/release/0.0.58/doc/img/example_android_app_directory.png" style="width:300px">
+<img src="https://github.com/meta-llama/llama-stack-client-kotlin/blob/latest-release/doc/img/example_android_app_directory.png" style="width:300px">
 </p>

-3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate with commit: [0a12e33](https://github.com/pytorch/executorch/commit/0a12e33d22a3d44d1aa2af5f0d0673d45b962553).
+3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
 4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
 ```
 dependencies {
@ -58,12 +58,12 @@ Breaking down the demo app, this section will show the core pieces that are used
 ### Setup Remote Inferencing
 Start a Llama Stack server on localhost. Here is an example of how you can do this using the firework.ai distribution:
 ```
-conda create -n stack-fireworks python=3.10
+conda create -n stack-fireworks python=3.10 
 conda activate stack-fireworks
-pip install llama-stack=0.0.58
+pip install --no-cache llama-stack==0.1.4
 llama stack build --template fireworks --image-type conda
 export FIREWORKS_API_KEY=<SOME_KEY>
-llama stack run /Users/<your_username>/.llama/distributions/llamastack-fireworks/fireworks-run.yaml --port=5050
+llama stack run fireworks --port 5050
 ```

 Ensure the Llama Stack server version is the same as the Kotlin SDK Library for maximum compatibility.
@ -146,7 +146,7 @@ The purpose of this section is to share more details with users that would like
 ### Prerequisite

 You must complete the following steps:
-1. Clone the repo (`git clone https://github.com/meta-llama/llama-stack-client-kotlin.git -b release/0.0.58`)
+1. Clone the repo (`git clone https://github.com/meta-llama/llama-stack-client-kotlin.git -b latest-release`)
 2. Port the appropriate ExecuTorch libraries over into your Llama Stack Kotlin library environment.
 ```
 cd llama-stack-client-kotlin-client-local
--- a/docs/source/distributions/ondevice_distro/ios_sdk.md
+++ b/docs/source/distributions/ondevice_distro/ios_sdk.md
@ -1,9 +1,8 @@
 # iOS SDK

-We offer both remote and on-device use of Llama Stack in Swift via two components:
-
-1. [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/)
-2. [LocalInferenceImpl](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/ios/inference)
+We offer both remote and on-device use of Llama Stack in Swift via a single SDK [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/) that contains two components:
+1. LlamaStackClient for remote
+2. Local Inference for on-device

 ```{image} ../../../_static/remote_or_local.gif
 :alt: Seamlessly switching between local, on-device inference and remote hosted inference
@ -42,7 +41,7 @@ let request = Components.Schemas.CreateAgentTurnRequest(
      // ...
 ```

-Check out [iOSCalendarAssistant](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/ios_calendar_assistant) for a complete app demo.
+Check out [iOSCalendarAssistant](https://github.com/meta-llama/llama-stack-client-swift/tree/main/examples/ios_calendar_assistant) for a complete app demo.

 ## LocalInference

@ -58,7 +57,7 @@ let inference = LocalInference(queue: runnerQueue)
 let agents = LocalAgents(inference: self.inference)
 ```

-Check out [iOSCalendarAssistantWithLocalInf](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/ios_calendar_assistant) for a complete app demo.
+Check out [iOSCalendarAssistantWithLocalInf](https://github.com/meta-llama/llama-stack-client-swift/tree/main/examples/ios_calendar_assistant) for a complete app demo.

 ### Installation

@ -68,47 +67,6 @@ We're working on making LocalInference easier to set up. For now, you'll need t
 1. Install [Cmake](https://cmake.org/) for the executorch build`
 1. Drag `LocalInference.xcodeproj` into your project
 1. Add `LocalInference` as a framework in your app target
-1. Add a package dependency on https://github.com/pytorch/executorch (branch latest)
-1. Add all the kernels / backends from executorch (but not exectuorch itself!) as frameworks in your app target:
-    - backend_coreml
-    - backend_mps
-    - backend_xnnpack
-    - kernels_custom
-    - kernels_optimized
-    - kernels_portable
-    - kernels_quantized
-1. In "Build Settings" > "Other Linker Flags" > "Any iOS Simulator SDK", add:
-    ```
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a
-    ```
-
-1. In "Build Settings" > "Other Linker Flags" > "Any iOS SDK", add:
-
-    ```
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a
-    -force_load
-    $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a
-    ```

 ### Preparing a model

--- a/docs/source/distributions/self_hosted_distro/passthrough.md
+++ b/docs/source/distributions/self_hosted_distro/passthrough.md
@ -0,0 +1,42 @@
+---
+orphan: true
+---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
+# Passthrough Distribution
+
+```{toctree}
+:maxdepth: 2
+:hidden:
+
+self
+```
+
+The `llamastack/distribution-passthrough` distribution consists of the following provider configurations.
+
+| API | Provider(s) |
+|-----|-------------|
+| agents | `inline::meta-reference` |
+| datasetio | `remote::huggingface`, `inline::localfs` |
+| eval | `inline::meta-reference` |
+| inference | `remote::passthrough`, `inline::sentence-transformers` |
+| safety | `inline::llama-guard` |
+| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
+| telemetry | `inline::meta-reference` |
+| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
+| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
+
+
+### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `PASSTHROUGH_API_KEY`: Passthrough API Key (default: ``)
+- `PASSTHROUGH_URL`: Passthrough URL (default: ``)
+
+### Models
+
+The following models are available by default:
+
+- `llama3.1-8b-instruct `
+- `llama3.2-11b-vision-instruct `
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -88,11 +88,19 @@ docker run -it \

 :::{dropdown} Installing the Llama Stack client CLI and SDK

-You can interact with the Llama Stack server using various client SDKs. We will use the Python SDK which you can install using the following command. Note that you must be using Python 3.10 or newer:
+You can interact with the Llama Stack server using various client SDKs.  Note that you must be using Python 3.10 or newer. We will use the Python SDK which you can install via `conda` or `virtualenv`.
+
+For `conda`:
 ```bash
 yes | conda create -n stack-client python=3.10
 conda activate stack-client
+pip install llama-stack-client
+```

+For `virtualenv`:
+```bash
+python -m venv stack-client
+source stack-client/bin/activate
 pip install llama-stack-client
 ```

@ -173,6 +181,13 @@ response = client.inference.chat_completion(
 print(response.completion_message.content)
 ```

+To run the above example, put the code in a file called `inference.py`, ensure your `conda` or `virtualenv` environment is active, and run the following:
+```bash
+pip install llama_stack
+llama stack build --template ollama --image-type <conda|venv>
+python inference.py
+```
+
 ### 4. Your first RAG agent

 Here is an example of a simple RAG (Retrieval Augmented Generation) chatbot agent which can answer questions about TorchTune documentation.
@ -273,6 +288,13 @@ for prompt in user_prompts:
        log.print()
 ```

+To run the above example, put the code in a file called `rag.py`, ensure your `conda` or `virtualenv` environment is active, and run the following:
+```bash
+pip install llama_stack
+llama stack build --template ollama --image-type <conda|venv>
+python rag.py
+```
+
 ## Next Steps

 - Learn more about Llama Stack [Concepts](../concepts/index.md)
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -38,9 +38,9 @@ We have a number of client-side SDKs available for different languages.
 |  **Language** |  **Client SDK** | **Package** |
 | :----: | :----: | :----: |
 | Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
-| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
+| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
 | Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
-| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
+| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)

 ## Supported Llama Stack Implementations

@ -61,6 +61,10 @@ A number of "adapters" are available for some popular Inference and Vector Store
 |  Groq  |  Hosted  |
 |  SambaNova  |  Hosted  |
 | PyTorch ExecuTorch | On-device iOS, Android |
+|  OpenAI  |  Hosted  |
+|  Anthropic  |  Hosted  |
+|  Gemini  |  Hosted  |
+

 **Vector IO API**
 |  **Provider** |  **Environments** |