From eba9d1ea14a2307375dfe7aeab7783103a953b55 Mon Sep 17 00:00:00 2001
From: Russell Bryant <russell.bryant@gmail.com>
Date: Thu, 10 Oct 2024 14:21:59 -0400
Subject: [PATCH 01/10] ci: Run pre-commit checks in CI (#176)

Run the pre-commit checks in a github workflow to validate that a PR
or a direct push to the repo does not introduce new errors.
---
 .flake8                          | 10 +++----
 .github/workflows/pre-commit.yml | 45 ++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/pre-commit.yml
diff --git a/.flake8 b/.flake8
index af5005b0d..7cadda2a9 100644
--- a/.flake8
+++ b/.flake8
@@ -21,11 +21,11 @@ ignore =
 optional-ascii-coding = True
 exclude =
     ./.git,
-    ./docs
-    ./build
+    ./docs/*,
+    ./build,
     ./scripts,
     ./venv,
-    *.pyi
-    .pre-commit-config.yaml
-    *.md
+    *.pyi,
+    .pre-commit-config.yaml,
+    *.md,
     .flake8
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 000000000..502753976
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,45 @@
+name: Pre-commit
+
+on: [pull_request]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
+
+      - name: Set up Python
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: '3.11'
+          cache: pip
+          cache-dependency-path: |
+            **/requirements*.txt
+            .pre-commit-config.yaml
+
+      - name: Install pre-commit
+        run: |
+          python -m pip install --upgrade pip
+          pip install pre-commit
+
+      - name: Fetch base branch
+        run: git fetch origin ${{ github.event.pull_request.base.ref }}:refs/remotes/origin/${{ github.event.pull_request.base.ref }}
+
+      - name: Fetch head commit from PR
+        run: git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
+
+      - name: Get changed files
+        id: changed-files
+        run: |
+          git diff --name-only origin/${{ github.event.pull_request.base.ref }} pr-${{ github.event.pull_request.number }} > changed_files.txt
+          cat changed_files.txt
+
+      - name: Run pre-commit
+        run: |
+          if [ -s changed_files.txt ]; then
+            pre-commit run --files $(cat changed_files.txt | tr '\n' ' ')
+          else
+            echo "No changed files to run pre-commit on."
+          fi

From a3e65d58a9b1297aa6b0167ba3cbb7a4332845ba Mon Sep 17 00:00:00 2001
From: Dalton Flanagan <6599399+dltn@users.noreply.github.com>
Date: Thu, 10 Oct 2024 15:04:21 -0400
Subject: [PATCH 02/10] Add logo

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 050a71aff..2a76913d6 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+<img src="https://github.com/user-attachments/assets/2fedfe0f-6df7-4441-98b2-87a1fd95ee1c" width="300" title="Llama Stack Logo" alt="Llama Stack Logo"/>
+
 # Llama Stack
 
 [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)

From 7ff5800dea3feb328df2b73e26e56087061f6848 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 10 Oct 2024 15:30:34 -0700
Subject: [PATCH 03/10] generate openapi

---
 docs/resources/llama-stack-spec.html | 66 ++++++++++++++--------------
 docs/resources/llama-stack-spec.yaml | 26 +++++------
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 96ef7e4bb..a2f92b6e4 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -21,7 +21,7 @@
     "info": {
         "title": "[DRAFT] Llama Stack Specification",
         "version": "0.0.1",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-09 21:10:09.073430"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-10-10 15:29:56.831109"
     },
     "servers": [
         {
@@ -355,7 +355,7 @@
                     "200": {
                         "description": "OK",
                         "content": {
-                            "application/json": {
+                            "text/event-stream": {
                                 "schema": {
                                     "$ref": "#/components/schemas/AgentTurnResponseStreamChunk"
                                 }
@@ -6074,36 +6074,6 @@
         }
     ],
     "tags": [
-        {
-            "name": "RewardScoring"
-        },
-        {
-            "name": "Memory"
-        },
-        {
-            "name": "SyntheticDataGeneration"
-        },
-        {
-            "name": "Models"
-        },
-        {
-            "name": "Safety"
-        },
-        {
-            "name": "BatchInference"
-        },
-        {
-            "name": "Agents"
-        },
-        {
-            "name": "MemoryBanks"
-        },
-        {
-            "name": "Shields"
-        },
-        {
-            "name": "Datasets"
-        },
         {
             "name": "Evaluations"
         },
@@ -6111,14 +6081,44 @@
             "name": "Inspect"
         },
         {
-            "name": "PostTraining"
+            "name": "RewardScoring"
+        },
+        {
+            "name": "Datasets"
+        },
+        {
+            "name": "Models"
         },
         {
             "name": "Telemetry"
         },
+        {
+            "name": "PostTraining"
+        },
+        {
+            "name": "SyntheticDataGeneration"
+        },
+        {
+            "name": "BatchInference"
+        },
         {
             "name": "Inference"
         },
+        {
+            "name": "Agents"
+        },
+        {
+            "name": "Memory"
+        },
+        {
+            "name": "Safety"
+        },
+        {
+            "name": "Shields"
+        },
+        {
+            "name": "MemoryBanks"
+        },
         {
             "name": "BuiltinTool",
             "description": "<SchemaDefinition schemaRef=\"#/components/schemas/BuiltinTool\" />"
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 9307ee47b..c9822d6ca 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -2507,7 +2507,7 @@ info:
   description: "This is the specification of the llama stack that provides\n     \
     \           a set of endpoints and their corresponding interfaces that are tailored\
     \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-10-09 21:10:09.073430"
+    \ draft and subject to change.\n                Generated at 2024-10-10 15:29:56.831109"
   title: '[DRAFT] Llama Stack Specification'
   version: 0.0.1
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -2693,7 +2693,7 @@ paths:
       responses:
         '200':
           content:
-            application/json:
+            text/event-stream:
               schema:
                 $ref: '#/components/schemas/AgentTurnResponseStreamChunk'
           description: OK
@@ -3712,21 +3712,21 @@ security:
 servers:
 - url: http://any-hosted-llama-stack.com
 tags:
-- name: RewardScoring
-- name: Memory
-- name: SyntheticDataGeneration
-- name: Models
-- name: Safety
-- name: BatchInference
-- name: Agents
-- name: MemoryBanks
-- name: Shields
-- name: Datasets
 - name: Evaluations
 - name: Inspect
-- name: PostTraining
+- name: RewardScoring
+- name: Datasets
+- name: Models
 - name: Telemetry
+- name: PostTraining
+- name: SyntheticDataGeneration
+- name: BatchInference
 - name: Inference
+- name: Agents
+- name: Memory
+- name: Safety
+- name: Shields
+- name: MemoryBanks
 - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
   name: BuiltinTool
 - description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage"

From 1ff0476002de90a1de54af9f1e4f4b9c75fc91b8 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Thu, 10 Oct 2024 15:54:08 -0700
Subject: [PATCH 04/10] Split off meta-reference-quantized provider

---
 llama_stack/cli/stack/build.py                |  6 +++-
 .../agents/tests/test_chat_agent.py           |  4 +--
 .../meta_reference/inference/__init__.py      | 13 ++++----
 .../impls/meta_reference/inference/config.py  | 11 +++----
 .../meta_reference/inference/generation.py    | 33 ++++++-------------
 .../meta_reference/inference/inference.py     |  4 +--
 .../inference/model_parallel.py               |  7 ++--
 .../inference/parallel_utils.py               |  4 +--
 .../inference/quantization/loader.py          | 13 ++------
 llama_stack/providers/registry/inference.py   | 17 +++++++++-
 10 files changed, 54 insertions(+), 58 deletions(-)

diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py
index 3fe615e6e..3c59e8c20 100644
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@@ -149,6 +149,7 @@ class StackBuild(Subcommand):
 
     def _run_template_list_cmd(self, args: argparse.Namespace) -> None:
         import json
+
         from llama_stack.cli.table import print_table
 
         # eventually, this should query a registry at llama.meta.com/llamastack/distributions
@@ -175,6 +176,7 @@ class StackBuild(Subcommand):
 
     def _run_stack_build_command(self, args: argparse.Namespace) -> None:
         import textwrap
+
         import yaml
         from llama_stack.distribution.distribution import get_provider_registry
         from prompt_toolkit import prompt
@@ -256,7 +258,9 @@ class StackBuild(Subcommand):
             providers = dict()
             for api, providers_for_api in get_provider_registry().items():
                 available_providers = [
-                    x for x in providers_for_api.keys() if x != "remote"
+                    x
+                    for x in providers_for_api.keys()
+                    if x not in ("remote", "remote::sample")
                 ]
                 api_provider = prompt(
                     "> Enter provider for API {}: ".format(api.value),
diff --git a/llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py b/llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py
index 9d941edc9..46423814b 100644
--- a/llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py
+++ b/llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py
@@ -16,7 +16,7 @@ from llama_stack.apis.agents import *  # noqa: F403
 from ..agents import (
     AGENT_INSTANCES_BY_ID,
     MetaReferenceAgentsImpl,
-    MetaReferenceImplConfig,
+    MetaReferenceInferenceConfig,
 )
 
 
@@ -166,7 +166,7 @@ def mock_memory_api():
 @pytest.fixture
 async def chat_agent(mock_inference_api, mock_safety_api, mock_memory_api):
     impl = MetaReferenceAgentsImpl(
-        config=MetaReferenceImplConfig(),
+        config=MetaReferenceInferenceConfig(),
         inference_api=mock_inference_api,
         safety_api=mock_safety_api,
         memory_api=mock_memory_api,
diff --git a/llama_stack/providers/impls/meta_reference/inference/__init__.py b/llama_stack/providers/impls/meta_reference/inference/__init__.py
index 64d315e79..9c923490d 100644
--- a/llama_stack/providers/impls/meta_reference/inference/__init__.py
+++ b/llama_stack/providers/impls/meta_reference/inference/__init__.py
@@ -4,16 +4,17 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from .config import MetaReferenceImplConfig  # noqa
+from typing import Union
+
+from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
 
 
-async def get_provider_impl(config: MetaReferenceImplConfig, _deps):
+async def get_provider_impl(
+    config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig],
+    _deps,
+):
     from .inference import MetaReferenceInferenceImpl
 
-    assert isinstance(
-        config, MetaReferenceImplConfig
-    ), f"Unexpected config type: {type(config)}"
-
     impl = MetaReferenceInferenceImpl(config)
     await impl.initialize()
     return impl
diff --git a/llama_stack/providers/impls/meta_reference/inference/config.py b/llama_stack/providers/impls/meta_reference/inference/config.py
index ba5eddd53..901a8c7fb 100644
--- a/llama_stack/providers/impls/meta_reference/inference/config.py
+++ b/llama_stack/providers/impls/meta_reference/inference/config.py
@@ -15,12 +15,11 @@ from pydantic import BaseModel, Field, field_validator
 from llama_stack.providers.utils.inference import supported_inference_models
 
 
-class MetaReferenceImplConfig(BaseModel):
+class MetaReferenceInferenceConfig(BaseModel):
     model: str = Field(
         default="Llama3.1-8B-Instruct",
         description="Model descriptor from `llama model list`",
     )
-    quantization: Optional[QuantizationConfig] = None
     torch_seed: Optional[int] = None
     max_seq_len: int = 4096
     max_batch_size: int = 1
@@ -38,9 +37,9 @@ class MetaReferenceImplConfig(BaseModel):
 
     @property
     def model_parallel_size(self) -> int:
-        # HACK ALERT: this will be fixed when we move inference configuration
-        # to ModelsRegistry and we can explicitly ask for `model_parallel_size`
-        # as configuration there
         resolved = resolve_model(self.model)
-        assert resolved is not None
         return resolved.pth_file_count
+
+
+class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
+    quantization: QuantizationConfig
diff --git a/llama_stack/providers/impls/meta_reference/inference/generation.py b/llama_stack/providers/impls/meta_reference/inference/generation.py
index 37aef5ede..8d94a20d1 100644
--- a/llama_stack/providers/impls/meta_reference/inference/generation.py
+++ b/llama_stack/providers/impls/meta_reference/inference/generation.py
@@ -11,9 +11,8 @@ import json
 import os
 import sys
 import time
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Generator, List, Optional
+from typing import Generator, List, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -36,14 +35,12 @@ from llama_models.llama3.reference_impl.multimodal.model import (
 )
 from llama_models.sku_list import resolve_model
 
-from llama_stack.apis.inference import QuantizationType
-
-from llama_stack.distribution.utils.model_utils import model_local_dir
-
 from pydantic import BaseModel
 from termcolor import cprint
 
-from .config import MetaReferenceImplConfig
+from llama_stack.distribution.utils.model_utils import model_local_dir
+
+from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
 
 
 def model_checkpoint_dir(model) -> str:
@@ -68,7 +65,11 @@ class TokenResult(BaseModel):
 
 class Llama:
     @staticmethod
-    def build(config: MetaReferenceImplConfig):
+    def build(
+        config: Union[
+            MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig
+        ]
+    ):
         """
         Build a Llama instance by initializing and loading a model checkpoint.
 
@@ -78,15 +79,6 @@ class Llama:
         """
         model = resolve_model(config.model)
 
-        if (
-            config.quantization
-            and config.quantization.type == QuantizationType.fp8.value
-        ):
-            from .quantization.loader import is_fbgemm_available
-
-            if not is_fbgemm_available():
-                raise ImportError("fbgemm-gpu is required for FP8 quantization")
-
         if not torch.distributed.is_initialized():
             torch.distributed.init_process_group("nccl")
 
@@ -134,12 +126,7 @@ class Llama:
             model_args.vocab_size == tokenizer.n_words
         ), f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}"
 
-        fp8 = (
-            config.quantization
-            and config.quantization.type == QuantizationType.fp8.value
-        )
-
-        if fp8:
+        if isinstance(config, MetaReferenceQuantizedInferenceConfig):
             from .quantization.loader import convert_to_quantized_model
 
             # load on CPU in bf16 so that fp8 conversion does not find an
diff --git a/llama_stack/providers/impls/meta_reference/inference/inference.py b/llama_stack/providers/impls/meta_reference/inference/inference.py
index a8afcea54..6696762c9 100644
--- a/llama_stack/providers/impls/meta_reference/inference/inference.py
+++ b/llama_stack/providers/impls/meta_reference/inference/inference.py
@@ -17,7 +17,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
     chat_completion_request_to_messages,
 )
 
-from .config import MetaReferenceImplConfig
+from .config import MetaReferenceInferenceConfig
 from .model_parallel import LlamaModelParallelGenerator
 
 # there's a single model parallel process running serving the model. for now,
@@ -26,7 +26,7 @@ SEMAPHORE = asyncio.Semaphore(1)
 
 
 class MetaReferenceInferenceImpl(Inference, ModelsProtocolPrivate):
-    def __init__(self, config: MetaReferenceImplConfig) -> None:
+    def __init__(self, config: MetaReferenceInferenceConfig) -> None:
         self.config = config
         model = resolve_model(config.model)
         if model is None:
diff --git a/llama_stack/providers/impls/meta_reference/inference/model_parallel.py b/llama_stack/providers/impls/meta_reference/inference/model_parallel.py
index 798fadcbe..e8f483f30 100644
--- a/llama_stack/providers/impls/meta_reference/inference/model_parallel.py
+++ b/llama_stack/providers/impls/meta_reference/inference/model_parallel.py
@@ -6,7 +6,6 @@
 
 import os
 from copy import deepcopy
-from dataclasses import dataclass
 from functools import partial
 from typing import Generator, List, Optional
 
@@ -15,7 +14,7 @@ from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.sku_list import resolve_model
 
-from .config import MetaReferenceImplConfig
+from .config import MetaReferenceInferenceConfig
 from .generation import Llama, model_checkpoint_dir
 from .parallel_utils import InferenceArgs, ModelParallelProcessGroup
 
@@ -36,7 +35,7 @@ class ModelRunner:
         )
 
 
-def init_model_cb(config: MetaReferenceImplConfig):
+def init_model_cb(config: MetaReferenceInferenceConfig):
     llama = Llama.build(config)
     return ModelRunner(llama)
 
@@ -52,7 +51,7 @@ class LlamaModelParallelGenerator:
     clear at the callsite why we need to use a context manager.
     """
 
-    def __init__(self, config: MetaReferenceImplConfig):
+    def __init__(self, config: MetaReferenceInferenceConfig):
         self.config = config
         self.model = resolve_model(self.config.model)
         # this is a hack because Agent's loop uses this to tokenize and check if input is too long
diff --git a/llama_stack/providers/impls/meta_reference/inference/parallel_utils.py b/llama_stack/providers/impls/meta_reference/inference/parallel_utils.py
index c6eacc73c..7dbedd0f0 100644
--- a/llama_stack/providers/impls/meta_reference/inference/parallel_utils.py
+++ b/llama_stack/providers/impls/meta_reference/inference/parallel_utils.py
@@ -11,7 +11,7 @@ import tempfile
 import time
 import uuid
 from enum import Enum
-from typing import Any, Callable, Generator, List, Literal, Optional, Union
+from typing import Callable, Generator, List, Literal, Optional, Union
 
 import torch
 
@@ -317,7 +317,7 @@ def start_model_parallel_process(
 
     request_socket.send(encode_msg(ReadyRequest()))
     response = request_socket.recv()
-    print(f"Finished model load {response}")
+    print("Loaded model...")
 
     return request_socket, process
 
diff --git a/llama_stack/providers/impls/meta_reference/inference/quantization/loader.py b/llama_stack/providers/impls/meta_reference/inference/quantization/loader.py
index 1df86cb84..92b3a6ce3 100644
--- a/llama_stack/providers/impls/meta_reference/inference/quantization/loader.py
+++ b/llama_stack/providers/impls/meta_reference/inference/quantization/loader.py
@@ -22,19 +22,10 @@ from torch import Tensor
 from llama_stack.apis.inference import QuantizationType
 
 from llama_stack.providers.impls.meta_reference.inference.config import (
-    MetaReferenceImplConfig,
+    MetaReferenceQuantizedInferenceConfig,
 )
 
 
-def is_fbgemm_available() -> bool:
-    try:
-        import fbgemm_gpu.experimental.gen_ai  # noqa: F401
-
-        return True
-    except ImportError:
-        return False
-
-
 def swiglu_wrapper(
     self,
     x: Tensor,
@@ -47,7 +38,7 @@ def swiglu_wrapper(
 
 def convert_to_quantized_model(
     model: Transformer,
-    config: MetaReferenceImplConfig,
+    config: MetaReferenceQuantizedInferenceConfig,
     fp8_activation_scale_ub: Optional[float] = 1200.0,
 ) -> Transformer:
     if config.quantization.type == QuantizationType.bf16.value:
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index ddfd4ff40..686fc273b 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -14,6 +14,21 @@ def available_providers() -> List[ProviderSpec]:
         InlineProviderSpec(
             api=Api.inference,
             provider_type="meta-reference",
+            pip_packages=[
+                "accelerate",
+                "blobfile",
+                "fairscale",
+                "torch",
+                "torchvision",
+                "transformers",
+                "zmq",
+            ],
+            module="llama_stack.providers.impls.meta_reference.inference",
+            config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceInferenceConfig",
+        ),
+        InlineProviderSpec(
+            api=Api.inference,
+            provider_type="meta-reference-quantized",
             pip_packages=[
                 "accelerate",
                 "blobfile",
@@ -25,7 +40,7 @@ def available_providers() -> List[ProviderSpec]:
                 "zmq",
             ],
             module="llama_stack.providers.impls.meta_reference.inference",
-            config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceImplConfig",
+            config_class="llama_stack.providers.impls.meta_reference.inference.MetaReferenceQuantizedInferenceConfig",
         ),
         remote_provider_spec(
             api=Api.inference,

From ca29980c6b591c0cd500aebe7bf6e4620fb51c40 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Thu, 10 Oct 2024 20:17:29 -0700
Subject: [PATCH 05/10] fix agents context retriever

---
 .../impls/meta_reference/agents/rag/context_retriever.py     | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py b/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py
index 6b59479b3..b668dc0d6 100644
--- a/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py
+++ b/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py
@@ -63,13 +63,12 @@ async def llm_rag_query_generator(
 
     model = config.model
     message = UserMessage(content=content)
-    response = inference_api.chat_completion(
+    response = await inference_api.chat_completion(
         model=model,
         messages=[message],
         stream=False,
     )
 
-    async for chunk in response:
-        query = chunk.completion_message.content
+    query = response.completion_message.content
 
     return query

From 9fbe8852aa2ecf6e029e0bfe16177883fc7f831f Mon Sep 17 00:00:00 2001
From: Dalton Flanagan <6599399+dltn@users.noreply.github.com>
Date: Thu, 10 Oct 2024 23:39:25 -0400
Subject: [PATCH 06/10] Add Swift Package Index badge

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2a76913d6..238475840 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,7 @@ The `llama` CLI makes it easy to work with the Llama Stack set of tools. Please
 |  **Language** |  **Client SDK** | **Package** |
 | :----: | :----: | :----: |
 | Python |  [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
-| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) |
+| Swift  | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
 | Node   | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
 | Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) |
 

From 2128e61da2d3f660deee7e8c2fdf454dc0168a2d Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 11 Oct 2024 11:47:57 -0400
Subject: [PATCH 07/10] Fix incorrect completion() signature for Databricks
 provider (#236)

---
 docs/getting_started.md                                  | 2 +-
 .../providers/adapters/inference/databricks/__init__.py  | 3 ++-
 .../providers/adapters/inference/databricks/config.py    | 3 +--
 .../adapters/inference/databricks/databricks.py          | 9 ++++++++-
 .../providers/impls/meta_reference/safety/llama_guard.py | 2 +-
 llama_stack/providers/impls/vllm/__init__.py             | 6 ++++++
 6 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/docs/getting_started.md b/docs/getting_started.md
index 32f4d2d15..6c8c902c0 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -73,7 +73,7 @@ docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack-local
 ```
 
 > [!NOTE]
-> `~/.llama` should be the path containing downloaded weights of Llama models. 
+> `~/.llama` should be the path containing downloaded weights of Llama models.
 
 
 #### Via conda
diff --git a/llama_stack/providers/adapters/inference/databricks/__init__.py b/llama_stack/providers/adapters/inference/databricks/__init__.py
index 097579d25..ca2a0a103 100644
--- a/llama_stack/providers/adapters/inference/databricks/__init__.py
+++ b/llama_stack/providers/adapters/inference/databricks/__init__.py
@@ -7,10 +7,11 @@
 from .config import DatabricksImplConfig
 from .databricks import DatabricksInferenceAdapter
 
+
 async def get_adapter_impl(config: DatabricksImplConfig, _deps):
     assert isinstance(
         config, DatabricksImplConfig
     ), f"Unexpected config type: {type(config)}"
     impl = DatabricksInferenceAdapter(config)
     await impl.initialize()
-    return impl
\ No newline at end of file
+    return impl
diff --git a/llama_stack/providers/adapters/inference/databricks/config.py b/llama_stack/providers/adapters/inference/databricks/config.py
index 927bb474c..ae2b056ea 100644
--- a/llama_stack/providers/adapters/inference/databricks/config.py
+++ b/llama_stack/providers/adapters/inference/databricks/config.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
 
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
@@ -19,4 +18,4 @@ class DatabricksImplConfig(BaseModel):
     api_token: str = Field(
         default=None,
         description="The Databricks API token",
-    )
\ No newline at end of file
+    )
diff --git a/llama_stack/providers/adapters/inference/databricks/databricks.py b/llama_stack/providers/adapters/inference/databricks/databricks.py
index 2d7427253..7e8263dbf 100644
--- a/llama_stack/providers/adapters/inference/databricks/databricks.py
+++ b/llama_stack/providers/adapters/inference/databricks/databricks.py
@@ -48,7 +48,14 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
     async def shutdown(self) -> None:
         pass
 
-    def completion(self, request: CompletionRequest) -> AsyncGenerator:
+    def completion(
+        self,
+        model: str,
+        content: InterleavedTextMedia,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
+        stream: Optional[bool] = False,
+        logprobs: Optional[LogProbConfig] = None,
+    ) -> AsyncGenerator:
         raise NotImplementedError()
 
     def chat_completion(
diff --git a/llama_stack/providers/impls/meta_reference/safety/llama_guard.py b/llama_stack/providers/impls/meta_reference/safety/llama_guard.py
index 19a20a899..a6f450fae 100644
--- a/llama_stack/providers/impls/meta_reference/safety/llama_guard.py
+++ b/llama_stack/providers/impls/meta_reference/safety/llama_guard.py
@@ -170,7 +170,7 @@ class LlamaGuardShield(ShieldBase):
         for i in range(1, len(messages)):
             if messages[i].role == messages[i - 1].role:
                 raise ValueError(
-                    f"Messages must alternate between user and assistant. Message {i} has the same role as message {i-1}"
+                    f"Messages must alternate between user and assistant. Message {i} has the same role as message {i - 1}"
                 )
         return messages
 
diff --git a/llama_stack/providers/impls/vllm/__init__.py b/llama_stack/providers/impls/vllm/__init__.py
index 3d5a81ad9..aa0c4b101 100644
--- a/llama_stack/providers/impls/vllm/__init__.py
+++ b/llama_stack/providers/impls/vllm/__init__.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
 from typing import Any
 
 from .config import VLLMConfig

From 05282d12349d9c1034b93f5833e4c8840aa48c15 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 11 Oct 2024 13:03:59 -0400
Subject: [PATCH 08/10] Enable pre-commit on main branch (#237)

---
 .github/workflows/pre-commit.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 502753976..871e91f4a 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -1,6 +1,9 @@
 name: Pre-commit
 
-on: [pull_request]
+on:
+  pull_request:
+  push:
+    branches: [main]
 
 jobs:
   pre-commit:

From a2b87ed0cb5be83022e13f99bf0b5c17a6524072 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 11 Oct 2024 14:09:11 -0400
Subject: [PATCH 09/10] Switch to pre-commit/action (#239)

---
 .github/workflows/pre-commit.yml | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 871e91f4a..dd1a5c6cd 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -22,27 +22,4 @@ jobs:
             **/requirements*.txt
             .pre-commit-config.yaml
 
-      - name: Install pre-commit
-        run: |
-          python -m pip install --upgrade pip
-          pip install pre-commit
-
-      - name: Fetch base branch
-        run: git fetch origin ${{ github.event.pull_request.base.ref }}:refs/remotes/origin/${{ github.event.pull_request.base.ref }}
-
-      - name: Fetch head commit from PR
-        run: git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
-
-      - name: Get changed files
-        id: changed-files
-        run: |
-          git diff --name-only origin/${{ github.event.pull_request.base.ref }} pr-${{ github.event.pull_request.number }} > changed_files.txt
-          cat changed_files.txt
-
-      - name: Run pre-commit
-        run: |
-          if [ -s changed_files.txt ]; then
-            pre-commit run --files $(cat changed_files.txt | tr '\n' ' ')
-          else
-            echo "No changed files to run pre-commit on."
-          fi
+      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd #v3.0.1

From 209cd3d35ed34c7db550969539d9b7d29e09f01b Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 14 Oct 2024 11:13:04 -0700
Subject: [PATCH 10/10] Bump version to 0.0.42

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 3a24cff21..767f06be8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ blobfile
 fire
 httpx
 huggingface-hub
-llama-models>=0.0.41
+llama-models>=0.0.42
 prompt-toolkit
 python-dotenv
 pydantic>=2
diff --git a/setup.py b/setup.py
index 4f4ea7713..466ca655f 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ def read_requirements():
 
 setup(
     name="llama_stack",
-    version="0.0.41",
+    version="0.0.42",
     author="Meta Llama",
     author_email="llama-oss@meta.com",
     description="Llama Stack",