Support for Llama3.2 models and Swift SDK (#98)

2025-12-03 09:53:45 +00:00 · 2024-09-25 10:29:58 -07:00 · 2024-09-25 10:29:58 -07:00 · 56aed59eb4
commit 56aed59eb4
parent 95abbf576b
56 changed files with 3745 additions and 630 deletions
--- a/llama_stack/apis/agents/client.py
+++ b/llama_stack/apis/agents/client.py
@ -94,14 +94,16 @@ class AgentsClient(Agents):
                            print(f"Error with parsing or validation: {e}")


-async def _run_agent(api, tool_definitions, user_prompts, attachments=None):
+async def _run_agent(
+    api, model, tool_definitions, tool_prompt_format, user_prompts, attachments=None
+):
    agent_config = AgentConfig(
-        model="Meta-Llama3.1-8B-Instruct",
+        model=model,
        instructions="You are a helpful assistant",
-        sampling_params=SamplingParams(temperature=1.0, top_p=0.9),
+        sampling_params=SamplingParams(temperature=0.6, top_p=0.9),
        tools=tool_definitions,
        tool_choice=ToolChoice.auto,
-        tool_prompt_format=ToolPromptFormat.function_tag,
+        tool_prompt_format=tool_prompt_format,
        enable_session_persistence=False,
    )

@ -130,7 +132,8 @@ async def _run_agent(api, tool_definitions, user_prompts, attachments=None):
                log.print()


-async def run_main(host: str, port: int):
+async def run_llama_3_1(host: str, port: int):
+    model = "Llama3.1-8B-Instruct"
    api = AgentsClient(f"http://{host}:{port}")

    tool_definitions = [
@ -167,10 +170,11 @@ async def run_main(host: str, port: int):
        "Write code to check if a number is prime. Use that to check if 7 is prime",
        "What is the boiling point of polyjuicepotion ?",
    ]
-    await _run_agent(api, tool_definitions, user_prompts)
+    await _run_agent(api, model, tool_definitions, ToolPromptFormat.json, user_prompts)


-async def run_rag(host: str, port: int):
+async def run_llama_3_2_rag(host: str, port: int):
+    model = "Llama3.2-3B-Instruct"
    api = AgentsClient(f"http://{host}:{port}")

    urls = [
@ -206,12 +210,71 @@ async def run_rag(host: str, port: int):
        "Tell me briefly about llama3 and torchtune",
    ]

-    await _run_agent(api, tool_definitions, user_prompts, attachments)
+    await _run_agent(
+        api, model, tool_definitions, ToolPromptFormat.json, user_prompts, attachments
+    )


-def main(host: str, port: int, rag: bool = False):
-    fn = run_rag if rag else run_main
-    asyncio.run(fn(host, port))
+async def run_llama_3_2(host: str, port: int):
+    model = "Llama3.2-3B-Instruct"
+    api = AgentsClient(f"http://{host}:{port}")
+
+    # zero shot tools for llama3.2 text models
+    tool_definitions = [
+        FunctionCallToolDefinition(
+            function_name="get_boiling_point",
+            description="Get the boiling point of a imaginary liquids (eg. polyjuice)",
+            parameters={
+                "liquid_name": ToolParamDefinition(
+                    param_type="str",
+                    description="The name of the liquid",
+                    required=True,
+                ),
+                "celcius": ToolParamDefinition(
+                    param_type="bool",
+                    description="Whether to return the boiling point in Celcius",
+                    required=False,
+                ),
+            },
+        ),
+        FunctionCallToolDefinition(
+            function_name="make_web_search",
+            description="Search the web / internet for more realtime information",
+            parameters={
+                "query": ToolParamDefinition(
+                    param_type="str",
+                    description="the query to search for",
+                    required=True,
+                ),
+            },
+        ),
+    ]
+
+    user_prompts = [
+        "Who are you?",
+        "what is the 100th prime number?",
+        "Who was 44th President of USA?",
+        # multiple tool calls in a single prompt
+        "What is the boiling point of polyjuicepotion and pinkponklyjuice?",
+    ]
+    await _run_agent(
+        api, model, tool_definitions, ToolPromptFormat.python_list, user_prompts
+    )
+
+
+def main(host: str, port: int, run_type: str):
+    assert run_type in [
+        "tools_llama_3_1",
+        "tools_llama_3_2",
+        "rag_llama_3_2",
+    ], f"Invalid run type {run_type}, must be one of tools_llama_3_1, tools_llama_3_2, rag_llama_3_2"
+
+    fn = {
+        "tools_llama_3_1": run_llama_3_1,
+        "tools_llama_3_2": run_llama_3_2,
+        "rag_llama_3_2": run_llama_3_2_rag,
+    }
+    asyncio.run(fn[run_type](host, port))


 if __name__ == "__main__":
--- a/llama_stack/apis/inference/client.py
+++ b/llama_stack/apis/inference/client.py
@ -10,6 +10,10 @@ from typing import Any, AsyncGenerator, List, Optional

 import fire
 import httpx
+
+from llama_models.llama3.api.datatypes import ImageMedia, URL
+
+from PIL import Image as PIL_Image
 from pydantic import BaseModel

 from llama_models.llama3.api import *  # noqa: F403
@ -105,7 +109,7 @@ async def run_main(host: str, port: int, stream: bool):
    )
    cprint(f"User>{message.content}", "green")
    iterator = client.chat_completion(
-        model="Meta-Llama3.1-8B-Instruct",
+        model="Llama3.1-8B-Instruct",
        messages=[message],
        stream=stream,
    )
@ -113,8 +117,34 @@ async def run_main(host: str, port: int, stream: bool):
        log.print()


-def main(host: str, port: int, stream: bool = True):
-    asyncio.run(run_main(host, port, stream))
+async def run_mm_main(host: str, port: int, stream: bool, path: str):
+    client = InferenceClient(f"http://{host}:{port}")
+
+    with open(path, "rb") as f:
+        img = PIL_Image.open(f).convert("RGB")
+
+    message = UserMessage(
+        content=[
+            ImageMedia(image=URL(uri=f"file://{path}")),
+            # ImageMedia(image=img),
+            "Describe this image in two sentences",
+        ],
+    )
+    cprint(f"User>{message.content}", "green")
+    iterator = client.chat_completion(
+        model="Llama3.2-11B-Vision-Instruct",
+        messages=[message],
+        stream=stream,
+    )
+    async for log in EventLogger().log(iterator):
+        log.print()
+
+
+def main(host: str, port: int, stream: bool = True, mm: bool = False, file: str = None):
+    if mm:
+        asyncio.run(run_mm_main(host, port, stream, file))
+    else:
+        asyncio.run(run_main(host, port, stream))


 if __name__ == "__main__":
--- a/llama_stack/apis/memory_banks/memory_banks.py
+++ b/llama_stack/apis/memory_banks/memory_banks.py
@ -7,11 +7,11 @@
 from typing import List, Optional, Protocol

 from llama_models.schema_utils import json_schema_type, webmethod
+from pydantic import BaseModel, Field

 from llama_stack.apis.memory import MemoryBankType

 from llama_stack.distribution.datatypes import GenericProviderConfig
-from pydantic import BaseModel, Field


@json_schema_type
--- a/llama_stack/apis/safety/client.py
+++ b/llama_stack/apis/safety/client.py
@ -51,6 +51,11 @@ class SafetyClient(Safety):
                ),
                headers={
                    "Content-Type": "application/json",
+                    "X-LlamaStack-ProviderData": json.dumps(
+                        {
+                            "together_api_key": "1882f9a484fc7c6ce3e4dc90272d5db52346c93838daab3d704803181f396b22"
+                        }
+                    ),
                },
                timeout=20,
            )
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -44,7 +44,7 @@ def setup_download_parser(parser: argparse.ArgumentParser) -> None:
    parser.add_argument(
        "--source",
        choices=["meta", "huggingface"],
-        required=True,
+        default="meta",
    )
    parser.add_argument(
        "--model-id",
@ -116,7 +116,7 @@ def _hf_download(
            "You can find your token by visiting https://huggingface.co/settings/tokens"
        )
    except RepositoryNotFoundError:
-        parser.error(f"Repository '{args.repo_id}' not found on the Hugging Face Hub.")
+        parser.error(f"Repository '{repo_id}' not found on the Hugging Face Hub.")
    except Exception as e:
        parser.error(e)

--- a/llama_stack/cli/model/model.py
+++ b/llama_stack/cli/model/model.py
@ -9,7 +9,7 @@ import argparse
 from llama_stack.cli.model.describe import ModelDescribe
 from llama_stack.cli.model.download import ModelDownload
 from llama_stack.cli.model.list import ModelList
-from llama_stack.cli.model.template import ModelTemplate
+from llama_stack.cli.model.prompt_format import ModelPromptFormat

 from llama_stack.cli.subcommand import Subcommand

@ -30,5 +30,5 @@ class ModelParser(Subcommand):
        # Add sub-commands
        ModelDownload.create(subparsers)
        ModelList.create(subparsers)
-        ModelTemplate.create(subparsers)
+        ModelPromptFormat.create(subparsers)
        ModelDescribe.create(subparsers)
--- a/llama_stack/cli/model/prompt_format.py
+++ b/llama_stack/cli/model/prompt_format.py
@ -0,0 +1,116 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import argparse
+import subprocess
+import textwrap
+from io import StringIO
+
+from llama_models.datatypes import CoreModelId, is_multimodal, model_family, ModelFamily
+
+from llama_stack.cli.subcommand import Subcommand
+
+
+class ModelPromptFormat(Subcommand):
+    """Llama model cli for describe a model prompt format (message formats)"""
+
+    def __init__(self, subparsers: argparse._SubParsersAction):
+        super().__init__()
+        self.parser = subparsers.add_parser(
+            "prompt-format",
+            prog="llama model prompt-format",
+            description="Show llama model message formats",
+            epilog=textwrap.dedent(
+                """
+                Example:
+                    llama model prompt-format <options>
+                """
+            ),
+            formatter_class=argparse.RawTextHelpFormatter,
+        )
+        self._add_arguments()
+        self.parser.set_defaults(func=self._run_model_template_cmd)
+
+    def _add_arguments(self):
+        self.parser.add_argument(
+            "-m",
+            "--model-name",
+            type=str,
+            default="llama3_1",
+            help="Model Family (llama3_1, llama3_X, etc.)",
+        )
+
+    def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
+        import pkg_resources
+
+        # Only Llama 3.1 and 3.2 are supported
+        supported_model_ids = [
+            m
+            for m in CoreModelId
+            if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
+        ]
+        model_str = "\n".join([m.value for m in supported_model_ids])
+        try:
+            model_id = CoreModelId(args.model_name)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}"
+            ) from None
+
+        if model_id not in supported_model_ids:
+            raise argparse.ArgumentTypeError(
+                f"{model_id} is not a valid Model. Choose one from --\n {model_str}"
+            ) from None
+
+        llama_3_1_file = pkg_resources.resource_filename(
+            "llama_models", "llama3_1/prompt_format.md"
+        )
+        llama_3_2_text_file = pkg_resources.resource_filename(
+            "llama_models", "llama3_2/text_prompt_format.md"
+        )
+        llama_3_2_vision_file = pkg_resources.resource_filename(
+            "llama_models", "llama3_2/vision_prompt_format.md"
+        )
+        if model_family(model_id) == ModelFamily.llama3_1:
+            with open(llama_3_1_file, "r") as f:
+                content = f.read()
+        elif model_family(model_id) == ModelFamily.llama3_2:
+            if is_multimodal(model_id):
+                with open(llama_3_2_vision_file, "r") as f:
+                    content = f.read()
+            else:
+                with open(llama_3_2_text_file, "r") as f:
+                    content = f.read()
+
+        render_markdown_to_pager(content)
+
+
+def render_markdown_to_pager(markdown_content: str):
+    from rich.console import Console
+    from rich.markdown import Markdown
+    from rich.style import Style
+    from rich.text import Text
+
+    class LeftAlignedHeaderMarkdown(Markdown):
+        def parse_header(self, token):
+            level = token.type.count("h")
+            content = Text(token.content)
+            header_style = Style(color="bright_blue", bold=True)
+            header = Text(f"{'#' * level} ", style=header_style) + content
+            self.add_text(header)
+
+    # Render the Markdown
+    md = LeftAlignedHeaderMarkdown(markdown_content)
+
+    # Capture the rendered output
+    output = StringIO()
+    console = Console(file=output, force_terminal=True, width=100)  # Set a fixed width
+    console.print(md)
+    rendered_content = output.getvalue()
+
+    # Pipe to pager
+    pager = subprocess.Popen(["less", "-R"], stdin=subprocess.PIPE)
+    pager.communicate(input=rendered_content.encode())
--- a/llama_stack/cli/model/template.py
+++ b/llama_stack/cli/model/template.py
@ -1,113 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import argparse
-import textwrap
-
-from termcolor import colored
-
-from llama_stack.cli.subcommand import Subcommand
-
-
-class ModelTemplate(Subcommand):
-    """Llama model cli for describe a model template (message formats)"""
-
-    def __init__(self, subparsers: argparse._SubParsersAction):
-        super().__init__()
-        self.parser = subparsers.add_parser(
-            "template",
-            prog="llama model template",
-            description="Show llama model message formats",
-            epilog=textwrap.dedent(
-                """
-                Example:
-                    llama model template <options>
-                """
-            ),
-            formatter_class=argparse.RawTextHelpFormatter,
-        )
-        self._add_arguments()
-        self.parser.set_defaults(func=self._run_model_template_cmd)
-
-    def _prompt_type(self, value):
-        from llama_models.llama3.api.datatypes import ToolPromptFormat
-
-        try:
-            return ToolPromptFormat(value.lower())
-        except ValueError:
-            raise argparse.ArgumentTypeError(
-                f"{value} is not a valid ToolPromptFormat. Choose from {', '.join(t.value for t in ToolPromptFormat)}"
-            ) from None
-
-    def _add_arguments(self):
-        self.parser.add_argument(
-            "-m",
-            "--model-family",
-            type=str,
-            default="llama3_1",
-            help="Model Family (llama3_1, llama3_X, etc.)",
-        )
-        self.parser.add_argument(
-            "--name",
-            type=str,
-            help="Usecase template name (system_message, user_message, assistant_message, tool_message)...",
-            required=False,
-        )
-        self.parser.add_argument(
-            "--format",
-            type=str,
-            help="ToolPromptFormat (json or function_tag). This flag is used to print the template in a specific formats.",
-            required=False,
-            default="json",
-        )
-        self.parser.add_argument(
-            "--raw",
-            action="store_true",
-            help="If set to true, don't pretty-print into a table. Useful to copy-paste.",
-        )
-
-    def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
-        from llama_models.llama3.api.interface import (
-            list_jinja_templates,
-            render_jinja_template,
-        )
-
-        from llama_stack.cli.table import print_table
-
-        if args.name:
-            tool_prompt_format = self._prompt_type(args.format)
-            template, tokens_info = render_jinja_template(args.name, tool_prompt_format)
-            rendered = ""
-            for tok, is_special in tokens_info:
-                if is_special:
-                    rendered += colored(tok, "yellow", attrs=["bold"])
-                else:
-                    rendered += tok
-
-            if not args.raw:
-                rendered = rendered.replace("\n", "↵\n")
-                print_table(
-                    [
-                        (
-                            "Name",
-                            colored(template.template_name, "white", attrs=["bold"]),
-                        ),
-                        ("Template", rendered),
-                        ("Notes", template.notes),
-                    ],
-                    separate_rows=True,
-                )
-            else:
-                print("Template: ", template.template_name)
-                print("=" * 40)
-                print(rendered)
-        else:
-            templates = list_jinja_templates()
-            headers = ["Role", "Template Name"]
-            print_table(
-                [(t.role, t.template_name) for t in templates],
-                headers,
-            )
--- a/llama_stack/distribution/control_plane/init.py
+++ b/llama_stack/distribution/control_plane/init.py
@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
--- a/llama_stack/distribution/start_container.sh
+++ b/llama_stack/distribution/start_container.sh
@ -8,6 +8,7 @@

 DOCKER_BINARY=${DOCKER_BINARY:-docker}
 DOCKER_OPTS=${DOCKER_OPTS:-}
+LLAMA_CHECKPOINT_DIR=${LLAMA_CHECKPOINT_DIR:-}

 set -euo pipefail

@ -37,10 +38,25 @@ port="$1"
 shift

 set -x
-$DOCKER_BINARY run $DOCKER_OPTS -it \
-  -p $port:$port \
-  -v "$yaml_config:/app/config.yaml" \
-  $docker_image \
-  python -m llama_stack.distribution.server.server \
-  --yaml_config /app/config.yaml \
-  --port $port "$@"
+
+if [ -n "$LLAMA_CHECKPOINT_DIR" ]; then
+  $DOCKER_BINARY run $DOCKER_OPTS -it \
+    -p $port:$port \
+    -v "$yaml_config:/app/config.yaml" \
+    -v "$LLAMA_CHECKPOINT_DIR:/root/.llama" \
+    --gpus=all \
+    $docker_image \
+    python -m llama_stack.distribution.server.server \
+    --yaml_config /app/config.yaml \
+    --port $port "$@"
+fi
+
+if [ -z "$LLAMA_CHECKPOINT_DIR" ]; then
+  $DOCKER_BINARY run $DOCKER_OPTS -it \
+    -p $port:$port \
+    -v "$yaml_config:/app/config.yaml" \
+    $docker_image \
+    python -m llama_stack.distribution.server.server \
+    --yaml_config /app/config.yaml \
+    --port $port "$@"
+fi
--- a/llama_stack/providers/adapters/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/adapters/inference/fireworks/fireworks.py
@ -15,14 +15,16 @@ from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.sku_list import resolve_model

 from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+from llama_stack.providers.utils.inference.augment_messages import (
+    augment_messages_for_tools,
+)

 from .config import FireworksImplConfig

 FIREWORKS_SUPPORTED_MODELS = {
-    "Meta-Llama3.1-8B-Instruct": "fireworks/llama-v3p1-8b-instruct",
-    "Meta-Llama3.1-70B-Instruct": "fireworks/llama-v3p1-70b-instruct",
-    "Meta-Llama3.1-405B-Instruct": "fireworks/llama-v3p1-405b-instruct",
+    "Llama3.1-8B-Instruct": "fireworks/llama-v3p1-8b-instruct",
+    "Llama3.1-70B-Instruct": "fireworks/llama-v3p1-70b-instruct",
+    "Llama3.1-405B-Instruct": "fireworks/llama-v3p1-405b-instruct",
 }


@ -106,7 +108,7 @@ class FireworksInferenceAdapter(Inference):
            logprobs=logprobs,
        )

-        messages = prepare_messages(request)
+        messages = augment_messages_for_tools(request)

        # accumulate sampling params and other options to pass to fireworks
        options = self.get_fireworks_chat_options(request)
--- a/llama_stack/providers/adapters/inference/ollama/ollama.py
+++ b/llama_stack/providers/adapters/inference/ollama/ollama.py
@ -16,14 +16,16 @@ from llama_models.sku_list import resolve_model
 from ollama import AsyncClient

 from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+from llama_stack.providers.utils.inference.augment_messages import (
+    augment_messages_for_tools,
+)

 # TODO: Eventually this will move to the llama cli model list command
 # mapping of Model SKUs to ollama models
 OLLAMA_SUPPORTED_SKUS = {
-    # "Meta-Llama3.1-8B-Instruct": "llama3.1",
-    "Meta-Llama3.1-8B-Instruct": "llama3.1:8b-instruct-fp16",
-    "Meta-Llama3.1-70B-Instruct": "llama3.1:70b-instruct-fp16",
+    # "Llama3.1-8B-Instruct": "llama3.1",
+    "Llama3.1-8B-Instruct": "llama3.1:8b-instruct-fp16",
+    "Llama3.1-70B-Instruct": "llama3.1:70b-instruct-fp16",
 }


@ -115,7 +117,7 @@ class OllamaInferenceAdapter(Inference):
            logprobs=logprobs,
        )

-        messages = prepare_messages(request)
+        messages = augment_messages_for_tools(request)
        # accumulate sampling params and other options to pass to ollama
        options = self.get_ollama_chat_options(request)
        ollama_model = self.resolve_ollama_model(request.model)
--- a/llama_stack/providers/adapters/inference/tgi/tgi.py
+++ b/llama_stack/providers/adapters/inference/tgi/tgi.py
@ -14,7 +14,9 @@ from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import StopReason
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+from llama_stack.providers.utils.inference.augment_messages import (
+    augment_messages_for_tools,
+)

 from .config import TGIImplConfig

@ -95,7 +97,7 @@ class TGIAdapter(Inference):
            logprobs=logprobs,
        )

-        messages = prepare_messages(request)
+        messages = augment_messages_for_tools(request)
        model_input = self.formatter.encode_dialog_prompt(messages)
        prompt = self.tokenizer.decode(model_input.tokens)

--- a/llama_stack/providers/adapters/inference/together/together.py
+++ b/llama_stack/providers/adapters/inference/together/together.py
@ -15,14 +15,16 @@ from llama_models.sku_list import resolve_model
 from together import Together

 from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+from llama_stack.providers.utils.inference.augment_messages import (
+    augment_messages_for_tools,
+)

 from .config import TogetherImplConfig

 TOGETHER_SUPPORTED_MODELS = {
-    "Meta-Llama3.1-8B-Instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
-    "Meta-Llama3.1-70B-Instruct": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
-    "Meta-Llama3.1-405B-Instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
+    "Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct-Turbo",
+    "Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct-Turbo",
+    "Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-Turbo",
 }


@ -110,7 +112,7 @@ class TogetherInferenceAdapter(Inference):
        # accumulate sampling params and other options to pass to together
        options = self.get_together_chat_options(request)
        together_model = self.resolve_together_model(request.model)
-        messages = prepare_messages(request)
+        messages = augment_messages_for_tools(request)

        if not request.stream:
            # TODO: might need to add back an async here
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference.xcodeproj/project.pbxproj
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference.xcodeproj/project.pbxproj
@ -0,0 +1,548 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 60;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		5C03561F2CA3AB9600E3BB46 /* LlamaStackClient in Frameworks */ = {isa = PBXBuildFile; productRef = 5C03561E2CA3AB9600E3BB46 /* LlamaStackClient */; };
+		5C5B6E212CA3D89F00AF6130 /* LlamaStackClient in Frameworks */ = {isa = PBXBuildFile; productRef = 5C5B6E202CA3D89F00AF6130 /* LlamaStackClient */; };
+		5CCBC60C2CA1F04A00E958D0 /* LocalInference.h in Headers */ = {isa = PBXBuildFile; fileRef = 5CCBC60B2CA1F04A00E958D0 /* LocalInference.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		5CCBC6752CA1F45800E958D0 /* executorch_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 5CCBC6742CA1F45800E958D0 /* executorch_debug */; };
+		5CCBC6862CA1F64A00E958D0 /* LLaMARunner.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5CCBC6802CA1F63F00E958D0 /* LLaMARunner.framework */; platformFilter = ios; };
+		5CCBC6872CA1F64A00E958D0 /* LLaMARunner.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 5CCBC6802CA1F63F00E958D0 /* LLaMARunner.framework */; platformFilter = ios; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
+		5CCBC68D2CA1F7A100E958D0 /* PromptTemplate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5CCBC6892CA1F7A000E958D0 /* PromptTemplate.swift */; };
+		5CCBC68E2CA1F7A100E958D0 /* LocalInference.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5CCBC68A2CA1F7A000E958D0 /* LocalInference.swift */; };
+		5CCBC68F2CA1F7A100E958D0 /* Parsing.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5CCBC68B2CA1F7A000E958D0 /* Parsing.swift */; };
+		5CCBC6902CA1F7A100E958D0 /* SystemPrompts.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5CCBC68C2CA1F7A100E958D0 /* SystemPrompts.swift */; };
+		5CCBC6932CA1F7D000E958D0 /* Stencil in Frameworks */ = {isa = PBXBuildFile; productRef = 5CCBC6922CA1F7D000E958D0 /* Stencil */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXContainerItemProxy section */
+		5CCBC67D2CA1F63F00E958D0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 036CAF9D2BB1444500D6C2D5;
+			remoteInfo = LLaMA;
+		};
+		5CCBC67F2CA1F63F00E958D0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 03729ED52BB1F8DE00152F2E;
+			remoteInfo = LLaMARunner;
+		};
+		5CCBC69E2CA2036B00E958D0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 5CCBC6982CA2036A00E958D0;
+			remoteInfo = LLaMAPerfBenchmark;
+		};
+		5CCBC6A02CA2036B00E958D0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 5CCBC6992CA2036A00E958D0;
+			remoteInfo = LLaMAPerfBenchmarkTests;
+		};
+/* End PBXContainerItemProxy section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		5CCBC6882CA1F64A00E958D0 /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				5CCBC6872CA1F64A00E958D0 /* LLaMARunner.framework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		5CCBC6082CA1F04A00E958D0 /* LocalInference.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = LocalInference.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		5CCBC60B2CA1F04A00E958D0 /* LocalInference.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LocalInference.h; sourceTree = "<group>"; };
+		5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = LLaMA.xcodeproj; path = "executorch/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj"; sourceTree = "<group>"; };
+		5CCBC6892CA1F7A000E958D0 /* PromptTemplate.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PromptTemplate.swift; sourceTree = "<group>"; };
+		5CCBC68A2CA1F7A000E958D0 /* LocalInference.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = LocalInference.swift; sourceTree = "<group>"; };
+		5CCBC68B2CA1F7A000E958D0 /* Parsing.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Parsing.swift; sourceTree = "<group>"; };
+		5CCBC68C2CA1F7A100E958D0 /* SystemPrompts.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SystemPrompts.swift; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		5CCBC6052CA1F04A00E958D0 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				5C03561F2CA3AB9600E3BB46 /* LlamaStackClient in Frameworks */,
+				5C5B6E212CA3D89F00AF6130 /* LlamaStackClient in Frameworks */,
+				5CCBC6932CA1F7D000E958D0 /* Stencil in Frameworks */,
+				5CCBC6862CA1F64A00E958D0 /* LLaMARunner.framework in Frameworks */,
+				5CCBC6752CA1F45800E958D0 /* executorch_debug in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		5CCBC5FE2CA1F04A00E958D0 = {
+			isa = PBXGroup;
+			children = (
+				5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */,
+				5CCBC60A2CA1F04A00E958D0 /* LocalInference */,
+				5CCBC6092CA1F04A00E958D0 /* Products */,
+				5CCBC6852CA1F64A00E958D0 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		5CCBC6092CA1F04A00E958D0 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				5CCBC6082CA1F04A00E958D0 /* LocalInference.framework */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		5CCBC60A2CA1F04A00E958D0 /* LocalInference */ = {
+			isa = PBXGroup;
+			children = (
+				5CCBC68A2CA1F7A000E958D0 /* LocalInference.swift */,
+				5CCBC68B2CA1F7A000E958D0 /* Parsing.swift */,
+				5CCBC6892CA1F7A000E958D0 /* PromptTemplate.swift */,
+				5CCBC68C2CA1F7A100E958D0 /* SystemPrompts.swift */,
+				5CCBC60B2CA1F04A00E958D0 /* LocalInference.h */,
+			);
+			path = LocalInference;
+			sourceTree = "<group>";
+		};
+		5CCBC6772CA1F63F00E958D0 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				5CCBC67E2CA1F63F00E958D0 /* LLaMA.app */,
+				5CCBC6802CA1F63F00E958D0 /* LLaMARunner.framework */,
+				5CCBC69F2CA2036B00E958D0 /* LLaMAPerfBenchmark.app */,
+				5CCBC6A12CA2036B00E958D0 /* LLaMAPerfBenchmarkTests.xctest */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		5CCBC6852CA1F64A00E958D0 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXHeadersBuildPhase section */
+		5CCBC6032CA1F04A00E958D0 /* Headers */ = {
+			isa = PBXHeadersBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				5CCBC60C2CA1F04A00E958D0 /* LocalInference.h in Headers */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXHeadersBuildPhase section */
+
+/* Begin PBXNativeTarget section */
+		5CCBC6072CA1F04A00E958D0 /* LocalInference */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 5CCBC60F2CA1F04A00E958D0 /* Build configuration list for PBXNativeTarget "LocalInference" */;
+			buildPhases = (
+				5CCBC6032CA1F04A00E958D0 /* Headers */,
+				5CCBC6042CA1F04A00E958D0 /* Sources */,
+				5CCBC6052CA1F04A00E958D0 /* Frameworks */,
+				5CCBC6062CA1F04A00E958D0 /* Resources */,
+				5CCBC6882CA1F64A00E958D0 /* Embed Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = LocalInference;
+			packageProductDependencies = (
+				5CCBC6742CA1F45800E958D0 /* executorch_debug */,
+				5CCBC6922CA1F7D000E958D0 /* Stencil */,
+				5C03561E2CA3AB9600E3BB46 /* LlamaStackClient */,
+				5C5B6E202CA3D89F00AF6130 /* LlamaStackClient */,
+			);
+			productName = LocalInferenceProvider;
+			productReference = 5CCBC6082CA1F04A00E958D0 /* LocalInference.framework */;
+			productType = "com.apple.product-type.framework";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		5CCBC5FF2CA1F04A00E958D0 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				BuildIndependentTargetsInParallel = 1;
+				LastUpgradeCheck = 1540;
+				TargetAttributes = {
+					5CCBC6072CA1F04A00E958D0 = {
+						CreatedOnToolsVersion = 15.4;
+						LastSwiftMigration = 1540;
+					};
+				};
+			};
+			buildConfigurationList = 5CCBC6022CA1F04A00E958D0 /* Build configuration list for PBXProject "LocalInference" */;
+			compatibilityVersion = "Xcode 14.0";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 5CCBC5FE2CA1F04A00E958D0;
+			packageReferences = (
+				5CCBC6732CA1F45800E958D0 /* XCRemoteSwiftPackageReference "executorch" */,
+				5CCBC6912CA1F7D000E958D0 /* XCRemoteSwiftPackageReference "Stencil" */,
+				5C5B6E1F2CA3D89F00AF6130 /* XCLocalSwiftPackageReference "internal-llama-stack-client-swift" */,
+			);
+			productRefGroup = 5CCBC6092CA1F04A00E958D0 /* Products */;
+			projectDirPath = "";
+			projectReferences = (
+				{
+					ProductGroup = 5CCBC6772CA1F63F00E958D0 /* Products */;
+					ProjectRef = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+				},
+			);
+			projectRoot = "";
+			targets = (
+				5CCBC6072CA1F04A00E958D0 /* LocalInference */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXReferenceProxy section */
+		5CCBC67E2CA1F63F00E958D0 /* LLaMA.app */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.application;
+			path = LLaMA.app;
+			remoteRef = 5CCBC67D2CA1F63F00E958D0 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+		5CCBC6802CA1F63F00E958D0 /* LLaMARunner.framework */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.framework;
+			path = LLaMARunner.framework;
+			remoteRef = 5CCBC67F2CA1F63F00E958D0 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+		5CCBC69F2CA2036B00E958D0 /* LLaMAPerfBenchmark.app */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.application;
+			path = LLaMAPerfBenchmark.app;
+			remoteRef = 5CCBC69E2CA2036B00E958D0 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+		5CCBC6A12CA2036B00E958D0 /* LLaMAPerfBenchmarkTests.xctest */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.cfbundle;
+			path = LLaMAPerfBenchmarkTests.xctest;
+			remoteRef = 5CCBC6A02CA2036B00E958D0 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+/* End PBXReferenceProxy section */
+
+/* Begin PBXResourcesBuildPhase section */
+		5CCBC6062CA1F04A00E958D0 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		5CCBC6042CA1F04A00E958D0 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				5CCBC6902CA1F7A100E958D0 /* SystemPrompts.swift in Sources */,
+				5CCBC68D2CA1F7A100E958D0 /* PromptTemplate.swift in Sources */,
+				5CCBC68F2CA1F7A100E958D0 /* Parsing.swift in Sources */,
+				5CCBC68E2CA1F7A100E958D0 /* LocalInference.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		5CCBC60D2CA1F04A00E958D0 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				CURRENT_PROJECT_VERSION = 1;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.5;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				VERSIONING_SYSTEM = "apple-generic";
+				VERSION_INFO_PREFIX = "";
+			};
+			name = Debug;
+		};
+		5CCBC60E2CA1F04A00E958D0 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				CURRENT_PROJECT_VERSION = 1;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.5;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				VALIDATE_PRODUCT = YES;
+				VERSIONING_SYSTEM = "apple-generic";
+				VERSION_INFO_PREFIX = "";
+			};
+			name = Release;
+		};
+		5CCBC6102CA1F04A00E958D0 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				BUILD_LIBRARY_FOR_DISTRIBUTION = YES;
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEFINES_MODULE = YES;
+				DYLIB_COMPATIBILITY_VERSION = 1;
+				DYLIB_CURRENT_VERSION = 1;
+				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_MODULE_VERIFIER = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				HEADER_SEARCH_PATHS = "";
+				INFOPLIST_KEY_NSHumanReadableCopyright = "";
+				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+					"@loader_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				MODULE_VERIFIER_SUPPORTED_LANGUAGES = "objective-c objective-c++";
+				MODULE_VERIFIER_SUPPORTED_LANGUAGE_STANDARDS = "gnu17 gnu++20";
+				OTHER_LDFLAGS = "";
+				PRODUCT_BUNDLE_IDENTIFIER = meta.llamatsack.LocalInferenceProvider;
+				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
+				SKIP_INSTALL = YES;
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_INSTALL_OBJC_HEADER = NO;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		5CCBC6112CA1F04A00E958D0 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				BUILD_LIBRARY_FOR_DISTRIBUTION = YES;
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEFINES_MODULE = YES;
+				DYLIB_COMPATIBILITY_VERSION = 1;
+				DYLIB_CURRENT_VERSION = 1;
+				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_MODULE_VERIFIER = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				HEADER_SEARCH_PATHS = "";
+				INFOPLIST_KEY_NSHumanReadableCopyright = "";
+				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+					"@loader_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				MODULE_VERIFIER_SUPPORTED_LANGUAGES = "objective-c objective-c++";
+				MODULE_VERIFIER_SUPPORTED_LANGUAGE_STANDARDS = "gnu17 gnu++20";
+				OTHER_LDFLAGS = "";
+				PRODUCT_BUNDLE_IDENTIFIER = meta.llamatsack.LocalInferenceProvider;
+				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
+				SKIP_INSTALL = YES;
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_INSTALL_OBJC_HEADER = NO;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		5CCBC6022CA1F04A00E958D0 /* Build configuration list for PBXProject "LocalInference" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				5CCBC60D2CA1F04A00E958D0 /* Debug */,
+				5CCBC60E2CA1F04A00E958D0 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		5CCBC60F2CA1F04A00E958D0 /* Build configuration list for PBXNativeTarget "LocalInference" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				5CCBC6102CA1F04A00E958D0 /* Debug */,
+				5CCBC6112CA1F04A00E958D0 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+
+/* Begin XCLocalSwiftPackageReference section */
+		5C5B6E1F2CA3D89F00AF6130 /* XCLocalSwiftPackageReference "internal-llama-stack-client-swift" */ = {
+			isa = XCLocalSwiftPackageReference;
+			relativePath = "internal-llama-stack-client-swift";
+		};
+/* End XCLocalSwiftPackageReference section */
+
+/* Begin XCRemoteSwiftPackageReference section */
+		5CCBC6732CA1F45800E958D0 /* XCRemoteSwiftPackageReference "executorch" */ = {
+			isa = XCRemoteSwiftPackageReference;
+			repositoryURL = "https://github.com/pytorch/executorch";
+			requirement = {
+				branch = latest;
+				kind = branch;
+			};
+		};
+		5CCBC6912CA1F7D000E958D0 /* XCRemoteSwiftPackageReference "Stencil" */ = {
+			isa = XCRemoteSwiftPackageReference;
+			repositoryURL = "https://github.com/stencilproject/Stencil";
+			requirement = {
+				kind = upToNextMajorVersion;
+				minimumVersion = 0.15.1;
+			};
+		};
+/* End XCRemoteSwiftPackageReference section */
+
+/* Begin XCSwiftPackageProductDependency section */
+		5C03561E2CA3AB9600E3BB46 /* LlamaStackClient */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = LlamaStackClient;
+		};
+		5C5B6E202CA3D89F00AF6130 /* LlamaStackClient */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = LlamaStackClient;
+		};
+		5CCBC6742CA1F45800E958D0 /* executorch_debug */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 5CCBC6732CA1F45800E958D0 /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = executorch_debug;
+		};
+		5CCBC6922CA1F7D000E958D0 /* Stencil */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 5CCBC6912CA1F7D000E958D0 /* XCRemoteSwiftPackageReference "Stencil" */;
+			productName = Stencil;
+		};
+/* End XCSwiftPackageProductDependency section */
+	};
+	rootObject = 5CCBC5FF2CA1F04A00E958D0 /* Project object */;
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:">
+   </FileRef>
+</Workspace>
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/LocalInference.h
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/LocalInference.h
@ -0,0 +1,16 @@
+//
+//  LocalInference.h
+//  LocalInference
+//
+//  Created by Dalton Flanagan on 9/23/24.
+//
+
+#import <Foundation/Foundation.h>
+
+//! Project version number for LocalInference.
+FOUNDATION_EXPORT double LocalInferenceVersionNumber;
+
+//! Project version string for LocalInference.
+FOUNDATION_EXPORT const unsigned char LocalInferenceVersionString[];
+
+// In this header, you should import all the public headers of your framework using statements like #import <LocalInference/PublicHeader.h>
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/LocalInference.swift
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/LocalInference.swift
@ -0,0 +1,167 @@
+import Foundation
+
+import LLaMARunner
+import LlamaStackClient
+
+class RunnerHolder: ObservableObject {
+  var runner: Runner?
+}
+
+public class LocalInference: Inference {
+  private var runnerHolder = RunnerHolder()
+  private let runnerQueue: DispatchQueue
+
+  public init (queue: DispatchQueue) {
+    runnerQueue = queue
+  }
+
+  public func loadModel(modelPath: String, tokenizerPath: String, completion: @escaping (Result<Void, Error>) -> Void) {
+    runnerHolder.runner = runnerHolder.runner ?? Runner(
+      modelPath: modelPath,
+      tokenizerPath: tokenizerPath
+    )
+
+
+    runnerQueue.async {
+      let runner = self.runnerHolder.runner
+      do {
+        try runner!.load()
+        completion(.success(()))
+      } catch let loadError {
+        print("error: " + loadError.localizedDescription)
+        completion(.failure(loadError))
+      }
+    }
+  }
+
+  public func chatCompletion(request: Components.Schemas.ChatCompletionRequest) -> AsyncStream<Components.Schemas.ChatCompletionResponseStreamChunk> {
+    return AsyncStream { continuation in
+      runnerQueue.async {
+        do {
+          var tokens: [String] = []
+
+          let prompt = try encodeDialogPrompt(messages: prepareMessages(request: request))
+          var stopReason: Components.Schemas.StopReason? = nil
+          var buffer = ""
+          var ipython = false
+          var echoDropped = false
+
+          try self.runnerHolder.runner?.generate(prompt, sequenceLength: 4096) { token in
+            buffer += token
+
+            // HACK: Workaround until LlamaRunner exposes echo param
+            if (!echoDropped) {
+              if (buffer.hasPrefix(prompt)) {
+                buffer = String(buffer.dropFirst(prompt.count))
+                echoDropped = true
+              }
+              return
+            }
+
+            tokens.append(token)
+
+            if !ipython && (buffer.starts(with: "<|python_tag|>") || buffer.starts(with: "[") ) {
+              ipython = true
+              continuation.yield(
+                Components.Schemas.ChatCompletionResponseStreamChunk(
+                  event: Components.Schemas.ChatCompletionResponseEvent(
+                    delta: .ToolCallDelta(Components.Schemas.ToolCallDelta(
+                      content: .case1(""),
+                      parse_status: Components.Schemas.ToolCallParseStatus.started
+                      )
+                    ),
+                    event_type: .progress
+                  )
+                )
+              )
+
+              if (buffer.starts(with: "<|python_tag|>")) {
+                buffer = String(buffer.dropFirst("<|python_tag|>".count))
+              }
+            }
+
+            // TODO: Non-streaming lobprobs
+
+            var text = ""
+            if token == "<|eot_id|>" {
+              stopReason = Components.Schemas.StopReason.end_of_turn
+            } else if token == "<|eom_id|>" {
+              stopReason = Components.Schemas.StopReason.end_of_message
+            } else {
+              text = token
+            }
+
+            var delta: Components.Schemas.ChatCompletionResponseEvent.deltaPayload
+            if ipython {
+              delta = .ToolCallDelta(Components.Schemas.ToolCallDelta(
+                content: .case1(text),
+                parse_status: .in_progress
+              ))
+            } else {
+              delta = .case1(text)
+            }
+
+            if stopReason == nil {
+              continuation.yield(
+                Components.Schemas.ChatCompletionResponseStreamChunk(
+                  event: Components.Schemas.ChatCompletionResponseEvent(
+                    delta: delta,
+                    event_type: .progress
+                  )
+                )
+              )
+            }
+          }
+
+          if stopReason == nil {
+            stopReason = Components.Schemas.StopReason.out_of_tokens
+          }
+
+          let message = decodeAssistantMessage(tokens: tokens.joined(), stopReason: stopReason!)
+          // TODO: non-streaming support
+
+          let didParseToolCalls = message.tool_calls.count > 0
+          if ipython && !didParseToolCalls {
+            continuation.yield(
+              Components.Schemas.ChatCompletionResponseStreamChunk(
+                event: Components.Schemas.ChatCompletionResponseEvent(
+                  delta: .ToolCallDelta(Components.Schemas.ToolCallDelta(content: .case1(""), parse_status: .failure)),
+                  event_type: .progress
+                )
+                // TODO: stopReason
+              )
+            )
+          }
+
+          for toolCall in message.tool_calls {
+            continuation.yield(
+              Components.Schemas.ChatCompletionResponseStreamChunk(
+                event: Components.Schemas.ChatCompletionResponseEvent(
+                  delta: .ToolCallDelta(Components.Schemas.ToolCallDelta(
+                    content: .ToolCall(toolCall),
+                    parse_status: .success
+                  )),
+                  event_type: .progress
+                )
+                // TODO: stopReason
+              )
+            )
+          }
+
+          continuation.yield(
+            Components.Schemas.ChatCompletionResponseStreamChunk(
+              event: Components.Schemas.ChatCompletionResponseEvent(
+                delta: .case1(""),
+                event_type: .complete
+              )
+              // TODO: stopReason
+            )
+          )
+        }
+        catch (let error) {
+          print("Inference error: " + error.localizedDescription)
+        }
+      }
+    }
+  }
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/Parsing.swift
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/Parsing.swift
@ -0,0 +1,235 @@
+import Foundation
+
+import LlamaStackClient
+
+func encodeHeader(role: String) -> String {
+  return "<|start_header_id|>\(role)<|end_header_id|>\n\n"
+}
+
+func encodeDialogPrompt(messages: [Components.Schemas.ChatCompletionRequest.messagesPayloadPayload]) -> String {
+  var prompt = ""
+
+  prompt.append("<|begin_of_text|>")
+  for message in messages {
+    let msg = encodeMessage(message: message)
+    prompt += msg
+  }
+
+  prompt.append(encodeHeader(role: "assistant"))
+
+  return prompt
+}
+
+func getRole(message: Components.Schemas.ChatCompletionRequest.messagesPayloadPayload) -> String {
+  switch (message) {
+  case .UserMessage(let m):
+    return m.role.rawValue
+  case .SystemMessage(let m):
+    return m.role.rawValue
+  case .ToolResponseMessage(let m):
+    return m.role.rawValue
+  case .CompletionMessage(let m):
+    return m.role.rawValue
+  }
+}
+
+func encodeMessage(message: Components.Schemas.ChatCompletionRequest.messagesPayloadPayload) -> String {
+  var prompt = encodeHeader(role: getRole(message: message))
+
+  switch (message) {
+  case .CompletionMessage(let m):
+    if (m.tool_calls.count > 0) {
+      prompt += "<|python_tag|>"
+    }
+  default:
+    break
+  }
+
+  func _processContent(_ content: Any) -> String {
+    func _process(_ c: Any) {
+      if let str = c as? String {
+        prompt += str
+      }
+    }
+
+    if let str = content as? String {
+      _process(str)
+    } else if let list = content as? [Any] {
+      for c in list {
+        _process(c)
+      }
+    }
+
+    return ""
+  }
+
+  switch (message) {
+  case .UserMessage(let m):
+    prompt += _processContent(m.content)
+  case .SystemMessage(let m):
+    prompt += _processContent(m.content)
+  case .ToolResponseMessage(let m):
+    prompt += _processContent(m.content)
+  case .CompletionMessage(let m):
+    prompt += _processContent(m.content)
+  }
+
+  var eom = false
+
+  switch (message) {
+  case .UserMessage(let m):
+    switch (m.content) {
+    case .case1(let c):
+      prompt += _processContent(c)
+    case .case2(let c):
+      prompt += _processContent(c)
+    }
+  case .CompletionMessage(let m):
+    // TODO: Support encoding past tool call history
+    // for t in m.tool_calls {
+    //  _processContent(t.)
+    //}
+    eom = m.stop_reason == Components.Schemas.StopReason.end_of_message
+  case .SystemMessage(_):
+    break
+  case .ToolResponseMessage(_):
+    break
+  }
+
+  if (eom) {
+    prompt += "<|eom_id|>"
+  } else {
+    prompt += "<|eot_id|>"
+  }
+
+  return prompt
+}
+
+func prepareMessages(request: Components.Schemas.ChatCompletionRequest) throws -> [Components.Schemas.ChatCompletionRequest.messagesPayloadPayload] {
+  var existingMessages = request.messages
+  var existingSystemMessage: Components.Schemas.ChatCompletionRequest.messagesPayloadPayload?
+  // TODO: Existing system message
+
+  var messages: [Components.Schemas.ChatCompletionRequest.messagesPayloadPayload] = []
+
+  let defaultGen = SystemDefaultGenerator()
+  let defaultTemplate = defaultGen.gen()
+
+  var sysContent = ""
+
+  // TODO: Built-in tools
+
+  sysContent += try defaultTemplate.render()
+
+  messages.append(.SystemMessage(Components.Schemas.SystemMessage(
+    content: .case1(sysContent),
+    role: .system))
+  )
+
+  if request.tools?.isEmpty == false {
+    // TODO: Separate built-ins and custom tools (right now everything treated as custom)
+    let toolGen = FunctionTagCustomToolGenerator()
+    let toolTemplate = try toolGen.gen(customTools: request.tools!)
+    let tools = try toolTemplate.render()
+    messages.append(.UserMessage(Components.Schemas.UserMessage(
+      content: .case1(tools),
+      role: .user)
+    ))
+  }
+
+  messages.append(contentsOf: existingMessages)
+
+  return messages
+}
+
+struct FunctionCall {
+    let name: String
+    let params: [String: Any]
+}
+
+public func maybeExtractCustomToolCalls(input: String) -> [Components.Schemas.ToolCall] {
+  guard input.hasPrefix("[") && input.hasSuffix("]") else {
+    return []
+  }
+
+  do {
+    let trimmed = input.trimmingCharacters(in: CharacterSet(charactersIn: "[]"))
+    let calls = trimmed.components(separatedBy: "),").map { $0.hasSuffix(")") ? $0 : $0 + ")" }
+
+    var result: [Components.Schemas.ToolCall] = []
+
+    for call in calls {
+      guard let nameEndIndex = call.firstIndex(of: "("),
+            let paramsStartIndex = call.firstIndex(of: "{"),
+            let paramsEndIndex = call.lastIndex(of: "}") else {
+        return []
+      }
+
+      let name = String(call[..<nameEndIndex]).trimmingCharacters(in: .whitespacesAndNewlines)
+      let paramsString = String(call[paramsStartIndex...paramsEndIndex])
+
+      guard let data = paramsString.data(using: .utf8),
+            let params = try? JSONSerialization.jsonObject(with: data, options: []) as? [String: Any] else {
+        return []
+      }
+
+      var props: [String : Components.Schemas.ToolCall.argumentsPayload.additionalPropertiesPayload] = [:]
+      for (param_name, param) in params {
+        switch (param) {
+        case let value as String:
+          props[param_name] = .case1(value)
+        case let value as Int:
+          props[param_name] = .case2(value)
+        case let value as Double:
+          props[param_name] = .case3(value)
+        case let value as Bool:
+          props[param_name] = .case4(value)
+        default:
+          return []
+        }
+      }
+
+      result.append(
+        Components.Schemas.ToolCall(
+          arguments: .init(additionalProperties: props),
+          call_id: UUID().uuidString,
+          tool_name: .case2(name) // custom_tool
+        )
+      )
+    }
+
+    return result.isEmpty ? [] : result
+  } catch {
+    return []
+  }
+}
+
+func decodeAssistantMessage(tokens: String, stopReason: Components.Schemas.StopReason) -> Components.Schemas.CompletionMessage {
+  var content = tokens
+
+  let roles = ["user", "system", "assistant"]
+  for role in roles {
+    let headerStr = encodeHeader(role: role)
+    if content.hasPrefix(headerStr) {
+      content = String(content.dropFirst(encodeHeader(role: role).count))
+    }
+  }
+
+  if content.hasPrefix("<|python_tag|>") {
+    content = String(content.dropFirst("<|python_tag|>".count))
+  }
+
+
+  if content.hasSuffix("<|eot_id|>") {
+    content = String(content.dropLast("<|eot_id|>".count))
+  } else {
+    content = String(content.dropLast("<|eom_id|>".count))
+  }
+
+  return Components.Schemas.CompletionMessage(
+    content: .case1(content),
+    role: .assistant,
+    stop_reason: stopReason,
+    tool_calls: maybeExtractCustomToolCalls(input: content)
+  )
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/PromptTemplate.swift
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/PromptTemplate.swift
@ -0,0 +1,12 @@
+import Foundation
+import Stencil
+
+public struct PromptTemplate {
+    let template: String
+    let data: [String: Any]
+
+  public func render() throws -> String {
+    let template = Template(templateString: self.template)
+    return try template.render(self.data)
+  }
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/SystemPrompts.swift
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInference/SystemPrompts.swift
@ -0,0 +1,91 @@
+import Foundation
+
+import LlamaStackClient
+
+func convertToNativeSwiftType(_ value: Any) -> Any {
+    switch value {
+    case let number as NSNumber:
+        if CFGetTypeID(number) == CFBooleanGetTypeID() {
+            return number.boolValue
+        }
+        if floor(number.doubleValue) == number.doubleValue {
+            return number.intValue
+        }
+        return number.doubleValue
+    case let string as String:
+        return string
+    case let array as [Any]:
+        return array.map(convertToNativeSwiftType)
+    case let dict as [String: Any]:
+        return dict.mapValues(convertToNativeSwiftType)
+    case is NSNull:
+        return NSNull()
+    default:
+        return value
+    }
+}
+
+public class SystemDefaultGenerator {
+  public init() {}
+
+  public func gen() -> PromptTemplate {
+    let templateStr = """
+            Cutting Knowledge Date: December 2023
+            Today Date: {{ today }}
+            """
+
+    let dateFormatter = DateFormatter()
+    dateFormatter.dateFormat = "dd MMMM yyyy"
+
+    return PromptTemplate(
+      template: templateStr,
+      data: ["today": dateFormatter.string(from: Date())]
+    )
+  }
+}
+
+
+public class FunctionTagCustomToolGenerator {
+  public init() {}
+
+  public func gen(customTools: [Components.Schemas.ToolDefinition]) throws -> PromptTemplate {
+    // TODO: required params
+    // TODO: {{#unless @last}},{{/unless}}
+
+    let templateStr = """
+            You are an expert in composing functions. You are given a question and a set of possible functions.
+            Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+            If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
+            also point it out. You should only return the function call in tools call sections.
+
+            If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
+            You SHOULD NOT include any other text in the response.
+
+            Here is a list of functions in JSON format that you can invoke.
+
+            [
+            {% for t in custom_tools %}
+            {
+                "name": "{{t.tool_name}}",
+                "description": "{{t.description}}",
+                "parameters": {
+                    "type": "dict",
+                    "properties": { {{t.parameters}} }
+            }
+
+            {{/let}}
+            {% endfor -%}
+            ]
+            """
+
+    let encoder = JSONEncoder()
+    return PromptTemplate(
+      template: templateStr,
+      data: ["custom_tools": try customTools.map {
+        let data = try encoder.encode($0)
+        let obj = try JSONSerialization.jsonObject(with: data)
+        return convertToNativeSwiftType(obj)
+      }]
+    )
+  }
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl.xcodeproj/project.pbxproj
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl.xcodeproj/project.pbxproj
@ -0,0 +1,541 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 60;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		5CADC71A2CA471CC007662D2 /* LlamaStackClient in Frameworks */ = {isa = PBXBuildFile; productRef = 5CADC7192CA471CC007662D2 /* LlamaStackClient */; };
+		5CCBC60C2CA1F04A00E958D0 /* LocalInference.h in Headers */ = {isa = PBXBuildFile; fileRef = 5CCBC60B2CA1F04A00E958D0 /* LocalInference.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		5CCBC6752CA1F45800E958D0 /* executorch_debug in Frameworks */ = {isa = PBXBuildFile; productRef = 5CCBC6742CA1F45800E958D0 /* executorch_debug */; };
+		5CCBC6862CA1F64A00E958D0 /* LLaMARunner.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5CCBC6802CA1F63F00E958D0 /* LLaMARunner.framework */; platformFilter = ios; };
+		5CCBC6872CA1F64A00E958D0 /* LLaMARunner.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 5CCBC6802CA1F63F00E958D0 /* LLaMARunner.framework */; platformFilter = ios; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
+		5CCBC68D2CA1F7A100E958D0 /* PromptTemplate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5CCBC6892CA1F7A000E958D0 /* PromptTemplate.swift */; };
+		5CCBC68E2CA1F7A100E958D0 /* LocalInference.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5CCBC68A2CA1F7A000E958D0 /* LocalInference.swift */; };
+		5CCBC68F2CA1F7A100E958D0 /* Parsing.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5CCBC68B2CA1F7A000E958D0 /* Parsing.swift */; };
+		5CCBC6902CA1F7A100E958D0 /* SystemPrompts.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5CCBC68C2CA1F7A100E958D0 /* SystemPrompts.swift */; };
+		5CCBC6932CA1F7D000E958D0 /* Stencil in Frameworks */ = {isa = PBXBuildFile; productRef = 5CCBC6922CA1F7D000E958D0 /* Stencil */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXContainerItemProxy section */
+		5CCBC67D2CA1F63F00E958D0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 036CAF9D2BB1444500D6C2D5;
+			remoteInfo = LLaMA;
+		};
+		5CCBC67F2CA1F63F00E958D0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 03729ED52BB1F8DE00152F2E;
+			remoteInfo = LLaMARunner;
+		};
+		5CCBC69E2CA2036B00E958D0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 5CCBC6982CA2036A00E958D0;
+			remoteInfo = LLaMAPerfBenchmark;
+		};
+		5CCBC6A02CA2036B00E958D0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = 5CCBC6992CA2036A00E958D0;
+			remoteInfo = LLaMAPerfBenchmarkTests;
+		};
+/* End PBXContainerItemProxy section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		5CCBC6882CA1F64A00E958D0 /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				5CCBC6872CA1F64A00E958D0 /* LLaMARunner.framework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		5CCBC6082CA1F04A00E958D0 /* LocalInferenceImpl.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = LocalInferenceImpl.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		5CCBC60B2CA1F04A00E958D0 /* LocalInference.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LocalInference.h; sourceTree = "<group>"; };
+		5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = LLaMA.xcodeproj; path = "executorch/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj"; sourceTree = "<group>"; };
+		5CCBC6892CA1F7A000E958D0 /* PromptTemplate.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PromptTemplate.swift; sourceTree = "<group>"; };
+		5CCBC68A2CA1F7A000E958D0 /* LocalInference.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = LocalInference.swift; sourceTree = "<group>"; };
+		5CCBC68B2CA1F7A000E958D0 /* Parsing.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Parsing.swift; sourceTree = "<group>"; };
+		5CCBC68C2CA1F7A100E958D0 /* SystemPrompts.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SystemPrompts.swift; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		5CCBC6052CA1F04A00E958D0 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				5CADC71A2CA471CC007662D2 /* LlamaStackClient in Frameworks */,
+				5CCBC6932CA1F7D000E958D0 /* Stencil in Frameworks */,
+				5CCBC6862CA1F64A00E958D0 /* LLaMARunner.framework in Frameworks */,
+				5CCBC6752CA1F45800E958D0 /* executorch_debug in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		5CCBC5FE2CA1F04A00E958D0 = {
+			isa = PBXGroup;
+			children = (
+				5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */,
+				5CCBC60A2CA1F04A00E958D0 /* LocalInferenceImpl */,
+				5CCBC6092CA1F04A00E958D0 /* Products */,
+				5CCBC6852CA1F64A00E958D0 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		5CCBC6092CA1F04A00E958D0 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				5CCBC6082CA1F04A00E958D0 /* LocalInferenceImpl.framework */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		5CCBC60A2CA1F04A00E958D0 /* LocalInferenceImpl */ = {
+			isa = PBXGroup;
+			children = (
+				5CCBC68A2CA1F7A000E958D0 /* LocalInference.swift */,
+				5CCBC68B2CA1F7A000E958D0 /* Parsing.swift */,
+				5CCBC6892CA1F7A000E958D0 /* PromptTemplate.swift */,
+				5CCBC68C2CA1F7A100E958D0 /* SystemPrompts.swift */,
+				5CCBC60B2CA1F04A00E958D0 /* LocalInference.h */,
+			);
+			path = LocalInferenceImpl;
+			sourceTree = "<group>";
+		};
+		5CCBC6772CA1F63F00E958D0 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				5CCBC67E2CA1F63F00E958D0 /* LLaMA.app */,
+				5CCBC6802CA1F63F00E958D0 /* LLaMARunner.framework */,
+				5CCBC69F2CA2036B00E958D0 /* LLaMAPerfBenchmark.app */,
+				5CCBC6A12CA2036B00E958D0 /* LLaMAPerfBenchmarkTests.xctest */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		5CCBC6852CA1F64A00E958D0 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXHeadersBuildPhase section */
+		5CCBC6032CA1F04A00E958D0 /* Headers */ = {
+			isa = PBXHeadersBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				5CCBC60C2CA1F04A00E958D0 /* LocalInference.h in Headers */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXHeadersBuildPhase section */
+
+/* Begin PBXNativeTarget section */
+		5CCBC6072CA1F04A00E958D0 /* LocalInferenceImpl */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 5CCBC60F2CA1F04A00E958D0 /* Build configuration list for PBXNativeTarget "LocalInferenceImpl" */;
+			buildPhases = (
+				5CCBC6032CA1F04A00E958D0 /* Headers */,
+				5CCBC6042CA1F04A00E958D0 /* Sources */,
+				5CCBC6052CA1F04A00E958D0 /* Frameworks */,
+				5CCBC6062CA1F04A00E958D0 /* Resources */,
+				5CCBC6882CA1F64A00E958D0 /* Embed Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = LocalInferenceImpl;
+			packageProductDependencies = (
+				5CCBC6742CA1F45800E958D0 /* executorch_debug */,
+				5CCBC6922CA1F7D000E958D0 /* Stencil */,
+				5CADC7192CA471CC007662D2 /* LlamaStackClient */,
+			);
+			productName = LocalInferenceProvider;
+			productReference = 5CCBC6082CA1F04A00E958D0 /* LocalInferenceImpl.framework */;
+			productType = "com.apple.product-type.framework";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		5CCBC5FF2CA1F04A00E958D0 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				BuildIndependentTargetsInParallel = 1;
+				LastUpgradeCheck = 1540;
+				TargetAttributes = {
+					5CCBC6072CA1F04A00E958D0 = {
+						CreatedOnToolsVersion = 15.4;
+						LastSwiftMigration = 1540;
+					};
+				};
+			};
+			buildConfigurationList = 5CCBC6022CA1F04A00E958D0 /* Build configuration list for PBXProject "LocalInferenceImpl" */;
+			compatibilityVersion = "Xcode 14.0";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 5CCBC5FE2CA1F04A00E958D0;
+			packageReferences = (
+				5CCBC6732CA1F45800E958D0 /* XCRemoteSwiftPackageReference "executorch" */,
+				5CCBC6912CA1F7D000E958D0 /* XCRemoteSwiftPackageReference "Stencil" */,
+				5CADC7182CA471CC007662D2 /* XCLocalSwiftPackageReference "internal-llama-stack-client-swift" */,
+			);
+			productRefGroup = 5CCBC6092CA1F04A00E958D0 /* Products */;
+			projectDirPath = "";
+			projectReferences = (
+				{
+					ProductGroup = 5CCBC6772CA1F63F00E958D0 /* Products */;
+					ProjectRef = 5CCBC6762CA1F63F00E958D0 /* LLaMA.xcodeproj */;
+				},
+			);
+			projectRoot = "";
+			targets = (
+				5CCBC6072CA1F04A00E958D0 /* LocalInferenceImpl */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXReferenceProxy section */
+		5CCBC67E2CA1F63F00E958D0 /* LLaMA.app */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.application;
+			path = LLaMA.app;
+			remoteRef = 5CCBC67D2CA1F63F00E958D0 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+		5CCBC6802CA1F63F00E958D0 /* LLaMARunner.framework */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.framework;
+			path = LLaMARunner.framework;
+			remoteRef = 5CCBC67F2CA1F63F00E958D0 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+		5CCBC69F2CA2036B00E958D0 /* LLaMAPerfBenchmark.app */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.application;
+			path = LLaMAPerfBenchmark.app;
+			remoteRef = 5CCBC69E2CA2036B00E958D0 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+		5CCBC6A12CA2036B00E958D0 /* LLaMAPerfBenchmarkTests.xctest */ = {
+			isa = PBXReferenceProxy;
+			fileType = wrapper.cfbundle;
+			path = LLaMAPerfBenchmarkTests.xctest;
+			remoteRef = 5CCBC6A02CA2036B00E958D0 /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+/* End PBXReferenceProxy section */
+
+/* Begin PBXResourcesBuildPhase section */
+		5CCBC6062CA1F04A00E958D0 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		5CCBC6042CA1F04A00E958D0 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				5CCBC6902CA1F7A100E958D0 /* SystemPrompts.swift in Sources */,
+				5CCBC68D2CA1F7A100E958D0 /* PromptTemplate.swift in Sources */,
+				5CCBC68F2CA1F7A100E958D0 /* Parsing.swift in Sources */,
+				5CCBC68E2CA1F7A100E958D0 /* LocalInference.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		5CCBC60D2CA1F04A00E958D0 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				CURRENT_PROJECT_VERSION = 1;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.5;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				VERSIONING_SYSTEM = "apple-generic";
+				VERSION_INFO_PREFIX = "";
+			};
+			name = Debug;
+		};
+		5CCBC60E2CA1F04A00E958D0 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				CURRENT_PROJECT_VERSION = 1;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.5;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				VALIDATE_PRODUCT = YES;
+				VERSIONING_SYSTEM = "apple-generic";
+				VERSION_INFO_PREFIX = "";
+			};
+			name = Release;
+		};
+		5CCBC6102CA1F04A00E958D0 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				BUILD_LIBRARY_FOR_DISTRIBUTION = YES;
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEFINES_MODULE = YES;
+				DYLIB_COMPATIBILITY_VERSION = 1;
+				DYLIB_CURRENT_VERSION = 1;
+				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_MODULE_VERIFIER = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				HEADER_SEARCH_PATHS = "";
+				INFOPLIST_KEY_NSHumanReadableCopyright = "";
+				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+					"@loader_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				MODULE_VERIFIER_SUPPORTED_LANGUAGES = "objective-c objective-c++";
+				MODULE_VERIFIER_SUPPORTED_LANGUAGE_STANDARDS = "gnu17 gnu++20";
+				OTHER_LDFLAGS = "";
+				PRODUCT_BUNDLE_IDENTIFIER = meta.llamatsack.LocalInferenceProvider;
+				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
+				SKIP_INSTALL = YES;
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_INSTALL_OBJC_HEADER = NO;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		5CCBC6112CA1F04A00E958D0 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				BUILD_LIBRARY_FOR_DISTRIBUTION = YES;
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEFINES_MODULE = YES;
+				DYLIB_COMPATIBILITY_VERSION = 1;
+				DYLIB_CURRENT_VERSION = 1;
+				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_MODULE_VERIFIER = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				HEADER_SEARCH_PATHS = "";
+				INFOPLIST_KEY_NSHumanReadableCopyright = "";
+				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+					"@loader_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				MODULE_VERIFIER_SUPPORTED_LANGUAGES = "objective-c objective-c++";
+				MODULE_VERIFIER_SUPPORTED_LANGUAGE_STANDARDS = "gnu17 gnu++20";
+				OTHER_LDFLAGS = "";
+				PRODUCT_BUNDLE_IDENTIFIER = meta.llamatsack.LocalInferenceProvider;
+				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
+				SKIP_INSTALL = YES;
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_INSTALL_OBJC_HEADER = NO;
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		5CCBC6022CA1F04A00E958D0 /* Build configuration list for PBXProject "LocalInferenceImpl" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				5CCBC60D2CA1F04A00E958D0 /* Debug */,
+				5CCBC60E2CA1F04A00E958D0 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		5CCBC60F2CA1F04A00E958D0 /* Build configuration list for PBXNativeTarget "LocalInferenceImpl" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				5CCBC6102CA1F04A00E958D0 /* Debug */,
+				5CCBC6112CA1F04A00E958D0 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+
+/* Begin XCLocalSwiftPackageReference section */
+		5CADC7182CA471CC007662D2 /* XCLocalSwiftPackageReference "internal-llama-stack-client-swift" */ = {
+			isa = XCLocalSwiftPackageReference;
+			relativePath = "internal-llama-stack-client-swift";
+		};
+/* End XCLocalSwiftPackageReference section */
+
+/* Begin XCRemoteSwiftPackageReference section */
+		5CCBC6732CA1F45800E958D0 /* XCRemoteSwiftPackageReference "executorch" */ = {
+			isa = XCRemoteSwiftPackageReference;
+			repositoryURL = "https://github.com/pytorch/executorch";
+			requirement = {
+				branch = latest;
+				kind = branch;
+			};
+		};
+		5CCBC6912CA1F7D000E958D0 /* XCRemoteSwiftPackageReference "Stencil" */ = {
+			isa = XCRemoteSwiftPackageReference;
+			repositoryURL = "https://github.com/stencilproject/Stencil";
+			requirement = {
+				kind = upToNextMajorVersion;
+				minimumVersion = 0.15.1;
+			};
+		};
+/* End XCRemoteSwiftPackageReference section */
+
+/* Begin XCSwiftPackageProductDependency section */
+		5CADC7192CA471CC007662D2 /* LlamaStackClient */ = {
+			isa = XCSwiftPackageProductDependency;
+			productName = LlamaStackClient;
+		};
+		5CCBC6742CA1F45800E958D0 /* executorch_debug */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 5CCBC6732CA1F45800E958D0 /* XCRemoteSwiftPackageReference "executorch" */;
+			productName = executorch_debug;
+		};
+		5CCBC6922CA1F7D000E958D0 /* Stencil */ = {
+			isa = XCSwiftPackageProductDependency;
+			package = 5CCBC6912CA1F7D000E958D0 /* XCRemoteSwiftPackageReference "Stencil" */;
+			productName = Stencil;
+		};
+/* End XCSwiftPackageProductDependency section */
+	};
+	rootObject = 5CCBC5FF2CA1F04A00E958D0 /* Project object */;
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:">
+   </FileRef>
+</Workspace>
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/LocalInference.h
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/LocalInference.h
@ -0,0 +1,16 @@
+//
+//  LocalInference.h
+//  LocalInference
+//
+//  Created by Dalton Flanagan on 9/23/24.
+//
+
+#import <Foundation/Foundation.h>
+
+//! Project version number for LocalInference.
+FOUNDATION_EXPORT double LocalInferenceVersionNumber;
+
+//! Project version string for LocalInference.
+FOUNDATION_EXPORT const unsigned char LocalInferenceVersionString[];
+
+// In this header, you should import all the public headers of your framework using statements like #import <LocalInference/PublicHeader.h>
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/LocalInference.swift
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/LocalInference.swift
@ -0,0 +1,167 @@
+import Foundation
+
+import LLaMARunner
+import LlamaStackClient
+
+class RunnerHolder: ObservableObject {
+  var runner: Runner?
+}
+
+public class LocalInference: Inference {
+  private var runnerHolder = RunnerHolder()
+  private let runnerQueue: DispatchQueue
+
+  public init (queue: DispatchQueue) {
+    runnerQueue = queue
+  }
+
+  public func loadModel(modelPath: String, tokenizerPath: String, completion: @escaping (Result<Void, Error>) -> Void) {
+    runnerHolder.runner = runnerHolder.runner ?? Runner(
+      modelPath: modelPath,
+      tokenizerPath: tokenizerPath
+    )
+
+
+    runnerQueue.async {
+      let runner = self.runnerHolder.runner
+      do {
+        try runner!.load()
+        completion(.success(()))
+      } catch let loadError {
+        print("error: " + loadError.localizedDescription)
+        completion(.failure(loadError))
+      }
+    }
+  }
+
+  public func chatCompletion(request: Components.Schemas.ChatCompletionRequest) -> AsyncStream<Components.Schemas.ChatCompletionResponseStreamChunk> {
+    return AsyncStream { continuation in
+      runnerQueue.async {
+        do {
+          var tokens: [String] = []
+
+          let prompt = try encodeDialogPrompt(messages: prepareMessages(request: request))
+          var stopReason: Components.Schemas.StopReason? = nil
+          var buffer = ""
+          var ipython = false
+          var echoDropped = false
+
+          try self.runnerHolder.runner?.generate(prompt, sequenceLength: 4096) { token in
+            buffer += token
+
+            // HACK: Workaround until LlamaRunner exposes echo param
+            if (!echoDropped) {
+              if (buffer.hasPrefix(prompt)) {
+                buffer = String(buffer.dropFirst(prompt.count))
+                echoDropped = true
+              }
+              return
+            }
+
+            tokens.append(token)
+
+            if !ipython && (buffer.starts(with: "<|python_tag|>") || buffer.starts(with: "[") ) {
+              ipython = true
+              continuation.yield(
+                Components.Schemas.ChatCompletionResponseStreamChunk(
+                  event: Components.Schemas.ChatCompletionResponseEvent(
+                    delta: .ToolCallDelta(Components.Schemas.ToolCallDelta(
+                      content: .case1(""),
+                      parse_status: Components.Schemas.ToolCallParseStatus.started
+                      )
+                    ),
+                    event_type: .progress
+                  )
+                )
+              )
+
+              if (buffer.starts(with: "<|python_tag|>")) {
+                buffer = String(buffer.dropFirst("<|python_tag|>".count))
+              }
+            }
+
+            // TODO: Non-streaming lobprobs
+
+            var text = ""
+            if token == "<|eot_id|>" {
+              stopReason = Components.Schemas.StopReason.end_of_turn
+            } else if token == "<|eom_id|>" {
+              stopReason = Components.Schemas.StopReason.end_of_message
+            } else {
+              text = token
+            }
+
+            var delta: Components.Schemas.ChatCompletionResponseEvent.deltaPayload
+            if ipython {
+              delta = .ToolCallDelta(Components.Schemas.ToolCallDelta(
+                content: .case1(text),
+                parse_status: .in_progress
+              ))
+            } else {
+              delta = .case1(text)
+            }
+
+            if stopReason == nil {
+              continuation.yield(
+                Components.Schemas.ChatCompletionResponseStreamChunk(
+                  event: Components.Schemas.ChatCompletionResponseEvent(
+                    delta: delta,
+                    event_type: .progress
+                  )
+                )
+              )
+            }
+          }
+
+          if stopReason == nil {
+            stopReason = Components.Schemas.StopReason.out_of_tokens
+          }
+
+          let message = decodeAssistantMessage(tokens: tokens.joined(), stopReason: stopReason!)
+          // TODO: non-streaming support
+
+          let didParseToolCalls = message.tool_calls.count > 0
+          if ipython && !didParseToolCalls {
+            continuation.yield(
+              Components.Schemas.ChatCompletionResponseStreamChunk(
+                event: Components.Schemas.ChatCompletionResponseEvent(
+                  delta: .ToolCallDelta(Components.Schemas.ToolCallDelta(content: .case1(""), parse_status: .failure)),
+                  event_type: .progress
+                )
+                // TODO: stopReason
+              )
+            )
+          }
+
+          for toolCall in message.tool_calls {
+            continuation.yield(
+              Components.Schemas.ChatCompletionResponseStreamChunk(
+                event: Components.Schemas.ChatCompletionResponseEvent(
+                  delta: .ToolCallDelta(Components.Schemas.ToolCallDelta(
+                    content: .ToolCall(toolCall),
+                    parse_status: .success
+                  )),
+                  event_type: .progress
+                )
+                // TODO: stopReason
+              )
+            )
+          }
+
+          continuation.yield(
+            Components.Schemas.ChatCompletionResponseStreamChunk(
+              event: Components.Schemas.ChatCompletionResponseEvent(
+                delta: .case1(""),
+                event_type: .complete
+              )
+              // TODO: stopReason
+            )
+          )
+        }
+        catch (let error) {
+          print("Inference error: " + error.localizedDescription)
+        }
+      }
+    }
+  }
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/Parsing.swift
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/Parsing.swift
@ -0,0 +1,235 @@
+import Foundation
+
+import LlamaStackClient
+
+func encodeHeader(role: String) -> String {
+  return "<|start_header_id|>\(role)<|end_header_id|>\n\n"
+}
+
+func encodeDialogPrompt(messages: [Components.Schemas.ChatCompletionRequest.messagesPayloadPayload]) -> String {
+  var prompt = ""
+
+  prompt.append("<|begin_of_text|>")
+  for message in messages {
+    let msg = encodeMessage(message: message)
+    prompt += msg
+  }
+
+  prompt.append(encodeHeader(role: "assistant"))
+
+  return prompt
+}
+
+func getRole(message: Components.Schemas.ChatCompletionRequest.messagesPayloadPayload) -> String {
+  switch (message) {
+  case .UserMessage(let m):
+    return m.role.rawValue
+  case .SystemMessage(let m):
+    return m.role.rawValue
+  case .ToolResponseMessage(let m):
+    return m.role.rawValue
+  case .CompletionMessage(let m):
+    return m.role.rawValue
+  }
+}
+
+func encodeMessage(message: Components.Schemas.ChatCompletionRequest.messagesPayloadPayload) -> String {
+  var prompt = encodeHeader(role: getRole(message: message))
+
+  switch (message) {
+  case .CompletionMessage(let m):
+    if (m.tool_calls.count > 0) {
+      prompt += "<|python_tag|>"
+    }
+  default:
+    break
+  }
+
+  func _processContent(_ content: Any) -> String {
+    func _process(_ c: Any) {
+      if let str = c as? String {
+        prompt += str
+      }
+    }
+
+    if let str = content as? String {
+      _process(str)
+    } else if let list = content as? [Any] {
+      for c in list {
+        _process(c)
+      }
+    }
+
+    return ""
+  }
+
+  switch (message) {
+  case .UserMessage(let m):
+    prompt += _processContent(m.content)
+  case .SystemMessage(let m):
+    prompt += _processContent(m.content)
+  case .ToolResponseMessage(let m):
+    prompt += _processContent(m.content)
+  case .CompletionMessage(let m):
+    prompt += _processContent(m.content)
+  }
+
+  var eom = false
+
+  switch (message) {
+  case .UserMessage(let m):
+    switch (m.content) {
+    case .case1(let c):
+      prompt += _processContent(c)
+    case .case2(let c):
+      prompt += _processContent(c)
+    }
+  case .CompletionMessage(let m):
+    // TODO: Support encoding past tool call history
+    // for t in m.tool_calls {
+    //  _processContent(t.)
+    //}
+    eom = m.stop_reason == Components.Schemas.StopReason.end_of_message
+  case .SystemMessage(_):
+    break
+  case .ToolResponseMessage(_):
+    break
+  }
+
+  if (eom) {
+    prompt += "<|eom_id|>"
+  } else {
+    prompt += "<|eot_id|>"
+  }
+
+  return prompt
+}
+
+func prepareMessages(request: Components.Schemas.ChatCompletionRequest) throws -> [Components.Schemas.ChatCompletionRequest.messagesPayloadPayload] {
+  var existingMessages = request.messages
+  var existingSystemMessage: Components.Schemas.ChatCompletionRequest.messagesPayloadPayload?
+  // TODO: Existing system message
+
+  var messages: [Components.Schemas.ChatCompletionRequest.messagesPayloadPayload] = []
+
+  let defaultGen = SystemDefaultGenerator()
+  let defaultTemplate = defaultGen.gen()
+
+  var sysContent = ""
+
+  // TODO: Built-in tools
+
+  sysContent += try defaultTemplate.render()
+
+  messages.append(.SystemMessage(Components.Schemas.SystemMessage(
+    content: .case1(sysContent),
+    role: .system))
+  )
+
+  if request.tools?.isEmpty == false {
+    // TODO: Separate built-ins and custom tools (right now everything treated as custom)
+    let toolGen = FunctionTagCustomToolGenerator()
+    let toolTemplate = try toolGen.gen(customTools: request.tools!)
+    let tools = try toolTemplate.render()
+    messages.append(.UserMessage(Components.Schemas.UserMessage(
+      content: .case1(tools),
+      role: .user)
+    ))
+  }
+
+  messages.append(contentsOf: existingMessages)
+
+  return messages
+}
+
+struct FunctionCall {
+    let name: String
+    let params: [String: Any]
+}
+
+public func maybeExtractCustomToolCalls(input: String) -> [Components.Schemas.ToolCall] {
+  guard input.hasPrefix("[") && input.hasSuffix("]") else {
+    return []
+  }
+
+  do {
+    let trimmed = input.trimmingCharacters(in: CharacterSet(charactersIn: "[]"))
+    let calls = trimmed.components(separatedBy: "),").map { $0.hasSuffix(")") ? $0 : $0 + ")" }
+
+    var result: [Components.Schemas.ToolCall] = []
+
+    for call in calls {
+      guard let nameEndIndex = call.firstIndex(of: "("),
+            let paramsStartIndex = call.firstIndex(of: "{"),
+            let paramsEndIndex = call.lastIndex(of: "}") else {
+        return []
+      }
+
+      let name = String(call[..<nameEndIndex]).trimmingCharacters(in: .whitespacesAndNewlines)
+      let paramsString = String(call[paramsStartIndex...paramsEndIndex])
+
+      guard let data = paramsString.data(using: .utf8),
+            let params = try? JSONSerialization.jsonObject(with: data, options: []) as? [String: Any] else {
+        return []
+      }
+
+      var props: [String : Components.Schemas.ToolCall.argumentsPayload.additionalPropertiesPayload] = [:]
+      for (param_name, param) in params {
+        switch (param) {
+        case let value as String:
+          props[param_name] = .case1(value)
+        case let value as Int:
+          props[param_name] = .case2(value)
+        case let value as Double:
+          props[param_name] = .case3(value)
+        case let value as Bool:
+          props[param_name] = .case4(value)
+        default:
+          return []
+        }
+      }
+
+      result.append(
+        Components.Schemas.ToolCall(
+          arguments: .init(additionalProperties: props),
+          call_id: UUID().uuidString,
+          tool_name: .case2(name) // custom_tool
+        )
+      )
+    }
+
+    return result.isEmpty ? [] : result
+  } catch {
+    return []
+  }
+}
+
+func decodeAssistantMessage(tokens: String, stopReason: Components.Schemas.StopReason) -> Components.Schemas.CompletionMessage {
+  var content = tokens
+
+  let roles = ["user", "system", "assistant"]
+  for role in roles {
+    let headerStr = encodeHeader(role: role)
+    if content.hasPrefix(headerStr) {
+      content = String(content.dropFirst(encodeHeader(role: role).count))
+    }
+  }
+
+  if content.hasPrefix("<|python_tag|>") {
+    content = String(content.dropFirst("<|python_tag|>".count))
+  }
+
+
+  if content.hasSuffix("<|eot_id|>") {
+    content = String(content.dropLast("<|eot_id|>".count))
+  } else {
+    content = String(content.dropLast("<|eom_id|>".count))
+  }
+
+  return Components.Schemas.CompletionMessage(
+    content: .case1(content),
+    role: .assistant,
+    stop_reason: stopReason,
+    tool_calls: maybeExtractCustomToolCalls(input: content)
+  )
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/PromptTemplate.swift
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/PromptTemplate.swift
@ -0,0 +1,12 @@
+import Foundation
+import Stencil
+
+public struct PromptTemplate {
+    let template: String
+    let data: [String: Any]
+
+  public func render() throws -> String {
+    let template = Template(templateString: self.template)
+    return try template.render(self.data)
+  }
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/SystemPrompts.swift
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/LocalInferenceImpl/SystemPrompts.swift
@ -0,0 +1,91 @@
+import Foundation
+
+import LlamaStackClient
+
+func convertToNativeSwiftType(_ value: Any) -> Any {
+    switch value {
+    case let number as NSNumber:
+        if CFGetTypeID(number) == CFBooleanGetTypeID() {
+            return number.boolValue
+        }
+        if floor(number.doubleValue) == number.doubleValue {
+            return number.intValue
+        }
+        return number.doubleValue
+    case let string as String:
+        return string
+    case let array as [Any]:
+        return array.map(convertToNativeSwiftType)
+    case let dict as [String: Any]:
+        return dict.mapValues(convertToNativeSwiftType)
+    case is NSNull:
+        return NSNull()
+    default:
+        return value
+    }
+}
+
+public class SystemDefaultGenerator {
+  public init() {}
+
+  public func gen() -> PromptTemplate {
+    let templateStr = """
+            Cutting Knowledge Date: December 2023
+            Today Date: {{ today }}
+            """
+
+    let dateFormatter = DateFormatter()
+    dateFormatter.dateFormat = "dd MMMM yyyy"
+
+    return PromptTemplate(
+      template: templateStr,
+      data: ["today": dateFormatter.string(from: Date())]
+    )
+  }
+}
+
+
+public class FunctionTagCustomToolGenerator {
+  public init() {}
+
+  public func gen(customTools: [Components.Schemas.ToolDefinition]) throws -> PromptTemplate {
+    // TODO: required params
+    // TODO: {{#unless @last}},{{/unless}}
+
+    let templateStr = """
+            You are an expert in composing functions. You are given a question and a set of possible functions.
+            Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+            If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
+            also point it out. You should only return the function call in tools call sections.
+
+            If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
+            You SHOULD NOT include any other text in the response.
+
+            Here is a list of functions in JSON format that you can invoke.
+
+            [
+            {% for t in custom_tools %}
+            {
+                "name": "{{t.tool_name}}",
+                "description": "{{t.description}}",
+                "parameters": {
+                    "type": "dict",
+                    "properties": { {{t.parameters}} }
+            }
+
+            {{/let}}
+            {% endfor -%}
+            ]
+            """
+
+    let encoder = JSONEncoder()
+    return PromptTemplate(
+      template: templateStr,
+      data: ["custom_tools": try customTools.map {
+        let data = try encoder.encode($0)
+        let obj = try JSONSerialization.jsonObject(with: data)
+        return convertToNativeSwiftType(obj)
+      }]
+    )
+  }
+}
--- a/llama_stack/providers/impls/ios/inference/LocalInference/README.md
+++ b/llama_stack/providers/impls/ios/inference/LocalInference/README.md
@ -0,0 +1,109 @@
+# LocalInference
+
+LocalInference provides a local inference implementation powered by [executorch](https://github.com/pytorch/executorch/).
+
+Llama Stack currently supports on-device inference for iOS with Android coming soon. You can run on-device inference on Android today using [executorch](https://github.com/pytorch/executorch/tree/main/examples/demo-apps/android/LlamaDemo), PyTorch’s on-device inference library.
+
+## Installation
+
+We're working on making LocalInference easier to set up. For now, you'll need to import it via `.xcframework`:
+
+1. Clone the executorch submodule in this repo and its dependencies: `git submodule update --init --recursive`
+1. Install [Cmake](https://cmake.org/) for the executorch build`
+1. Drag `LocalInference.xcodeproj` into your project
+1. Add `LocalInference` as a framework in your app target
+1. Add a package dependency on https://github.com/pytorch/executorch (branch latest)
+1. Add all the kernels / backends from executorch (but not exectuorch itself!) as frameworks in your app target:
+    - backend_coreml
+    - backend_mps
+    - backend_xnnpack
+    - kernels_custom
+    - kernels_optimized
+    - kernels_portable
+    - kernels_quantized
+1. In "Build Settings" > "Other Linker Flags" > "Any iOS Simulator SDK", add:
+    ```
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a
+    ```
+
+1. In "Build Settings" > "Other Linker Flags" > "Any iOS SDK", add:
+
+    ```
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libkernels_optimized-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libkernels_custom-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libkernels_quantized-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libbackend_coreml-simulator-release.a
+    -force_load
+    $(BUILT_PRODUCTS_DIR)/libbackend_mps-simulator-release.a
+    ```
+
+## Preparing a model
+
+1. Prepare a `.pte` file [following the executorch docs](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md#step-2-prepare-model)
+2. Bundle the `.pte` and `tokenizer.model` file into your app
+
+## Using LocalInference
+
+1. Instantiate LocalInference with a DispatchQueue. Optionally, pass it into your agents service:
+
+```swift
+  init () {
+    runnerQueue = DispatchQueue(label: "org.meta.llamastack")
+    inferenceService = LocalInferenceService(queue: runnerQueue)
+    agentsService = LocalAgentsService(inference: inferenceService)
+  }
+```
+
+2. Before making any inference calls, load your model from your bundle:
+
+```swift
+let mainBundle = Bundle.main
+inferenceService.loadModel(
+    modelPath: mainBundle.url(forResource: "llama32_1b_spinquant", withExtension: "pte"),
+    tokenizerPath: mainBundle.url(forResource: "tokenizer", withExtension: "model"),
+    completion: {_ in } // use to handle load failures
+)
+```
+
+3. Make inference calls (or agents calls) as you normally would with LlamaStack:
+
+```
+for await chunk in try await agentsService.initAndCreateTurn(
+    messages: [
+    .UserMessage(Components.Schemas.UserMessage(
+        content: .case1("Call functions as needed to handle any actions in the following text:\n\n" + text),
+        role: .user))
+    ]
+) {
+```
+
+## Troubleshooting
+
+If you receive errors like "missing package product" or "invalid checksum", try cleaning the build folder and resetting the Swift package cache:
+
+(Opt+Click) Product > Clean Build Folder Immediately
+
+```
+rm -rf \
+  ~/Library/org.swift.swiftpm \
+  ~/Library/Caches/org.swift.swiftpm \
+  ~/Library/Caches/com.apple.dt.Xcode \
+  ~/Library/Developer/Xcode/DerivedData
+```
--- a/llama_stack/providers/impls/meta_reference/agents/agent_instance.py
+++ b/llama_stack/providers/impls/meta_reference/agents/agent_instance.py
@ -398,7 +398,11 @@ class ChatAgent(ShieldRunnerMixin):
                color = "yellow"
            else:
                color = None
-            cprint(f"{str(msg)}", color=color)
+            if len(str(msg)) > 1000:
+                msg_str = f"{str(msg)[:500]}...<more>...{str(msg)[-500:]}"
+            else:
+                msg_str = str(msg)
+            cprint(f"{msg_str}", color=color)

            step_id = str(uuid.uuid4())
            yield AgentTurnResponseStreamChunk(
@ -466,6 +470,13 @@ class ChatAgent(ShieldRunnerMixin):
                        stop_reason = event.stop_reason

            stop_reason = stop_reason or StopReason.out_of_tokens
+
+            # If tool calls are parsed successfully,
+            # if content is not made null the tool call str will also be in the content
+            # and tokens will have tool call syntax included twice
+            if tool_calls:
+                content = ""
+
            message = CompletionMessage(
                content=content,
                stop_reason=stop_reason,
--- a/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py
+++ b/llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py
@ -10,13 +10,14 @@ from jinja2 import Template
 from llama_models.llama3.api import *  # noqa: F403


+from termcolor import cprint  # noqa: F401
+
 from llama_stack.apis.agents import (
    DefaultMemoryQueryGeneratorConfig,
    LLMMemoryQueryGeneratorConfig,
    MemoryQueryGenerator,
    MemoryQueryGeneratorConfig,
 )
-from termcolor import cprint  # noqa: F401
 from llama_stack.apis.inference import *  # noqa: F403


--- a/llama_stack/providers/impls/meta_reference/inference/config.py
+++ b/llama_stack/providers/impls/meta_reference/inference/config.py
@ -16,7 +16,7 @@ from pydantic import BaseModel, Field, field_validator

 class MetaReferenceImplConfig(BaseModel):
    model: str = Field(
-        default="Meta-Llama3.1-8B-Instruct",
+        default="Llama3.1-8B-Instruct",
        description="Model descriptor from `llama model list`",
    )
    quantization: Optional[QuantizationConfig] = None
@ -30,7 +30,7 @@ class MetaReferenceImplConfig(BaseModel):
        permitted_models = [
            m.descriptor()
            for m in all_registered_models()
-            if m.model_family == ModelFamily.llama3_1
+            if m.model_family in {ModelFamily.llama3_1, ModelFamily.llama3_2}
            or m.core_model_id == CoreModelId.llama_guard_3_8b
        ]
        if model not in permitted_models:
@ -42,14 +42,9 @@ class MetaReferenceImplConfig(BaseModel):

    @property
    def model_parallel_size(self) -> int:
-        # HUGE HACK ALERT: this will be fixed when we move inference configuration
+        # HACK ALERT: this will be fixed when we move inference configuration
        # to ModelsRegistry and we can explicitly ask for `model_parallel_size`
        # as configuration there
-        gpu_count = 1
        resolved = resolve_model(self.model)
        assert resolved is not None
-        descriptor = resolved.descriptor().lower()
-        if "-70b" in descriptor or "-405b" in descriptor:
-            gpu_count = 8
-
-        return gpu_count
+        return resolved.pth_file_count
--- a/llama_stack/providers/impls/meta_reference/inference/generation.py
+++ b/llama_stack/providers/impls/meta_reference/inference/generation.py
@ -24,21 +24,31 @@ from fairscale.nn.model_parallel.initialize import (
 )
 from llama_models.llama3.api.args import ModelArgs
 from llama_models.llama3.api.chat_format import ChatFormat, ModelInput
-from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
+from llama_models.llama3.api.datatypes import (
+    InterleavedTextMedia,
+    Message,
+    ToolPromptFormat,
+)
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.llama3.reference_impl.model import Transformer
+from llama_models.llama3.reference_impl.multimodal.model import (
+    CrossAttentionTransformer,
+)
 from llama_models.sku_list import resolve_model
+from termcolor import cprint
+
 from llama_stack.apis.inference import QuantizationType

 from llama_stack.distribution.utils.model_utils import model_local_dir
-from termcolor import cprint

 from .config import MetaReferenceImplConfig


 def model_checkpoint_dir(model) -> str:
    checkpoint_dir = Path(model_local_dir(model.descriptor()))
-    if not Path(checkpoint_dir / "consolidated.00.pth").exists():
+
+    paths = [Path(checkpoint_dir / f"consolidated.{ext}") for ext in ["pth", "00.pth"]]
+    if not any(p.exists() for p in paths):
        checkpoint_dir = checkpoint_dir / "original"

    assert checkpoint_dir.exists(), (
@ -134,7 +144,11 @@ class Llama:
            # load on CPU in bf16 so that fp8 conversion does not find an
            # unexpected (fp32, e.g.) datatype
            torch.set_default_tensor_type(torch.BFloat16Tensor)
-            model = Transformer(model_args)
+            if model_args.vision_chunk_size > 0:
+                model = CrossAttentionTransformer(model_args)
+                model.setup_cache(model_args.max_batch_size, torch.bfloat16)
+            else:
+                model = Transformer(model_args)
            model.load_state_dict(state_dict, strict=False)
            model = convert_to_quantized_model(model, config)
        else:
@ -142,7 +156,11 @@ class Llama:
                torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
            else:
                torch.set_default_tensor_type(torch.cuda.HalfTensor)
-            model = Transformer(model_args)
+            if model_args.vision_chunk_size > 0:
+                model = CrossAttentionTransformer(model_args)
+                model.setup_cache(model_args.max_batch_size, torch.bfloat16)
+            else:
+                model = Transformer(model_args)
            model.load_state_dict(state_dict, strict=False)

        print(f"Loaded in {time.time() - start_time:.2f} seconds")
@ -167,7 +185,11 @@ class Llama:
    ) -> Generator:
        params = self.model.params

-        # cprint("Input to model -> " + self.tokenizer.decode(model_input.tokens), "red")
+        # input_tokens = [
+        #     self.formatter.vision_token if t == 128256 else t
+        #     for t in model_input.tokens
+        # ]
+        # cprint("Input to model -> " + self.tokenizer.decode(input_tokens), "red")
        prompt_tokens = [model_input.tokens]

        bsz = 1
@ -183,6 +205,21 @@ class Llama:
            return

        total_len = min(max_gen_len + max_prompt_len, params.max_seq_len)
+
+        is_vision = isinstance(self.model, CrossAttentionTransformer)
+        if is_vision:
+            images = model_input.vision.images if model_input.vision is not None else []
+            mask = model_input.vision.mask if model_input.vision is not None else []
+
+            # the method works for bsz > 1 so add a batch dimension
+            xattn_caches, cross_attention_masks, full_text_row_masked_out_mask = (
+                self.model.compute_vision_tokens_masks(
+                    batch_images=[images],
+                    batch_masks=[mask],
+                    total_len=total_len,
+                )
+            )
+
        pad_id = self.tokenizer.pad_id
        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
        for k, t in enumerate(prompt_tokens):
@ -206,7 +243,19 @@ class Llama:
        stop_tokens = torch.tensor(self.tokenizer.stop_tokens)

        for cur_pos in range(min_prompt_len, total_len):
-            logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+            if is_vision:
+                position_ids = torch.arange(
+                    prev_pos, cur_pos, dtype=torch.long, device="cuda"
+                )
+                logits = self.model.forward(
+                    position_ids,
+                    tokens,
+                    cross_attention_masks,
+                    full_text_row_masked_out_mask,
+                    xattn_caches,
+                )
+            else:
+                logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)

            if temperature > 0:
                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
@ -222,6 +271,18 @@ class Llama:
            tokens[:, cur_pos] = next_token

            target = tokens[:, prev_pos + 1 : cur_pos + 1]
+            if is_vision:
+                # the logits space (num_classes) is designed to never contain a media_token
+                # however our input token stream does contain them. we need to nuke them here
+                # or else the CUDA kernels will crash with an illegal memory access
+                vision_tokens = [self.tokenizer.special_tokens["<|image|>"], 128256]
+                masks = [target.eq(t) for t in vision_tokens]
+                if len(masks) > 1:
+                    mask = torch.logical_or(*masks)
+                else:
+                    mask = masks[0]
+                target[mask] = 0
+
            if logprobs:
                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
                    input=logits.transpose(1, 2),
@ -248,7 +309,7 @@ class Llama:

    def text_completion(
        self,
-        prompt: str,
+        content: InterleavedTextMedia,
        temperature: float = 0.6,
        top_p: float = 0.9,
        max_gen_len: Optional[int] = None,
@ -262,10 +323,10 @@ class Llama:
        ):
            max_gen_len = self.model.params.max_seq_len - 1

-        prompt_tokens = self.tokenizer.encode(prompt, bos=True, eos=False)
+        model_input = self.formatter.encode_content(content)

        yield from self.generate(
-            model_input=ModelInput(tokens=prompt_tokens),
+            model_input=model_input,
            max_gen_len=max_gen_len,
            temperature=temperature,
            top_p=top_p,
--- a/llama_stack/providers/impls/meta_reference/inference/inference.py
+++ b/llama_stack/providers/impls/meta_reference/inference/inference.py
@ -21,7 +21,9 @@ from llama_stack.apis.inference import (
    ToolCallDelta,
    ToolCallParseStatus,
 )
-from llama_stack.providers.utils.inference.prepare_messages import prepare_messages
+from llama_stack.providers.utils.inference.augment_messages import (
+    augment_messages_for_tools,
+)

 from .config import MetaReferenceImplConfig
 from .model_parallel import LlamaModelParallelGenerator
@ -57,7 +59,7 @@ class MetaReferenceInferenceImpl(Inference):
        model: str,
        messages: List[Message],
        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: Optional[List[ToolDefinition]] = [],
+        tools: Optional[List[ToolDefinition]] = None,
        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
        stream: Optional[bool] = False,
@ -70,14 +72,14 @@ class MetaReferenceInferenceImpl(Inference):
            model=model,
            messages=messages,
            sampling_params=sampling_params,
-            tools=tools,
+            tools=tools or [],
            tool_choice=tool_choice,
            tool_prompt_format=tool_prompt_format,
            stream=stream,
            logprobs=logprobs,
        )

-        messages = prepare_messages(request)
+        messages = augment_messages_for_tools(request)
        model = resolve_model(request.model)
        if model is None:
            raise RuntimeError(
--- a/llama_stack/providers/impls/meta_reference/safety/init.py
+++ b/llama_stack/providers/impls/meta_reference/safety/init.py
@ -7,11 +7,11 @@
 from .config import SafetyConfig


-async def get_provider_impl(config: SafetyConfig, _deps):
+async def get_provider_impl(config: SafetyConfig, deps):
    from .safety import MetaReferenceSafetyImpl

    assert isinstance(config, SafetyConfig), f"Unexpected config type: {type(config)}"

-    impl = MetaReferenceSafetyImpl(config)
+    impl = MetaReferenceSafetyImpl(config, deps)
    await impl.initialize()
    return impl
--- a/llama_stack/providers/impls/meta_reference/safety/config.py
+++ b/llama_stack/providers/impls/meta_reference/safety/config.py
@ -31,7 +31,10 @@ class LlamaGuardShieldConfig(BaseModel):
        permitted_models = [
            m.descriptor()
            for m in safety_models()
-            if m.core_model_id == CoreModelId.llama_guard_3_8b
+            if (
+                m.core_model_id
+                in {CoreModelId.llama_guard_3_8b, CoreModelId.llama_guard_3_11b_vision}
+            )
        ]
        if model not in permitted_models:
            raise ValueError(
--- a/llama_stack/providers/impls/meta_reference/safety/safety.py
+++ b/llama_stack/providers/impls/meta_reference/safety/safety.py
@ -7,8 +7,10 @@
 from llama_models.sku_list import resolve_model

 from llama_stack.distribution.utils.model_utils import model_local_dir
+from llama_stack.apis.inference import *  # noqa: F403
 from llama_stack.apis.safety import *  # noqa: F403
 from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.distribution.datatypes import Api

 from llama_stack.providers.impls.meta_reference.safety.shields.base import (
    OnViolationAction,
@ -34,20 +36,11 @@ def resolve_and_get_path(model_name: str) -> str:


 class MetaReferenceSafetyImpl(Safety):
-    def __init__(self, config: SafetyConfig) -> None:
+    def __init__(self, config: SafetyConfig, deps) -> None:
        self.config = config
+        self.inference_api = deps[Api.inference]

    async def initialize(self) -> None:
-        shield_cfg = self.config.llama_guard_shield
-        if shield_cfg is not None:
-            model_dir = resolve_and_get_path(shield_cfg.model)
-            _ = LlamaGuardShield.instance(
-                model_dir=model_dir,
-                excluded_categories=shield_cfg.excluded_categories,
-                disable_input_check=shield_cfg.disable_input_check,
-                disable_output_check=shield_cfg.disable_output_check,
-            )
-
        shield_cfg = self.config.prompt_guard_shield
        if shield_cfg is not None:
            model_dir = resolve_and_get_path(shield_cfg.model)
@ -91,11 +84,18 @@ class MetaReferenceSafetyImpl(Safety):
    def get_shield_impl(self, typ: MetaReferenceShieldType) -> ShieldBase:
        cfg = self.config
        if typ == MetaReferenceShieldType.llama_guard:
+            cfg = cfg.llama_guard_shield
            assert (
-                cfg.llama_guard_shield is not None
+                cfg is not None
            ), "Cannot use LlamaGuardShield since not present in config"
-            model_dir = resolve_and_get_path(cfg.llama_guard_shield.model)
-            return LlamaGuardShield.instance(model_dir=model_dir)
+
+            return LlamaGuardShield(
+                model=cfg.model,
+                inference_api=self.inference_api,
+                excluded_categories=cfg.excluded_categories,
+                disable_input_check=cfg.disable_input_check,
+                disable_output_check=cfg.disable_output_check,
+            )
        elif typ == MetaReferenceShieldType.jailbreak_shield:
            assert (
                cfg.prompt_guard_shield is not None
--- a/llama_stack/providers/impls/meta_reference/safety/shields/llama_guard.py
+++ b/llama_stack/providers/impls/meta_reference/safety/shields/llama_guard.py
@ -9,9 +9,8 @@ import re
 from string import Template
 from typing import List, Optional

-import torch
 from llama_models.llama3.api.datatypes import Message, Role
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from llama_stack.apis.inference import *  # noqa: F403

 from .base import CANNED_RESPONSE_TEXT, OnViolationAction, ShieldBase, ShieldResponse

@ -100,39 +99,17 @@ PROMPT_TEMPLATE = Template(


 class LlamaGuardShield(ShieldBase):
-    @staticmethod
-    def instance(
-        on_violation_action=OnViolationAction.RAISE,
-        model_dir: str = None,
-        excluded_categories: List[str] = None,
-        disable_input_check: bool = False,
-        disable_output_check: bool = False,
-    ) -> "LlamaGuardShield":
-        global _INSTANCE
-        if _INSTANCE is None:
-            _INSTANCE = LlamaGuardShield(
-                on_violation_action,
-                model_dir,
-                excluded_categories,
-                disable_input_check,
-                disable_output_check,
-            )
-        return _INSTANCE
-
    def __init__(
        self,
-        on_violation_action: OnViolationAction = OnViolationAction.RAISE,
-        model_dir: str = None,
+        model: str,
+        inference_api: Inference,
        excluded_categories: List[str] = None,
        disable_input_check: bool = False,
        disable_output_check: bool = False,
+        on_violation_action: OnViolationAction = OnViolationAction.RAISE,
    ):
        super().__init__(on_violation_action)

-        dtype = torch.bfloat16
-
-        assert model_dir is not None, "Llama Guard model_dir is None"
-
        if excluded_categories is None:
            excluded_categories = []

@ -140,18 +117,12 @@ class LlamaGuardShield(ShieldBase):
            x in SAFETY_CATEGORIES_TO_CODE_MAP.values() for x in excluded_categories
        ), "Invalid categories in excluded categories. Expected format is ['S1', 'S2', ..]"

-        self.device = "cuda"
+        self.model = model
+        self.inference_api = inference_api
        self.excluded_categories = excluded_categories
        self.disable_input_check = disable_input_check
        self.disable_output_check = disable_output_check

-        # load model
-        torch_dtype = torch.bfloat16
-        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_dir, torch_dtype=torch_dtype, device_map=self.device
-        )
-
    def check_unsafe_response(self, response: str) -> Optional[str]:
        match = re.match(r"^unsafe\n(.*)$", response)
        if match:
@ -212,26 +183,21 @@ class LlamaGuardShield(ShieldBase):
            )
        else:
            prompt = self.build_prompt(messages)
-            llama_guard_input = {
-                "role": "user",
-                "content": prompt,
-            }
-            input_ids = self.tokenizer.apply_chat_template(
-                [llama_guard_input], return_tensors="pt", tokenize=True
-            ).to(self.device)
-            prompt_len = input_ids.shape[1]
-            output = self.model.generate(
-                input_ids=input_ids,
-                max_new_tokens=20,
-                output_scores=True,
-                return_dict_in_generate=True,
-                pad_token_id=0,
-            )
-            generated_tokens = output.sequences[:, prompt_len:]

-            response = self.tokenizer.decode(
-                generated_tokens[0], skip_special_tokens=True
-            )
-            response = response.strip()
-            shield_response = self.get_shield_response(response)
+            # TODO: llama-stack inference protocol has issues with non-streaming inference code
+            content = ""
+            async for chunk in self.inference_api.chat_completion(
+                model=self.model,
+                messages=[
+                    UserMessage(content=prompt),
+                ],
+                stream=True,
+            ):
+                event = chunk.event
+                if event.event_type == ChatCompletionResponseEventType.progress:
+                    assert isinstance(event.delta, str)
+                    content += event.delta
+
+            content = content.strip()
+            shield_response = self.get_shield_response(content)
            return shield_response
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -20,6 +20,7 @@ def available_providers() -> List[ProviderSpec]:
                "fairscale",
                "fbgemm-gpu==0.8.0",
                "torch",
+                "torchvision",
                "transformers",
                "zmq",
            ],
@ -75,15 +76,4 @@ def available_providers() -> List[ProviderSpec]:
                header_extractor_class="llama_stack.providers.adapters.inference.together.TogetherHeaderExtractor",
            ),
        ),
-        remote_provider_spec(
-            api=Api.inference,
-            adapter=AdapterSpec(
-                adapter_id="bedrock",
-                pip_packages=[
-                    "boto3",
-                ],
-                module="llama_stack.providers.adapters.inference.bedrock",
-                config_class="llama_stack.providers.adapters.inference.bedrock.BedrockConfig",
-            ),
-        ),
    ]
--- a/llama_stack/providers/registry/safety.py
+++ b/llama_stack/providers/registry/safety.py
@ -21,13 +21,15 @@ def available_providers() -> List[ProviderSpec]:
            api=Api.safety,
            provider_id="meta-reference",
            pip_packages=[
-                "accelerate",
                "codeshield",
-                "torch",
                "transformers",
+                "torch --index-url https://download.pytorch.org/whl/cpu",
            ],
            module="llama_stack.providers.impls.meta_reference.safety",
            config_class="llama_stack.providers.impls.meta_reference.safety.SafetyConfig",
+            api_dependencies=[
+                Api.inference,
+            ],
        ),
        remote_provider_spec(
            api=Api.safety,
--- a/llama_stack/providers/utils/inference/augment_messages.py
+++ b/llama_stack/providers/utils/inference/augment_messages.py
@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+from termcolor import cprint
+from llama_models.llama3.api.datatypes import *  # noqa: F403
+from llama_stack.apis.inference import *  # noqa: F403
+from llama_models.datatypes import ModelFamily
+from llama_models.llama3.prompt_templates import (
+    BuiltinToolGenerator,
+    FunctionTagCustomToolGenerator,
+    JsonCustomToolGenerator,
+    PythonListCustomToolGenerator,
+    SystemDefaultGenerator,
+)
+from llama_models.sku_list import resolve_model
+
+
+def augment_messages_for_tools(request: ChatCompletionRequest) -> List[Message]:
+    """Reads chat completion request and augments the messages to handle tools.
+    For eg. for llama_3_1, add system message with the appropriate tools or
+    add user messsage for custom tools, etc.
+    """
+    model = resolve_model(request.model)
+    if model is None:
+        cprint(f"Could not resolve model {request.model}", color="red")
+        return request.messages
+
+    if model.model_family not in [ModelFamily.llama3_1, ModelFamily.llama3_2]:
+        cprint(f"Model family {model.model_family} not llama 3_1 or 3_2", color="red")
+        return request.messages
+
+    if model.model_family == ModelFamily.llama3_1 or (
+        model.model_family == ModelFamily.llama3_2 and is_multimodal(model)
+    ):
+        # llama3.1 and llama3.2 multimodal models follow the same tool prompt format
+        return augment_messages_for_tools_llama_3_1(request)
+    elif model.model_family == ModelFamily.llama3_2:
+        return augment_messages_for_tools_llama_3_2(request)
+    else:
+        return request.messages
+
+
+def augment_messages_for_tools_llama_3_1(
+    request: ChatCompletionRequest,
+) -> List[Message]:
+
+    assert request.tool_choice == ToolChoice.auto, "Only `ToolChoice.auto` supported"
+
+    existing_messages = request.messages
+    existing_system_message = None
+    if existing_messages[0].role == Role.system.value:
+        existing_system_message = existing_messages.pop(0)
+
+    assert (
+        existing_messages[0].role != Role.system.value
+    ), "Should only have 1 system message"
+
+    messages = []
+
+    default_gen = SystemDefaultGenerator()
+    default_template = default_gen.gen()
+
+    sys_content = ""
+
+    tool_template = None
+    if request.tools:
+        tool_gen = BuiltinToolGenerator()
+        tool_template = tool_gen.gen(request.tools)
+
+        sys_content += tool_template.render()
+        sys_content += "\n"
+
+    sys_content += default_template.render()
+
+    if existing_system_message:
+        # TODO: this fn is needed in many places
+        def _process(c):
+            if isinstance(c, str):
+                return c
+            else:
+                return "<media>"
+
+        sys_content += "\n"
+
+        if isinstance(existing_system_message.content, str):
+            sys_content += _process(existing_system_message.content)
+        elif isinstance(existing_system_message.content, list):
+            sys_content += "\n".join(
+                [_process(c) for c in existing_system_message.content]
+            )
+
+    messages.append(SystemMessage(content=sys_content))
+
+    has_custom_tools = any(isinstance(dfn.tool_name, str) for dfn in request.tools)
+    if has_custom_tools:
+        if request.tool_prompt_format == ToolPromptFormat.json:
+            tool_gen = JsonCustomToolGenerator()
+        elif request.tool_prompt_format == ToolPromptFormat.function_tag:
+            tool_gen = FunctionTagCustomToolGenerator()
+        else:
+            raise ValueError(
+                f"Non supported ToolPromptFormat {request.tool_prompt_format}"
+            )
+
+        custom_tools = [t for t in request.tools if isinstance(t.tool_name, str)]
+        custom_template = tool_gen.gen(custom_tools)
+        messages.append(UserMessage(content=custom_template.render()))
+
+    # Add back existing messages from the request
+    messages += existing_messages
+
+    return messages
+
+
+def augment_messages_for_tools_llama_3_2(
+    request: ChatCompletionRequest,
+) -> List[Message]:
+    assert request.tool_choice == ToolChoice.auto, "Only `ToolChoice.auto` supported"
+
+    existing_messages = request.messages
+    existing_system_message = None
+    if existing_messages[0].role == Role.system.value:
+        existing_system_message = existing_messages.pop(0)
+
+    assert (
+        existing_messages[0].role != Role.system.value
+    ), "Should only have 1 system message"
+
+    messages = []
+    sys_content = ""
+    custom_tools, builtin_tools = [], []
+    for t in request.tools:
+        if isinstance(t.tool_name, str):
+            custom_tools.append(t)
+        else:
+            builtin_tools.append(t)
+
+    tool_template = None
+    if builtin_tools:
+        tool_gen = BuiltinToolGenerator()
+        tool_template = tool_gen.gen(builtin_tools)
+
+        sys_content += tool_template.render()
+        sys_content += "\n"
+
+    custom_tools = [dfn for dfn in request.tools if isinstance(dfn.tool_name, str)]
+    if custom_tools:
+        if request.tool_prompt_format != ToolPromptFormat.python_list:
+            raise ValueError(
+                f"Non supported ToolPromptFormat {request.tool_prompt_format}"
+            )
+
+        tool_gen = PythonListCustomToolGenerator()
+        tool_template = tool_gen.gen(custom_tools)
+
+        sys_content += tool_template.render()
+        sys_content += "\n"
+
+    if existing_system_message:
+        sys_content += interleaved_text_media_as_str(
+            existing_system_message.content, sep="\n"
+        )
+
+    messages.append(SystemMessage(content=sys_content))
+
+    # Add back existing messages from the request
+    messages += existing_messages
+    return messages
--- a/llama_stack/providers/utils/inference/prepare_messages.py
+++ b/llama_stack/providers/utils/inference/prepare_messages.py
@ -1,84 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_models.llama3.api.datatypes import *  # noqa: F403
-from llama_stack.apis.inference import *  # noqa: F403
-from llama_models.llama3.prompt_templates import (
-    BuiltinToolGenerator,
-    FunctionTagCustomToolGenerator,
-    JsonCustomToolGenerator,
-    SystemDefaultGenerator,
-)
-
-
-def prepare_messages(request: ChatCompletionRequest) -> List[Message]:
-
-    assert request.tool_choice == ToolChoice.auto, "Only `ToolChoice.auto` supported"
-
-    existing_messages = request.messages
-    existing_system_message = None
-    if existing_messages[0].role == Role.system.value:
-        existing_system_message = existing_messages.pop(0)
-
-    assert (
-        existing_messages[0].role != Role.system.value
-    ), "Should only have 1 system message"
-
-    messages = []
-
-    default_gen = SystemDefaultGenerator()
-    default_template = default_gen.gen()
-
-    sys_content = ""
-
-    tool_template = None
-    if request.tools:
-        tool_gen = BuiltinToolGenerator()
-        tool_template = tool_gen.gen(request.tools)
-
-        sys_content += tool_template.render()
-        sys_content += "\n"
-
-    sys_content += default_template.render()
-
-    if existing_system_message:
-        # TODO: this fn is needed in many places
-        def _process(c):
-            if isinstance(c, str):
-                return c
-            else:
-                return "<media>"
-
-        sys_content += "\n"
-
-        if isinstance(existing_system_message.content, str):
-            sys_content += _process(existing_system_message.content)
-        elif isinstance(existing_system_message.content, list):
-            sys_content += "\n".join(
-                [_process(c) for c in existing_system_message.content]
-            )
-
-    messages.append(SystemMessage(content=sys_content))
-
-    has_custom_tools = any(isinstance(dfn.tool_name, str) for dfn in request.tools)
-    if has_custom_tools:
-        if request.tool_prompt_format == ToolPromptFormat.json:
-            tool_gen = JsonCustomToolGenerator()
-        elif request.tool_prompt_format == ToolPromptFormat.function_tag:
-            tool_gen = FunctionTagCustomToolGenerator()
-        else:
-            raise ValueError(
-                f"Non supported ToolPromptFormat {request.tool_prompt_format}"
-            )
-
-        custom_tools = [t for t in request.tools if isinstance(t.tool_name, str)]
-        custom_template = tool_gen.gen(custom_tools)
-        messages.append(UserMessage(content=custom_template.render()))
-
-    # Add back existing messages from the request
-    messages += existing_messages
-
-    return messages