Merge branch 'main' into evals_6

2025-07-29 15:23:51 +00:00 · 2024-10-25 12:55:28 -07:00 · 2024-10-25 12:55:28 -07:00 · d95bef7f2e
commit d95bef7f2e
parent 52fe165db8 07f9bf723f
38 changed files with 352 additions and 346 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,4 +1,4 @@
-exclude: 'build'
+exclude: 'build/'
 default_language_version:
    python: python3
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,4 @@
 include requirements.txt
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
-include distributions/*/build.yaml
+include llama_stack/templates/*/build.yaml
--- a/distributions/bedrock/build.yaml
+++ b/distributions/bedrock/build.yaml
@ -1,10 +0,0 @@
 name: bedrock
 distribution_spec:
  description: Use Amazon Bedrock APIs.
  providers:
    inference: remote::bedrock
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: conda
--- a/distributions/bedrock/build.yaml
+++ b/distributions/bedrock/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/bedrock/build.yaml
--- a/distributions/databricks/build.yaml
+++ b/distributions/databricks/build.yaml
@ -1,10 +0,0 @@
 name: databricks
 distribution_spec:
  description: Use Databricks for running LLM inference
  providers:
    inference: remote::databricks
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: conda
--- a/distributions/databricks/build.yaml
+++ b/distributions/databricks/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/databricks/build.yaml
--- a/distributions/fireworks/README.md
+++ b/distributions/fireworks/README.md
@ -49,7 +49,7 @@ inference:
 **Via Conda**
 ```bash
-llama stack build --config ./build.yaml
+llama stack build --template fireworks --image-type conda
 # -- modify run.yaml to a valid Fireworks server endpoint
 llama stack run ./run.yaml
 ```
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@ -1,10 +0,0 @@
 name: fireworks
 distribution_spec:
  description: Use Fireworks.ai for running LLM inference
  providers:
    inference: remote::fireworks
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: docker
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/fireworks/build.yaml
--- a/distributions/hf-endpoint/build.yaml
+++ b/distributions/hf-endpoint/build.yaml
@ -1,10 +0,0 @@
 name: hf-endpoint
 distribution_spec:
  description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
  providers:
    inference: remote::hf::endpoint
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: conda
--- a/distributions/hf-endpoint/build.yaml
+++ b/distributions/hf-endpoint/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/hf-endpoint/build.yaml
--- a/distributions/hf-serverless/build.yaml
+++ b/distributions/hf-serverless/build.yaml
@ -1,10 +0,0 @@
 name: hf-serverless
 distribution_spec:
  description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
  providers:
    inference: remote::hf::serverless
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: conda
--- a/distributions/hf-serverless/build.yaml
+++ b/distributions/hf-serverless/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/hf-serverless/build.yaml
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@ -1,14 +0,0 @@
 name: meta-reference-gpu
 distribution_spec:
  docker_image: pytorch/pytorch
  description: Use code from `llama_stack` itself to serve all llama stack APIs
  providers:
    inference: meta-reference
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: docker
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/meta-reference-gpu/build.yaml
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@ -1,14 +0,0 @@
 name: meta-reference-quantized-gpu
 distribution_spec:
  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
  description: Use code from `llama_stack` itself to serve all llama stack APIs
  providers:
    inference: meta-reference-quantized
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: docker
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
--- a/distributions/ollama/README.md
+++ b/distributions/ollama/README.md
@ -86,6 +86,6 @@ inference:
 **Via Conda**
 ```
-llama stack build --config ./build.yaml
+llama stack build --template ollama --image-type conda
 llama stack run ./gpu/run.yaml
 ```
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@ -1,13 +0,0 @@
 name: ollama
 distribution_spec:
  description: Use ollama for running LLM inference
  providers:
    inference: remote::ollama
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: docker
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/ollama/build.yaml
--- a/distributions/tgi/README.md
+++ b/distributions/tgi/README.md
@ -88,7 +88,7 @@ inference:
 **Via Conda**
 ```bash
-llama stack build --config ./build.yaml
+llama stack build --template tgi --image-type conda
 # -- start a TGI server endpoint
 llama stack run ./gpu/run.yaml
 ```
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@ -1,13 +0,0 @@
 name: tgi
 distribution_spec:
  description: Use TGI for running LLM inference
  providers:
    inference: remote::tgi
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: docker
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/tgi/build.yaml
--- a/distributions/together/README.md
+++ b/distributions/together/README.md
@ -62,7 +62,7 @@ memory:
 **Via Conda**
 ```bash
-llama stack build --config ./build.yaml
+llama stack build --template together --image-type conda
 # -- modify run.yaml to a valid Together server endpoint
 llama stack run ./run.yaml
 ```
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@ -1,10 +0,0 @@
 name: together
 distribution_spec:
  description: Use Together.ai for running LLM inference
  providers:
    inference: remote::together
    memory: remote::weaviate
    safety: remote::together
    agents: meta-reference
    telemetry: meta-reference
 image_type: docker
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/together/build.yaml
--- a/distributions/vllm/build.yaml
+++ b/distributions/vllm/build.yaml
@ -1,10 +0,0 @@
 name: vllm
 distribution_spec:
  description: Like local, but use vLLM for running LLM inference
  providers:
    inference: vllm
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
 image_type: conda
--- a/distributions/vllm/build.yaml
+++ b/distributions/vllm/build.yaml
@ -0,0 +1 @@
 ../../llama_stack/templates/vllm/build.yaml
--- a/docs/cli_reference.md
+++ b/docs/cli_reference.md
@ -279,11 +279,11 @@ llama stack build --list-templates
 You may then pick a template to build your distribution with providers fitted to your liking.
 ```
-llama stack build --template local-tgi --name my-tgi-stack
+llama stack build --template local-tgi --name my-tgi-stack --image-type conda
 ```
 ```
-$ llama stack build --template local-tgi --name my-tgi-stack
+$ llama stack build --template local-tgi --name my-tgi-stack --image-type conda
 ...
 ...
 Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
@ -293,10 +293,10 @@ You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~
 #### Building from config file
 - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/distributions/templates/`.
+- The config file will be of contents like the ones in `llama_stack/templates/`.
 ```
-$ cat llama_stack/distribution/templates/local-ollama-build.yaml
+$ cat build.yaml
 name: local-ollama
 distribution_spec:
@ -311,7 +311,7 @@ image_type: conda
 ```
 ```
-llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml
+llama stack build --config build.yaml
 ```
 #### How to build distribution with Docker image
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@ -35,11 +35,7 @@ You have two ways to start up Llama stack server:
 1. **Starting up server via docker**:
-	We provide 2 pre-built Docker image of Llama Stack distribution, which can be found in the following links.
+	We provide pre-built Docker image of Llama Stack distribution, which can be found in the following links in the [distributions](../distributions/) folder.
 	- [llamastack-local-gpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-gpu/general)
 	- This is a packaged version with our local meta-reference implementations, where you will be running inference locally with downloaded Llama model checkpoints.
 	- [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general)
 	- This is a lite version with remote inference where you can hook up to your favourite remote inference framework (e.g. ollama, fireworks, together, tgi) for running inference without GPU.
 	> [!NOTE]
 	> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container.
--- a/llama_stack/cli/stack/build.py
+++ b/llama_stack/cli/stack/build.py
@ -12,9 +12,7 @@ import os
 from functools import lru_cache
 from pathlib import Path
-TEMPLATES_PATH = (
+TEMPLATES_PATH = Path(os.path.relpath(__file__)).parent.parent.parent / "templates"
    Path(os.path.relpath(__file__)).parent.parent.parent.parent / "distributions"
 )
@lru_cache()
@ -26,7 +24,6 @@ def available_templates_specs() -> List[BuildConfig]:
        with open(p, "r") as f:
            build_config = BuildConfig(**yaml.safe_load(f))
            template_specs.append(build_config)
    return template_specs
@ -78,112 +75,17 @@ class StackBuild(Subcommand):
            choices=["conda", "docker"],
        )
    def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]:
        if os.getenv("CONDA_PREFIX", ""):
            conda_dir = (
                Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}"
            )
        else:
            cprint(
                "Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...",
                color="green",
            )
            conda_dir = (
                Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}"
            )
        build_config_file = Path(conda_dir) / f"{args.name}-build.yaml"
        if build_config_file.exists():
            return build_config_file
        return None
    def _run_stack_build_command_from_build_config(
        self, build_config: BuildConfig
    ) -> None:
        import json
        import os
        import yaml
        from llama_stack.distribution.build import build_image, ImageType
        from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
        from llama_stack.distribution.utils.serialize import EnumEncoder
        from termcolor import cprint
        # save build.yaml spec for building same distribution again
        if build_config.image_type == ImageType.docker.value:
            # docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
            llama_stack_path = Path(
                os.path.abspath(__file__)
            ).parent.parent.parent.parent
            build_dir = llama_stack_path / "tmp/configs/"
        else:
            build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}"
        os.makedirs(build_dir, exist_ok=True)
        build_file_path = build_dir / f"{build_config.name}-build.yaml"
        with open(build_file_path, "w") as f:
            to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
            f.write(yaml.dump(to_write, sort_keys=False))
        return_code = build_image(build_config, build_file_path)
        if return_code != 0:
            return
        configure_name = (
            build_config.name
            if build_config.image_type == "conda"
            else (f"llamastack-{build_config.name}")
        )
        if build_config.image_type == "conda":
            cprint(
                f"You can now run `llama stack configure {configure_name}`",
                color="green",
            )
        else:
            cprint(
                f"You can now run `llama stack run {build_config.name}`",
                color="green",
            )
    def _run_template_list_cmd(self, args: argparse.Namespace) -> None:
        import json
        from llama_stack.cli.table import print_table
        # eventually, this should query a registry at llama.meta.com/llamastack/distributions
        headers = [
            "Template Name",
            "Providers",
            "Description",
        ]
        rows = []
        for spec in available_templates_specs():
            rows.append(
                [
                    spec.name,
                    json.dumps(spec.distribution_spec.providers, indent=2),
                    spec.distribution_spec.description,
                ]
            )
        print_table(
            rows,
            headers,
            separate_rows=True,
        )
    def _run_stack_build_command(self, args: argparse.Namespace) -> None:
        import textwrap
        import yaml
        from llama_stack.distribution.distribution import get_provider_registry
        from prompt_toolkit import prompt
        from prompt_toolkit.completion import WordCompleter
        from prompt_toolkit.validation import Validator
        from termcolor import cprint
        from llama_stack.distribution.distribution import get_provider_registry
        if args.list_templates:
            self._run_template_list_cmd(args)
            return
@ -194,19 +96,22 @@ class StackBuild(Subcommand):
                    "You must specify a name for the build using --name when using a template"
                )
                return
-            build_path = TEMPLATES_PATH / f"{args.template}-build.yaml"
+            available_templates = available_templates_specs()
-            if not build_path.exists():
+            for build_config in available_templates:
-                self.parser.error(
+                if build_config.name == args.template:
-                    f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates"
+                    build_config.name = args.name
-                )
+                    if args.image_type:
-                return
+                        build_config.image_type = args.image_type
-            with open(build_path, "r") as f:
+                    else:
-                build_config = BuildConfig(**yaml.safe_load(f))
+                        self.parser.error(
-                build_config.name = args.name
+                            f"Please specify a image-type (docker | conda) for {args.template}"
-                if args.image_type:
+                        )
-                    build_config.image_type = args.image_type
+                    self._run_stack_build_command_from_build_config(build_config)
-                self._run_stack_build_command_from_build_config(build_config)
+                    return
            self.parser.error(
                f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates"
            )
            return
        # try to see if we can find a pre-existing build config file through name
@ -297,3 +202,99 @@ class StackBuild(Subcommand):
                self.parser.error(f"Could not parse config file {args.config}: {e}")
                return
            self._run_stack_build_command_from_build_config(build_config)
    def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]:
        if os.getenv("CONDA_PREFIX", ""):
            conda_dir = (
                Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}"
            )
        else:
            cprint(
                "Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...",
                color="green",
            )
            conda_dir = (
                Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}"
            )
        build_config_file = Path(conda_dir) / f"{args.name}-build.yaml"
        if build_config_file.exists():
            return build_config_file
        return None
    def _run_stack_build_command_from_build_config(
        self, build_config: BuildConfig
    ) -> None:
        import json
        import os
        import yaml
        from termcolor import cprint
        from llama_stack.distribution.build import build_image, ImageType
        from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
        from llama_stack.distribution.utils.serialize import EnumEncoder
        # save build.yaml spec for building same distribution again
        if build_config.image_type == ImageType.docker.value:
            # docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
            llama_stack_path = Path(
                os.path.abspath(__file__)
            ).parent.parent.parent.parent
            build_dir = llama_stack_path / "tmp/configs/"
        else:
            build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}"
        os.makedirs(build_dir, exist_ok=True)
        build_file_path = build_dir / f"{build_config.name}-build.yaml"
        with open(build_file_path, "w") as f:
            to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
            f.write(yaml.dump(to_write, sort_keys=False))
        return_code = build_image(build_config, build_file_path)
        if return_code != 0:
            return
        configure_name = (
            build_config.name
            if build_config.image_type == "conda"
            else (f"llamastack-{build_config.name}")
        )
        if build_config.image_type == "conda":
            cprint(
                f"You can now run `llama stack configure {configure_name}`",
                color="green",
            )
        else:
            cprint(
                f"You can now edit your run.yaml file and run `docker run -it -p 5000:5000 {build_config.name}`. See full command in llama-stack/distributions/",
                color="green",
            )
    def _run_template_list_cmd(self, args: argparse.Namespace) -> None:
        import json
        from llama_stack.cli.table import print_table
        # eventually, this should query a registry at llama.meta.com/llamastack/distributions
        headers = [
            "Template Name",
            "Providers",
            "Description",
        ]
        rows = []
        for spec in available_templates_specs():
            rows.append(
                [
                    spec.name,
                    json.dumps(spec.distribution_spec.providers, indent=2),
                    spec.distribution_spec.description,
                ]
            )
        print_table(
            rows,
            headers,
            separate_rows=True,
        )
--- a/llama_stack/distribution/build.py
+++ b/llama_stack/distribution/build.py
@ -8,18 +8,19 @@ from enum import Enum
 from typing import List, Optional
 import pkg_resources
 from llama_stack.distribution.utils.exec import run_with_pty
 from pydantic import BaseModel
 from termcolor import cprint
 from llama_stack.distribution.utils.exec import run_with_pty
 from llama_stack.distribution.datatypes import *  # noqa: F403
 from pathlib import Path
 from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
 from llama_stack.distribution.distribution import get_provider_registry
 from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
 # These are the dependencies needed by the distribution server.
 # `llama-stack` is automatically installed by the installation script.
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -1,5 +1,11 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
@ -34,9 +40,6 @@ REPO_CONFIGS_DIR="$REPO_DIR/tmp/configs"
 TEMP_DIR=$(mktemp -d)
 llama stack configure $build_file_path
 cp $host_build_dir/$build_name-run.yaml $REPO_CONFIGS_DIR
 add_to_docker() {
  local input
  output_file="$TEMP_DIR/Dockerfile"
@ -113,7 +116,6 @@ ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"]
 EOF
 add_to_docker "ADD tmp/configs/$(basename "$build_file_path") ./llamastack-build.yaml"
 add_to_docker "ADD tmp/configs/$build_name-run.yaml ./llamastack-run.yaml"
 printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile"
 cat $TEMP_DIR/Dockerfile
--- a/llama_stack/distribution/start_container.sh
+++ b/llama_stack/distribution/start_container.sh
@ -29,7 +29,7 @@ if [ $# -lt 3 ]; then
 fi
 build_name="$1"
-docker_image="llamastack-$build_name"
+docker_image="distribution-$build_name"
 shift
 yaml_config="$1"
--- a/llama_stack/providers/impls/meta_reference/agents/agents.py
+++ b/llama_stack/providers/impls/meta_reference/agents/agents.py
@ -169,7 +169,7 @@ class MetaReferenceAgentsImpl(Agents):
        turn_ids: Optional[List[str]] = None,
    ) -> Session:
        session = await self.persistence_store.get(f"session:{agent_id}:{session_id}")
-        session = Session(**json.loads(session))
+        session = Session(**json.loads(session), turns=[])
        turns = []
        if turn_ids:
            for turn_id in turn_ids:
--- a/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh
+++ b/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh
@ -1,5 +1,11 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 if [[ $# -ne 1 ]]; then
    echo "Error: Please provide the name of CONDA environment you wish to create"
    exit 1
--- a/llama_stack/providers/impls/vllm/config.py
+++ b/llama_stack/providers/impls/vllm/config.py
@ -15,13 +15,24 @@ class VLLMConfig(BaseModel):
    """Configuration for the vLLM inference provider."""
    model: str = Field(
-        default="Llama3.1-8B-Instruct",
+        default="Llama3.2-3B-Instruct",
        description="Model descriptor from `llama model list`",
    )
    tensor_parallel_size: int = Field(
        default=1,
        description="Number of tensor parallel replicas (number of GPUs to use).",
    )
    max_tokens: int = Field(
        default=4096,
        description="Maximum number of tokens to generate.",
    )
    enforce_eager: bool = Field(
        default=False,
        description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
    )
    gpu_memory_utilization: float = Field(
        default=0.3,
    )
    @field_validator("model")
    @classmethod
--- a/llama_stack/providers/impls/vllm/vllm.py
+++ b/llama_stack/providers/impls/vllm/vllm.py
@ -7,11 +7,12 @@
 import logging
 import os
 import uuid
-from typing import Any, AsyncGenerator
+from typing import AsyncGenerator, Optional
 from llama_models.llama3.api.chat_format import ChatFormat
 from llama_models.llama3.api.datatypes import *  # noqa: F403
 from llama_models.llama3.api.tokenizer import Tokenizer
 from llama_models.sku_list import resolve_model
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -19,7 +20,7 @@ from vllm.sampling_params import SamplingParams as VLLMSamplingParams
 from llama_stack.apis.inference import *  # noqa: F403
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
+from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate
 from llama_stack.providers.utils.inference.openai_compat import (
    OpenAICompatCompletionChoice,
    OpenAICompatCompletionResponse,
@ -40,74 +41,15 @@ def _random_uuid() -> str:
    return str(uuid.uuid4().hex)
-def _vllm_sampling_params(sampling_params: Any) -> VLLMSamplingParams:
+class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
    """Convert sampling params to vLLM sampling params."""
    if sampling_params is None:
        return VLLMSamplingParams()
    # TODO convert what I saw in my first test ... but surely there's more to do here
    kwargs = {
        "temperature": sampling_params.temperature,
    }
    if sampling_params.top_k >= 1:
        kwargs["top_k"] = sampling_params.top_k
    if sampling_params.top_p:
        kwargs["top_p"] = sampling_params.top_p
    if sampling_params.max_tokens >= 1:
        kwargs["max_tokens"] = sampling_params.max_tokens
    if sampling_params.repetition_penalty > 0:
        kwargs["repetition_penalty"] = sampling_params.repetition_penalty
    return VLLMSamplingParams(**kwargs)
 class VLLMInferenceImpl(ModelRegistryHelper, Inference):
    """Inference implementation for vLLM."""
    HF_MODEL_MAPPINGS = {
        # TODO: seems like we should be able to build this table dynamically ...
        "Llama3.1-8B": "meta-llama/Llama-3.1-8B",
        "Llama3.1-70B": "meta-llama/Llama-3.1-70B",
        "Llama3.1-405B:bf16-mp8": "meta-llama/Llama-3.1-405B",
        "Llama3.1-405B": "meta-llama/Llama-3.1-405B-FP8",
        "Llama3.1-405B:bf16-mp16": "meta-llama/Llama-3.1-405B",
        "Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct",
        "Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct",
        "Llama3.1-405B-Instruct:bf16-mp8": "meta-llama/Llama-3.1-405B-Instruct",
        "Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-FP8",
        "Llama3.1-405B-Instruct:bf16-mp16": "meta-llama/Llama-3.1-405B-Instruct",
        "Llama3.2-1B": "meta-llama/Llama-3.2-1B",
        "Llama3.2-3B": "meta-llama/Llama-3.2-3B",
        "Llama3.2-11B-Vision": "meta-llama/Llama-3.2-11B-Vision",
        "Llama3.2-90B-Vision": "meta-llama/Llama-3.2-90B-Vision",
        "Llama3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct",
        "Llama3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
        "Llama3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
        "Llama3.2-90B-Vision-Instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
        "Llama-Guard-3-11B-Vision": "meta-llama/Llama-Guard-3-11B-Vision",
        "Llama-Guard-3-1B:int4-mp1": "meta-llama/Llama-Guard-3-1B-INT4",
        "Llama-Guard-3-1B": "meta-llama/Llama-Guard-3-1B",
        "Llama-Guard-3-8B": "meta-llama/Llama-Guard-3-8B",
        "Llama-Guard-3-8B:int8-mp1": "meta-llama/Llama-Guard-3-8B-INT8",
        "Prompt-Guard-86M": "meta-llama/Prompt-Guard-86M",
        "Llama-Guard-2-8B": "meta-llama/Llama-Guard-2-8B",
    }
    def __init__(self, config: VLLMConfig):
        Inference.__init__(self)
        ModelRegistryHelper.__init__(
            self,
            stack_to_provider_models_map=self.HF_MODEL_MAPPINGS,
        )
        self.config = config
        self.engine = None
-
+        self.formatter = ChatFormat(Tokenizer.get_instance())
        tokenizer = Tokenizer.get_instance()
        self.formatter = ChatFormat(tokenizer)
    async def initialize(self):
        """Initialize the vLLM inference adapter."""
        log.info("Initializing vLLM inference adapter")
        # Disable usage stats reporting. This would be a surprising thing for most
@ -116,15 +58,22 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
        if "VLLM_NO_USAGE_STATS" not in os.environ:
            os.environ["VLLM_NO_USAGE_STATS"] = "1"
-        hf_model = self.HF_MODEL_MAPPINGS.get(self.config.model)
+        model = resolve_model(self.config.model)
        if model is None:
            raise ValueError(f"Unknown model {self.config.model}")
        if model.huggingface_repo is None:
            raise ValueError(f"Model {self.config.model} needs a huggingface repo")
        # TODO -- there are a ton of options supported here ...
-        engine_args = AsyncEngineArgs()
+        engine_args = AsyncEngineArgs(
-        engine_args.model = hf_model
+            model=model.huggingface_repo,
-        # We will need a new config item for this in the future if model support is more broad
+            tokenizer=model.huggingface_repo,
-        # than it is today (llama only)
+            tensor_parallel_size=self.config.tensor_parallel_size,
-        engine_args.tokenizer = hf_model
+            enforce_eager=self.config.enforce_eager,
-        engine_args.tensor_parallel_size = self.config.tensor_parallel_size
+            gpu_memory_utilization=self.config.gpu_memory_utilization,
            guided_decoding_backend="lm-format-enforcer",
        )
        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
@ -134,13 +83,47 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
        if self.engine:
            self.engine.shutdown_background_loop()
    async def register_model(self, model: ModelDef) -> None:
        raise ValueError(
            "You cannot dynamically add a model to a running vllm instance"
        )
    async def list_models(self) -> List[ModelDef]:
        return [
            ModelDef(
                identifier=self.config.model,
                llama_model=self.config.model,
            )
        ]
    def _sampling_params(self, sampling_params: SamplingParams) -> VLLMSamplingParams:
        if sampling_params is None:
            return VLLMSamplingParams(max_tokens=self.config.max_tokens)
        # TODO convert what I saw in my first test ... but surely there's more to do here
        kwargs = {
            "temperature": sampling_params.temperature,
            "max_tokens": self.config.max_tokens,
        }
        if sampling_params.top_k:
            kwargs["top_k"] = sampling_params.top_k
        if sampling_params.top_p:
            kwargs["top_p"] = sampling_params.top_p
        if sampling_params.max_tokens:
            kwargs["max_tokens"] = sampling_params.max_tokens
        if sampling_params.repetition_penalty > 0:
            kwargs["repetition_penalty"] = sampling_params.repetition_penalty
        return VLLMSamplingParams(**kwargs)
    async def completion(
        self,
        model: str,
        content: InterleavedTextMedia,
-        sampling_params: Any | None = ...,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        stream: bool | None = False,
+        response_format: Optional[ResponseFormat] = None,
-        logprobs: LogProbConfig | None = None,
+        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> CompletionResponse | CompletionResponseStreamChunk:
        log.info("vLLM completion")
        messages = [UserMessage(content=content)]
@ -155,13 +138,14 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
    async def chat_completion(
        self,
        model: str,
-        messages: list[Message],
+        messages: List[Message],
-        sampling_params: Any | None = ...,
+        sampling_params: Optional[SamplingParams] = SamplingParams(),
-        tools: list[ToolDefinition] | None = ...,
+        tools: Optional[List[ToolDefinition]] = None,
-        tool_choice: ToolChoice | None = ...,
+        tool_choice: Optional[ToolChoice] = ToolChoice.auto,
-        tool_prompt_format: ToolPromptFormat | None = ...,
+        tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
-        stream: bool | None = False,
+        response_format: Optional[ResponseFormat] = None,
-        logprobs: LogProbConfig | None = None,
+        stream: Optional[bool] = False,
        logprobs: Optional[LogProbConfig] = None,
    ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
        log.info("vLLM chat completion")
@ -182,7 +166,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
        request_id = _random_uuid()
        prompt = chat_completion_request_to_prompt(request, self.formatter)
-        vllm_sampling_params = _vllm_sampling_params(request.sampling_params)
+        vllm_sampling_params = self._sampling_params(request.sampling_params)
        results_generator = self.engine.generate(
            prompt, vllm_sampling_params, request_id
        )
@ -213,14 +197,19 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
        self, request: ChatCompletionRequest, results_generator: AsyncGenerator
    ) -> AsyncGenerator:
        async def _generate_and_convert_to_openai_compat():
            cur = []
            async for chunk in results_generator:
                if not chunk.outputs:
                    log.warning("Empty chunk received")
                    continue
-                text = "".join([output.text for output in chunk.outputs])
+                output = chunk.outputs[-1]
                new_tokens = output.token_ids[len(cur) :]
                text = self.formatter.tokenizer.decode(new_tokens)
                cur.extend(new_tokens)
                choice = OpenAICompatCompletionChoice(
-                    finish_reason=chunk.outputs[-1].stop_reason,
+                    finish_reason=output.finish_reason,
                    text=text,
                )
                yield OpenAICompatCompletionResponse(
--- a/llama_stack/templates/bedrock/build.yaml
+++ b/llama_stack/templates/bedrock/build.yaml
@ -0,0 +1,9 @@
 name: bedrock
 distribution_spec:
  description: Use Amazon Bedrock APIs.
  providers:
    inference: remote::bedrock
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/databricks/build.yaml
+++ b/llama_stack/templates/databricks/build.yaml
@ -0,0 +1,9 @@
 name: databricks
 distribution_spec:
  description: Use Databricks for running LLM inference
  providers:
    inference: remote::databricks
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/fireworks/build.yaml
+++ b/llama_stack/templates/fireworks/build.yaml
@ -0,0 +1,9 @@
 name: fireworks
 distribution_spec:
  description: Use Fireworks.ai for running LLM inference
  providers:
    inference: remote::fireworks
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/hf-endpoint/build.yaml
+++ b/llama_stack/templates/hf-endpoint/build.yaml
@ -0,0 +1,9 @@
 name: hf-endpoint
 distribution_spec:
  description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
  providers:
    inference: remote::hf::endpoint
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/hf-serverless/build.yaml
+++ b/llama_stack/templates/hf-serverless/build.yaml
@ -0,0 +1,9 @@
 name: hf-serverless
 distribution_spec:
  description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
  providers:
    inference: remote::hf::serverless
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/meta-reference-gpu/build.yaml
+++ b/llama_stack/templates/meta-reference-gpu/build.yaml
@ -0,0 +1,13 @@
 name: meta-reference-gpu
 distribution_spec:
  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
  description: Use code from `llama_stack` itself to serve all llama stack APIs
  providers:
    inference: meta-reference
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml
@ -0,0 +1,13 @@
 name: meta-reference-quantized-gpu
 distribution_spec:
  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
  description: Use code from `llama_stack` itself to serve all llama stack APIs
  providers:
    inference: meta-reference-quantized
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/ollama/build.yaml
+++ b/llama_stack/templates/ollama/build.yaml
@ -0,0 +1,12 @@
 name: ollama
 distribution_spec:
  description: Use ollama for running LLM inference
  providers:
    inference: remote::ollama
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/tgi/build.yaml
+++ b/llama_stack/templates/tgi/build.yaml
@ -0,0 +1,12 @@
 name: tgi
 distribution_spec:
  description: Use TGI for running LLM inference
  providers:
    inference: remote::tgi
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@ -0,0 +1,9 @@
 name: together
 distribution_spec:
  description: Use Together.ai for running LLM inference
  providers:
    inference: remote::together
    memory: remote::weaviate
    safety: remote::together
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/vllm/build.yaml
+++ b/llama_stack/templates/vllm/build.yaml
@ -0,0 +1,9 @@
 name: vllm
 distribution_spec:
  description: Like local, but use vLLM for running LLM inference
  providers:
    inference: vllm
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/bedrock/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/databricks/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/fireworks/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/hf-endpoint/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/hf-serverless/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/meta-reference-gpu/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/ollama/build.yaml`
		`@ -0,0 +1 @@`
							`../../llama_stack/templates/together/build.yaml`