diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c85436c4..3707d4671 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -exclude: 'build' +exclude: 'build/' default_language_version: python: python3 diff --git a/MANIFEST.in b/MANIFEST.in index 7426a3abd..0517b86a8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ include requirements.txt include llama_stack/distribution/*.sh include llama_stack/cli/scripts/*.sh -include distributions/*/build.yaml +include llama_stack/templates/*/build.yaml diff --git a/distributions/bedrock/build.yaml b/distributions/bedrock/build.yaml deleted file mode 100644 index ae7b27d49..000000000 --- a/distributions/bedrock/build.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name: bedrock -distribution_spec: - description: Use Amazon Bedrock APIs. - providers: - inference: remote::bedrock - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: conda diff --git a/distributions/bedrock/build.yaml b/distributions/bedrock/build.yaml new file mode 120000 index 000000000..72402ef8d --- /dev/null +++ b/distributions/bedrock/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/bedrock/build.yaml \ No newline at end of file diff --git a/distributions/databricks/build.yaml b/distributions/databricks/build.yaml deleted file mode 100644 index 2188dd0a0..000000000 --- a/distributions/databricks/build.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name: databricks -distribution_spec: - description: Use Databricks for running LLM inference - providers: - inference: remote::databricks - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: conda diff --git a/distributions/databricks/build.yaml b/distributions/databricks/build.yaml new file mode 120000 index 000000000..66342fe6f --- /dev/null +++ b/distributions/databricks/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/databricks/build.yaml \ No newline at end of file diff --git a/distributions/fireworks/README.md b/distributions/fireworks/README.md index fcf74d809..e3987e1e2 100644 --- a/distributions/fireworks/README.md +++ b/distributions/fireworks/README.md @@ -49,7 +49,7 @@ inference: **Via Conda** ```bash -llama stack build --config ./build.yaml +llama stack build --template fireworks --image-type conda # -- modify run.yaml to a valid Fireworks server endpoint llama stack run ./run.yaml ``` diff --git a/distributions/fireworks/build.yaml b/distributions/fireworks/build.yaml deleted file mode 100644 index 2e5cf0753..000000000 --- a/distributions/fireworks/build.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name: fireworks -distribution_spec: - description: Use Fireworks.ai for running LLM inference - providers: - inference: remote::fireworks - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: docker diff --git a/distributions/fireworks/build.yaml b/distributions/fireworks/build.yaml new file mode 120000 index 000000000..32a5bd869 --- /dev/null +++ b/distributions/fireworks/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/fireworks/build.yaml \ No newline at end of file diff --git a/distributions/hf-endpoint/build.yaml b/distributions/hf-endpoint/build.yaml deleted file mode 100644 index 750bebcb5..000000000 --- a/distributions/hf-endpoint/build.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name: hf-endpoint -distribution_spec: - description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints." - providers: - inference: remote::hf::endpoint - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: conda diff --git a/distributions/hf-endpoint/build.yaml b/distributions/hf-endpoint/build.yaml new file mode 120000 index 000000000..a73c70c05 --- /dev/null +++ b/distributions/hf-endpoint/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/hf-endpoint/build.yaml \ No newline at end of file diff --git a/distributions/hf-serverless/build.yaml b/distributions/hf-serverless/build.yaml deleted file mode 100644 index f6da3ad4d..000000000 --- a/distributions/hf-serverless/build.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name: hf-serverless -distribution_spec: - description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference." - providers: - inference: remote::hf::serverless - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: conda diff --git a/distributions/hf-serverless/build.yaml b/distributions/hf-serverless/build.yaml new file mode 120000 index 000000000..f2db0fd55 --- /dev/null +++ b/distributions/hf-serverless/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/hf-serverless/build.yaml \ No newline at end of file diff --git a/distributions/meta-reference-gpu/build.yaml b/distributions/meta-reference-gpu/build.yaml deleted file mode 100644 index 5b1521a92..000000000 --- a/distributions/meta-reference-gpu/build.yaml +++ /dev/null @@ -1,14 +0,0 @@ -name: meta-reference-gpu -distribution_spec: - docker_image: pytorch/pytorch - description: Use code from `llama_stack` itself to serve all llama stack APIs - providers: - inference: meta-reference - memory: - - meta-reference - - remote::chromadb - - remote::pgvector - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: docker diff --git a/distributions/meta-reference-gpu/build.yaml b/distributions/meta-reference-gpu/build.yaml new file mode 120000 index 000000000..4418195eb --- /dev/null +++ b/distributions/meta-reference-gpu/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/meta-reference-gpu/build.yaml \ No newline at end of file diff --git a/distributions/meta-reference-quantized-gpu/build.yaml b/distributions/meta-reference-quantized-gpu/build.yaml deleted file mode 100644 index e9ddb4aad..000000000 --- a/distributions/meta-reference-quantized-gpu/build.yaml +++ /dev/null @@ -1,14 +0,0 @@ -name: meta-reference-quantized-gpu -distribution_spec: - docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime - description: Use code from `llama_stack` itself to serve all llama stack APIs - providers: - inference: meta-reference-quantized - memory: - - meta-reference - - remote::chromadb - - remote::pgvector - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: docker diff --git a/distributions/meta-reference-quantized-gpu/build.yaml b/distributions/meta-reference-quantized-gpu/build.yaml new file mode 120000 index 000000000..f3dbe996f --- /dev/null +++ b/distributions/meta-reference-quantized-gpu/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml \ No newline at end of file diff --git a/distributions/ollama/README.md b/distributions/ollama/README.md index d59c3f9e1..70bc27a85 100644 --- a/distributions/ollama/README.md +++ b/distributions/ollama/README.md @@ -86,6 +86,6 @@ inference: **Via Conda** ``` -llama stack build --config ./build.yaml +llama stack build --template ollama --image-type conda llama stack run ./gpu/run.yaml ``` diff --git a/distributions/ollama/build.yaml b/distributions/ollama/build.yaml deleted file mode 100644 index c27f40929..000000000 --- a/distributions/ollama/build.yaml +++ /dev/null @@ -1,13 +0,0 @@ -name: ollama -distribution_spec: - description: Use ollama for running LLM inference - providers: - inference: remote::ollama - memory: - - meta-reference - - remote::chromadb - - remote::pgvector - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: docker diff --git a/distributions/ollama/build.yaml b/distributions/ollama/build.yaml new file mode 120000 index 000000000..8772548e0 --- /dev/null +++ b/distributions/ollama/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/ollama/build.yaml \ No newline at end of file diff --git a/distributions/tgi/README.md b/distributions/tgi/README.md index 86d2636d7..886252ecd 100644 --- a/distributions/tgi/README.md +++ b/distributions/tgi/README.md @@ -88,7 +88,7 @@ inference: **Via Conda** ```bash -llama stack build --config ./build.yaml +llama stack build --template tgi --image-type conda # -- start a TGI server endpoint llama stack run ./gpu/run.yaml ``` diff --git a/distributions/tgi/build.yaml b/distributions/tgi/build.yaml deleted file mode 100644 index 2c0ca1d33..000000000 --- a/distributions/tgi/build.yaml +++ /dev/null @@ -1,13 +0,0 @@ -name: tgi -distribution_spec: - description: Use TGI for running LLM inference - providers: - inference: remote::tgi - memory: - - meta-reference - - remote::chromadb - - remote::pgvector - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: docker diff --git a/distributions/tgi/build.yaml b/distributions/tgi/build.yaml new file mode 120000 index 000000000..73e59ad84 --- /dev/null +++ b/distributions/tgi/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/tgi/build.yaml \ No newline at end of file diff --git a/distributions/together/README.md b/distributions/together/README.md index 227c7a450..b964673e0 100644 --- a/distributions/together/README.md +++ b/distributions/together/README.md @@ -62,7 +62,7 @@ memory: **Via Conda** ```bash -llama stack build --config ./build.yaml +llama stack build --template together --image-type conda # -- modify run.yaml to a valid Together server endpoint llama stack run ./run.yaml ``` diff --git a/distributions/together/build.yaml b/distributions/together/build.yaml deleted file mode 100644 index 49eab859d..000000000 --- a/distributions/together/build.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name: together -distribution_spec: - description: Use Together.ai for running LLM inference - providers: - inference: remote::together - memory: remote::weaviate - safety: remote::together - agents: meta-reference - telemetry: meta-reference -image_type: docker diff --git a/distributions/together/build.yaml b/distributions/together/build.yaml new file mode 120000 index 000000000..3877a9c96 --- /dev/null +++ b/distributions/together/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/together/build.yaml \ No newline at end of file diff --git a/distributions/vllm/build.yaml b/distributions/vllm/build.yaml deleted file mode 100644 index f41352eb1..000000000 --- a/distributions/vllm/build.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name: vllm -distribution_spec: - description: Like local, but use vLLM for running LLM inference - providers: - inference: vllm - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference -image_type: conda \ No newline at end of file diff --git a/distributions/vllm/build.yaml b/distributions/vllm/build.yaml new file mode 120000 index 000000000..dfc9401b6 --- /dev/null +++ b/distributions/vllm/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/vllm/build.yaml \ No newline at end of file diff --git a/docs/cli_reference.md b/docs/cli_reference.md index f0f67192f..ddc8e6b3e 100644 --- a/docs/cli_reference.md +++ b/docs/cli_reference.md @@ -279,11 +279,11 @@ llama stack build --list-templates You may then pick a template to build your distribution with providers fitted to your liking. ``` -llama stack build --template local-tgi --name my-tgi-stack +llama stack build --template local-tgi --name my-tgi-stack --image-type conda ``` ``` -$ llama stack build --template local-tgi --name my-tgi-stack +$ llama stack build --template local-tgi --name my-tgi-stack --image-type conda ... ... Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml @@ -293,10 +293,10 @@ You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~ #### Building from config file - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command. -- The config file will be of contents like the ones in `llama_stack/distributions/templates/`. +- The config file will be of contents like the ones in `llama_stack/templates/`. ``` -$ cat llama_stack/distribution/templates/local-ollama-build.yaml +$ cat build.yaml name: local-ollama distribution_spec: @@ -311,7 +311,7 @@ image_type: conda ``` ``` -llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml +llama stack build --config build.yaml ``` #### How to build distribution with Docker image diff --git a/docs/getting_started.md b/docs/getting_started.md index 4f06f5d47..2a90301d0 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -35,11 +35,7 @@ You have two ways to start up Llama stack server: 1. **Starting up server via docker**: - We provide 2 pre-built Docker image of Llama Stack distribution, which can be found in the following links. - - [llamastack-local-gpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-gpu/general) - - This is a packaged version with our local meta-reference implementations, where you will be running inference locally with downloaded Llama model checkpoints. - - [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general) - - This is a lite version with remote inference where you can hook up to your favourite remote inference framework (e.g. ollama, fireworks, together, tgi) for running inference without GPU. + We provide pre-built Docker image of Llama Stack distribution, which can be found in the following links in the [distributions](../distributions/) folder. > [!NOTE] > For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container. diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py index adac34d55..1fd523dcb 100644 --- a/llama_stack/apis/scoring/scoring.py +++ b/llama_stack/apis/scoring/scoring.py @@ -37,7 +37,7 @@ class ScoreResponse(BaseModel): class ScoringFunctionStore(Protocol): - def get_scoring_function(self, name: str) -> ScoringFunctionDefWithProvider: ... + def get_scoring_function(self, name: str) -> ScoringFnDefWithProvider: ... @runtime_checkable diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index a242215c6..fc3584f90 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -29,7 +29,7 @@ class LLMAsJudgeContext(BaseModel): @json_schema_type -class ScoringFunctionDef(BaseModel): +class ScoringFnDef(BaseModel): identifier: str description: Optional[str] = None metadata: Dict[str, Any] = Field( @@ -48,7 +48,7 @@ class ScoringFunctionDef(BaseModel): @json_schema_type -class ScoringFunctionDefWithProvider(ScoringFunctionDef): +class ScoringFnDefWithProvider(ScoringFnDef): provider_id: str = Field( description="ID of the provider which serves this dataset", ) @@ -57,14 +57,14 @@ class ScoringFunctionDefWithProvider(ScoringFunctionDef): @runtime_checkable class ScoringFunctions(Protocol): @webmethod(route="/scoring_functions/list", method="GET") - async def list_scoring_functions(self) -> List[ScoringFunctionDefWithProvider]: ... + async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]: ... @webmethod(route="/scoring_functions/get", method="GET") async def get_scoring_function( self, name: str - ) -> Optional[ScoringFunctionDefWithProvider]: ... + ) -> Optional[ScoringFnDefWithProvider]: ... @webmethod(route="/scoring_functions/register", method="POST") async def register_scoring_function( - self, function_def: ScoringFunctionDefWithProvider + self, function_def: ScoringFnDefWithProvider ) -> None: ... diff --git a/llama_stack/cli/stack/build.py b/llama_stack/cli/stack/build.py index 26aa35e16..40fca4c6d 100644 --- a/llama_stack/cli/stack/build.py +++ b/llama_stack/cli/stack/build.py @@ -12,9 +12,7 @@ import os from functools import lru_cache from pathlib import Path -TEMPLATES_PATH = ( - Path(os.path.relpath(__file__)).parent.parent.parent.parent / "distributions" -) +TEMPLATES_PATH = Path(os.path.relpath(__file__)).parent.parent.parent / "templates" @lru_cache() @@ -26,7 +24,6 @@ def available_templates_specs() -> List[BuildConfig]: with open(p, "r") as f: build_config = BuildConfig(**yaml.safe_load(f)) template_specs.append(build_config) - return template_specs @@ -78,112 +75,17 @@ class StackBuild(Subcommand): choices=["conda", "docker"], ) - def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]: - if os.getenv("CONDA_PREFIX", ""): - conda_dir = ( - Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}" - ) - else: - cprint( - "Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...", - color="green", - ) - conda_dir = ( - Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}" - ) - build_config_file = Path(conda_dir) / f"{args.name}-build.yaml" - if build_config_file.exists(): - return build_config_file - - return None - - def _run_stack_build_command_from_build_config( - self, build_config: BuildConfig - ) -> None: - import json - import os - - import yaml - - from llama_stack.distribution.build import build_image, ImageType - from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR - from llama_stack.distribution.utils.serialize import EnumEncoder - from termcolor import cprint - - # save build.yaml spec for building same distribution again - if build_config.image_type == ImageType.docker.value: - # docker needs build file to be in the llama-stack repo dir to be able to copy over to the image - llama_stack_path = Path( - os.path.abspath(__file__) - ).parent.parent.parent.parent - build_dir = llama_stack_path / "tmp/configs/" - else: - build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}" - - os.makedirs(build_dir, exist_ok=True) - build_file_path = build_dir / f"{build_config.name}-build.yaml" - - with open(build_file_path, "w") as f: - to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder)) - f.write(yaml.dump(to_write, sort_keys=False)) - - return_code = build_image(build_config, build_file_path) - if return_code != 0: - return - - configure_name = ( - build_config.name - if build_config.image_type == "conda" - else (f"llamastack-{build_config.name}") - ) - if build_config.image_type == "conda": - cprint( - f"You can now run `llama stack configure {configure_name}`", - color="green", - ) - else: - cprint( - f"You can now run `llama stack run {build_config.name}`", - color="green", - ) - - def _run_template_list_cmd(self, args: argparse.Namespace) -> None: - import json - - from llama_stack.cli.table import print_table - - # eventually, this should query a registry at llama.meta.com/llamastack/distributions - headers = [ - "Template Name", - "Providers", - "Description", - ] - - rows = [] - for spec in available_templates_specs(): - rows.append( - [ - spec.name, - json.dumps(spec.distribution_spec.providers, indent=2), - spec.distribution_spec.description, - ] - ) - print_table( - rows, - headers, - separate_rows=True, - ) - def _run_stack_build_command(self, args: argparse.Namespace) -> None: import textwrap import yaml - from llama_stack.distribution.distribution import get_provider_registry from prompt_toolkit import prompt from prompt_toolkit.completion import WordCompleter from prompt_toolkit.validation import Validator from termcolor import cprint + from llama_stack.distribution.distribution import get_provider_registry + if args.list_templates: self._run_template_list_cmd(args) return @@ -194,19 +96,22 @@ class StackBuild(Subcommand): "You must specify a name for the build using --name when using a template" ) return - build_path = TEMPLATES_PATH / f"{args.template}-build.yaml" - if not build_path.exists(): - self.parser.error( - f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates" - ) - return - with open(build_path, "r") as f: - build_config = BuildConfig(**yaml.safe_load(f)) - build_config.name = args.name - if args.image_type: - build_config.image_type = args.image_type - self._run_stack_build_command_from_build_config(build_config) + available_templates = available_templates_specs() + for build_config in available_templates: + if build_config.name == args.template: + build_config.name = args.name + if args.image_type: + build_config.image_type = args.image_type + else: + self.parser.error( + f"Please specify a image-type (docker | conda) for {args.template}" + ) + self._run_stack_build_command_from_build_config(build_config) + return + self.parser.error( + f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates" + ) return # try to see if we can find a pre-existing build config file through name @@ -297,3 +202,99 @@ class StackBuild(Subcommand): self.parser.error(f"Could not parse config file {args.config}: {e}") return self._run_stack_build_command_from_build_config(build_config) + + def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]: + if os.getenv("CONDA_PREFIX", ""): + conda_dir = ( + Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}" + ) + else: + cprint( + "Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...", + color="green", + ) + conda_dir = ( + Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}" + ) + build_config_file = Path(conda_dir) / f"{args.name}-build.yaml" + if build_config_file.exists(): + return build_config_file + + return None + + def _run_stack_build_command_from_build_config( + self, build_config: BuildConfig + ) -> None: + import json + import os + + import yaml + from termcolor import cprint + + from llama_stack.distribution.build import build_image, ImageType + from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR + from llama_stack.distribution.utils.serialize import EnumEncoder + + # save build.yaml spec for building same distribution again + if build_config.image_type == ImageType.docker.value: + # docker needs build file to be in the llama-stack repo dir to be able to copy over to the image + llama_stack_path = Path( + os.path.abspath(__file__) + ).parent.parent.parent.parent + build_dir = llama_stack_path / "tmp/configs/" + else: + build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}" + + os.makedirs(build_dir, exist_ok=True) + build_file_path = build_dir / f"{build_config.name}-build.yaml" + + with open(build_file_path, "w") as f: + to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder)) + f.write(yaml.dump(to_write, sort_keys=False)) + + return_code = build_image(build_config, build_file_path) + if return_code != 0: + return + + configure_name = ( + build_config.name + if build_config.image_type == "conda" + else (f"llamastack-{build_config.name}") + ) + if build_config.image_type == "conda": + cprint( + f"You can now run `llama stack configure {configure_name}`", + color="green", + ) + else: + cprint( + f"You can now edit your run.yaml file and run `docker run -it -p 5000:5000 {build_config.name}`. See full command in llama-stack/distributions/", + color="green", + ) + + def _run_template_list_cmd(self, args: argparse.Namespace) -> None: + import json + + from llama_stack.cli.table import print_table + + # eventually, this should query a registry at llama.meta.com/llamastack/distributions + headers = [ + "Template Name", + "Providers", + "Description", + ] + + rows = [] + for spec in available_templates_specs(): + rows.append( + [ + spec.name, + json.dumps(spec.distribution_spec.providers, indent=2), + spec.distribution_spec.description, + ] + ) + print_table( + rows, + headers, + separate_rows=True, + ) diff --git a/llama_stack/distribution/build.py b/llama_stack/distribution/build.py index 13c545723..e3a9d9186 100644 --- a/llama_stack/distribution/build.py +++ b/llama_stack/distribution/build.py @@ -8,18 +8,19 @@ from enum import Enum from typing import List, Optional import pkg_resources - -from llama_stack.distribution.utils.exec import run_with_pty from pydantic import BaseModel from termcolor import cprint +from llama_stack.distribution.utils.exec import run_with_pty + from llama_stack.distribution.datatypes import * # noqa: F403 from pathlib import Path -from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR from llama_stack.distribution.distribution import get_provider_registry +from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR + # These are the dependencies needed by the distribution server. # `llama-stack` is automatically installed by the installation script. diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh index 3bf74edcf..8044dda28 100755 --- a/llama_stack/distribution/build_container.sh +++ b/llama_stack/distribution/build_container.sh @@ -1,5 +1,11 @@ #!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} @@ -34,9 +40,6 @@ REPO_CONFIGS_DIR="$REPO_DIR/tmp/configs" TEMP_DIR=$(mktemp -d) -llama stack configure $build_file_path -cp $host_build_dir/$build_name-run.yaml $REPO_CONFIGS_DIR - add_to_docker() { local input output_file="$TEMP_DIR/Dockerfile" @@ -113,7 +116,6 @@ ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"] EOF add_to_docker "ADD tmp/configs/$(basename "$build_file_path") ./llamastack-build.yaml" -add_to_docker "ADD tmp/configs/$build_name-run.yaml ./llamastack-run.yaml" printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile" cat $TEMP_DIR/Dockerfile diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 318809baf..9ad82cd79 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -34,7 +34,7 @@ RoutableObject = Union[ ShieldDef, MemoryBankDef, DatasetDef, - ScoringFunctionDef, + ScoringFnDef, ] RoutableObjectWithProvider = Union[ @@ -42,7 +42,7 @@ RoutableObjectWithProvider = Union[ ShieldDefWithProvider, MemoryBankDefWithProvider, DatasetDefWithProvider, - ScoringFunctionDefWithProvider, + ScoringFnDefWithProvider, ] RoutedProtocol = Union[ diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index dcd588a9e..3e07b9162 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -100,7 +100,7 @@ class CommonRoutingTableImpl(RoutingTable): scoring_functions = await p.list_scoring_functions() add_objects( [ - ScoringFunctionDefWithProvider(**s.dict(), provider_id=pid) + ScoringFnDefWithProvider(**s.dict(), provider_id=pid) for s in scoring_functions ] ) @@ -239,7 +239,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring): - async def list_scoring_functions(self) -> List[ScoringFunctionDefWithProvider]: + async def list_scoring_functions(self) -> List[ScoringFnDefWithProvider]: objects = [] for objs in self.registry.values(): objects.extend(objs) @@ -247,10 +247,10 @@ class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, Scoring): async def get_scoring_function( self, name: str - ) -> Optional[ScoringFunctionDefWithProvider]: + ) -> Optional[ScoringFnDefWithProvider]: return self.get_object_by_identifier(name) async def register_scoring_function( - self, function_def: ScoringFunctionDefWithProvider + self, function_def: ScoringFnDefWithProvider ) -> None: await self.register_object(function_def) diff --git a/llama_stack/distribution/start_container.sh b/llama_stack/distribution/start_container.sh index 8533da7d1..fe1b5051f 100755 --- a/llama_stack/distribution/start_container.sh +++ b/llama_stack/distribution/start_container.sh @@ -29,7 +29,7 @@ if [ $# -lt 3 ]; then fi build_name="$1" -docker_image="llamastack-$build_name" +docker_image="distribution-$build_name" shift yaml_config="$1" diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index 8d476a509..eace0ea1a 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -13,7 +13,7 @@ from pydantic import BaseModel, Field from llama_stack.apis.datasets import DatasetDef from llama_stack.apis.memory_banks import MemoryBankDef from llama_stack.apis.models import ModelDef -from llama_stack.apis.scoring_functions import ScoringFunctionDef +from llama_stack.apis.scoring_functions import ScoringFnDef from llama_stack.apis.shields import ShieldDef @@ -64,11 +64,9 @@ class DatasetsProtocolPrivate(Protocol): class ScoringFunctionsProtocolPrivate(Protocol): - async def list_scoring_functions(self) -> List[ScoringFunctionDef]: ... + async def list_scoring_functions(self) -> List[ScoringFnDef]: ... - async def register_scoring_function( - self, function_def: ScoringFunctionDef - ) -> None: ... + async def register_scoring_function(self, function_def: ScoringFnDef) -> None: ... @json_schema_type diff --git a/llama_stack/providers/impls/meta_reference/agents/agents.py b/llama_stack/providers/impls/meta_reference/agents/agents.py index ca5a00359..13d9044fd 100644 --- a/llama_stack/providers/impls/meta_reference/agents/agents.py +++ b/llama_stack/providers/impls/meta_reference/agents/agents.py @@ -169,7 +169,7 @@ class MetaReferenceAgentsImpl(Agents): turn_ids: Optional[List[str]] = None, ) -> Session: session = await self.persistence_store.get(f"session:{agent_id}:{session_id}") - session = Session(**json.loads(session)) + session = Session(**json.loads(session), turns=[]) turns = [] if turn_ids: for turn_id in turn_ids: diff --git a/llama_stack/providers/impls/meta_reference/eval/eval.py b/llama_stack/providers/impls/meta_reference/eval/eval.py index daa17a89e..d675e40eb 100644 --- a/llama_stack/providers/impls/meta_reference/eval/eval.py +++ b/llama_stack/providers/impls/meta_reference/eval/eval.py @@ -3,6 +3,7 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from enum import Enum from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_stack.apis.common.type_system import * # noqa: F403 @@ -16,6 +17,13 @@ from llama_stack.apis.scoring import Scoring from .config import MetaReferenceEvalConfig +class ColumnName(Enum): + expected_answer = "expected_answer" + chat_completion_input = "chat_completion_input" + completion_input = "completion_input" + generated_answer = "generated_answer" + + class MetaReferenceEvalImpl(Eval): def __init__( self, @@ -41,18 +49,16 @@ class MetaReferenceEvalImpl(Eval): async def validate_eval_input_dataset_schema(self, dataset_id: str) -> None: dataset_def = await self.datasets_api.get_dataset(dataset_identifier=dataset_id) if not dataset_def.dataset_schema or len(dataset_def.dataset_schema) == 0: - raise ValueError( - f"Dataset {dataset_id} does not have a schema defined. Please define a schema for the dataset." - ) + raise ValueError(f"Dataset {dataset_id} does not have a schema defined.") expected_schemas = [ { - "expected_answer": StringType(), - "chat_completion_input": ChatCompletionInputType(), + ColumnName.expected_answer.value: StringType(), + ColumnName.chat_completion_input.value: ChatCompletionInputType(), }, { - "expected_answer": StringType(), - "chat_completion_input": CompletionInputType(), + ColumnName.expected_answer.value: StringType(), + ColumnName.completion_input.value: CompletionInputType(), }, ] @@ -94,27 +100,43 @@ class MetaReferenceEvalImpl(Eval): raise NotImplementedError( "Evaluation with generation has not been implemented for agents" ) + assert ( + candidate.sampling_params.max_tokens is not None + ), "SamplingParams.max_tokens must be provided" + generations = [] for x in input_rows: - if "completion_input" in x: - raise NotImplementedError( - "Evaluation with completion API has not been implemented" + if ColumnName.completion_input.value in x: + input_content = eval(str(x[ColumnName.completion_input.value])) + response = await self.inference_api.completion( + model=candidate.model, + content=input_content, + sampling_params=candidate.sampling_params, ) - - input_messages = eval(str(x["chat_completion_input"])) - input_messages = [UserMessage(**x) for x in input_messages] - messages = [] - if candidate.system_message: - messages.append(candidate.system_message) - messages += input_messages - response = await self.inference_api.chat_completion( - model=candidate.model, - messages=messages, - sampling_params=candidate.sampling_params, - ) - generations.append( - {"generated_answer": response.completion_message.content} - ) + generations.append( + { + ColumnName.generated_answer.value: response.completion_message.content + } + ) + elif ColumnName.chat_completion_input.value in x: + input_messages = eval(str(x[ColumnName.chat_completion_input.value])) + input_messages = [UserMessage(**x) for x in input_messages] + messages = [] + if candidate.system_message: + messages.append(candidate.system_message) + messages += input_messages + response = await self.inference_api.chat_completion( + model=candidate.model, + messages=messages, + sampling_params=candidate.sampling_params, + ) + generations.append( + { + ColumnName.generated_answer.value: response.completion_message.content + } + ) + else: + raise ValueError("Invalid input row") # scoring with generated_answer score_input_rows = [ @@ -132,6 +154,8 @@ class MetaReferenceEvalImpl(Eval): if job_id in self.jobs: return JobStatus.completed + return None + async def job_cancel(self, job_id: str) -> None: raise NotImplementedError("Job cancel is not implemented yet") diff --git a/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh b/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh index d3028f8e8..ae0ed0bac 100644 --- a/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh +++ b/llama_stack/providers/impls/meta_reference/inference/quantization/scripts/build_conda.sh @@ -1,5 +1,11 @@ #!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + if [[ $# -ne 1 ]]; then echo "Error: Please provide the name of CONDA environment you wish to create" exit 1 diff --git a/llama_stack/providers/impls/meta_reference/scoring/scoring.py b/llama_stack/providers/impls/meta_reference/scoring/scoring.py index 05ace33b4..b1d561533 100644 --- a/llama_stack/providers/impls/meta_reference/scoring/scoring.py +++ b/llama_stack/providers/impls/meta_reference/scoring/scoring.py @@ -13,22 +13,22 @@ from llama_stack.apis.datasetio import * # noqa: F403 from llama_stack.apis.datasets import * # noqa: F403 from llama_stack.providers.datatypes import ScoringFunctionsProtocolPrivate -from llama_stack.providers.impls.meta_reference.scoring.scorer.equality_scorer import ( - EqualityScorer, +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.equality_scoring_fn import ( + EqualityScoringFn, ) -from llama_stack.providers.impls.meta_reference.scoring.scorer.subset_of_scorer import ( - SubsetOfScorer, +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.subset_of_scoring_fn import ( + SubsetOfScoringFn, ) from .config import MetaReferenceScoringConfig -SUPPORTED_SCORERS = [ - EqualityScorer, - SubsetOfScorer, +SUPPORTED_SCORING_FNS = [ + EqualityScoringFn, + SubsetOfScoringFn, ] -SCORER_REGISTRY = {x.scoring_function_def.identifier: x for x in SUPPORTED_SCORERS} +SCORER_REGISTRY = {x.scoring_function_def.identifier: x for x in SUPPORTED_SCORING_FNS} class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): @@ -46,10 +46,10 @@ class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): async def shutdown(self) -> None: ... - async def list_scoring_functions(self) -> List[ScoringFunctionDef]: - return [x.scoring_function_def for x in SUPPORTED_SCORERS] + async def list_scoring_functions(self) -> List[ScoringFnDef]: + return [x.scoring_function_def for x in SUPPORTED_SCORING_FNS] - async def register_scoring_function(self, function_def: ScoringFunctionDef) -> None: + async def register_scoring_function(self, function_def: ScoringFnDef) -> None: raise NotImplementedError( "Dynamically registering scoring functions is not supported" ) @@ -101,9 +101,9 @@ class MetaReferenceScoringImpl(Scoring, ScoringFunctionsProtocolPrivate): for scoring_fn_id in scoring_functions: if scoring_fn_id not in SCORER_REGISTRY: raise ValueError(f"Scoring function {scoring_fn_id} is not supported.") - scorer = SCORER_REGISTRY[scoring_fn_id]() - score_results = scorer.score(input_rows) - agg_results = scorer.aggregate(score_results) + scoring_fn = SCORER_REGISTRY[scoring_fn_id]() + score_results = scoring_fn.score(input_rows) + agg_results = scoring_fn.aggregate(score_results) res[scoring_fn_id] = ScoringResult( score_rows=score_results, aggregated_results=agg_results, diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/__init__.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/__init__.py similarity index 100% rename from llama_stack/providers/impls/meta_reference/scoring/scorer/__init__.py rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/__init__.py diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/base_scorer.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/base_scoring_fn.py similarity index 81% rename from llama_stack/providers/impls/meta_reference/scoring/scorer/base_scorer.py rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/base_scoring_fn.py index ea8a3f063..952d46bb2 100644 --- a/llama_stack/providers/impls/meta_reference/scoring/scorer/base_scorer.py +++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/base_scoring_fn.py @@ -9,15 +9,15 @@ from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 -class BaseScorer(ABC): +class BaseScoringFn(ABC): """ - Base interface class for all meta-reference scorers. - Each scorer needs to implement the following methods: + Base interface class for all meta-reference scoring_fns. + Each scoring_fn needs to implement the following methods: - score_row(self, row) - - aggregate(self, scorer_results) + - aggregate(self, scoring_fn_results) """ - scoring_function_def: ScoringFunctionDef + scoring_function_def: ScoringFnDef def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/common.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/common.py similarity index 100% rename from llama_stack/providers/impls/meta_reference/scoring/scorer/common.py rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/common.py diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/equality_scorer.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/equality_scoring_fn.py similarity index 76% rename from llama_stack/providers/impls/meta_reference/scoring/scorer/equality_scorer.py rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/equality_scoring_fn.py index 0c7751f35..cce0f948a 100644 --- a/llama_stack/providers/impls/meta_reference/scoring/scorer/equality_scorer.py +++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/equality_scoring_fn.py @@ -4,23 +4,23 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.providers.impls.meta_reference.scoring.scorer.base_scorer import ( - BaseScorer, +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.base_scoring_fn import ( + BaseScoringFn, ) from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 from llama_stack.apis.common.type_system import * # noqa: F403 -from llama_stack.providers.impls.meta_reference.scoring.scorer.common import ( +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import ( aggregate_accuracy, ) -class EqualityScorer(BaseScorer): +class EqualityScoringFn(BaseScoringFn): """ - A scorer that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise. + A scoring_fn that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise. """ - scoring_function_def = ScoringFunctionDef( + scoring_function_def = ScoringFnDef( identifier="equality", description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.", parameters=[], diff --git a/llama_stack/providers/impls/meta_reference/scoring/scorer/subset_of_scorer.py b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py similarity index 76% rename from llama_stack/providers/impls/meta_reference/scoring/scorer/subset_of_scorer.py rename to llama_stack/providers/impls/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py index e72b5ed0f..c7ee68e26 100644 --- a/llama_stack/providers/impls/meta_reference/scoring/scorer/subset_of_scorer.py +++ b/llama_stack/providers/impls/meta_reference/scoring/scoring_fn/subset_of_scoring_fn.py @@ -4,23 +4,23 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.providers.impls.meta_reference.scoring.scorer.base_scorer import ( - BaseScorer, +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.base_scoring_fn import ( + BaseScoringFn, ) from llama_stack.apis.scoring_functions import * # noqa: F401, F403 from llama_stack.apis.scoring import * # noqa: F401, F403 from llama_stack.apis.common.type_system import * # noqa: F403 -from llama_stack.providers.impls.meta_reference.scoring.scorer.common import ( +from llama_stack.providers.impls.meta_reference.scoring.scoring_fn.common import ( aggregate_accuracy, ) -class SubsetOfScorer(BaseScorer): +class SubsetOfScoringFn(BaseScoringFn): """ - A scorer that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise. + A scoring_fn that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise. """ - scoring_function_def = ScoringFunctionDef( + scoring_function_def = ScoringFnDef( identifier="subset_of", description="Returns 1.0 if the expected is included in generated, 0.0 otherwise.", parameters=[], diff --git a/llama_stack/providers/impls/vllm/config.py b/llama_stack/providers/impls/vllm/config.py index df2526f2e..a7469ebde 100644 --- a/llama_stack/providers/impls/vllm/config.py +++ b/llama_stack/providers/impls/vllm/config.py @@ -15,13 +15,24 @@ class VLLMConfig(BaseModel): """Configuration for the vLLM inference provider.""" model: str = Field( - default="Llama3.1-8B-Instruct", + default="Llama3.2-3B-Instruct", description="Model descriptor from `llama model list`", ) tensor_parallel_size: int = Field( default=1, description="Number of tensor parallel replicas (number of GPUs to use).", ) + max_tokens: int = Field( + default=4096, + description="Maximum number of tokens to generate.", + ) + enforce_eager: bool = Field( + default=False, + description="Whether to use eager mode for inference (otherwise cuda graphs are used).", + ) + gpu_memory_utilization: float = Field( + default=0.3, + ) @field_validator("model") @classmethod diff --git a/llama_stack/providers/impls/vllm/vllm.py b/llama_stack/providers/impls/vllm/vllm.py index ad3ad8fb7..cf5b0572b 100644 --- a/llama_stack/providers/impls/vllm/vllm.py +++ b/llama_stack/providers/impls/vllm/vllm.py @@ -7,11 +7,12 @@ import logging import os import uuid -from typing import Any, AsyncGenerator +from typing import AsyncGenerator, Optional from llama_models.llama3.api.chat_format import ChatFormat from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_models.llama3.api.tokenizer import Tokenizer +from llama_models.sku_list import resolve_model from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -19,7 +20,7 @@ from vllm.sampling_params import SamplingParams as VLLMSamplingParams from llama_stack.apis.inference import * # noqa: F403 -from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper +from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate from llama_stack.providers.utils.inference.openai_compat import ( OpenAICompatCompletionChoice, OpenAICompatCompletionResponse, @@ -40,74 +41,15 @@ def _random_uuid() -> str: return str(uuid.uuid4().hex) -def _vllm_sampling_params(sampling_params: Any) -> VLLMSamplingParams: - """Convert sampling params to vLLM sampling params.""" - if sampling_params is None: - return VLLMSamplingParams() - - # TODO convert what I saw in my first test ... but surely there's more to do here - kwargs = { - "temperature": sampling_params.temperature, - } - if sampling_params.top_k >= 1: - kwargs["top_k"] = sampling_params.top_k - if sampling_params.top_p: - kwargs["top_p"] = sampling_params.top_p - if sampling_params.max_tokens >= 1: - kwargs["max_tokens"] = sampling_params.max_tokens - if sampling_params.repetition_penalty > 0: - kwargs["repetition_penalty"] = sampling_params.repetition_penalty - - return VLLMSamplingParams(**kwargs) - - -class VLLMInferenceImpl(ModelRegistryHelper, Inference): +class VLLMInferenceImpl(Inference, ModelsProtocolPrivate): """Inference implementation for vLLM.""" - HF_MODEL_MAPPINGS = { - # TODO: seems like we should be able to build this table dynamically ... - "Llama3.1-8B": "meta-llama/Llama-3.1-8B", - "Llama3.1-70B": "meta-llama/Llama-3.1-70B", - "Llama3.1-405B:bf16-mp8": "meta-llama/Llama-3.1-405B", - "Llama3.1-405B": "meta-llama/Llama-3.1-405B-FP8", - "Llama3.1-405B:bf16-mp16": "meta-llama/Llama-3.1-405B", - "Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct", - "Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct", - "Llama3.1-405B-Instruct:bf16-mp8": "meta-llama/Llama-3.1-405B-Instruct", - "Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-FP8", - "Llama3.1-405B-Instruct:bf16-mp16": "meta-llama/Llama-3.1-405B-Instruct", - "Llama3.2-1B": "meta-llama/Llama-3.2-1B", - "Llama3.2-3B": "meta-llama/Llama-3.2-3B", - "Llama3.2-11B-Vision": "meta-llama/Llama-3.2-11B-Vision", - "Llama3.2-90B-Vision": "meta-llama/Llama-3.2-90B-Vision", - "Llama3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct", - "Llama3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct", - "Llama3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct", - "Llama3.2-90B-Vision-Instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct", - "Llama-Guard-3-11B-Vision": "meta-llama/Llama-Guard-3-11B-Vision", - "Llama-Guard-3-1B:int4-mp1": "meta-llama/Llama-Guard-3-1B-INT4", - "Llama-Guard-3-1B": "meta-llama/Llama-Guard-3-1B", - "Llama-Guard-3-8B": "meta-llama/Llama-Guard-3-8B", - "Llama-Guard-3-8B:int8-mp1": "meta-llama/Llama-Guard-3-8B-INT8", - "Prompt-Guard-86M": "meta-llama/Prompt-Guard-86M", - "Llama-Guard-2-8B": "meta-llama/Llama-Guard-2-8B", - } - def __init__(self, config: VLLMConfig): - Inference.__init__(self) - ModelRegistryHelper.__init__( - self, - stack_to_provider_models_map=self.HF_MODEL_MAPPINGS, - ) self.config = config self.engine = None - - tokenizer = Tokenizer.get_instance() - self.formatter = ChatFormat(tokenizer) + self.formatter = ChatFormat(Tokenizer.get_instance()) async def initialize(self): - """Initialize the vLLM inference adapter.""" - log.info("Initializing vLLM inference adapter") # Disable usage stats reporting. This would be a surprising thing for most @@ -116,15 +58,22 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference): if "VLLM_NO_USAGE_STATS" not in os.environ: os.environ["VLLM_NO_USAGE_STATS"] = "1" - hf_model = self.HF_MODEL_MAPPINGS.get(self.config.model) + model = resolve_model(self.config.model) + if model is None: + raise ValueError(f"Unknown model {self.config.model}") + + if model.huggingface_repo is None: + raise ValueError(f"Model {self.config.model} needs a huggingface repo") # TODO -- there are a ton of options supported here ... - engine_args = AsyncEngineArgs() - engine_args.model = hf_model - # We will need a new config item for this in the future if model support is more broad - # than it is today (llama only) - engine_args.tokenizer = hf_model - engine_args.tensor_parallel_size = self.config.tensor_parallel_size + engine_args = AsyncEngineArgs( + model=model.huggingface_repo, + tokenizer=model.huggingface_repo, + tensor_parallel_size=self.config.tensor_parallel_size, + enforce_eager=self.config.enforce_eager, + gpu_memory_utilization=self.config.gpu_memory_utilization, + guided_decoding_backend="lm-format-enforcer", + ) self.engine = AsyncLLMEngine.from_engine_args(engine_args) @@ -134,13 +83,47 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference): if self.engine: self.engine.shutdown_background_loop() + async def register_model(self, model: ModelDef) -> None: + raise ValueError( + "You cannot dynamically add a model to a running vllm instance" + ) + + async def list_models(self) -> List[ModelDef]: + return [ + ModelDef( + identifier=self.config.model, + llama_model=self.config.model, + ) + ] + + def _sampling_params(self, sampling_params: SamplingParams) -> VLLMSamplingParams: + if sampling_params is None: + return VLLMSamplingParams(max_tokens=self.config.max_tokens) + + # TODO convert what I saw in my first test ... but surely there's more to do here + kwargs = { + "temperature": sampling_params.temperature, + "max_tokens": self.config.max_tokens, + } + if sampling_params.top_k: + kwargs["top_k"] = sampling_params.top_k + if sampling_params.top_p: + kwargs["top_p"] = sampling_params.top_p + if sampling_params.max_tokens: + kwargs["max_tokens"] = sampling_params.max_tokens + if sampling_params.repetition_penalty > 0: + kwargs["repetition_penalty"] = sampling_params.repetition_penalty + + return VLLMSamplingParams(**kwargs) + async def completion( self, model: str, content: InterleavedTextMedia, - sampling_params: Any | None = ..., - stream: bool | None = False, - logprobs: LogProbConfig | None = None, + sampling_params: Optional[SamplingParams] = SamplingParams(), + response_format: Optional[ResponseFormat] = None, + stream: Optional[bool] = False, + logprobs: Optional[LogProbConfig] = None, ) -> CompletionResponse | CompletionResponseStreamChunk: log.info("vLLM completion") messages = [UserMessage(content=content)] @@ -155,13 +138,14 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference): async def chat_completion( self, model: str, - messages: list[Message], - sampling_params: Any | None = ..., - tools: list[ToolDefinition] | None = ..., - tool_choice: ToolChoice | None = ..., - tool_prompt_format: ToolPromptFormat | None = ..., - stream: bool | None = False, - logprobs: LogProbConfig | None = None, + messages: List[Message], + sampling_params: Optional[SamplingParams] = SamplingParams(), + tools: Optional[List[ToolDefinition]] = None, + tool_choice: Optional[ToolChoice] = ToolChoice.auto, + tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json, + response_format: Optional[ResponseFormat] = None, + stream: Optional[bool] = False, + logprobs: Optional[LogProbConfig] = None, ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk: log.info("vLLM chat completion") @@ -182,7 +166,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference): request_id = _random_uuid() prompt = chat_completion_request_to_prompt(request, self.formatter) - vllm_sampling_params = _vllm_sampling_params(request.sampling_params) + vllm_sampling_params = self._sampling_params(request.sampling_params) results_generator = self.engine.generate( prompt, vllm_sampling_params, request_id ) @@ -213,14 +197,19 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference): self, request: ChatCompletionRequest, results_generator: AsyncGenerator ) -> AsyncGenerator: async def _generate_and_convert_to_openai_compat(): + cur = [] async for chunk in results_generator: if not chunk.outputs: log.warning("Empty chunk received") continue - text = "".join([output.text for output in chunk.outputs]) + output = chunk.outputs[-1] + + new_tokens = output.token_ids[len(cur) :] + text = self.formatter.tokenizer.decode(new_tokens) + cur.extend(new_tokens) choice = OpenAICompatCompletionChoice( - finish_reason=chunk.outputs[-1].stop_reason, + finish_reason=output.finish_reason, text=text, ) yield OpenAICompatCompletionResponse( diff --git a/llama_stack/providers/tests/eval/test_eval.py b/llama_stack/providers/tests/eval/test_eval.py index 4632cdd96..6b0d99a22 100644 --- a/llama_stack/providers/tests/eval/test_eval.py +++ b/llama_stack/providers/tests/eval/test_eval.py @@ -62,7 +62,7 @@ async def test_eval(eval_settings): response = await eval_impl.evaluate_batch( dataset_id=response[0].identifier, candidate=ModelCandidate( - model="Llama3.1-8B-Instruct", + model="Llama3.2-1B-Instruct", sampling_params=SamplingParams(), ), scoring_functions=["subset_of"], diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml new file mode 100644 index 000000000..a3ff27949 --- /dev/null +++ b/llama_stack/templates/bedrock/build.yaml @@ -0,0 +1,9 @@ +name: bedrock +distribution_spec: + description: Use Amazon Bedrock APIs. + providers: + inference: remote::bedrock + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/databricks/build.yaml b/llama_stack/templates/databricks/build.yaml new file mode 100644 index 000000000..f6c8b50a1 --- /dev/null +++ b/llama_stack/templates/databricks/build.yaml @@ -0,0 +1,9 @@ +name: databricks +distribution_spec: + description: Use Databricks for running LLM inference + providers: + inference: remote::databricks + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml new file mode 100644 index 000000000..37129bef0 --- /dev/null +++ b/llama_stack/templates/fireworks/build.yaml @@ -0,0 +1,9 @@ +name: fireworks +distribution_spec: + description: Use Fireworks.ai for running LLM inference + providers: + inference: remote::fireworks + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml new file mode 100644 index 000000000..6c84e5ccf --- /dev/null +++ b/llama_stack/templates/hf-endpoint/build.yaml @@ -0,0 +1,9 @@ +name: hf-endpoint +distribution_spec: + description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints." + providers: + inference: remote::hf::endpoint + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml new file mode 100644 index 000000000..32561c1fa --- /dev/null +++ b/llama_stack/templates/hf-serverless/build.yaml @@ -0,0 +1,9 @@ +name: hf-serverless +distribution_spec: + description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference." + providers: + inference: remote::hf::serverless + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/meta-reference-gpu/build.yaml b/llama_stack/templates/meta-reference-gpu/build.yaml new file mode 100644 index 000000000..d0fe93aa3 --- /dev/null +++ b/llama_stack/templates/meta-reference-gpu/build.yaml @@ -0,0 +1,13 @@ +name: meta-reference-gpu +distribution_spec: + docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime + description: Use code from `llama_stack` itself to serve all llama stack APIs + providers: + inference: meta-reference + memory: + - meta-reference + - remote::chromadb + - remote::pgvector + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml new file mode 100644 index 000000000..20500ea5a --- /dev/null +++ b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml @@ -0,0 +1,13 @@ +name: meta-reference-quantized-gpu +distribution_spec: + docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime + description: Use code from `llama_stack` itself to serve all llama stack APIs + providers: + inference: meta-reference-quantized + memory: + - meta-reference + - remote::chromadb + - remote::pgvector + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml new file mode 100644 index 000000000..06de2fc3c --- /dev/null +++ b/llama_stack/templates/ollama/build.yaml @@ -0,0 +1,12 @@ +name: ollama +distribution_spec: + description: Use ollama for running LLM inference + providers: + inference: remote::ollama + memory: + - meta-reference + - remote::chromadb + - remote::pgvector + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml new file mode 100644 index 000000000..c5e618bb6 --- /dev/null +++ b/llama_stack/templates/tgi/build.yaml @@ -0,0 +1,12 @@ +name: tgi +distribution_spec: + description: Use TGI for running LLM inference + providers: + inference: remote::tgi + memory: + - meta-reference + - remote::chromadb + - remote::pgvector + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml new file mode 100644 index 000000000..5232aeb93 --- /dev/null +++ b/llama_stack/templates/together/build.yaml @@ -0,0 +1,9 @@ +name: together +distribution_spec: + description: Use Together.ai for running LLM inference + providers: + inference: remote::together + memory: remote::weaviate + safety: remote::together + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/vllm/build.yaml b/llama_stack/templates/vllm/build.yaml new file mode 100644 index 000000000..d842896db --- /dev/null +++ b/llama_stack/templates/vllm/build.yaml @@ -0,0 +1,9 @@ +name: vllm +distribution_spec: + description: Like local, but use vLLM for running LLM inference + providers: + inference: vllm + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference