Merge branch 'main' into evals_6

This commit is contained in:
Xi Yan 2024-10-25 12:55:28 -07:00
commit d95bef7f2e
38 changed files with 352 additions and 346 deletions

View file

@ -1,4 +1,4 @@
exclude: 'build' exclude: 'build/'
default_language_version: default_language_version:
python: python3 python: python3

View file

@ -1,4 +1,4 @@
include requirements.txt include requirements.txt
include llama_stack/distribution/*.sh include llama_stack/distribution/*.sh
include llama_stack/cli/scripts/*.sh include llama_stack/cli/scripts/*.sh
include distributions/*/build.yaml include llama_stack/templates/*/build.yaml

View file

@ -1,10 +0,0 @@
name: bedrock
distribution_spec:
description: Use Amazon Bedrock APIs.
providers:
inference: remote::bedrock
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/bedrock/build.yaml

View file

@ -1,10 +0,0 @@
name: databricks
distribution_spec:
description: Use Databricks for running LLM inference
providers:
inference: remote::databricks
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/databricks/build.yaml

View file

@ -49,7 +49,7 @@ inference:
**Via Conda** **Via Conda**
```bash ```bash
llama stack build --config ./build.yaml llama stack build --template fireworks --image-type conda
# -- modify run.yaml to a valid Fireworks server endpoint # -- modify run.yaml to a valid Fireworks server endpoint
llama stack run ./run.yaml llama stack run ./run.yaml
``` ```

View file

@ -1,10 +0,0 @@
name: fireworks
distribution_spec:
description: Use Fireworks.ai for running LLM inference
providers:
inference: remote::fireworks
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/fireworks/build.yaml

View file

@ -1,10 +0,0 @@
name: hf-endpoint
distribution_spec:
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
providers:
inference: remote::hf::endpoint
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/hf-endpoint/build.yaml

View file

@ -1,10 +0,0 @@
name: hf-serverless
distribution_spec:
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
providers:
inference: remote::hf::serverless
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/hf-serverless/build.yaml

View file

@ -1,14 +0,0 @@
name: meta-reference-gpu
distribution_spec:
docker_image: pytorch/pytorch
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: meta-reference
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-gpu/build.yaml

View file

@ -1,14 +0,0 @@
name: meta-reference-quantized-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: meta-reference-quantized
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml

View file

@ -86,6 +86,6 @@ inference:
**Via Conda** **Via Conda**
``` ```
llama stack build --config ./build.yaml llama stack build --template ollama --image-type conda
llama stack run ./gpu/run.yaml llama stack run ./gpu/run.yaml
``` ```

View file

@ -1,13 +0,0 @@
name: ollama
distribution_spec:
description: Use ollama for running LLM inference
providers:
inference: remote::ollama
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/ollama/build.yaml

View file

@ -88,7 +88,7 @@ inference:
**Via Conda** **Via Conda**
```bash ```bash
llama stack build --config ./build.yaml llama stack build --template tgi --image-type conda
# -- start a TGI server endpoint # -- start a TGI server endpoint
llama stack run ./gpu/run.yaml llama stack run ./gpu/run.yaml
``` ```

View file

@ -1,13 +0,0 @@
name: tgi
distribution_spec:
description: Use TGI for running LLM inference
providers:
inference: remote::tgi
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/tgi/build.yaml

View file

@ -62,7 +62,7 @@ memory:
**Via Conda** **Via Conda**
```bash ```bash
llama stack build --config ./build.yaml llama stack build --template together --image-type conda
# -- modify run.yaml to a valid Together server endpoint # -- modify run.yaml to a valid Together server endpoint
llama stack run ./run.yaml llama stack run ./run.yaml
``` ```

View file

@ -1,10 +0,0 @@
name: together
distribution_spec:
description: Use Together.ai for running LLM inference
providers:
inference: remote::together
memory: remote::weaviate
safety: remote::together
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/together/build.yaml

View file

@ -1,10 +0,0 @@
name: vllm
distribution_spec:
description: Like local, but use vLLM for running LLM inference
providers:
inference: vllm
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/vllm/build.yaml

View file

@ -279,11 +279,11 @@ llama stack build --list-templates
You may then pick a template to build your distribution with providers fitted to your liking. You may then pick a template to build your distribution with providers fitted to your liking.
``` ```
llama stack build --template local-tgi --name my-tgi-stack llama stack build --template local-tgi --name my-tgi-stack --image-type conda
``` ```
``` ```
$ llama stack build --template local-tgi --name my-tgi-stack $ llama stack build --template local-tgi --name my-tgi-stack --image-type conda
... ...
... ...
Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
@ -293,10 +293,10 @@ You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~
#### Building from config file #### Building from config file
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command. - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/distributions/templates/`. - The config file will be of contents like the ones in `llama_stack/templates/`.
``` ```
$ cat llama_stack/distribution/templates/local-ollama-build.yaml $ cat build.yaml
name: local-ollama name: local-ollama
distribution_spec: distribution_spec:
@ -311,7 +311,7 @@ image_type: conda
``` ```
``` ```
llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml llama stack build --config build.yaml
``` ```
#### How to build distribution with Docker image #### How to build distribution with Docker image

View file

@ -35,11 +35,7 @@ You have two ways to start up Llama stack server:
1. **Starting up server via docker**: 1. **Starting up server via docker**:
We provide 2 pre-built Docker image of Llama Stack distribution, which can be found in the following links. We provide pre-built Docker image of Llama Stack distribution, which can be found in the following links in the [distributions](../distributions/) folder.
- [llamastack-local-gpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-gpu/general)
- This is a packaged version with our local meta-reference implementations, where you will be running inference locally with downloaded Llama model checkpoints.
- [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general)
- This is a lite version with remote inference where you can hook up to your favourite remote inference framework (e.g. ollama, fireworks, together, tgi) for running inference without GPU.
> [!NOTE] > [!NOTE]
> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container. > For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container.

View file

@ -12,9 +12,7 @@ import os
from functools import lru_cache from functools import lru_cache
from pathlib import Path from pathlib import Path
TEMPLATES_PATH = ( TEMPLATES_PATH = Path(os.path.relpath(__file__)).parent.parent.parent / "templates"
Path(os.path.relpath(__file__)).parent.parent.parent.parent / "distributions"
)
@lru_cache() @lru_cache()
@ -26,7 +24,6 @@ def available_templates_specs() -> List[BuildConfig]:
with open(p, "r") as f: with open(p, "r") as f:
build_config = BuildConfig(**yaml.safe_load(f)) build_config = BuildConfig(**yaml.safe_load(f))
template_specs.append(build_config) template_specs.append(build_config)
return template_specs return template_specs
@ -78,112 +75,17 @@ class StackBuild(Subcommand):
choices=["conda", "docker"], choices=["conda", "docker"],
) )
def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]:
if os.getenv("CONDA_PREFIX", ""):
conda_dir = (
Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}"
)
else:
cprint(
"Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...",
color="green",
)
conda_dir = (
Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}"
)
build_config_file = Path(conda_dir) / f"{args.name}-build.yaml"
if build_config_file.exists():
return build_config_file
return None
def _run_stack_build_command_from_build_config(
self, build_config: BuildConfig
) -> None:
import json
import os
import yaml
from llama_stack.distribution.build import build_image, ImageType
from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
from llama_stack.distribution.utils.serialize import EnumEncoder
from termcolor import cprint
# save build.yaml spec for building same distribution again
if build_config.image_type == ImageType.docker.value:
# docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
llama_stack_path = Path(
os.path.abspath(__file__)
).parent.parent.parent.parent
build_dir = llama_stack_path / "tmp/configs/"
else:
build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}"
os.makedirs(build_dir, exist_ok=True)
build_file_path = build_dir / f"{build_config.name}-build.yaml"
with open(build_file_path, "w") as f:
to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
f.write(yaml.dump(to_write, sort_keys=False))
return_code = build_image(build_config, build_file_path)
if return_code != 0:
return
configure_name = (
build_config.name
if build_config.image_type == "conda"
else (f"llamastack-{build_config.name}")
)
if build_config.image_type == "conda":
cprint(
f"You can now run `llama stack configure {configure_name}`",
color="green",
)
else:
cprint(
f"You can now run `llama stack run {build_config.name}`",
color="green",
)
def _run_template_list_cmd(self, args: argparse.Namespace) -> None:
import json
from llama_stack.cli.table import print_table
# eventually, this should query a registry at llama.meta.com/llamastack/distributions
headers = [
"Template Name",
"Providers",
"Description",
]
rows = []
for spec in available_templates_specs():
rows.append(
[
spec.name,
json.dumps(spec.distribution_spec.providers, indent=2),
spec.distribution_spec.description,
]
)
print_table(
rows,
headers,
separate_rows=True,
)
def _run_stack_build_command(self, args: argparse.Namespace) -> None: def _run_stack_build_command(self, args: argparse.Namespace) -> None:
import textwrap import textwrap
import yaml import yaml
from llama_stack.distribution.distribution import get_provider_registry
from prompt_toolkit import prompt from prompt_toolkit import prompt
from prompt_toolkit.completion import WordCompleter from prompt_toolkit.completion import WordCompleter
from prompt_toolkit.validation import Validator from prompt_toolkit.validation import Validator
from termcolor import cprint from termcolor import cprint
from llama_stack.distribution.distribution import get_provider_registry
if args.list_templates: if args.list_templates:
self._run_template_list_cmd(args) self._run_template_list_cmd(args)
return return
@ -194,19 +96,22 @@ class StackBuild(Subcommand):
"You must specify a name for the build using --name when using a template" "You must specify a name for the build using --name when using a template"
) )
return return
build_path = TEMPLATES_PATH / f"{args.template}-build.yaml" available_templates = available_templates_specs()
if not build_path.exists(): for build_config in available_templates:
self.parser.error( if build_config.name == args.template:
f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates" build_config.name = args.name
) if args.image_type:
return build_config.image_type = args.image_type
with open(build_path, "r") as f: else:
build_config = BuildConfig(**yaml.safe_load(f)) self.parser.error(
build_config.name = args.name f"Please specify a image-type (docker | conda) for {args.template}"
if args.image_type: )
build_config.image_type = args.image_type self._run_stack_build_command_from_build_config(build_config)
self._run_stack_build_command_from_build_config(build_config) return
self.parser.error(
f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates"
)
return return
# try to see if we can find a pre-existing build config file through name # try to see if we can find a pre-existing build config file through name
@ -297,3 +202,99 @@ class StackBuild(Subcommand):
self.parser.error(f"Could not parse config file {args.config}: {e}") self.parser.error(f"Could not parse config file {args.config}: {e}")
return return
self._run_stack_build_command_from_build_config(build_config) self._run_stack_build_command_from_build_config(build_config)
def _get_build_config_from_name(self, args: argparse.Namespace) -> Optional[Path]:
if os.getenv("CONDA_PREFIX", ""):
conda_dir = (
Path(os.getenv("CONDA_PREFIX")).parent / f"llamastack-{args.name}"
)
else:
cprint(
"Cannot find CONDA_PREFIX. Trying default conda path ~/.conda/envs...",
color="green",
)
conda_dir = (
Path(os.path.expanduser("~/.conda/envs")) / f"llamastack-{args.name}"
)
build_config_file = Path(conda_dir) / f"{args.name}-build.yaml"
if build_config_file.exists():
return build_config_file
return None
def _run_stack_build_command_from_build_config(
self, build_config: BuildConfig
) -> None:
import json
import os
import yaml
from termcolor import cprint
from llama_stack.distribution.build import build_image, ImageType
from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
from llama_stack.distribution.utils.serialize import EnumEncoder
# save build.yaml spec for building same distribution again
if build_config.image_type == ImageType.docker.value:
# docker needs build file to be in the llama-stack repo dir to be able to copy over to the image
llama_stack_path = Path(
os.path.abspath(__file__)
).parent.parent.parent.parent
build_dir = llama_stack_path / "tmp/configs/"
else:
build_dir = DISTRIBS_BASE_DIR / f"llamastack-{build_config.name}"
os.makedirs(build_dir, exist_ok=True)
build_file_path = build_dir / f"{build_config.name}-build.yaml"
with open(build_file_path, "w") as f:
to_write = json.loads(json.dumps(build_config.dict(), cls=EnumEncoder))
f.write(yaml.dump(to_write, sort_keys=False))
return_code = build_image(build_config, build_file_path)
if return_code != 0:
return
configure_name = (
build_config.name
if build_config.image_type == "conda"
else (f"llamastack-{build_config.name}")
)
if build_config.image_type == "conda":
cprint(
f"You can now run `llama stack configure {configure_name}`",
color="green",
)
else:
cprint(
f"You can now edit your run.yaml file and run `docker run -it -p 5000:5000 {build_config.name}`. See full command in llama-stack/distributions/",
color="green",
)
def _run_template_list_cmd(self, args: argparse.Namespace) -> None:
import json
from llama_stack.cli.table import print_table
# eventually, this should query a registry at llama.meta.com/llamastack/distributions
headers = [
"Template Name",
"Providers",
"Description",
]
rows = []
for spec in available_templates_specs():
rows.append(
[
spec.name,
json.dumps(spec.distribution_spec.providers, indent=2),
spec.distribution_spec.description,
]
)
print_table(
rows,
headers,
separate_rows=True,
)

View file

@ -8,18 +8,19 @@ from enum import Enum
from typing import List, Optional from typing import List, Optional
import pkg_resources import pkg_resources
from llama_stack.distribution.utils.exec import run_with_pty
from pydantic import BaseModel from pydantic import BaseModel
from termcolor import cprint from termcolor import cprint
from llama_stack.distribution.utils.exec import run_with_pty
from llama_stack.distribution.datatypes import * # noqa: F403 from llama_stack.distribution.datatypes import * # noqa: F403
from pathlib import Path from pathlib import Path
from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
from llama_stack.distribution.distribution import get_provider_registry from llama_stack.distribution.distribution import get_provider_registry
from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
# These are the dependencies needed by the distribution server. # These are the dependencies needed by the distribution server.
# `llama-stack` is automatically installed by the installation script. # `llama-stack` is automatically installed by the installation script.

View file

@ -1,5 +1,11 @@
#!/bin/bash #!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-} LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
@ -34,9 +40,6 @@ REPO_CONFIGS_DIR="$REPO_DIR/tmp/configs"
TEMP_DIR=$(mktemp -d) TEMP_DIR=$(mktemp -d)
llama stack configure $build_file_path
cp $host_build_dir/$build_name-run.yaml $REPO_CONFIGS_DIR
add_to_docker() { add_to_docker() {
local input local input
output_file="$TEMP_DIR/Dockerfile" output_file="$TEMP_DIR/Dockerfile"
@ -113,7 +116,6 @@ ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"]
EOF EOF
add_to_docker "ADD tmp/configs/$(basename "$build_file_path") ./llamastack-build.yaml" add_to_docker "ADD tmp/configs/$(basename "$build_file_path") ./llamastack-build.yaml"
add_to_docker "ADD tmp/configs/$build_name-run.yaml ./llamastack-run.yaml"
printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile" printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile"
cat $TEMP_DIR/Dockerfile cat $TEMP_DIR/Dockerfile

View file

@ -29,7 +29,7 @@ if [ $# -lt 3 ]; then
fi fi
build_name="$1" build_name="$1"
docker_image="llamastack-$build_name" docker_image="distribution-$build_name"
shift shift
yaml_config="$1" yaml_config="$1"

View file

@ -169,7 +169,7 @@ class MetaReferenceAgentsImpl(Agents):
turn_ids: Optional[List[str]] = None, turn_ids: Optional[List[str]] = None,
) -> Session: ) -> Session:
session = await self.persistence_store.get(f"session:{agent_id}:{session_id}") session = await self.persistence_store.get(f"session:{agent_id}:{session_id}")
session = Session(**json.loads(session)) session = Session(**json.loads(session), turns=[])
turns = [] turns = []
if turn_ids: if turn_ids:
for turn_id in turn_ids: for turn_id in turn_ids:

View file

@ -1,5 +1,11 @@
#!/bin/bash #!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
if [[ $# -ne 1 ]]; then if [[ $# -ne 1 ]]; then
echo "Error: Please provide the name of CONDA environment you wish to create" echo "Error: Please provide the name of CONDA environment you wish to create"
exit 1 exit 1

View file

@ -15,13 +15,24 @@ class VLLMConfig(BaseModel):
"""Configuration for the vLLM inference provider.""" """Configuration for the vLLM inference provider."""
model: str = Field( model: str = Field(
default="Llama3.1-8B-Instruct", default="Llama3.2-3B-Instruct",
description="Model descriptor from `llama model list`", description="Model descriptor from `llama model list`",
) )
tensor_parallel_size: int = Field( tensor_parallel_size: int = Field(
default=1, default=1,
description="Number of tensor parallel replicas (number of GPUs to use).", description="Number of tensor parallel replicas (number of GPUs to use).",
) )
max_tokens: int = Field(
default=4096,
description="Maximum number of tokens to generate.",
)
enforce_eager: bool = Field(
default=False,
description="Whether to use eager mode for inference (otherwise cuda graphs are used).",
)
gpu_memory_utilization: float = Field(
default=0.3,
)
@field_validator("model") @field_validator("model")
@classmethod @classmethod

View file

@ -7,11 +7,12 @@
import logging import logging
import os import os
import uuid import uuid
from typing import Any, AsyncGenerator from typing import AsyncGenerator, Optional
from llama_models.llama3.api.chat_format import ChatFormat from llama_models.llama3.api.chat_format import ChatFormat
from llama_models.llama3.api.datatypes import * # noqa: F403 from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_models.llama3.api.tokenizer import Tokenizer from llama_models.llama3.api.tokenizer import Tokenizer
from llama_models.sku_list import resolve_model
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
@ -19,7 +20,7 @@ from vllm.sampling_params import SamplingParams as VLLMSamplingParams
from llama_stack.apis.inference import * # noqa: F403 from llama_stack.apis.inference import * # noqa: F403
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.datatypes import ModelDef, ModelsProtocolPrivate
from llama_stack.providers.utils.inference.openai_compat import ( from llama_stack.providers.utils.inference.openai_compat import (
OpenAICompatCompletionChoice, OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse, OpenAICompatCompletionResponse,
@ -40,74 +41,15 @@ def _random_uuid() -> str:
return str(uuid.uuid4().hex) return str(uuid.uuid4().hex)
def _vllm_sampling_params(sampling_params: Any) -> VLLMSamplingParams: class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
"""Convert sampling params to vLLM sampling params."""
if sampling_params is None:
return VLLMSamplingParams()
# TODO convert what I saw in my first test ... but surely there's more to do here
kwargs = {
"temperature": sampling_params.temperature,
}
if sampling_params.top_k >= 1:
kwargs["top_k"] = sampling_params.top_k
if sampling_params.top_p:
kwargs["top_p"] = sampling_params.top_p
if sampling_params.max_tokens >= 1:
kwargs["max_tokens"] = sampling_params.max_tokens
if sampling_params.repetition_penalty > 0:
kwargs["repetition_penalty"] = sampling_params.repetition_penalty
return VLLMSamplingParams(**kwargs)
class VLLMInferenceImpl(ModelRegistryHelper, Inference):
"""Inference implementation for vLLM.""" """Inference implementation for vLLM."""
HF_MODEL_MAPPINGS = {
# TODO: seems like we should be able to build this table dynamically ...
"Llama3.1-8B": "meta-llama/Llama-3.1-8B",
"Llama3.1-70B": "meta-llama/Llama-3.1-70B",
"Llama3.1-405B:bf16-mp8": "meta-llama/Llama-3.1-405B",
"Llama3.1-405B": "meta-llama/Llama-3.1-405B-FP8",
"Llama3.1-405B:bf16-mp16": "meta-llama/Llama-3.1-405B",
"Llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct",
"Llama3.1-70B-Instruct": "meta-llama/Llama-3.1-70B-Instruct",
"Llama3.1-405B-Instruct:bf16-mp8": "meta-llama/Llama-3.1-405B-Instruct",
"Llama3.1-405B-Instruct": "meta-llama/Llama-3.1-405B-Instruct-FP8",
"Llama3.1-405B-Instruct:bf16-mp16": "meta-llama/Llama-3.1-405B-Instruct",
"Llama3.2-1B": "meta-llama/Llama-3.2-1B",
"Llama3.2-3B": "meta-llama/Llama-3.2-3B",
"Llama3.2-11B-Vision": "meta-llama/Llama-3.2-11B-Vision",
"Llama3.2-90B-Vision": "meta-llama/Llama-3.2-90B-Vision",
"Llama3.2-1B-Instruct": "meta-llama/Llama-3.2-1B-Instruct",
"Llama3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
"Llama3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
"Llama3.2-90B-Vision-Instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
"Llama-Guard-3-11B-Vision": "meta-llama/Llama-Guard-3-11B-Vision",
"Llama-Guard-3-1B:int4-mp1": "meta-llama/Llama-Guard-3-1B-INT4",
"Llama-Guard-3-1B": "meta-llama/Llama-Guard-3-1B",
"Llama-Guard-3-8B": "meta-llama/Llama-Guard-3-8B",
"Llama-Guard-3-8B:int8-mp1": "meta-llama/Llama-Guard-3-8B-INT8",
"Prompt-Guard-86M": "meta-llama/Prompt-Guard-86M",
"Llama-Guard-2-8B": "meta-llama/Llama-Guard-2-8B",
}
def __init__(self, config: VLLMConfig): def __init__(self, config: VLLMConfig):
Inference.__init__(self)
ModelRegistryHelper.__init__(
self,
stack_to_provider_models_map=self.HF_MODEL_MAPPINGS,
)
self.config = config self.config = config
self.engine = None self.engine = None
self.formatter = ChatFormat(Tokenizer.get_instance())
tokenizer = Tokenizer.get_instance()
self.formatter = ChatFormat(tokenizer)
async def initialize(self): async def initialize(self):
"""Initialize the vLLM inference adapter."""
log.info("Initializing vLLM inference adapter") log.info("Initializing vLLM inference adapter")
# Disable usage stats reporting. This would be a surprising thing for most # Disable usage stats reporting. This would be a surprising thing for most
@ -116,15 +58,22 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
if "VLLM_NO_USAGE_STATS" not in os.environ: if "VLLM_NO_USAGE_STATS" not in os.environ:
os.environ["VLLM_NO_USAGE_STATS"] = "1" os.environ["VLLM_NO_USAGE_STATS"] = "1"
hf_model = self.HF_MODEL_MAPPINGS.get(self.config.model) model = resolve_model(self.config.model)
if model is None:
raise ValueError(f"Unknown model {self.config.model}")
if model.huggingface_repo is None:
raise ValueError(f"Model {self.config.model} needs a huggingface repo")
# TODO -- there are a ton of options supported here ... # TODO -- there are a ton of options supported here ...
engine_args = AsyncEngineArgs() engine_args = AsyncEngineArgs(
engine_args.model = hf_model model=model.huggingface_repo,
# We will need a new config item for this in the future if model support is more broad tokenizer=model.huggingface_repo,
# than it is today (llama only) tensor_parallel_size=self.config.tensor_parallel_size,
engine_args.tokenizer = hf_model enforce_eager=self.config.enforce_eager,
engine_args.tensor_parallel_size = self.config.tensor_parallel_size gpu_memory_utilization=self.config.gpu_memory_utilization,
guided_decoding_backend="lm-format-enforcer",
)
self.engine = AsyncLLMEngine.from_engine_args(engine_args) self.engine = AsyncLLMEngine.from_engine_args(engine_args)
@ -134,13 +83,47 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
if self.engine: if self.engine:
self.engine.shutdown_background_loop() self.engine.shutdown_background_loop()
async def register_model(self, model: ModelDef) -> None:
raise ValueError(
"You cannot dynamically add a model to a running vllm instance"
)
async def list_models(self) -> List[ModelDef]:
return [
ModelDef(
identifier=self.config.model,
llama_model=self.config.model,
)
]
def _sampling_params(self, sampling_params: SamplingParams) -> VLLMSamplingParams:
if sampling_params is None:
return VLLMSamplingParams(max_tokens=self.config.max_tokens)
# TODO convert what I saw in my first test ... but surely there's more to do here
kwargs = {
"temperature": sampling_params.temperature,
"max_tokens": self.config.max_tokens,
}
if sampling_params.top_k:
kwargs["top_k"] = sampling_params.top_k
if sampling_params.top_p:
kwargs["top_p"] = sampling_params.top_p
if sampling_params.max_tokens:
kwargs["max_tokens"] = sampling_params.max_tokens
if sampling_params.repetition_penalty > 0:
kwargs["repetition_penalty"] = sampling_params.repetition_penalty
return VLLMSamplingParams(**kwargs)
async def completion( async def completion(
self, self,
model: str, model: str,
content: InterleavedTextMedia, content: InterleavedTextMedia,
sampling_params: Any | None = ..., sampling_params: Optional[SamplingParams] = SamplingParams(),
stream: bool | None = False, response_format: Optional[ResponseFormat] = None,
logprobs: LogProbConfig | None = None, stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> CompletionResponse | CompletionResponseStreamChunk: ) -> CompletionResponse | CompletionResponseStreamChunk:
log.info("vLLM completion") log.info("vLLM completion")
messages = [UserMessage(content=content)] messages = [UserMessage(content=content)]
@ -155,13 +138,14 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
async def chat_completion( async def chat_completion(
self, self,
model: str, model: str,
messages: list[Message], messages: List[Message],
sampling_params: Any | None = ..., sampling_params: Optional[SamplingParams] = SamplingParams(),
tools: list[ToolDefinition] | None = ..., tools: Optional[List[ToolDefinition]] = None,
tool_choice: ToolChoice | None = ..., tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: ToolPromptFormat | None = ..., tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
stream: bool | None = False, response_format: Optional[ResponseFormat] = None,
logprobs: LogProbConfig | None = None, stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk: ) -> ChatCompletionResponse | ChatCompletionResponseStreamChunk:
log.info("vLLM chat completion") log.info("vLLM chat completion")
@ -182,7 +166,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
request_id = _random_uuid() request_id = _random_uuid()
prompt = chat_completion_request_to_prompt(request, self.formatter) prompt = chat_completion_request_to_prompt(request, self.formatter)
vllm_sampling_params = _vllm_sampling_params(request.sampling_params) vllm_sampling_params = self._sampling_params(request.sampling_params)
results_generator = self.engine.generate( results_generator = self.engine.generate(
prompt, vllm_sampling_params, request_id prompt, vllm_sampling_params, request_id
) )
@ -213,14 +197,19 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
self, request: ChatCompletionRequest, results_generator: AsyncGenerator self, request: ChatCompletionRequest, results_generator: AsyncGenerator
) -> AsyncGenerator: ) -> AsyncGenerator:
async def _generate_and_convert_to_openai_compat(): async def _generate_and_convert_to_openai_compat():
cur = []
async for chunk in results_generator: async for chunk in results_generator:
if not chunk.outputs: if not chunk.outputs:
log.warning("Empty chunk received") log.warning("Empty chunk received")
continue continue
text = "".join([output.text for output in chunk.outputs]) output = chunk.outputs[-1]
new_tokens = output.token_ids[len(cur) :]
text = self.formatter.tokenizer.decode(new_tokens)
cur.extend(new_tokens)
choice = OpenAICompatCompletionChoice( choice = OpenAICompatCompletionChoice(
finish_reason=chunk.outputs[-1].stop_reason, finish_reason=output.finish_reason,
text=text, text=text,
) )
yield OpenAICompatCompletionResponse( yield OpenAICompatCompletionResponse(

View file

@ -0,0 +1,9 @@
name: bedrock
distribution_spec:
description: Use Amazon Bedrock APIs.
providers:
inference: remote::bedrock
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: databricks
distribution_spec:
description: Use Databricks for running LLM inference
providers:
inference: remote::databricks
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: fireworks
distribution_spec:
description: Use Fireworks.ai for running LLM inference
providers:
inference: remote::fireworks
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: hf-endpoint
distribution_spec:
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
providers:
inference: remote::hf::endpoint
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: hf-serverless
distribution_spec:
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
providers:
inference: remote::hf::serverless
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,13 @@
name: meta-reference-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: meta-reference
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,13 @@
name: meta-reference-quantized-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: meta-reference-quantized
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,12 @@
name: ollama
distribution_spec:
description: Use ollama for running LLM inference
providers:
inference: remote::ollama
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,12 @@
name: tgi
distribution_spec:
description: Use TGI for running LLM inference
providers:
inference: remote::tgi
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: together
distribution_spec:
description: Use Together.ai for running LLM inference
providers:
inference: remote::together
memory: remote::weaviate
safety: remote::together
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: vllm
distribution_spec:
description: Like local, but use vLLM for running LLM inference
providers:
inference: vllm
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference