Merge branch 'main' into add-nvidia-inference-adapter

This commit is contained in:
Matthew Farrellee 2024-11-21 06:49:13 -05:00
commit 5fbfb9d854
92 changed files with 2145 additions and 678 deletions

View file

@ -8,7 +8,6 @@ import argparse
from llama_stack.cli.subcommand import Subcommand
from llama_stack.distribution.datatypes import * # noqa: F403
import importlib
import os
import shutil
from functools import lru_cache
@ -258,6 +257,7 @@ class StackBuild(Subcommand):
) -> None:
import json
import os
import re
import yaml
from termcolor import cprint
@ -286,17 +286,19 @@ class StackBuild(Subcommand):
os.makedirs(build_dir, exist_ok=True)
run_config_file = build_dir / f"{build_config.name}-run.yaml"
shutil.copy(template_path, run_config_file)
module_name = f"llama_stack.templates.{template_name}"
module = importlib.import_module(module_name)
distribution_template = module.get_distribution_template()
with open(template_path, "r") as f:
yaml_content = f.read()
# Find all ${env.VARIABLE} patterns
env_vars = set(re.findall(r"\${env\.([A-Za-z0-9_]+)}", yaml_content))
cprint("Build Successful! Next steps: ", color="green")
env_vars = ", ".join(distribution_template.run_config_env_vars.keys())
cprint(
f" 1. Set the environment variables: {env_vars}",
f" 1. Set the environment variables: {list(env_vars)}",
color="green",
)
cprint(
f" 2. `llama stack run {run_config_file}`",
f" 2. Run: `llama stack run {template_name}`",
color="green",
)
else:

View file

@ -5,9 +5,12 @@
# the root directory of this source tree.
import argparse
from pathlib import Path
from llama_stack.cli.subcommand import Subcommand
REPO_ROOT = Path(__file__).parent.parent.parent.parent
class StackRun(Subcommand):
def __init__(self, subparsers: argparse._SubParsersAction):
@ -48,8 +51,6 @@ class StackRun(Subcommand):
)
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
from pathlib import Path
import pkg_resources
import yaml
@ -66,19 +67,27 @@ class StackRun(Subcommand):
return
config_file = Path(args.config)
if not config_file.exists() and not args.config.endswith(".yaml"):
has_yaml_suffix = args.config.endswith(".yaml")
if not config_file.exists() and not has_yaml_suffix:
# check if this is a template
config_file = (
Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
)
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to conda dir
config_file = Path(
BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml"
)
if not config_file.exists() and not args.config.endswith(".yaml"):
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to docker dir
config_file = Path(
BUILDS_BASE_DIR / ImageType.docker.value / f"{args.config}-run.yaml"
)
if not config_file.exists() and not args.config.endswith(".yaml"):
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to ~/.llama dir
config_file = Path(
DISTRIBS_BASE_DIR
@ -92,6 +101,7 @@ class StackRun(Subcommand):
)
return
print(f"Using config file: {config_file}")
config_dict = yaml.safe_load(config_file.read_text())
config = parse_and_maybe_upgrade_config(config_dict)

View file

@ -122,7 +122,7 @@ add_to_docker <<EOF
# This would be good in production but for debugging flexibility lets not add it right now
# We need a more solid production ready entrypoint.sh anyway
#
ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"]
ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--template", "$build_name"]
EOF

View file

@ -170,13 +170,6 @@ class CommonRoutingTableImpl(RoutingTable):
# Get existing objects from registry
existing_obj = await self.dist_registry.get(obj.type, obj.identifier)
# Check for existing registration
if existing_obj and existing_obj.provider_id == obj.provider_id:
print(
f"`{obj.identifier}` already registered with `{existing_obj.provider_id}`"
)
return existing_obj
# if provider_id is not specified, pick an arbitrary one from existing entries
if not obj.provider_id and len(self.impls_by_provider_id) > 0:
obj.provider_id = list(self.impls_by_provider_id.keys())[0]

View file

@ -16,6 +16,7 @@ import traceback
import warnings
from contextlib import asynccontextmanager
from pathlib import Path
from ssl import SSLError
from typing import Any, Dict, Optional
@ -49,6 +50,9 @@ from llama_stack.distribution.stack import (
from .endpoints import get_all_api_endpoints
REPO_ROOT = Path(__file__).parent.parent.parent.parent
def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
log = file if hasattr(file, "write") else sys.stderr
traceback.print_stack(file=log)
@ -279,9 +283,12 @@ def main():
parser = argparse.ArgumentParser(description="Start the LlamaStack server.")
parser.add_argument(
"--yaml-config",
default="llamastack-run.yaml",
help="Path to YAML configuration file",
)
parser.add_argument(
"--template",
help="One of the template names in llama_stack/templates (e.g., tgi, fireworks, remote-vllm, etc.)",
)
parser.add_argument("--port", type=int, default=5000, help="Port to listen on")
parser.add_argument(
"--disable-ipv6", action="store_true", help="Whether to disable IPv6 support"
@ -303,10 +310,29 @@ def main():
print(f"Error: {str(e)}")
sys.exit(1)
with open(args.yaml_config, "r") as fp:
if args.yaml_config:
# if the user provided a config file, use it, even if template was specified
config_file = Path(args.yaml_config)
if not config_file.exists():
raise ValueError(f"Config file {config_file} does not exist")
print(f"Using config file: {config_file}")
elif args.template:
config_file = (
Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml"
)
if not config_file.exists():
raise ValueError(f"Template {args.template} does not exist")
print(f"Using template {args.template} config file: {config_file}")
else:
raise ValueError("Either --yaml-config or --template must be provided")
with open(config_file, "r") as fp:
config = replace_env_vars(yaml.safe_load(fp))
config = StackRunConfig(**config)
print("Run configuration:")
print(yaml.dump(config.model_dump(), indent=2))
app = FastAPI()
try:

View file

@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Optional
from typing import Any, Dict, Optional
from llama_models.datatypes import * # noqa: F403
from llama_models.sku_list import resolve_model
@ -37,8 +37,10 @@ class MetaReferenceInferenceConfig(BaseModel):
@classmethod
def validate_model(cls, model: str) -> str:
permitted_models = supported_inference_models()
if model not in permitted_models:
model_list = "\n\t".join(permitted_models)
descriptors = [m.descriptor() for m in permitted_models]
repos = [m.huggingface_repo for m in permitted_models]
if model not in (descriptors + repos):
model_list = "\n\t".join(repos)
raise ValueError(
f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
)
@ -54,6 +56,7 @@ class MetaReferenceInferenceConfig(BaseModel):
cls,
model: str = "Llama3.2-3B-Instruct",
checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
**kwargs,
) -> Dict[str, Any]:
return {
"model": model,
@ -64,3 +67,16 @@ class MetaReferenceInferenceConfig(BaseModel):
class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
quantization: QuantizationConfig
@classmethod
def sample_run_config(
cls,
model: str = "Llama3.2-3B-Instruct",
checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
**kwargs,
) -> Dict[str, Any]:
config = super().sample_run_config(model, checkpoint_dir, **kwargs)
config["quantization"] = {
"type": "fp8",
}
return config

View file

@ -37,19 +37,22 @@ class VLLMConfig(BaseModel):
@classmethod
def sample_run_config(cls):
return {
"model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}",
"tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}",
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
"enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}",
"gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}",
"model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
"tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
"max_tokens": "${env.MAX_TOKENS:4096}",
"enforce_eager": "${env.ENFORCE_EAGER:False}",
"gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
}
@field_validator("model")
@classmethod
def validate_model(cls, model: str) -> str:
permitted_models = supported_inference_models()
if model not in permitted_models:
model_list = "\n\t".join(permitted_models)
descriptors = [m.descriptor() for m in permitted_models]
repos = [m.huggingface_repo for m in permitted_models]
if model not in (descriptors + repos):
model_list = "\n\t".join(repos)
raise ValueError(
f"Unknown model: `{model}`. Choose from [\n\t{model_list}\n]"
)

View file

@ -4,11 +4,8 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_models.schema_utils import json_schema_type
from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
@json_schema_type
class BedrockConfig(BedrockBaseConfig):
pass

View file

@ -37,6 +37,18 @@ class InferenceEndpointImplConfig(BaseModel):
description="Your Hugging Face user access token (will default to locally saved token if not provided)",
)
@classmethod
def sample_run_config(
cls,
endpoint_name: str = "${env.INFERENCE_ENDPOINT_NAME}",
api_token: str = "${env.HF_API_TOKEN}",
**kwargs,
):
return {
"endpoint_name": endpoint_name,
"api_token": api_token,
}
@json_schema_type
class InferenceAPIImplConfig(BaseModel):
@ -47,3 +59,15 @@ class InferenceAPIImplConfig(BaseModel):
default=None,
description="Your Hugging Face user access token (will default to locally saved token if not provided)",
)
@classmethod
def sample_run_config(
cls,
repo: str = "${env.INFERENCE_MODEL}",
api_token: str = "${env.HF_API_TOKEN}",
**kwargs,
):
return {
"huggingface_repo": repo,
"api_token": api_token,
}

View file

@ -147,9 +147,7 @@ class ChromaMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
documents: List[MemoryBankDocument],
ttl_seconds: Optional[int] = None,
) -> None:
index = self.cache.get(bank_id, None)
if not index:
raise ValueError(f"Bank {bank_id} not found")
index = await self._get_and_cache_bank_index(bank_id)
await index.insert_documents(documents)
@ -159,8 +157,20 @@ class ChromaMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
query: InterleavedTextMedia,
params: Optional[Dict[str, Any]] = None,
) -> QueryDocumentsResponse:
index = self.cache.get(bank_id, None)
if not index:
raise ValueError(f"Bank {bank_id} not found")
index = await self._get_and_cache_bank_index(bank_id)
return await index.query_documents(query, params)
async def _get_and_cache_bank_index(self, bank_id: str) -> BankWithIndex:
if bank_id in self.cache:
return self.cache[bank_id]
bank = await self.memory_bank_store.get_memory_bank(bank_id)
if not bank:
raise ValueError(f"Bank {bank_id} not found in Llama Stack")
collection = await self.client.get_collection(bank_id)
if not collection:
raise ValueError(f"Bank {bank_id} not found in Chroma")
index = BankWithIndex(bank=bank, index=ChromaIndex(self.client, collection))
self.cache[bank_id] = index
return index

View file

@ -201,10 +201,7 @@ class PGVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
documents: List[MemoryBankDocument],
ttl_seconds: Optional[int] = None,
) -> None:
index = self.cache.get(bank_id, None)
if not index:
raise ValueError(f"Bank {bank_id} not found")
index = await self._get_and_cache_bank_index(bank_id)
await index.insert_documents(documents)
async def query_documents(
@ -213,8 +210,17 @@ class PGVectorMemoryAdapter(Memory, MemoryBanksProtocolPrivate):
query: InterleavedTextMedia,
params: Optional[Dict[str, Any]] = None,
) -> QueryDocumentsResponse:
index = self.cache.get(bank_id, None)
if not index:
raise ValueError(f"Bank {bank_id} not found")
index = await self._get_and_cache_bank_index(bank_id)
return await index.query_documents(query, params)
async def _get_and_cache_bank_index(self, bank_id: str) -> BankWithIndex:
if bank_id in self.cache:
return self.cache[bank_id]
bank = await self.memory_bank_store.get_memory_bank(bank_id)
index = BankWithIndex(
bank=bank,
index=PGVectorIndex(bank, ALL_MINILM_L6_V2_DIMENSION, self.cursor),
)
self.cache[bank_id] = index
return index

View file

@ -11,7 +11,6 @@ import pytest
#
# pytest -v -s llama_stack/providers/tests/inference/test_model_registration.py
# -m "meta_reference"
# --env TOGETHER_API_KEY=<your_api_key>
class TestModelRegistration:

View file

@ -5,11 +5,9 @@
# the root directory of this source tree.
from typing import Optional
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field
@json_schema_type
class BedrockBaseConfig(BaseModel):
aws_access_key_id: Optional[str] = Field(
default=None,
@ -57,3 +55,7 @@ class BedrockBaseConfig(BaseModel):
default=3600,
description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
)
@classmethod
def sample_run_config(cls, **kwargs):
return {}

View file

@ -22,9 +22,9 @@ def is_supported_safety_model(model: Model) -> bool:
]
def supported_inference_models() -> List[str]:
def supported_inference_models() -> List[Model]:
return [
m.descriptor()
m
for m in all_registered_models()
if (
m.model_family in {ModelFamily.llama3_1, ModelFamily.llama3_2}

View file

@ -178,7 +178,9 @@ def chat_completion_request_to_messages(
cprint(f"Could not resolve model {llama_model}", color="red")
return request.messages
if model.descriptor() not in supported_inference_models():
allowed_models = supported_inference_models()
descriptors = [m.descriptor() for m in allowed_models]
if model.descriptor() not in descriptors:
cprint(f"Unsupported inference model? {model.descriptor()}", color="red")
return request.messages

View file

@ -50,7 +50,7 @@ def process_template(template_dir: Path, progress) -> None:
template.save_distribution(
yaml_output_dir=REPO_ROOT / "llama_stack" / "templates" / template.name,
doc_output_dir=REPO_ROOT
/ "docs/source/getting_started/distributions"
/ "docs/source/distributions"
/ f"{template.distro_type}_distro",
)
else:
@ -103,7 +103,7 @@ def generate_dependencies_file():
deps_file = REPO_ROOT / "distributions" / "dependencies.json"
with open(deps_file, "w") as f:
json.dump(distribution_deps, f, indent=2)
f.write(json.dumps(distribution_deps, indent=2) + "\n")
def main():

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .bedrock import get_distribution_template # noqa: F401

View file

@ -0,0 +1,38 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::bedrock"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["remote::bedrock"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
return DistributionTemplate(
name="bedrock",
distro_type="self_hosted",
description="Use AWS Bedrock for running LLM inference and safety",
docker_image=None,
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[],
run_configs={
"run.yaml": RunConfigSettings(),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
},
)

View file

@ -1,9 +1,19 @@
version: '2'
name: bedrock
distribution_spec:
description: Use Amazon Bedrock APIs.
description: Use AWS Bedrock for running LLM inference and safety
docker_image: null
providers:
inference: remote::bedrock
memory: inline::faiss
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
inference:
- remote::bedrock
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- remote::bedrock
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,70 @@
# Bedrock Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
{{ providers_table }}
{% if run_config_env_vars %}
### Environment Variables
The following environment variables can be configured:
{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
{% endfor %}
{% endif %}
{% if default_models %}
### Models
The following models are available by default:
{% for model in default_models %}
- `{{ model.model_id }} ({{ model.provider_model_id }})`
{% endfor %}
{% endif %}
### Prerequisite: API Keys
Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
## Running Llama Stack with AWS Bedrock
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-{{ name }} \
--port $LLAMA_STACK_PORT \
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
```
### Via Conda
```bash
llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \
--port $LLAMA_STACK_PORT \
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
```

View file

@ -0,0 +1,49 @@
version: '2'
image_name: bedrock
docker_image: null
conda_env: bedrock
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: bedrock
provider_type: remote::bedrock
config: {}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/faiss_store.db
safety:
- provider_id: bedrock
provider_type: remote::bedrock
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/registry.db
models: []
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,9 +0,0 @@
name: databricks
distribution_spec:
description: Use Databricks for running LLM inference
providers:
inference: remote::databricks
memory: inline::faiss
safety: inline::llama-guard
agents: meta-reference
telemetry: meta-reference

View file

@ -1,5 +1,12 @@
# Fireworks Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
{{ providers_table }}
@ -43,9 +50,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
```
@ -55,6 +60,6 @@ docker run \
```bash
llama stack build --template fireworks --image-type conda
llama stack run ./run.yaml \
--port 5001 \
--port $LLAMA_STACK_PORT \
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
```

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .hf_endpoint import get_distribution_template # noqa: F401

View file

@ -1,9 +1,19 @@
version: '2'
name: hf-endpoint
distribution_spec:
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
docker_image: null
providers:
inference: remote::hf::endpoint
memory: inline::faiss
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
inference:
- remote::hf::endpoint
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,97 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::hf::endpoint"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="hf-endpoint",
provider_type="remote::hf::endpoint",
config=InferenceEndpointImplConfig.sample_run_config(),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="hf-endpoint",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="hf-endpoint-safety",
)
return DistributionTemplate(
name="hf-endpoint",
distro_type="self_hosted",
description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
docker_image=None,
template_path=None,
providers=providers,
default_models=[inference_model, safety_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
Provider(
provider_id="hf-endpoint-safety",
provider_type="remote::hf::endpoint",
config=InferenceEndpointImplConfig.sample_run_config(
endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}",
),
),
]
},
default_models=[
inference_model,
safety_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"HF_API_TOKEN": (
"hf_...",
"Hugging Face API token",
),
"INFERENCE_ENDPOINT_NAME": (
"",
"HF Inference endpoint name for the main inference model",
),
"SAFETY_INFERENCE_ENDPOINT_NAME": (
"",
"HF Inference endpoint for the safety model",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model served by the HF Inference Endpoint",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Safety model served by the HF Inference Endpoint",
),
},
)

View file

@ -0,0 +1,68 @@
version: '2'
image_name: hf-endpoint
docker_image: null
conda_env: hf-endpoint
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-endpoint
provider_type: remote::hf::endpoint
config:
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN}
- provider_id: hf-endpoint-safety
provider_type: remote::hf::endpoint
config:
endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-endpoint
provider_model_id: null
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: hf-endpoint-safety
provider_model_id: null
shields:
- params: null
shield_id: ${env.SAFETY_MODEL}
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,55 @@
version: '2'
image_name: hf-endpoint
docker_image: null
conda_env: hf-endpoint
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-endpoint
provider_type: remote::hf::endpoint
config:
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-endpoint
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .hf_serverless import get_distribution_template # noqa: F401

View file

@ -1,9 +1,19 @@
version: '2'
name: hf-serverless
distribution_spec:
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
docker_image: null
providers:
inference: remote::hf::serverless
memory: inline::faiss
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
inference:
- remote::hf::serverless
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,89 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["remote::hf::serverless"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="hf-serverless",
provider_type="remote::hf::serverless",
config=InferenceAPIImplConfig.sample_run_config(),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="hf-serverless",
)
safety_model = ModelInput(
model_id="${env.SAFETY_MODEL}",
provider_id="hf-serverless-safety",
)
return DistributionTemplate(
name="hf-serverless",
distro_type="self_hosted",
description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
docker_image=None,
template_path=None,
providers=providers,
default_models=[inference_model, safety_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
"run-with-safety.yaml": RunConfigSettings(
provider_overrides={
"inference": [
inference_provider,
Provider(
provider_id="hf-serverless-safety",
provider_type="remote::hf::serverless",
config=InferenceAPIImplConfig.sample_run_config(
repo="${env.SAFETY_MODEL}",
),
),
]
},
default_models=[
inference_model,
safety_model,
],
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"HF_API_TOKEN": (
"hf_...",
"Hugging Face API token",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model to be served by the HF Serverless endpoint",
),
"SAFETY_MODEL": (
"meta-llama/Llama-Guard-3-1B",
"Safety model to be served by the HF Serverless endpoint",
),
},
)

View file

@ -0,0 +1,68 @@
version: '2'
image_name: hf-serverless
docker_image: null
conda_env: hf-serverless
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-serverless
provider_type: remote::hf::serverless
config:
huggingface_repo: ${env.INFERENCE_MODEL}
api_token: ${env.HF_API_TOKEN}
- provider_id: hf-serverless-safety
provider_type: remote::hf::serverless
config:
huggingface_repo: ${env.SAFETY_MODEL}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-serverless
provider_model_id: null
- metadata: {}
model_id: ${env.SAFETY_MODEL}
provider_id: hf-serverless-safety
provider_model_id: null
shields:
- params: null
shield_id: ${env.SAFETY_MODEL}
provider_id: null
provider_shield_id: null
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,55 @@
version: '2'
image_name: hf-serverless
docker_image: null
conda_env: hf-serverless
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: hf-serverless
provider_type: remote::hf::serverless
config:
huggingface_repo: ${env.INFERENCE_MODEL}
api_token: ${env.HF_API_TOKEN}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: hf-serverless
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,13 +0,0 @@
name: meta-reference-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: inline::meta-reference
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference

View file

@ -1,5 +1,12 @@
# Meta Reference Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
{{ providers_table }}
@ -40,9 +47,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
@ -53,9 +58,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
/root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
@ -66,8 +69,8 @@ docker run \
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template meta-reference-gpu --image-type conda
llama stack run ./run.yaml \
llama stack build --template {{ name }} --image-type conda
llama stack run distributions/{{ name }}/run.yaml \
--port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
@ -75,7 +78,7 @@ llama stack run ./run.yaml \
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run ./run-with-safety.yaml \
llama stack run distributions/{{ name }}/run-with-safety.yaml \
--port 5001 \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .meta_reference import get_distribution_template # noqa: F401

View file

@ -1,13 +1,19 @@
version: '2'
name: meta-reference-quantized-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
description: Use Meta Reference with fp8, int4 quantization for running LLM inference
docker_image: null
providers:
inference: meta-reference-quantized
inference:
- inline::meta-reference-quantized
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety: inline::llama-guard
agents: inline::meta-reference
telemetry: inline::meta-reference
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,87 @@
# Meta Reference Quantized Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
{{ providers_table }}
The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
{% if run_config_env_vars %}
### Environment Variables
The following environment variables can be configured:
{% for var, (default_value, description) in run_config_env_vars.items() %}
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
{% endfor %}
{% endif %}
## Prerequisite: Downloading Models
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
```
$ ls ~/.llama/checkpoints
Llama3.1-8B Llama3.2-11B-Vision-Instruct Llama3.2-1B-Instruct Llama3.2-90B-Vision-Instruct Llama-Guard-3-8B
Llama3.1-8B-Instruct Llama3.2-1B Llama3.2-3B-Instruct Llama-Guard-3-1B Prompt-Guard-86M
```
## Running the Distribution
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-{{ name }} \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-{{ name }} \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
```
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template {{ name }} --image-type conda
llama stack run distributions/{{ name }}/run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run distributions/{{ name }}/run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
```

View file

@ -0,0 +1,67 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.distribution.datatypes import ModelInput, Provider
from llama_stack.providers.inline.inference.meta_reference import (
MetaReferenceQuantizedInferenceConfig,
)
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["inline::meta-reference-quantized"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="meta-reference-inference",
provider_type="inline::meta-reference-quantized",
config=MetaReferenceQuantizedInferenceConfig.sample_run_config(
model="${env.INFERENCE_MODEL}",
checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="meta-reference-inference",
)
return DistributionTemplate(
name="meta-reference-quantized-gpu",
distro_type="self_hosted",
description="Use Meta Reference with fp8, int4 quantization for running LLM inference",
template_path=Path(__file__).parent / "doc_template.md",
providers=providers,
default_models=[inference_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model loaded into the Meta Reference server",
),
"INFERENCE_CHECKPOINT_DIR": (
"null",
"Directory containing the Meta Reference model checkpoint",
),
},
)

View file

@ -0,0 +1,58 @@
version: '2'
image_name: meta-reference-quantized-gpu
docker_image: null
conda_env: meta-reference-quantized-gpu
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: meta-reference-inference
provider_type: inline::meta-reference-quantized
config:
model: ${env.INFERENCE_MODEL}
max_seq_len: 4096
checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null}
quantization:
type: fp8
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: meta-reference-inference
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -1,5 +1,12 @@
# Ollama Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
{{ providers_table }}
@ -55,9 +62,7 @@ docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434
@ -86,7 +91,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash
export LLAMA_STACK_PORT=5001
llama stack build --template ollama --image-type conda
llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \

View file

@ -1,4 +1,10 @@
# Remote vLLM Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:

View file

@ -27,7 +27,7 @@ from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
class RunConfigSettings(BaseModel):
provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
default_models: List[ModelInput]
default_models: Optional[List[ModelInput]] = None
default_shields: Optional[List[ShieldInput]] = None
def run_config(
@ -87,7 +87,7 @@ class RunConfigSettings(BaseModel):
__distro_dir__=f"distributions/{name}",
db_name="registry.db",
),
models=self.default_models,
models=self.default_models or [],
shields=self.default_shields or [],
)
@ -104,7 +104,7 @@ class DistributionTemplate(BaseModel):
providers: Dict[str, List[str]]
run_configs: Dict[str, RunConfigSettings]
template_path: Path
template_path: Optional[Path] = None
# Optional configuration
run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
@ -159,6 +159,7 @@ class DistributionTemplate(BaseModel):
with open(yaml_output_dir / yaml_pth, "w") as f:
yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
docs = self.generate_markdown_docs()
with open(doc_output_dir / f"{self.name}.md", "w") as f:
f.write(docs)
if self.template_path:
docs = self.generate_markdown_docs()
with open(doc_output_dir / f"{self.name}.md", "w") as f:
f.write(docs if docs.endswith("\n") else docs + "\n")

View file

@ -1,5 +1,12 @@
# TGI Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
{{ providers_table }}
@ -71,9 +78,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
@ -102,18 +107,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
```bash
llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml
--port 5001
--env INFERENCE_MODEL=$INFERENCE_MODEL
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run ./run-with-safety.yaml
--port 5001
--env INFERENCE_MODEL=$INFERENCE_MODEL
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
--env SAFETY_MODEL=$SAFETY_MODEL
llama stack run ./run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
```

View file

@ -1,4 +1,11 @@
# Fireworks Distribution
# Together Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
@ -43,9 +50,7 @@ LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-{{ name }} \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
```
@ -53,8 +58,8 @@ docker run \
### Via Conda
```bash
llama stack build --template together --image-type conda
llama stack build --template {{ name }} --image-type conda
llama stack run ./run.yaml \
--port 5001 \
--port $LLAMA_STACK_PORT \
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
```

View file

@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .vllm import get_distribution_template # noqa: F401

View file

@ -0,0 +1,19 @@
version: '2'
name: vllm-gpu
distribution_spec:
description: Use a built-in vLLM engine for running LLM inference
docker_image: null
providers:
inference:
- inline::vllm
memory:
- inline::faiss
- remote::chromadb
- remote::pgvector
safety:
- inline::llama-guard
agents:
- inline::meta-reference
telemetry:
- inline::meta-reference
image_type: conda

View file

@ -0,0 +1,58 @@
version: '2'
image_name: vllm-gpu
docker_image: null
conda_env: vllm-gpu
apis:
- agents
- inference
- memory
- safety
- telemetry
providers:
inference:
- provider_id: vllm
provider_type: inline::vllm
config:
model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
max_tokens: ${env.MAX_TOKENS:4096}
enforce_eager: ${env.ENFORCE_EAGER:False}
gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
memory:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
metadata_store:
namespace: null
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm
provider_model_id: null
shields: []
memory_banks: []
datasets: []
scoring_fns: []
eval_tasks: []

View file

@ -0,0 +1,74 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from llama_stack.distribution.datatypes import ModelInput, Provider
from llama_stack.providers.inline.inference.vllm import VLLMConfig
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
def get_distribution_template() -> DistributionTemplate:
providers = {
"inference": ["inline::vllm"],
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
"safety": ["inline::llama-guard"],
"agents": ["inline::meta-reference"],
"telemetry": ["inline::meta-reference"],
}
inference_provider = Provider(
provider_id="vllm",
provider_type="inline::vllm",
config=VLLMConfig.sample_run_config(),
)
inference_model = ModelInput(
model_id="${env.INFERENCE_MODEL}",
provider_id="vllm",
)
return DistributionTemplate(
name="vllm-gpu",
distro_type="self_hosted",
description="Use a built-in vLLM engine for running LLM inference",
docker_image=None,
template_path=None,
providers=providers,
default_models=[inference_model],
run_configs={
"run.yaml": RunConfigSettings(
provider_overrides={
"inference": [inference_provider],
},
default_models=[inference_model],
),
},
run_config_env_vars={
"LLAMASTACK_PORT": (
"5001",
"Port for the Llama Stack distribution server",
),
"INFERENCE_MODEL": (
"meta-llama/Llama-3.2-3B-Instruct",
"Inference model loaded into the vLLM engine",
),
"TENSOR_PARALLEL_SIZE": (
"1",
"Number of tensor parallel replicas (number of GPUs to use).",
),
"MAX_TOKENS": (
"4096",
"Maximum number of tokens to generate.",
),
"ENFORCE_EAGER": (
"False",
"Whether to use eager mode for inference (otherwise cuda graphs are used).",
),
"GPU_MEMORY_UTILIZATION": (
"0.7",
"GPU memory utilization for the vLLM engine.",
),
},
)