From e4509cb5686b7131ceb7221d24d873052b683a50 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 15 Nov 2024 09:35:38 -0800 Subject: [PATCH] more progress on auto-generation --- distributions/ollama/run.yaml | 2 +- .../inline/agents/meta_reference/config.py | 4 +- .../providers/inline/inference/vllm/config.py | 2 +- .../inline/safety/llama_guard/config.py | 6 - .../remote/inference/ollama/__init__.py | 25 +- .../providers/remote/inference/vllm/config.py | 51 +++- .../providers/utils/docker/__init__.py | 5 + .../providers/utils/docker/service_config.py | 29 ++ llama_stack/providers/utils/kvstore/config.py | 6 +- llama_stack/templates/template.py | 252 ++++++++++++++---- 10 files changed, 309 insertions(+), 73 deletions(-) create mode 100644 llama_stack/providers/utils/docker/__init__.py create mode 100644 llama_stack/providers/utils/docker/service_config.py diff --git a/distributions/ollama/run.yaml b/distributions/ollama/run.yaml index 461f64609..d09fa0e05 100644 --- a/distributions/ollama/run.yaml +++ b/distributions/ollama/run.yaml @@ -33,7 +33,7 @@ providers: persistence_store: namespace: null type: sqlite - db_path: ~/.llama/runtime/kvstore.db + db_path: ${env.SQLITE_STORE_DIR:/home/ashwin/.llama/runtime}/kvstore.db telemetry: - provider_id: meta0 provider_type: inline::meta-reference diff --git a/llama_stack/providers/inline/agents/meta_reference/config.py b/llama_stack/providers/inline/agents/meta_reference/config.py index 44628758a..c5617fcc7 100644 --- a/llama_stack/providers/inline/agents/meta_reference/config.py +++ b/llama_stack/providers/inline/agents/meta_reference/config.py @@ -14,9 +14,9 @@ class MetaReferenceAgentsImplConfig(BaseModel): persistence_store: KVStoreConfig = Field(default=SqliteKVStoreConfig()) @classmethod - def sample_dict(cls): + def sample_run_config(cls): return { - "persistence_store": SqliteKVStoreConfig.sample_dict( + "persistence_store": SqliteKVStoreConfig.sample_run_config( db_name="agents_store.db" ), } diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py index a633dffb6..e5516673c 100644 --- a/llama_stack/providers/inline/inference/vllm/config.py +++ b/llama_stack/providers/inline/inference/vllm/config.py @@ -35,7 +35,7 @@ class VLLMConfig(BaseModel): ) @classmethod - def sample_dict(cls): + def sample_run_config(cls): return { "model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}", "tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}", diff --git a/llama_stack/providers/inline/safety/llama_guard/config.py b/llama_stack/providers/inline/safety/llama_guard/config.py index 4d9e2b969..72036fd1c 100644 --- a/llama_stack/providers/inline/safety/llama_guard/config.py +++ b/llama_stack/providers/inline/safety/llama_guard/config.py @@ -11,9 +11,3 @@ from pydantic import BaseModel class LlamaGuardConfig(BaseModel): excluded_categories: List[str] = [] - - @classmethod - def sample_dict(cls): - return { - "excluded_categories": [], - } diff --git a/llama_stack/providers/remote/inference/ollama/__init__.py b/llama_stack/providers/remote/inference/ollama/__init__.py index 7763af8d1..adc4845d1 100644 --- a/llama_stack/providers/remote/inference/ollama/__init__.py +++ b/llama_stack/providers/remote/inference/ollama/__init__.py @@ -4,11 +4,34 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from typing import Optional + from llama_stack.distribution.datatypes import RemoteProviderConfig +from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig + + +DEFAULT_OLLAMA_PORT = 11434 class OllamaImplConfig(RemoteProviderConfig): - port: int = 11434 + port: int = DEFAULT_OLLAMA_PORT + + @classmethod + def sample_docker_compose_config(cls) -> Optional[DockerComposeServiceConfig]: + return DockerComposeServiceConfig( + image="ollama/ollama:latest", + volumes=["$HOME/.ollama:/root/.ollama"], + devices=["nvidia.com/gpu=all"], + deploy={ + "resources": { + "reservations": { + "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}] + } + } + }, + runtime="nvidia", + ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"], + ) async def get_adapter_impl(config: RemoteProviderConfig, _deps): diff --git a/llama_stack/providers/remote/inference/vllm/config.py b/llama_stack/providers/remote/inference/vllm/config.py index 8aa7af4f0..6a3419cd2 100644 --- a/llama_stack/providers/remote/inference/vllm/config.py +++ b/llama_stack/providers/remote/inference/vllm/config.py @@ -9,6 +9,11 @@ from typing import Optional from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field +from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig + + +DEFAULT_VLLM_PORT = 8000 + @json_schema_type class VLLMInferenceAdapterConfig(BaseModel): @@ -26,10 +31,50 @@ class VLLMInferenceAdapterConfig(BaseModel): ) @classmethod - def sample_dict(cls): - # TODO: we may need two modes, one for conda and one for docker + def sample_run_config( + cls, + url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}", + ): return { - "url": "${env.VLLM_URL:http://host.docker.internal:5100/v1}", + "url": url, "max_tokens": "${env.VLLM_MAX_TOKENS:4096}", "api_token": "${env.VLLM_API_TOKEN:fake}", } + + @classmethod + def sample_docker_compose_config( + cls, + port: int = DEFAULT_VLLM_PORT, + cuda_visible_devices: str = "0", + model: str = "meta-llama/Llama-3.2-3B-Instruct", + ) -> Optional[DockerComposeServiceConfig]: + return DockerComposeServiceConfig( + image="vllm/vllm-openai:latest", + volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"], + devices=["nvidia.com/gpu=all"], + deploy={ + "resources": { + "reservations": { + "devices": [{"driver": "nvidia", "capabilities": ["gpu"]}] + } + } + }, + runtime="nvidia", + ports=[f"{port}:{port}"], + environment={ + "CUDA_VISIBLE_DEVICES": cuda_visible_devices, + "HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN", + }, + command=( + " ".join( + [ + "--gpu-memory-utilization 0.75", + f"--model {model}", + "--enforce-eager", + "--max-model-len 8192", + "--max-num-seqs 16", + f"--port {port}", + ] + ) + ), + ) diff --git a/llama_stack/providers/utils/docker/__init__.py b/llama_stack/providers/utils/docker/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/providers/utils/docker/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/utils/docker/service_config.py b/llama_stack/providers/utils/docker/service_config.py new file mode 100644 index 000000000..b1f88eb5f --- /dev/null +++ b/llama_stack/providers/utils/docker/service_config.py @@ -0,0 +1,29 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel + + +class LiteralString(str): + pass # Marker class for strings we want to format with > + + +class DockerComposeServiceConfig(BaseModel): + """Configuration for a single service in docker-compose.""" + + image: str + volumes: Optional[List[str]] = None + network_mode: str = "bridge" + ports: Optional[List[str]] = None + devices: Optional[List[str]] = None + environment: Optional[Dict[str, str]] = None + command: Optional[str] = None + depends_on: Optional[List[str]] = None + deploy: Optional[Dict[str, Any]] = None + runtime: Optional[str] = None + entrypoint: Optional[str] = None diff --git a/llama_stack/providers/utils/kvstore/config.py b/llama_stack/providers/utils/kvstore/config.py index 5559a99f2..de9f6d79b 100644 --- a/llama_stack/providers/utils/kvstore/config.py +++ b/llama_stack/providers/utils/kvstore/config.py @@ -37,7 +37,7 @@ class RedisKVStoreConfig(CommonConfig): return f"redis://{self.host}:{self.port}" @classmethod - def sample_dict(cls): + def sample_run_config(cls): return { "type": "redis", "namespace": None, @@ -54,7 +54,7 @@ class SqliteKVStoreConfig(CommonConfig): ) @classmethod - def sample_dict(cls, db_name: str = "kvstore.db"): + def sample_run_config(cls, db_name: str = "kvstore.db"): return { "type": "sqlite", "namespace": None, @@ -72,7 +72,7 @@ class PostgresKVStoreConfig(CommonConfig): table_name: str = "llamastack_kvstore" @classmethod - def sample_dict(cls, table_name: str = "llamastack_kvstore"): + def sample_run_config(cls, table_name: str = "llamastack_kvstore"): return { "type": "postgres", "namespace": None, diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index 57fcbe962..c8ca05c6b 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -4,19 +4,22 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +from datetime import datetime + from io import StringIO from pathlib import Path -from typing import Dict, List, Optional, Set +from typing import Any, Dict, List, Optional, Set, Tuple import jinja2 import yaml -from pydantic import BaseModel +from pydantic import BaseModel, Field from rich.console import Console from rich.table import Table from llama_stack.distribution.datatypes import ( + Api, BuildConfig, DistributionSpec, KVStoreConfig, @@ -25,6 +28,12 @@ from llama_stack.distribution.datatypes import ( ShieldInput, StackRunConfig, ) +from llama_stack.distribution.distribution import get_provider_registry +from llama_stack.distribution.utils.dynamic import instantiate_class_type +from llama_stack.providers.remote.inference.vllm.config import ( + VLLMInferenceAdapterConfig, +) +from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig class DistributionTemplate(BaseModel): @@ -36,12 +45,17 @@ class DistributionTemplate(BaseModel): name: str description: str providers: Dict[str, List[str]] + run_config_overrides: Dict[str, List[Provider]] = Field(default_factory=dict) + compose_config_overrides: Dict[str, Dict[str, DockerComposeServiceConfig]] = Field( + default_factory=dict + ) + default_models: List[ModelInput] default_shields: Optional[List[ShieldInput]] = None # Optional configuration metadata_store: Optional[KVStoreConfig] = None - env_vars: Optional[Dict[str, str]] = None + docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None docker_image: Optional[str] = None @property @@ -59,8 +73,42 @@ class DistributionTemplate(BaseModel): image_type="conda", # default to conda, can be overridden ) - def run_config(self, provider_configs: Dict[str, List[Provider]]) -> StackRunConfig: - from datetime import datetime + def run_config(self) -> StackRunConfig: + provider_registry = get_provider_registry() + + provider_configs = {} + for api_str, provider_types in self.providers.items(): + if providers := self.run_config_overrides.get(api_str): + provider_configs[api_str] = providers + continue + + provider_type = provider_types[0] + provider_id = provider_type.split("::")[-1] + + api = Api(api_str) + if provider_type not in provider_registry[api]: + raise ValueError( + f"Unknown provider type: {provider_type} for API: {api_str}" + ) + + config_class = provider_registry[api][provider_type].config_class + assert ( + config_class is not None + ), f"No config class for provider type: {provider_type} for API: {api_str}" + + config_class = instantiate_class_type(config_class) + if hasattr(config_class, "sample_run_config"): + config = config_class.sample_run_config() + else: + config = {} + + provider_configs[api_str] = [ + Provider( + provider_id=provider_id, + provider_type=provider_type, + config=config, + ) + ] # Get unique set of APIs from providers apis: Set[str] = set(self.providers.keys()) @@ -76,6 +124,70 @@ class DistributionTemplate(BaseModel): shields=self.default_shields or [], ) + def docker_compose_config(self) -> Dict[str, Any]: + services = {} + provider_registry = get_provider_registry() + + # Add provider services based on their sample_compose_config + for api_str, api_providers in self.providers.items(): + if overrides := self.compose_config_overrides.get(api_str): + services |= overrides + continue + + # only look at the first provider to get the compose config for now + # we may want to use `docker compose profiles` in the future + provider_type = api_providers[0] + provider_id = provider_type.split("::")[-1] + api = Api(api_str) + if provider_type not in provider_registry[api]: + raise ValueError( + f"Unknown provider type: {provider_type} for API: {api_str}" + ) + + config_class = provider_registry[api][provider_type].config_class + assert ( + config_class is not None + ), f"No config class for provider type: {provider_type} for API: {api_str}" + + config_class = instantiate_class_type(config_class) + if not hasattr(config_class, "sample_docker_compose_config"): + continue + + compose_config = config_class.sample_docker_compose_config() + services[provider_id] = compose_config + + port = "${LLAMASTACK_PORT:-5001}" + # Add main llamastack service + llamastack_config = DockerComposeServiceConfig( + image=f"llamastack/distribution-{self.name}:latest", + depends_on=list(services.keys()), + volumes=[ + "~/.llama:/root/.llama", + f"~/local/llama-stack/distributions/{self.name}/run.yaml:/root/llamastack-run-{self.name}.yaml", + ], + ports=[f"{port}:{port}"], + environment={ + k: v[0] for k, v in (self.docker_compose_env_vars or {}).items() + }, + entrypoint=( + f'bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-{self.name}.yaml --port {port}"' + ), + deploy={ + "restart_policy": { + "condition": "on-failure", + "delay": "3s", + "max_attempts": 5, + "window": "60s", + } + }, + ) + + services["llamastack"] = llamastack_config + return { + "services": {k: v.model_dump() for k, v in services.items()}, + "volumes": {service_name: None for service_name in services.keys()}, + } + def generate_markdown_docs(self) -> str: """Generate markdown documentation using both Jinja2 templates and rich tables.""" # First generate the providers table using rich @@ -108,7 +220,7 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following The following environment variables can be configured: -{% for var, description in env_vars.items() %} +{% for var, (value, description) in docker_compose_env_vars.items() %} - `{{ var }}`: {{ description }} {% endfor %} {%- endif %} @@ -122,29 +234,6 @@ $ cd distributions/{{ name }} $ docker compose up ``` -### Manual Configuration - -You can also configure the distribution manually by creating a `run.yaml` file: - -```yaml -version: '2' -image_name: {{ name }} -apis: -{% for api in providers.keys() %} - - {{ api }} -{% endfor %} - -providers: -{% for api, provider_list in providers.items() %} - {{ api }}: - {% for provider in provider_list %} - - provider_id: {{ provider.lower() }}-0 - provider_type: {{ provider }} - config: {} - {% endfor %} -{% endfor %} -``` - ## Models The following models are configured by default: @@ -170,7 +259,7 @@ The following safety shields are configured: description=self.description, providers=self.providers, providers_table=providers_table, - env_vars=self.env_vars, + docker_compose_env_vars=self.docker_compose_env_vars, default_models=self.default_models, default_shields=self.default_shields, ) @@ -178,29 +267,24 @@ The following safety shields are configured: def save_distribution(self, output_dir: Path) -> None: output_dir.mkdir(parents=True, exist_ok=True) - # Save build.yaml build_config = self.build_config() with open(output_dir / "build.yaml", "w") as f: yaml.safe_dump(build_config.model_dump(), f, sort_keys=False) - # Save run.yaml template - # Create a minimal provider config for the template - provider_configs = { - api: [ - Provider( - provider_id=f"{provider.lower()}-0", - provider_type=provider, - config={}, - ) - for provider in providers - ] - for api, providers in self.providers.items() - } - run_config = self.run_config(provider_configs) + run_config = self.run_config() + serialized = run_config.model_dump() with open(output_dir / "run.yaml", "w") as f: - yaml.safe_dump(run_config.model_dump(), f, sort_keys=False) + yaml.safe_dump(serialized, f, sort_keys=False) + + # serialized_str = yaml.dump(serialized, sort_keys=False) + # env_vars = set() + # for match in re.finditer(r"\${env\.([A-Za-z0-9_-]+)}", serialized_str): + # env_vars.add(match.group(1)) + + docker_compose = self.docker_compose_config() + with open(output_dir / "compose.yaml", "w") as f: + yaml.safe_dump(docker_compose, f, sort_keys=False, default_flow_style=False) - # Save documentation docs = self.generate_markdown_docs() with open(output_dir / f"{self.name}.md", "w") as f: f.write(docs) @@ -217,21 +301,77 @@ The following safety shields are configured: "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], }, + run_config_overrides={ + "inference": [ + Provider( + provider_id="vllm-0", + provider_type="remote::vllm", + config=VLLMInferenceAdapterConfig.sample_run_config( + url="${env.VLLM_URL:http://host.docker.internal:5100/v1}", + ), + ), + Provider( + provider_id="vllm-1", + provider_type="remote::vllm", + config=VLLMInferenceAdapterConfig.sample_run_config( + url="${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}", + ), + ), + ] + }, + compose_config_overrides={ + "inference": { + "vllm-0": VLLMInferenceAdapterConfig.sample_docker_compose_config( + port=5100, + cuda_visible_devices="0", + model="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}", + ), + "vllm-1": VLLMInferenceAdapterConfig.sample_docker_compose_config( + port=5100, + cuda_visible_devices="1", + model="${env.SAFETY_MODEL:Llama-Guard-3-1B}", + ), + } + }, default_models=[ ModelInput( - model_id="${env.LLAMA_INFERENCE_MODEL:Llama3.1-8B-Instruct}" + model_id="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}", + provider_id="vllm-0", + ), + ModelInput( + model_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}", + provider_id="vllm-1", ), - ModelInput(model_id="${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}"), ], default_shields=[ - ShieldInput(shield_id="${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}") + ShieldInput(shield_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}") ], - env_vars={ - "LLAMA_INFERENCE_VLLM_URL": "URL of the vLLM inference server", - "LLAMA_SAFETY_VLLM_URL": "URL of the vLLM safety server", - "MAX_TOKENS": "Maximum number of tokens for generation", - "LLAMA_INFERENCE_MODEL": "Name of the inference model to use", - "LLAMA_SAFETY_MODEL": "Name of the safety model to use", + docker_compose_env_vars={ + # these defaults are for the Docker Compose configuration + "VLLM_URL": ( + "http://host.docker.internal:${VLLM_PORT:-5100}/v1", + "URL of the vLLM server with the main inference model", + ), + "SAFETY_VLLM_URL": ( + "http://host.docker.internal:${SAFETY_VLLM_PORT:-5101}/v1", + "URL of the vLLM server with the safety model", + ), + "MAX_TOKENS": ( + "${MAX_TOKENS:-4096}", + "Maximum number of tokens for generation", + ), + "INFERENCE_MODEL": ( + "${INFERENCE_MODEL:-Llama3.2-3B-Instruct}", + "Name of the inference model to use", + ), + "SAFETY_MODEL": ( + "${SAFETY_MODEL:-Llama-Guard-3-1B}", + "Name of the safety (Llama-Guard) model to use", + ), + "LLAMASTACK_PORT": ( + "${LLAMASTACK_PORT:-5001}", + "Port for the Llama Stack distribution server", + ), }, )