diff --git a/distributions/remote-vllm/run-with-safety.yaml b/distributions/remote-vllm/run-with-safety.yaml index 35f87c0bb..7d401322b 100644 --- a/distributions/remote-vllm/run-with-safety.yaml +++ b/distributions/remote-vllm/run-with-safety.yaml @@ -1,68 +1,71 @@ version: '2' -built_at: '2024-11-11T20:09:45.988375' +built_at: 2024-11-17 14:07:24.568750 image_name: remote-vllm -docker_image: remote-vllm +docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 conda_env: null apis: -- inference -- memory -- safety - agents - telemetry +- safety +- inference +- memory providers: inference: - # serves main inference model - provider_id: vllm-inference provider_type: remote::vllm config: - # NOTE: replace with "localhost" if you are running in "host" network mode url: ${env.VLLM_URL} - max_tokens: ${env.MAX_TOKENS:4096} - api_token: fake - # serves safety llama_guard model + max_tokens: ${env.VLLM_MAX_TOKENS:4096} + api_token: ${env.VLLM_API_TOKEN:fake} - provider_id: vllm-safety provider_type: remote::vllm config: - # NOTE: replace with "localhost" if you are running in "host" network mode url: ${env.SAFETY_VLLM_URL} - max_tokens: ${env.MAX_TOKENS:4096} - api_token: fake + max_tokens: ${env.VLLM_MAX_TOKENS:4096} + api_token: ${env.VLLM_API_TOKEN:fake} memory: - - provider_id: faiss-0 + - provider_id: faiss provider_type: inline::faiss config: kvstore: - namespace: null type: sqlite - db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db" + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db safety: - provider_id: llama-guard provider_type: inline::llama-guard config: {} - memory: - - provider_id: meta0 - provider_type: inline::faiss - config: {} agents: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: persistence_store: - namespace: null type: sqlite - db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db" + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db telemetry: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: {} metadata_store: namespace: null type: sqlite - db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db" + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db models: - - model_id: ${env.INFERENCE_MODEL} - provider_id: vllm-inference - - model_id: ${env.SAFETY_MODEL} - provider_id: vllm-safety +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: vllm-inference + provider_model_id: null +- metadata: {} + model_id: ${env.SAFETY_MODEL} + provider_id: vllm-safety + provider_model_id: null shields: - - shield_id: ${env.SAFETY_MODEL} +- params: null + shield_id: ${env.SAFETY_MODEL} + provider_id: null + provider_shield_id: null +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/distributions/remote-vllm/run.yaml b/distributions/remote-vllm/run.yaml index 847dc1dd1..18f27cb20 100644 --- a/distributions/remote-vllm/run.yaml +++ b/distributions/remote-vllm/run.yaml @@ -1,50 +1,57 @@ version: '2' -built_at: '2024-11-11T20:09:45.988375' +built_at: 2024-11-17 14:07:24.563541 image_name: remote-vllm -docker_image: remote-vllm +docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 conda_env: null apis: -- inference -- memory - agents - telemetry +- safety +- inference +- memory providers: inference: - # serves main inference model - provider_id: vllm-inference provider_type: remote::vllm config: url: ${env.VLLM_URL} - max_tokens: ${env.MAX_TOKENS:4096} - api_token: fake + max_tokens: ${env.VLLM_MAX_TOKENS:4096} + api_token: ${env.VLLM_API_TOKEN:fake} memory: - provider_id: faiss provider_type: inline::faiss config: kvstore: - namespace: null type: sqlite - db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db" - memory: - - provider_id: meta0 - provider_type: inline::faiss + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard config: {} agents: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: persistence_store: - namespace: null type: sqlite - db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db" + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db telemetry: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: {} metadata_store: namespace: null type: sqlite - db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db" + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db models: - - model_id: ${env.INFERENCE_MODEL} - provider_id: vllm-inference +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: vllm-inference + provider_model_id: null +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md b/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md index 2ab8df7b7..0ecfafaea 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md @@ -1,20 +1,39 @@ # Remote vLLM Distribution -The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations. +The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations: -| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | -|----------------- |---------------- |---------------- |------------------------------------ |---------------- |---------------- | -| **Provider(s)** | remote::vllm | meta-reference | remote::pgvector, remote::chromadb | meta-reference | meta-reference | + Provider Configuration +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ API ┃ Provider(s) ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ agents │ `inline::meta-reference` │ +│ inference │ `remote::vllm` │ +│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` │ +│ safety │ `inline::llama-guard` │ +│ telemetry │ `inline::meta-reference` │ +└───────────┴─────────────────────────────────────────────────────────┘ -You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference. + +You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.### Environment Variables + +The following environment variables can be configured: + +- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`) +- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100}/v1`) +- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`) +- `SAFETY_VLLM_URL`: URL of the vLLM server with the safety model (default: `http://host.docker.internal:5101/v1`) +- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) + +### Models + +The following models are configured by default: +- `${env.INFERENCE_MODEL}` +- `${env.SAFETY_MODEL}` ## Using Docker Compose You can use `docker compose` to start a vLLM container and Llama Stack server container together. - -> [!NOTE] -> This assumes you have access to GPU to start a vLLM server with access to your GPU. - ```bash $ cd distributions/remote-vllm; docker compose up ``` @@ -31,8 +50,7 @@ docker compose down ## Starting vLLM and Llama Stack separately -You may want to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack. - +You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack. #### Start vLLM server. @@ -43,7 +61,7 @@ docker run --runtime nvidia --gpus all \ -p 8000:8000 \ --ipc=host \ vllm/vllm-openai:latest \ - --model meta-llama/Llama-3.1-8B-Instruct + --model meta-llama/Llama-3.2-3B-Instruct ``` Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details. @@ -66,7 +84,7 @@ inference: If you are using Conda, you can build and run the Llama Stack server with the following commands: ```bash cd distributions/remote-vllm -llama stack build --template remote_vllm --image-type conda +llama stack build --template remote-vllm --image-type conda llama stack run run.yaml ``` diff --git a/llama_stack/providers/inline/agents/meta_reference/config.py b/llama_stack/providers/inline/agents/meta_reference/config.py index 6e09bace4..ff34e5d5f 100644 --- a/llama_stack/providers/inline/agents/meta_reference/config.py +++ b/llama_stack/providers/inline/agents/meta_reference/config.py @@ -21,5 +21,5 @@ class MetaReferenceAgentsImplConfig(BaseModel): "persistence_store": SqliteKVStoreConfig.sample_run_config( __distro_dir__=__distro_dir__, db_name="agents_store.db", - ).model_dump(), + ) } diff --git a/llama_stack/providers/inline/memory/faiss/config.py b/llama_stack/providers/inline/memory/faiss/config.py index 13de60e9d..d82104477 100644 --- a/llama_stack/providers/inline/memory/faiss/config.py +++ b/llama_stack/providers/inline/memory/faiss/config.py @@ -25,5 +25,5 @@ class FaissImplConfig(BaseModel): "kvstore": SqliteKVStoreConfig.sample_run_config( __distro_dir__=__distro_dir__, db_name="faiss_store.db", - ).model_dump(), + ) } diff --git a/llama_stack/providers/utils/kvstore/config.py b/llama_stack/providers/utils/kvstore/config.py index a7cfc5c7e..ed400efae 100644 --- a/llama_stack/providers/utils/kvstore/config.py +++ b/llama_stack/providers/utils/kvstore/config.py @@ -61,8 +61,9 @@ class SqliteKVStoreConfig(CommonConfig): "type": "sqlite", "namespace": None, "db_path": "${env.SQLITE_STORE_DIR:~/.llama/" - + f"{__distro_dir__}/{db_name}" - + "}", + + __distro_dir__ + + "}/" + + db_name, } diff --git a/llama_stack/scripts/save_distributions.py b/llama_stack/scripts/distro_codegen.py similarity index 99% rename from llama_stack/scripts/save_distributions.py rename to llama_stack/scripts/distro_codegen.py index 0f2cdeeb3..bff39c57d 100644 --- a/llama_stack/scripts/save_distributions.py +++ b/llama_stack/scripts/distro_codegen.py @@ -50,6 +50,7 @@ def process_template(template_dir: Path, progress) -> None: except Exception as e: progress.print(f"[red]Error processing {template_dir.name}: {str(e)}") + raise e def main(): diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml index 39abb10af..1efa5dc7a 100644 --- a/llama_stack/templates/remote-vllm/build.yaml +++ b/llama_stack/templates/remote-vllm/build.yaml @@ -1,12 +1,19 @@ +version: '2' name: remote-vllm distribution_spec: description: Use (an external) vLLM server for running LLM inference + docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 providers: - inference: remote::vllm + inference: + - remote::vllm memory: - inline::faiss - remote::chromadb - remote::pgvector - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py index ad3c1d8e2..00073a856 100644 --- a/llama_stack/templates/remote-vllm/vllm.py +++ b/llama_stack/templates/remote-vllm/vllm.py @@ -41,6 +41,7 @@ def get_distribution_template() -> DistributionTemplate: name="remote-vllm", distro_type="self_hosted", description="Use (an external) vLLM server for running LLM inference", + docker_image="llamastack/distribution-remote-vllm:test-0.0.52rc3", template_path=Path(__file__).parent / "doc_template.md", providers=providers, default_models=[inference_model, safety_model], diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index 227dd2c0c..2074f19c3 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -91,7 +91,7 @@ class RunConfigSettings(BaseModel): apis=list(apis), providers=provider_configs, metadata_store=SqliteKVStoreConfig.sample_run_config( - dir=f"distributions/{name}", + __distro_dir__=f"distributions/{name}", db_name="registry.db", ), models=self.default_models,