Run the script to produce vllm outputs

2025-07-31 16:01:46 +00:00 · 2024-11-17 14:09:36 -08:00 · 2024-11-17 14:09:36 -08:00 · 9bb07ce298
commit 9bb07ce298
parent 0218e68849
10 changed files with 109 additions and 71 deletions
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -1,68 +1,71 @@
 version: '2'
-built_at: '2024-11-11T20:09:45.988375'
+built_at: 2024-11-17 14:07:24.568750
 image_name: remote-vllm
-docker_image: remote-vllm
+docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
 conda_env: null
 apis:
- inference
- memory
- safety
 - agents
 - telemetry
+- safety
+- inference
+- memory
 providers:
  inference:
-  # serves main inference model
  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
      url: ${env.VLLM_URL}
-      max_tokens: ${env.MAX_TOKENS:4096}
-      api_token: fake
-  # serves safety llama_guard model
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
  - provider_id: vllm-safety
    provider_type: remote::vllm
    config:
-      # NOTE: replace with "localhost" if you are running in "host" network mode
      url: ${env.SAFETY_VLLM_URL}
-      max_tokens: ${env.MAX_TOKENS:4096}
-      api_token: fake
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
  memory:
-  - provider_id: faiss-0
+  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
-        namespace: null
        type: sqlite
-        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
-  memory:
-  - provider_id: meta0
-    provider_type: inline::faiss
-    config: {}
  agents:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
-        namespace: null
        type: sqlite
-        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db"
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db
  telemetry:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
-  db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
 models:
-  - model_id: ${env.INFERENCE_MODEL}
-    provider_id: vllm-inference
-  - model_id: ${env.SAFETY_MODEL}
-    provider_id: vllm-safety
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  provider_model_id: null
+- metadata: {}
+  model_id: ${env.SAFETY_MODEL}
+  provider_id: vllm-safety
+  provider_model_id: null
 shields:
-  - shield_id: ${env.SAFETY_MODEL}
+- params: null
+  shield_id: ${env.SAFETY_MODEL}
+  provider_id: null
+  provider_shield_id: null
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -1,50 +1,57 @@
 version: '2'
-built_at: '2024-11-11T20:09:45.988375'
+built_at: 2024-11-17 14:07:24.563541
 image_name: remote-vllm
-docker_image: remote-vllm
+docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
 conda_env: null
 apis:
- inference
- memory
 - agents
 - telemetry
+- safety
+- inference
+- memory
 providers:
  inference:
-  # serves main inference model
  - provider_id: vllm-inference
    provider_type: remote::vllm
    config:
      url: ${env.VLLM_URL}
-      max_tokens: ${env.MAX_TOKENS:4096}
-      api_token: fake
+      max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+      api_token: ${env.VLLM_API_TOKEN:fake}
  memory:
  - provider_id: faiss
    provider_type: inline::faiss
    config:
      kvstore:
-        namespace: null
        type: sqlite
-        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
-  memory:
-  - provider_id: meta0
-    provider_type: inline::faiss
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
    config: {}
  agents:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
-        namespace: null
        type: sqlite
-        db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db"
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db
  telemetry:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
-  db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db
 models:
-  - model_id: ${env.INFERENCE_MODEL}
-    provider_id: vllm-inference
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm-inference
+  provider_model_id: null
+shields: []
+memory_banks: []
+datasets: []
+scoring_fns: []
+eval_tasks: []
--- a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md
@ -1,20 +1,39 @@
 # Remote vLLM Distribution

-The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations.
+The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations:

-| **API**           | **Inference**   | **Agents**      | **Memory**                          | **Safety**     	| **Telemetry**  	|
-|-----------------  |---------------- |---------------- |------------------------------------	|----------------	|----------------	|
-| **Provider(s)**   | remote::vllm  	| meta-reference 	| remote::pgvector, remote::chromadb 	| meta-reference 	| meta-reference 	|
+                        Provider Configuration
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ API       ┃ Provider(s)                                             ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ agents    │ `inline::meta-reference`                                │
+│ inference │ `remote::vllm`                                          │
+│ memory    │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` │
+│ safety    │ `inline::llama-guard`                                   │
+│ telemetry │ `inline::meta-reference`                                │
+└───────────┴─────────────────────────────────────────────────────────┘

-You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
+
+You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.### Environment Variables
+
+The following environment variables can be configured:
+
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
+- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`)
+- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100}/v1`)
+- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`)
+- `SAFETY_VLLM_URL`: URL of the vLLM server with the safety model (default: `http://host.docker.internal:5101/v1`)
+- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
+
+### Models
+
+The following models are configured by default:
+- `${env.INFERENCE_MODEL}`
+- `${env.SAFETY_MODEL}`

 ## Using Docker Compose

 You can use `docker compose` to start a vLLM container and Llama Stack server container together.
-
-> [!NOTE]
-> This assumes you have access to GPU to start a vLLM server with access to your GPU.
-
 ```bash
 $ cd distributions/remote-vllm; docker compose up
 ```
@ -31,8 +50,7 @@ docker compose down

 ## Starting vLLM and Llama Stack separately

-You may want to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
-
+You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.

 #### Start vLLM server.

@ -43,7 +61,7 @@ docker run --runtime nvidia --gpus all \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
-    --model meta-llama/Llama-3.1-8B-Instruct
+    --model meta-llama/Llama-3.2-3B-Instruct
 ```

 Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
@ -66,7 +84,7 @@ inference:
 If you are using Conda, you can build and run the Llama Stack server with the following commands:
 ```bash
 cd distributions/remote-vllm
-llama stack build --template remote_vllm --image-type conda
+llama stack build --template remote-vllm --image-type conda
 llama stack run run.yaml
 ```

--- a/llama_stack/providers/inline/agents/meta_reference/config.py
+++ b/llama_stack/providers/inline/agents/meta_reference/config.py
@ -21,5 +21,5 @@ class MetaReferenceAgentsImplConfig(BaseModel):
            "persistence_store": SqliteKVStoreConfig.sample_run_config(
                __distro_dir__=__distro_dir__,
                db_name="agents_store.db",
-            ).model_dump(),
+            )
        }
--- a/llama_stack/providers/inline/memory/faiss/config.py
+++ b/llama_stack/providers/inline/memory/faiss/config.py
@ -25,5 +25,5 @@ class FaissImplConfig(BaseModel):
            "kvstore": SqliteKVStoreConfig.sample_run_config(
                __distro_dir__=__distro_dir__,
                db_name="faiss_store.db",
-            ).model_dump(),
+            )
        }
--- a/llama_stack/providers/utils/kvstore/config.py
+++ b/llama_stack/providers/utils/kvstore/config.py
@ -61,8 +61,9 @@ class SqliteKVStoreConfig(CommonConfig):
            "type": "sqlite",
            "namespace": None,
            "db_path": "${env.SQLITE_STORE_DIR:~/.llama/"
-            + f"{__distro_dir__}/{db_name}"
-            + "}",
+            + __distro_dir__
+            + "}/"
+            + db_name,
        }


--- a/llama_stack/scripts/save_distributions.py
+++ b/llama_stack/scripts/save_distributions.py
@ -50,6 +50,7 @@ def process_template(template_dir: Path, progress) -> None:

    except Exception as e:
        progress.print(f"[red]Error processing {template_dir.name}: {str(e)}")
+        raise e


 def main():
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@ -1,12 +1,19 @@
+version: '2'
 name: remote-vllm
 distribution_spec:
  description: Use (an external) vLLM server for running LLM inference
+  docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
  providers:
-    inference: remote::vllm
+    inference:
+    - remote::vllm
    memory:
    - inline::faiss
    - remote::chromadb
    - remote::pgvector
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@ -41,6 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
        name="remote-vllm",
        distro_type="self_hosted",
        description="Use (an external) vLLM server for running LLM inference",
+        docker_image="llamastack/distribution-remote-vllm:test-0.0.52rc3",
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=[inference_model, safety_model],
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@ -91,7 +91,7 @@ class RunConfigSettings(BaseModel):
            apis=list(apis),
            providers=provider_configs,
            metadata_store=SqliteKVStoreConfig.sample_run_config(
-                dir=f"distributions/{name}",
+                __distro_dir__=f"distributions/{name}",
                db_name="registry.db",
            ),
            models=self.default_models,