add inline-vllm details, fix things

2025-12-16 10:12:37 +00:00 · 2024-11-08 12:01:05 -08:00 · 2024-11-08 12:01:05 -08:00 · 38cdbdec5a
commit 38cdbdec5a
parent 02c66b49fc
12 changed files with 142 additions and 101 deletions
--- a/distributions/inline-vllm/build.yaml
+++ b/distributions/inline-vllm/build.yaml
@ -1 +1 @@
-../../llama_stack/templates/vllm/build.yaml
+../../llama_stack/templates/inline-vllm/build.yaml
--- a/distributions/inline-vllm/compose.yaml
+++ b/distributions/inline-vllm/compose.yaml
@ -0,0 +1,35 @@
 services:
  llamastack:
    image: llamastack/distribution-inline-vllm
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "5000:5000"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/inline-vllm/run.yaml
+++ b/distributions/inline-vllm/run.yaml
@ -0,0 +1,66 @@
 version: '2'
 built_at: '2024-10-08T17:40:45.325529'
 image_name: local
 docker_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: vllm-inference
    provider_type: inline::vllm
    config:
      model: Llama3.2-3B-Instruct
      tensor_parallel_size: 1
      gpu_memory_utilization: 0.4
      enforce_eager: true
      max_tokens: 4096
  - provider_id: vllm-safety
    provider_type: inline::vllm
    config:
      model: Llama-Guard-3-1B
      tensor_parallel_size: 1
      gpu_memory_utilization: 0.2
      enforce_eager: true
      max_tokens: 4096
  safety:
  - provider_id: meta0
    provider_type: meta-reference
    config:
      llama_guard_shield:
        model: Llama-Guard-3-1B
        excluded_categories: []
 # Uncomment to use prompt guard
 #      prompt_guard_shield:
 #        model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: meta-reference
    config: {}
  # Uncomment to use pgvector
  # - provider_id: pgvector
  #   provider_type: remote::pgvector
  #   config:
  #     host: 127.0.0.1
  #     port: 5432
  #     db: postgres
  #     user: postgres
  #     password: mysecretpassword
  agents:
  - provider_id: meta0
    provider_type: meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/agents_store.db
  telemetry:
  - provider_id: meta0
    provider_type: meta-reference
    config: {}
--- a/distributions/remote-vllm/build.yaml
+++ b/distributions/remote-vllm/build.yaml
@ -1 +1 @@
-../../llama_stack/templates/ollama/build.yaml
+../../llama_stack/templates/remote-vllm/build.yaml
--- a/distributions/remote-vllm/gpu/compose.yaml
+++ b/distributions/remote-vllm/gpu/compose.yaml
@ -1,11 +1,11 @@
 services:
-  ollama:
+  vllm:
-    image: ollama/ollama:latest
+    image: vllm/vllm-openai:latest
    network_mode: "host"
    volumes:
-      - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
+      - $HOME/.cache/huggingface:/root/.cache/huggingface
    ports:
-      - "11434:11434"
+      - "8000:8000"
    devices:
      - nvidia.com/gpu=all
    environment:
@ -27,17 +27,17 @@ services:
    runtime: nvidia
  llamastack:
    depends_on:
-    - ollama
+    - vllm
-    image: llamastack/distribution-ollama
+    image: llamastack/distribution-remote-vllm
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      # Link to ollama run.yaml file
-      - ./run.yaml:/root/llamastack-run-ollama.yaml
+      - ./run.yaml:/root/llamastack-run-remote-vllm.yaml
    ports:
      - "5000:5000"
-    # Hack: wait for ollama server to start before starting docker
+    # Hack: wait for vllm server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml"
+    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml"
    deploy:
      restart_policy:
        condition: on-failure
@ -45,4 +45,4 @@ services:
        max_attempts: 5
        window: 60s
 volumes:
-  ollama:
+  vllm:
--- a/distributions/remote-vllm/cpu/compose.yaml
+++ b/distributions/remote-vllm/cpu/compose.yaml
@ -1,30 +0,0 @@
 services:
  ollama:
    image: ollama/ollama:latest
    network_mode: "host"
    volumes:
      - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
    ports:
      - "11434:11434"
    command: []
  llamastack:
    depends_on:
    - ollama
    image: llamastack/distribution-ollama
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      # Link to ollama run.yaml file
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "5000:5000"
    # Hack: wait for ollama server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
 volumes:
  ollama:
--- a/distributions/remote-vllm/gpu/run.yaml
+++ b/distributions/remote-vllm/gpu/run.yaml
@ -1,46 +0,0 @@
 version: '2'
 built_at: '2024-10-08T17:40:45.325529'
 image_name: local
 docker_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: ollama0
    provider_type: remote::ollama
    config:
      url: http://127.0.0.1:14343
  safety:
  - provider_id: meta0
    provider_type: meta-reference
    config:
      llama_guard_shield:
        model: Llama-Guard-3-1B
        excluded_categories: []
        disable_input_check: false
        disable_output_check: false
      prompt_guard_shield:
        model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: meta-reference
    config: {}
  agents:
  - provider_id: meta0
    provider_type: meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/kvstore.db
  telemetry:
  - provider_id: meta0
    provider_type: meta-reference
    config: {}
--- a/distributions/remote-vllm/cpu/run.yaml
+++ b/distributions/remote-vllm/cpu/run.yaml
@ -13,10 +13,10 @@ apis:
 - safety
 providers:
  inference:
-  - provider_id: ollama0
+  - provider_id: vllm0
-    provider_type: remote::ollama
+    provider_type: remote::vllm
    config:
-      url: http://127.0.0.1:14343
+      url: http://127.0.0.1:8000
  safety:
  - provider_id: meta0
    provider_type: meta-reference
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@ -45,7 +45,7 @@ def available_providers() -> List[ProviderSpec]:
        ),
        InlineProviderSpec(
            api=Api.inference,
-            provider_type="vllm",
+            provider_type="inline::vllm",
            pip_packages=[
                "vllm",
            ],
--- a/llama_stack/templates/inline-vllm/build.yaml
+++ b/llama_stack/templates/inline-vllm/build.yaml
@ -0,0 +1,13 @@
 name: meta-reference-gpu
 distribution_spec:
  docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
  description: Use code from `llama_stack` itself to serve all llama stack APIs
  providers:
    inference: meta-reference
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@ -0,0 +1,12 @@
 name: remote-vllm
 distribution_spec:
  description: Use (an external) vLLM server for running LLM inference
  providers:
    inference: remote::vllm
    memory:
    - meta-reference
    - remote::chromadb
    - remote::pgvector
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/llama_stack/templates/vllm/build.yaml
+++ b/llama_stack/templates/vllm/build.yaml
@ -1,9 +0,0 @@
 name: vllm
 distribution_spec:
  description: Like local, but use vLLM for running LLM inference
  providers:
    inference: vllm
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
		`@ -1 +1 @@`
			`../../llama_stack/templates/vllm/build.yaml`				`../../llama_stack/templates/inline-vllm/build.yaml`
		`@ -1 +1 @@`
			`../../llama_stack/templates/ollama/build.yaml`				`../../llama_stack/templates/remote-vllm/build.yaml`