Merge branch 'main' into dell_tgi

2025-12-16 02:02:37 +00:00 · 2024-10-17 13:48:00 -07:00 · 2024-10-17 13:48:00 -07:00 · 560b3b5461
commit 560b3b5461
parent eaab21dd48 9fcf5d58e0
18 changed files with 101 additions and 102 deletions
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -152,27 +152,29 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
        parser.error("Please provide a model id")
        return

-    prompt_guard = prompt_guard_model_sku()
-    if args.model_id == prompt_guard.model_id:
-        model = prompt_guard
-        info = prompt_guard_download_info()
-    else:
-        model = resolve_model(args.model_id)
-        if model is None:
-            parser.error(f"Model {args.model_id} not found")
-            return
-        info = llama_meta_net_info(model)
+    # Check if model_id is a comma-separated list
+    model_ids = [model_id.strip() for model_id in args.model_id.split(",")]

-    if args.source == "huggingface":
-        _hf_download(model, args.hf_token, args.ignore_patterns, parser)
-    else:
-        meta_url = args.meta_url
-        if not meta_url:
-            meta_url = input(
-                "Please provide the signed URL you received via email after visiting https://www.llama.com/llama-downloads/ (e.g., https://llama3-1.llamameta.net/*?Policy...): "
+    prompt_guard = prompt_guard_model_sku()
+    for model_id in model_ids:
+        if model_id == prompt_guard.model_id:
+            model = prompt_guard
+            info = prompt_guard_download_info()
+        else:
+            model = resolve_model(model_id)
+            if model is None:
+                parser.error(f"Model {model_id} not found")
+                continue
+            info = llama_meta_net_info(model)
+
+        if args.source == "huggingface":
+            _hf_download(model, args.hf_token, args.ignore_patterns, parser)
+        else:
+            meta_url = args.meta_url or input(
+                f"Please provide the signed URL for model {model_id} you received via email after visiting https://www.llama.com/llama-downloads/ (e.g., https://llama3-1.llamameta.net/*?Policy...): "
            )
-            assert meta_url is not None and "llamameta.net" in meta_url
-        _meta_download(model, meta_url, info)
+            assert "llamameta.net" in meta_url
+            _meta_download(model, meta_url, info)


 class ModelEntry(BaseModel):
--- a/llama_stack/distribution/templates/build_configs/local-bedrock-conda-example-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-bedrock-conda-example-build.yaml
--- a/llama_stack/distribution/templates/build_configs/local-cpu-docker-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-cpu-docker-build.yaml
--- a/llama_stack/distribution/templates/build_configs/local-databricks-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-databricks-build.yaml
--- a/llama_stack/distribution/templates/build_configs/local-fireworks-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-fireworks-build.yaml
--- a/llama_stack/distribution/templates/build_configs/local-gpu-docker-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-gpu-docker-build.yaml
@ -1,4 +1,4 @@
-name: local
+name: local-gpu
 distribution_spec:
  description: Use code from `llama_stack` itself to serve all llama stack APIs
  providers:
@ -7,4 +7,4 @@ distribution_spec:
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
-image_type: conda
+image_type: docker
--- a/llama_stack/distribution/templates/build_configs/local-hf-endpoint-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-hf-endpoint-build.yaml
--- a/llama_stack/distribution/templates/build_configs/local-hf-serverless-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-hf-serverless-build.yaml
--- a/llama_stack/distribution/templates/build_configs/local-ollama-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-ollama-build.yaml
--- a/llama_stack/distribution/templates/build_configs/local-tgi-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-tgi-build.yaml
--- a/llama_stack/distribution/templates/build_configs/local-tgi-chroma-docker-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-tgi-chroma-docker-build.yaml
@ -1,11 +1,11 @@
-name: local-gpu
+name: local-tgi-chroma
 distribution_spec:
-  description: local meta reference
+  description: remote tgi inference + chromadb memory
  docker_image: null
  providers:
-    inference: meta-reference
+    inference: remote::tgi
    safety: meta-reference
    agents: meta-reference
-    memory: meta-reference
+    memory: remote::chromadb
    telemetry: meta-reference
 image_type: docker
--- a/llama_stack/distribution/templates/build_configs/local-together-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-together-build.yaml
--- a/llama_stack/distribution/templates/build_configs/local-vllm-build.yaml
+++ b/llama_stack/distribution/templates/build_configs/local-vllm-build.yaml
--- a/llama_stack/distribution/templates/docker/llamastack-local-gpu/run.yaml
+++ b/llama_stack/distribution/templates/docker/llamastack-local-gpu/run.yaml
@ -1,16 +1,16 @@
 version: '2'
-built_at: '2024-10-08T17:42:33.690666'
-image_name: local-gpu
-docker_image: local-gpu
-conda_env: null
+built_at: '2024-10-08T17:40:45.325529'
+image_name: local
+docker_image: null
+conda_env: local
 apis:
- memory
- inference
- agents
 - shields
- safety
+- agents
 - models
+- memory
 - memory_banks
+- inference
+- safety
 providers:
  inference:
  - provider_id: meta-reference
@ -25,8 +25,13 @@ providers:
  - provider_id: meta-reference
    provider_type: meta-reference
    config:
-      llama_guard_shield: null
-      prompt_guard_shield: null
+      llama_guard_shield:
+        model: Llama-Guard-3-1B
+        excluded_categories: []
+        disable_input_check: false
+        disable_output_check: false
+      prompt_guard_shield:
+        model: Prompt-Guard-86M
  memory:
  - provider_id: meta-reference
    provider_type: meta-reference
--- a/llama_stack/distribution/templates/docker/llamastack-local-cpu/run.yaml
+++ b/llama_stack/distribution/templates/docker/llamastack-local-cpu/run.yaml
@ -1,29 +1,33 @@
 version: '2'
-built_at: '2024-10-08T17:42:07.505267'
-image_name: local-cpu
-docker_image: local-cpu
-conda_env: null
+built_at: '2024-10-08T17:40:45.325529'
+image_name: local
+docker_image: null
+conda_env: local
 apis:
+- shields
 - agents
- inference
 - models
 - memory
- safety
- shields
 - memory_banks
+- inference
+- safety
 providers:
  inference:
-  - provider_id: remote::ollama
-    provider_type: remote::ollama
+  - provider_id: tgi0
+    provider_type: remote::tgi
    config:
-      host: localhost
-      port: 6000
+      url: http://127.0.0.1:5009
  safety:
  - provider_id: meta-reference
    provider_type: meta-reference
    config:
-      llama_guard_shield: null
-      prompt_guard_shield: null
+      llama_guard_shield:
+        model: Llama-Guard-3-1B
+        excluded_categories: []
+        disable_input_check: false
+        disable_output_check: false
+      prompt_guard_shield:
+        model: Prompt-Guard-86M
  memory:
  - provider_id: meta-reference
    provider_type: meta-reference
--- a/llama_stack/providers/tests/inference/test_inference.py
+++ b/llama_stack/providers/tests/inference/test_inference.py
@ -5,6 +5,7 @@
 # the root directory of this source tree.

 import itertools
+import os

 import pytest
 import pytest_asyncio
@ -50,14 +51,17 @@ def get_expected_stop_reason(model: str):
    return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn


+if "MODEL_IDS" not in os.environ:
+    MODEL_IDS = [Llama_8B, Llama_3B]
+else:
+    MODEL_IDS = os.environ["MODEL_IDS"].split(",")
+
+
 # This is going to create multiple Stack impls without tearing down the previous one
 # Fix that!
@pytest_asyncio.fixture(
    scope="session",
-    params=[
-        {"model": Llama_8B},
-        {"model": Llama_3B},
-    ],
+    params=[{"model": m} for m in MODEL_IDS],
    ids=lambda d: d["model"],
 )
 async def inference_settings(request):