Merge branch 'meta-llama:main' into main

2025-12-17 09:22:36 +00:00 · 2024-11-21 10:21:49 -05:00 · 2024-11-21 10:21:49 -05:00 · 09302347d3
commit 09302347d3
parent 490c5fb730 cf079a22a0
62 changed files with 1427 additions and 339 deletions
--- a/llama_stack/providers/inline/inference/meta_reference/config.py
+++ b/llama_stack/providers/inline/inference/meta_reference/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional
+from typing import Any, Dict, Optional

 from llama_models.datatypes import *  # noqa: F403
 from llama_models.sku_list import resolve_model
@ -56,6 +56,7 @@ class MetaReferenceInferenceConfig(BaseModel):
        cls,
        model: str = "Llama3.2-3B-Instruct",
        checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
+        **kwargs,
    ) -> Dict[str, Any]:
        return {
            "model": model,
@ -66,3 +67,16 @@ class MetaReferenceInferenceConfig(BaseModel):

 class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
    quantization: QuantizationConfig
+
+    @classmethod
+    def sample_run_config(
+        cls,
+        model: str = "Llama3.2-3B-Instruct",
+        checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
+        **kwargs,
+    ) -> Dict[str, Any]:
+        config = super().sample_run_config(model, checkpoint_dir, **kwargs)
+        config["quantization"] = {
+            "type": "fp8",
+        }
+        return config
--- a/llama_stack/providers/inline/inference/vllm/config.py
+++ b/llama_stack/providers/inline/inference/vllm/config.py
@ -37,11 +37,11 @@ class VLLMConfig(BaseModel):
    @classmethod
    def sample_run_config(cls):
        return {
-            "model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}",
-            "tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}",
-            "max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
-            "enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}",
-            "gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}",
+            "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
+            "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
+            "max_tokens": "${env.MAX_TOKENS:4096}",
+            "enforce_eager": "${env.ENFORCE_EAGER:False}",
+            "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
        }

    @field_validator("model")