mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 09:22:36 +00:00
Merge branch 'meta-llama:main' into main
This commit is contained in:
commit
09302347d3
62 changed files with 1427 additions and 339 deletions
|
|
@ -4,7 +4,7 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Optional
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from llama_models.datatypes import * # noqa: F403
|
||||
from llama_models.sku_list import resolve_model
|
||||
|
|
@ -56,6 +56,7 @@ class MetaReferenceInferenceConfig(BaseModel):
|
|||
cls,
|
||||
model: str = "Llama3.2-3B-Instruct",
|
||||
checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
return {
|
||||
"model": model,
|
||||
|
|
@ -66,3 +67,16 @@ class MetaReferenceInferenceConfig(BaseModel):
|
|||
|
||||
class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
|
||||
quantization: QuantizationConfig
|
||||
|
||||
@classmethod
|
||||
def sample_run_config(
|
||||
cls,
|
||||
model: str = "Llama3.2-3B-Instruct",
|
||||
checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
config = super().sample_run_config(model, checkpoint_dir, **kwargs)
|
||||
config["quantization"] = {
|
||||
"type": "fp8",
|
||||
}
|
||||
return config
|
||||
|
|
|
|||
|
|
@ -37,11 +37,11 @@ class VLLMConfig(BaseModel):
|
|||
@classmethod
|
||||
def sample_run_config(cls):
|
||||
return {
|
||||
"model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}",
|
||||
"tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}",
|
||||
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
|
||||
"enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}",
|
||||
"gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}",
|
||||
"model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
|
||||
"tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
|
||||
"max_tokens": "${env.MAX_TOKENS:4096}",
|
||||
"enforce_eager": "${env.ENFORCE_EAGER:False}",
|
||||
"gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
|
||||
}
|
||||
|
||||
@field_validator("model")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue