Merge branch 'meta-llama:main' into main

This commit is contained in:
Chacksu 2024-11-21 10:21:49 -05:00 committed by GitHub
commit 09302347d3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
62 changed files with 1427 additions and 339 deletions

View file

@ -4,7 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import Optional
from typing import Any, Dict, Optional
from llama_models.datatypes import * # noqa: F403
from llama_models.sku_list import resolve_model
@ -56,6 +56,7 @@ class MetaReferenceInferenceConfig(BaseModel):
cls,
model: str = "Llama3.2-3B-Instruct",
checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
**kwargs,
) -> Dict[str, Any]:
return {
"model": model,
@ -66,3 +67,16 @@ class MetaReferenceInferenceConfig(BaseModel):
class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
quantization: QuantizationConfig
@classmethod
def sample_run_config(
cls,
model: str = "Llama3.2-3B-Instruct",
checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}",
**kwargs,
) -> Dict[str, Any]:
config = super().sample_run_config(model, checkpoint_dir, **kwargs)
config["quantization"] = {
"type": "fp8",
}
return config

View file

@ -37,11 +37,11 @@ class VLLMConfig(BaseModel):
@classmethod
def sample_run_config(cls):
return {
"model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}",
"tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}",
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
"enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}",
"gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}",
"model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
"tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
"max_tokens": "${env.MAX_TOKENS:4096}",
"enforce_eager": "${env.ENFORCE_EAGER:False}",
"gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
}
@field_validator("model")