llama-stack/llama_stack/models/llama/sku_list.py
Ashwin Bharambe 530d4bdfe1
refactor: move all llama code to models/llama out of meta reference (#1887)
# What does this PR do?

Move around bits. This makes the copies from llama-models _much_ easier
to maintain and ensures we don't entangle meta-reference specific
tidbits into llama-models code even by accident.

Also, kills the meta-reference-quantized-gpu distro and rolls
quantization deps into meta-reference-gpu.

## Test Plan

```
LLAMA_MODELS_DEBUG=1 \
  with-proxy llama stack run meta-reference-gpu \
  --env INFERENCE_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct \
   --env INFERENCE_CHECKPOINT_DIR=<DIR> \
   --env MODEL_PARALLEL_SIZE=4 \
   --env QUANTIZATION_TYPE=fp8_mixed
```

Start a server with and without quantization. Point integration tests to
it using:

```
pytest -s -v  tests/integration/inference/test_text_inference.py \
   --stack-config http://localhost:8321 --text-model meta-llama/Llama-4-Scout-17B-16E-Instruct
```
2025-04-07 15:03:58 -07:00

1020 lines
33 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from dataclasses import dataclass
from functools import lru_cache
from typing import List, Optional
from .sku_types import (
CheckpointQuantizationFormat,
CoreModelId,
Model,
ModelFamily,
)
LLAMA2_VOCAB_SIZE = 32000
LLAMA3_VOCAB_SIZE = 128256
def resolve_model(descriptor: str) -> Optional[Model]:
for m in all_registered_models():
if descriptor in (m.descriptor(), m.huggingface_repo):
return m
return None
def all_registered_models() -> List[Model]:
return (
llama2_family()
+ llama3_family()
+ llama3_1_family()
+ llama3_2_family()
+ llama3_3_family()
+ llama4_family()
+ safety_models()
)
def llama2_family() -> List[Model]:
return [
*llama2_base_models(),
*llama2_instruct_models(),
]
def llama3_family() -> List[Model]:
return [
*llama3_base_models(),
*llama3_instruct_models(),
]
def llama3_1_family() -> List[Model]:
return [
*llama3_1_base_models(),
*llama3_1_instruct_models(),
]
def llama3_2_family() -> List[Model]:
return [
*llama3_2_base_models(),
*llama3_2_instruct_models(),
]
def llama3_3_family() -> List[Model]:
return [
*llama3_3_instruct_models(),
]
def llama4_family() -> List[Model]:
return [
*llama4_base_models(),
*llama4_instruct_models(),
]
def llama4_base_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama4_scout_17b_16e,
description="Llama 4 Scout (17b 16 experts model)",
huggingface_repo="meta-llama/Llama-4-Scout-17B-16E",
pth_file_count=8,
arch_args={},
),
Model(
core_model_id=CoreModelId.llama4_maverick_17b_128e,
description="Llama 4 Maverick (17b 128 experts model)",
huggingface_repo="meta-llama/Llama-4-Maverick-17B-128E",
pth_file_count=8,
arch_args={},
),
]
def llama4_instruct_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama4_scout_17b_16e_instruct,
description="Llama 4 Scout (17b 16 experts instruct model)",
huggingface_repo="meta-llama/Llama-4-Scout-17B-16E-Instruct",
pth_file_count=8,
arch_args={},
),
Model(
core_model_id=CoreModelId.llama4_maverick_17b_128e_instruct,
description="Llama 4 Maverick (17b 128 experts instruct model)",
huggingface_repo="meta-llama/Llama-4-Maverick-17B-128E-Instruct",
pth_file_count=8,
arch_args={},
),
Model(
core_model_id=CoreModelId.llama4_maverick_17b_128e_instruct,
description="Llama 4 Maverick (FP8 quantized)",
huggingface_repo="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
quantization_format=CheckpointQuantizationFormat.fp8_mixed,
pth_file_count=8,
variant="fp8",
arch_args={},
),
]
def llama2_base_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama2_7b,
description="Llama 2 7b model",
huggingface_repo="meta-llama/Llama-2-7b",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA2_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama2_13b,
description="Llama 2 13b model",
huggingface_repo="meta-llama/Llama-2-13b",
arch_args={
"dim": 5120,
"n_layers": 40,
"n_heads": 40,
"n_kv_heads": 8,
"vocab_size": LLAMA2_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama2_70b,
description="Llama 2 70b model",
huggingface_repo="meta-llama/Llama-2-70b",
arch_args={
"dim": 8192,
"n_layers": 80,
"n_heads": 64,
"n_kv_heads": 8,
"vocab_size": LLAMA2_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=8,
),
]
def llama3_base_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama3_8b,
description="Llama 3 8b model",
huggingface_repo="meta-llama/Llama-3-8B",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 1024,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_70b,
description="Llama 3 70b model",
huggingface_repo="meta-llama/Llama-3-70B",
arch_args={
"dim": 8192,
"n_layers": 80,
"n_heads": 64,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=8,
),
]
def llama3_1_base_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama3_1_8b,
description="Llama 3.1 8b model",
huggingface_repo="meta-llama/Llama-3.1-8B",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 1024,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_1_70b,
description="Llama 3.1 70b model",
huggingface_repo="meta-llama/Llama-3.1-70B",
arch_args={
"dim": 8192,
"n_layers": 80,
"n_heads": 64,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=8,
),
Model(
core_model_id=CoreModelId.llama3_1_405b,
variant="bf16-mp8",
description="Llama 3.1 405b model (BF16 weights)",
huggingface_repo="meta-llama/Llama-3.1-405B",
arch_args={
"dim": 16384,
"n_layers": 126,
"n_heads": 128,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.2,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=8,
),
Model(
core_model_id=CoreModelId.llama3_1_405b,
description="Llama 3.1 405b model (FP8 quantized)",
huggingface_repo="meta-llama/Llama-3.1-405B-FP8",
quantization_format=CheckpointQuantizationFormat.fp8_mixed,
arch_args={
"dim": 16384,
"n_layers": 126,
"n_heads": 128,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.2,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=8,
),
Model(
core_model_id=CoreModelId.llama3_1_405b,
variant="bf16-mp16",
description="Llama 3.1 405b model (BF16 weights for mp16)",
huggingface_repo="meta-llama/Llama-3.1-405B",
arch_args={
"dim": 16384,
"n_layers": 126,
"n_heads": 128,
"n_kv_heads": 16,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.2,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=16,
),
]
def llama3_2_base_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama3_2_1b,
description="Llama 3.2 1b model",
huggingface_repo="meta-llama/Llama-3.2-1B",
arch_args={
"dim": 2048,
"n_layers": 16,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.5,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_2_3b,
description="Llama 3.2 3b model",
huggingface_repo="meta-llama/Llama-3.2-3B",
arch_args={
"dim": 3072,
"n_layers": 28,
"n_heads": 24,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.0,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_2_11b_vision,
description="Llama 3.2 11b vision model",
huggingface_repo="meta-llama/Llama-3.2-11B-Vision",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 1024,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
"vision_chunk_size": 448,
"vision_max_num_chunks": 4,
"vision_num_cross_attention_layers": 8,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_2_90b_vision,
description="Llama 3.2 90b vision model",
huggingface_repo="meta-llama/Llama-3.2-90B-Vision",
arch_args={
"dim": 8192,
"n_layers": 80,
"n_heads": 64,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
"vision_chunk_size": 560,
"vision_max_num_chunks": 4,
"vision_num_cross_attention_layers": 20,
},
pth_file_count=8,
),
]
def llama2_instruct_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama2_7b_chat,
description="Llama 2 7b chat model",
huggingface_repo="meta-llama/Llama-2-7b-chat",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA2_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama2_13b_chat,
description="Llama 2 13b chat model",
huggingface_repo="meta-llama/Llama-2-13b-chat",
arch_args={
"dim": 5120,
"n_layers": 40,
"n_heads": 40,
"n_kv_heads": 8,
"vocab_size": LLAMA2_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama2_70b_chat,
description="Llama 2 70b chat model",
huggingface_repo="meta-llama/Llama-2-70b-chat",
arch_args={
"dim": 8192,
"n_layers": 80,
"n_heads": 64,
"n_kv_heads": 8,
"vocab_size": LLAMA2_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=8,
),
]
def llama3_instruct_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama3_8b_instruct,
description="Llama 3 8b instruct model",
huggingface_repo="meta-llama/Llama-3-8B-Instruct",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 1024,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_70b_instruct,
description="Llama 3 70b instruct model",
huggingface_repo="meta-llama/Llama-3-70B-Instruct",
arch_args={
"dim": 8192,
"n_layers": 80,
"n_heads": 64,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=8,
),
]
def llama3_1_instruct_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama3_1_8b_instruct,
description="Llama 3.1 8b instruct model",
huggingface_repo="meta-llama/Llama-3.1-8B-Instruct",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 1024,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_1_70b_instruct,
description="Llama 3.1 70b instruct model",
huggingface_repo="meta-llama/Llama-3.1-70B-Instruct",
arch_args={
"dim": 8192,
"n_layers": 80,
"n_heads": 64,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=8,
),
Model(
core_model_id=CoreModelId.llama3_1_405b_instruct,
variant="bf16-mp8",
description="Llama 3.1 405b instruct model (BF16 weights)",
huggingface_repo="meta-llama/Llama-3.1-405B-Instruct",
arch_args={
"dim": 16384,
"n_layers": 126,
"n_heads": 128,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.2,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=8,
),
Model(
core_model_id=CoreModelId.llama3_1_405b_instruct,
description="Llama 3.1 405b instruct model (FP8 quantized)",
huggingface_repo="meta-llama/Llama-3.1-405B-Instruct-FP8",
quantization_format=CheckpointQuantizationFormat.fp8_mixed,
arch_args={
"dim": 16384,
"n_layers": 126,
"n_heads": 128,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.2,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=8,
),
Model(
core_model_id=CoreModelId.llama3_1_405b_instruct,
variant="bf16-mp16",
description="Llama 3.1 405b instruct model (BF16 weights for mp16)",
huggingface_repo="meta-llama/Llama-3.1-405B-Instruct",
arch_args={
"dim": 16384,
"n_layers": 126,
"n_heads": 128,
"n_kv_heads": 16,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.2,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=16,
),
]
def arch_args_1b() -> dict:
return {
"dim": 2048,
"n_layers": 16,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.5,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
}
def arch_args_3b() -> dict:
return {
"dim": 3072,
"n_layers": 28,
"n_heads": 24,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.0,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
}
def llama3_2_quantized_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama3_2_1b_instruct,
variant="int4-qlora-eo8",
quantization_format=CheckpointQuantizationFormat.int4,
description="Llama 3.2 1b INT4 quantized LoRA",
huggingface_repo="meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8",
arch_args={
**arch_args_1b(),
"quantization_args": {
"group_size": 256,
},
"lora_args": {
"rank": 16,
"scale": 2.0,
},
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_2_1b_instruct,
variant="int4-spinquant-eo8",
quantization_format=CheckpointQuantizationFormat.int4,
description="Llama 3.2 1b INT4 quantized SpinQuant",
huggingface_repo="meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8",
arch_args={
**arch_args_1b(),
"quantization_args": {
"group_size": 256,
},
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_2_3b_instruct,
variant="int4-qlora-eo8",
quantization_format=CheckpointQuantizationFormat.int4,
description="Llama 3.2 3b INT4 quantized LoRA",
huggingface_repo="meta-llama/Llama-3.2-3B-Instruct-QLORA_INT4_EO8",
arch_args={
**arch_args_3b(),
"quantization_args": {
"group_size": 256,
},
"lora_args": {
"rank": 16,
"scale": 2.0,
},
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_2_3b_instruct,
variant="int4-spinquant-eo8",
quantization_format=CheckpointQuantizationFormat.int4,
description="Llama 3.2 3b INT4 quantized SpinQuant",
huggingface_repo="meta-llama/Llama-3.2-3B-Instruct-SpinQuant_INT4_EO8",
arch_args={
**arch_args_3b(),
"quantization_args": {
"group_size": 256,
},
},
pth_file_count=1,
),
]
def llama3_2_instruct_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama3_2_1b_instruct,
description="Llama 3.2 1b instruct model",
huggingface_repo="meta-llama/Llama-3.2-1B-Instruct",
arch_args=arch_args_1b(),
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_2_3b_instruct,
description="Llama 3.2 3b instruct model",
huggingface_repo="meta-llama/Llama-3.2-3B-Instruct",
arch_args=arch_args_3b(),
pth_file_count=1,
),
*llama3_2_quantized_models(),
Model(
core_model_id=CoreModelId.llama3_2_11b_vision_instruct,
description="Llama 3.2 11b vision instruct model",
huggingface_repo="meta-llama/Llama-3.2-11B-Vision-Instruct",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 1024,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
"vision_chunk_size": 560,
"vision_max_num_chunks": 4,
"vision_num_cross_attention_layers": 8,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama3_2_90b_vision_instruct,
description="Llama 3.2 90b vision instruct model",
huggingface_repo="meta-llama/Llama-3.2-90B-Vision-Instruct",
arch_args={
"dim": 8192,
"n_layers": 80,
"n_heads": 64,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
"vision_chunk_size": 560,
"vision_max_num_chunks": 4,
"vision_num_cross_attention_layers": 20,
},
pth_file_count=8,
),
]
def llama3_3_instruct_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama3_3_70b_instruct,
description="Llama 3.3 70b instruct",
huggingface_repo="meta-llama/Llama-3.3-70B-Instruct",
arch_args={
"dim": 8192,
"n_layers": 80,
"n_heads": 64,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 4096,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=8,
),
]
@lru_cache
def safety_models() -> List[Model]:
return [
Model(
core_model_id=CoreModelId.llama_guard_3_11b_vision,
description="Llama Guard v3 11b vision system safety model",
huggingface_repo="meta-llama/Llama-Guard-3-11B-Vision",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 1024,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
"vision_chunk_size": 560,
"vision_max_num_chunks": 4,
"vision_num_cross_attention_layers": 8,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama_guard_3_1b,
variant="int4",
description="Llama Guard v3 1b 'int4' quantized system safety model",
huggingface_repo="meta-llama/Llama-Guard-3-1B-INT4",
quantization_format=CheckpointQuantizationFormat.int4,
arch_args={
"dim": 2048,
"n_layers": 12,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"rope_freq_base": 500000.0,
"norm_eps": 1e-05,
"hidden_dim": 6400,
"use_scaled_rope": True,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama_guard_3_1b,
description="Llama Guard v3 1b system safety model",
huggingface_repo="meta-llama/Llama-Guard-3-1B",
arch_args={
"dim": 2048,
"n_layers": 16,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA3_VOCAB_SIZE,
"ffn_dim_multiplier": 1.5,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": True,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama_guard_3_8b,
description="Llama Guard v3 8b system safety model",
huggingface_repo="meta-llama/Llama-Guard-3-8B",
arch_args={
"dim": 4096,
"ffn_dim_multiplier": 1.3,
"multiple_of": 1024,
"n_heads": 32,
"n_kv_heads": 8,
"n_layers": 32,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
"vocab_size": LLAMA3_VOCAB_SIZE,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama_guard_3_8b,
variant="int8",
description="Llama Guard v3 8b system safety model",
huggingface_repo="meta-llama/Llama-Guard-3-8B-INT8",
quantization_format=CheckpointQuantizationFormat.int8,
arch_args={
"dim": 4096,
"ffn_dim_multiplier": 1.3,
"multiple_of": 1024,
"n_heads": 32,
"n_kv_heads": 8,
"n_layers": 32,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
"vocab_size": LLAMA3_VOCAB_SIZE,
},
pth_file_count=1,
),
Model(
core_model_id=CoreModelId.llama_guard_2_8b,
description="Llama Guard v2 8b system safety model",
huggingface_repo="meta-llama/Llama-Guard-2-8B",
arch_args={
"dim": 4096,
"n_layers": 32,
"n_heads": 32,
"n_kv_heads": 8,
"vocab_size": LLAMA2_VOCAB_SIZE,
"ffn_dim_multiplier": 1.3,
"multiple_of": 256,
"norm_eps": 1e-05,
"rope_theta": 500000.0,
"use_scaled_rope": False,
},
pth_file_count=1,
),
]
@dataclass
class LlamaDownloadInfo:
folder: str
files: List[str]
pth_size: int
def llama_meta_net_info(model: Model) -> LlamaDownloadInfo:
"""Information needed to download model from llamameta.net"""
pth_count = model.pth_file_count
if model.core_model_id == CoreModelId.llama3_1_405b:
if pth_count == 16:
folder = "Llama-3.1-405B-MP16"
elif model.quantization_format == CheckpointQuantizationFormat.fp8_mixed:
folder = "Llama-3.1-405B"
else:
folder = "Llama-3.1-405B-MP8"
elif model.core_model_id == CoreModelId.llama3_1_405b_instruct:
if pth_count == 16:
folder = "Llama-3.1-405B-Instruct-MP16"
elif model.quantization_format == CheckpointQuantizationFormat.fp8_mixed:
folder = "Llama-3.1-405B-Instruct"
else:
folder = "Llama-3.1-405B-Instruct-MP8"
elif model.core_model_id == CoreModelId.llama_guard_3_8b:
if model.quantization_format == CheckpointQuantizationFormat.int8:
folder = "Llama-Guard-3-8B-INT8-HF"
else:
folder = "Llama-Guard-3-8B"
elif model.core_model_id == CoreModelId.llama_guard_2_8b:
folder = "llama-guard-2"
else:
folder = model.huggingface_repo.split("/")[-1]
if "Llama-2" in folder:
folder = folder.lower()
files = ["checklist.chk"]
if (
model.core_model_id == CoreModelId.llama_guard_3_8b
and model.quantization_format == CheckpointQuantizationFormat.int8
):
files.extend(
[
"generation_config.json",
"model-00001-of-00002.safetensors",
"model-00002-of-00002.safetensors",
"special_tokens_map.json",
"tokenizer.json",
"tokenizer_config.json",
"model.safetensors.index.json",
]
)
elif (
model.core_model_id == CoreModelId.llama_guard_3_1b
and model.quantization_format == CheckpointQuantizationFormat.int4
):
files.extend(
[
"llama_guard_3_1b_pruned_xnnpack.pte",
"example-prompt.txt",
"params.json",
"tokenizer.model",
]
)
else:
files.extend(
[
"tokenizer.model",
"params.json",
]
)
if model.quantization_format == CheckpointQuantizationFormat.fp8_mixed:
files.extend([f"fp8_scales_{i}.pt" for i in range(pth_count)])
files.extend([f"consolidated.{i:02d}.pth" for i in range(pth_count)])
return LlamaDownloadInfo(
folder=folder,
files=files,
pth_size=llama_meta_pth_size(model),
)
# Sadness because Cloudfront rejects our HEAD requests to find Content-Length
def llama_meta_pth_size(model: Model) -> int:
if model.core_model_id not in (
CoreModelId.llama3_1_405b,
CoreModelId.llama3_1_405b_instruct,
CoreModelId.llama4_maverick_17b_128e,
CoreModelId.llama4_maverick_17b_128e_instruct,
):
return 0
if model.model_family == ModelFamily.llama3_1:
if model.pth_file_count == 16:
return 51268302389
elif model.quantization_format == CheckpointQuantizationFormat.fp8_mixed:
return 60903742309
else:
return 101470976045
if model.model_family == ModelFamily.llama4:
if model.core_model_id == CoreModelId.llama4_maverick_17b_128e:
return 100458118386
elif model.core_model_id == CoreModelId.llama4_maverick_17b_128e_instruct:
if model.quantization_format == CheckpointQuantizationFormat.fp8_mixed:
return 54121549657
else:
return 100426653046