diff --git a/distributions/meta-reference-quantized-gpu/run.yaml b/distributions/meta-reference-quantized-gpu/run.yaml index 6e8be2b6d..f162502c5 100644 --- a/distributions/meta-reference-quantized-gpu/run.yaml +++ b/distributions/meta-reference-quantized-gpu/run.yaml @@ -16,9 +16,9 @@ providers: - provider_id: meta0 provider_type: meta-reference-quantized config: - model: Llama3.2-3B-Instruct + model: Llama3.2-3B-Instruct:int4-qlora-eo8 quantization: - type: fp8 + type: int4 torch_seed: null max_seq_len: 2048 max_batch_size: 1 diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index d1ff047b0..24b7bdc33 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -41,7 +41,7 @@ class Bf16QuantizationConfig(BaseModel): @json_schema_type class Int4QuantizationConfig(BaseModel): type: Literal[QuantizationType.int4.value] = QuantizationType.int4.value - scheme: Optional[str] = None + scheme: Optional[str] = "int4_weight_int8_dynamic_activation" QuantizationConfig = Annotated[