mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-04 12:07:34 +00:00
make inference server load checkpoints for fp8 inference
- introduce quantization related args for inference config - also kill GeneratorArgs
This commit is contained in:
parent
7d2c0b14b8
commit
ad62e2e1f3
10 changed files with 249 additions and 155 deletions
|
@ -18,6 +18,12 @@ from fp8.fp8_impls import ffn_swiglu
|
|||
from torch import nn
|
||||
|
||||
|
||||
@dataclass
|
||||
class QuantizationArgs:
|
||||
fp8_rowwise: bool = False
|
||||
convert_from_bf16: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArgs:
|
||||
dim: int = 4096
|
||||
|
@ -31,6 +37,8 @@ class ModelArgs:
|
|||
rope_theta: float = 500000
|
||||
use_scaled_rope: bool = False
|
||||
|
||||
quantization: Optional[QuantizationArgs] = None
|
||||
|
||||
max_batch_size: int = 32
|
||||
max_seq_len: int = 2048
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue