mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-11 21:48:36 +00:00
make inference server load checkpoints for fp8 inference
- introduce quantization related args for inference config - also kill GeneratorArgs
This commit is contained in:
parent
7d2c0b14b8
commit
ad62e2e1f3
10 changed files with 249 additions and 155 deletions
|
@ -7,3 +7,5 @@ model_inference_config:
|
|||
model_parallel_size: 1
|
||||
max_seq_len: 2048
|
||||
max_batch_size: 1
|
||||
quantization:
|
||||
type: "fp8"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue