make inference server load checkpoints for fp8 inference

- introduce quantization related args for inference config
- also kill GeneratorArgs
This commit is contained in:
Ashwin Bharambe 2024-07-20 21:10:17 -07:00
parent 7d2c0b14b8
commit ad62e2e1f3
10 changed files with 249 additions and 155 deletions

View file

@ -7,3 +7,5 @@ model_inference_config:
model_parallel_size: 1
max_seq_len: 2048
max_batch_size: 1
quantization:
type: "fp8"