make inference server load checkpoints for fp8 inference

- introduce quantization related args for inference config - also kill GeneratorArgs
2025-10-13 22:17:59 +00:00 · 2024-07-20 21:10:17 -07:00 · 2024-07-20 21:10:17 -07:00 · ad62e2e1f3
commit ad62e2e1f3
parent 7d2c0b14b8
10 changed files with 249 additions and 155 deletions
--- a/toolchain/configs/ashwin.yaml
+++ b/toolchain/configs/ashwin.yaml
@ -7,3 +7,5 @@ model_inference_config:
    model_parallel_size: 1
    max_seq_len: 2048
    max_batch_size: 1
+    quantization: 
+      type: "fp8"