New quantized models (#301)

This commit is contained in:
Ashwin Bharambe 2024-10-24 08:38:56 -07:00 committed by GitHub
parent 05a8d47b98
commit 7afe51c84d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 292 additions and 21 deletions

View file

@ -25,6 +25,7 @@ class LogProbConfig(BaseModel):
class QuantizationType(Enum):
bf16 = "bf16"
fp8 = "fp8"
int4 = "int4"
@json_schema_type
@ -37,8 +38,14 @@ class Bf16QuantizationConfig(BaseModel):
type: Literal[QuantizationType.bf16.value] = QuantizationType.bf16.value
@json_schema_type
class Int4QuantizationConfig(BaseModel):
type: Literal[QuantizationType.int4.value] = QuantizationType.int4.value
scheme: Optional[str] = None
QuantizationConfig = Annotated[
Union[Bf16QuantizationConfig, Fp8QuantizationConfig],
Union[Bf16QuantizationConfig, Fp8QuantizationConfig, Int4QuantizationConfig],
Field(discriminator="type"),
]
@ -219,8 +226,6 @@ class Inference(Protocol):
logprobs: Optional[LogProbConfig] = None,
) -> Union[CompletionResponse, CompletionResponseStreamChunk]: ...
# This method is not `async def` because it can result in either an
# `AsyncGenerator` or a `ChatCompletionResponse` depending on the value of `stream`.
@webmethod(route="/inference/chat_completion")
async def chat_completion(
self,