mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
Split off meta-reference-quantized provider
This commit is contained in:
parent
7ff5800dea
commit
1ff0476002
10 changed files with 54 additions and 58 deletions
|
@ -15,12 +15,11 @@ from pydantic import BaseModel, Field, field_validator
|
|||
from llama_stack.providers.utils.inference import supported_inference_models
|
||||
|
||||
|
||||
class MetaReferenceImplConfig(BaseModel):
|
||||
class MetaReferenceInferenceConfig(BaseModel):
|
||||
model: str = Field(
|
||||
default="Llama3.1-8B-Instruct",
|
||||
description="Model descriptor from `llama model list`",
|
||||
)
|
||||
quantization: Optional[QuantizationConfig] = None
|
||||
torch_seed: Optional[int] = None
|
||||
max_seq_len: int = 4096
|
||||
max_batch_size: int = 1
|
||||
|
@ -38,9 +37,9 @@ class MetaReferenceImplConfig(BaseModel):
|
|||
|
||||
@property
|
||||
def model_parallel_size(self) -> int:
|
||||
# HACK ALERT: this will be fixed when we move inference configuration
|
||||
# to ModelsRegistry and we can explicitly ask for `model_parallel_size`
|
||||
# as configuration there
|
||||
resolved = resolve_model(self.model)
|
||||
assert resolved is not None
|
||||
return resolved.pth_file_count
|
||||
|
||||
|
||||
class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig):
|
||||
quantization: QuantizationConfig
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue