Remove quantization_config from the APIs for now

This commit is contained in:
Ashwin Bharambe 2024-08-21 14:17:05 -07:00
parent ab0a24f333
commit 863bb915e1
5 changed files with 18 additions and 157 deletions

View file

@ -151,8 +151,6 @@ class AgenticSystemInstanceConfig(BaseModel):
input_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list) input_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
output_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list) output_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
quantization_config: Optional[QuantizationConfig] = None
# if you completely want to replace the messages prefixed by the system, # if you completely want to replace the messages prefixed by the system,
# this is debug only # this is debug only
debug_prefix_messages: Optional[List[Message]] = Field(default_factory=list) debug_prefix_messages: Optional[List[Message]] = Field(default_factory=list)

View file

@ -135,7 +135,6 @@ async def run_main(host: str, port: int):
available_tools=tool_definitions, available_tools=tool_definitions,
input_shields=[], input_shields=[],
output_shields=[], output_shields=[],
quantization_config=None,
debug_prefix_messages=[], debug_prefix_messages=[],
tool_prompt_format=ToolPromptFormat.json, tool_prompt_format=ToolPromptFormat.json,
), ),

View file

@ -19,7 +19,6 @@ class CompletionRequest(BaseModel):
stream: Optional[bool] = False stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None logprobs: Optional[LogProbConfig] = None
quantization_config: Optional[QuantizationConfig] = None
@json_schema_type @json_schema_type
@ -43,7 +42,6 @@ class BatchCompletionRequest(BaseModel):
content_batch: List[InterleavedTextAttachment] content_batch: List[InterleavedTextAttachment]
sampling_params: Optional[SamplingParams] = SamplingParams() sampling_params: Optional[SamplingParams] = SamplingParams()
logprobs: Optional[LogProbConfig] = None logprobs: Optional[LogProbConfig] = None
quantization_config: Optional[QuantizationConfig] = None
@json_schema_type @json_schema_type
@ -62,7 +60,6 @@ class ChatCompletionRequest(BaseModel):
stream: Optional[bool] = False stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None logprobs: Optional[LogProbConfig] = None
quantization_config: Optional[QuantizationConfig] = None
@json_schema_type @json_schema_type
@ -88,7 +85,6 @@ class BatchChatCompletionRequest(BaseModel):
available_tools: Optional[List[ToolDefinition]] = Field(default_factory=list) available_tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
logprobs: Optional[LogProbConfig] = None logprobs: Optional[LogProbConfig] = None
quantization_config: Optional[QuantizationConfig] = None
@json_schema_type @json_schema_type

View file

@ -21,7 +21,7 @@
"info": { "info": {
"title": "[DRAFT] Llama Stack Specification", "title": "[DRAFT] Llama Stack Specification",
"version": "0.0.1", "version": "0.0.1",
"description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-08-20 19:00:39.110138" "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-08-21 14:16:38.313950"
}, },
"servers": [ "servers": [
{ {
@ -1760,16 +1760,6 @@
} }
}, },
"additionalProperties": false "additionalProperties": false
},
"quantization_config": {
"oneOf": [
{
"$ref": "#/components/schemas/Bf16QuantizationConfig"
},
{
"$ref": "#/components/schemas/Fp8QuantizationConfig"
}
]
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -1778,19 +1768,6 @@
"messages_batch" "messages_batch"
] ]
}, },
"Bf16QuantizationConfig": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "bf16"
}
},
"additionalProperties": false,
"required": [
"type"
]
},
"BuiltinTool": { "BuiltinTool": {
"type": "string", "type": "string",
"enum": [ "enum": [
@ -1848,19 +1825,6 @@
"tool_calls" "tool_calls"
] ]
}, },
"Fp8QuantizationConfig": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "fp8"
}
},
"additionalProperties": false,
"required": [
"type"
]
},
"SamplingParams": { "SamplingParams": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -2229,16 +2193,6 @@
} }
}, },
"additionalProperties": false "additionalProperties": false
},
"quantization_config": {
"oneOf": [
{
"$ref": "#/components/schemas/Bf16QuantizationConfig"
},
{
"$ref": "#/components/schemas/Fp8QuantizationConfig"
}
]
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -2307,16 +2261,6 @@
} }
}, },
"additionalProperties": false "additionalProperties": false
},
"quantization_config": {
"oneOf": [
{
"$ref": "#/components/schemas/Bf16QuantizationConfig"
},
{
"$ref": "#/components/schemas/Fp8QuantizationConfig"
}
]
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -2469,16 +2413,6 @@
} }
}, },
"additionalProperties": false "additionalProperties": false
},
"quantization_config": {
"oneOf": [
{
"$ref": "#/components/schemas/Bf16QuantizationConfig"
},
{
"$ref": "#/components/schemas/Fp8QuantizationConfig"
}
]
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -2552,16 +2486,6 @@
"$ref": "#/components/schemas/ShieldDefinition" "$ref": "#/components/schemas/ShieldDefinition"
} }
}, },
"quantization_config": {
"oneOf": [
{
"$ref": "#/components/schemas/Bf16QuantizationConfig"
},
{
"$ref": "#/components/schemas/Fp8QuantizationConfig"
}
]
},
"debug_prefix_messages": { "debug_prefix_messages": {
"type": "array", "type": "array",
"items": { "items": {
@ -4782,30 +4706,30 @@
{ {
"name": "RewardScoring" "name": "RewardScoring"
}, },
{
"name": "AgenticSystem"
},
{
"name": "SyntheticDataGeneration"
},
{
"name": "Inference"
},
{ {
"name": "Datasets" "name": "Datasets"
}, },
{ {
"name": "Observability" "name": "Observability"
}, },
{
"name": "AgenticSystem"
},
{
"name": "Inference"
},
{
"name": "Evaluations"
},
{
"name": "SyntheticDataGeneration"
},
{ {
"name": "PostTraining" "name": "PostTraining"
}, },
{ {
"name": "MemoryBanks" "name": "MemoryBanks"
}, },
{
"name": "Evaluations"
},
{ {
"name": "Attachment", "name": "Attachment",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/Attachment\" />" "description": "<SchemaDefinition schemaRef=\"#/components/schemas/Attachment\" />"
@ -4814,10 +4738,6 @@
"name": "BatchChatCompletionRequest", "name": "BatchChatCompletionRequest",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/BatchChatCompletionRequest\" />" "description": "<SchemaDefinition schemaRef=\"#/components/schemas/BatchChatCompletionRequest\" />"
}, },
{
"name": "Bf16QuantizationConfig",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/Bf16QuantizationConfig\" />"
},
{ {
"name": "BuiltinTool", "name": "BuiltinTool",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/BuiltinTool\" />" "description": "<SchemaDefinition schemaRef=\"#/components/schemas/BuiltinTool\" />"
@ -4826,10 +4746,6 @@
"name": "CompletionMessage", "name": "CompletionMessage",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/CompletionMessage\" />" "description": "<SchemaDefinition schemaRef=\"#/components/schemas/CompletionMessage\" />"
}, },
{
"name": "Fp8QuantizationConfig",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/Fp8QuantizationConfig\" />"
},
{ {
"name": "SamplingParams", "name": "SamplingParams",
"description": "<SchemaDefinition schemaRef=\"#/components/schemas/SamplingParams\" />" "description": "<SchemaDefinition schemaRef=\"#/components/schemas/SamplingParams\" />"
@ -5245,7 +5161,6 @@
"BatchChatCompletionResponse", "BatchChatCompletionResponse",
"BatchCompletionRequest", "BatchCompletionRequest",
"BatchCompletionResponse", "BatchCompletionResponse",
"Bf16QuantizationConfig",
"BuiltinShield", "BuiltinShield",
"BuiltinTool", "BuiltinTool",
"ChatCompletionRequest", "ChatCompletionRequest",
@ -5272,7 +5187,6 @@
"Experiment", "Experiment",
"ExperimentStatus", "ExperimentStatus",
"FinetuningAlgorithm", "FinetuningAlgorithm",
"Fp8QuantizationConfig",
"InferenceStep", "InferenceStep",
"Log", "Log",
"LogMessagesRequest", "LogMessagesRequest",

View file

@ -45,10 +45,6 @@ components:
items: items:
$ref: '#/components/schemas/ShieldDefinition' $ref: '#/components/schemas/ShieldDefinition'
type: array type: array
quantization_config:
oneOf:
- $ref: '#/components/schemas/Bf16QuantizationConfig'
- $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params: sampling_params:
$ref: '#/components/schemas/SamplingParams' $ref: '#/components/schemas/SamplingParams'
tool_prompt_format: tool_prompt_format:
@ -216,10 +212,6 @@ components:
type: array type: array
model: model:
type: string type: string
quantization_config:
oneOf:
- $ref: '#/components/schemas/Bf16QuantizationConfig'
- $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params: sampling_params:
$ref: '#/components/schemas/SamplingParams' $ref: '#/components/schemas/SamplingParams'
required: required:
@ -258,10 +250,6 @@ components:
type: object type: object
model: model:
type: string type: string
quantization_config:
oneOf:
- $ref: '#/components/schemas/Bf16QuantizationConfig'
- $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params: sampling_params:
$ref: '#/components/schemas/SamplingParams' $ref: '#/components/schemas/SamplingParams'
required: required:
@ -278,15 +266,6 @@ components:
required: required:
- completion_message_batch - completion_message_batch
type: object type: object
Bf16QuantizationConfig:
additionalProperties: false
properties:
type:
const: bf16
type: string
required:
- type
type: object
BuiltinShield: BuiltinShield:
enum: enum:
- llama_guard - llama_guard
@ -325,10 +304,6 @@ components:
type: array type: array
model: model:
type: string type: string
quantization_config:
oneOf:
- $ref: '#/components/schemas/Bf16QuantizationConfig'
- $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params: sampling_params:
$ref: '#/components/schemas/SamplingParams' $ref: '#/components/schemas/SamplingParams'
stream: stream:
@ -421,10 +396,6 @@ components:
type: object type: object
model: model:
type: string type: string
quantization_config:
oneOf:
- $ref: '#/components/schemas/Bf16QuantizationConfig'
- $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params: sampling_params:
$ref: '#/components/schemas/SamplingParams' $ref: '#/components/schemas/SamplingParams'
stream: stream:
@ -717,15 +688,6 @@ components:
- qlora - qlora
- dora - dora
type: string type: string
Fp8QuantizationConfig:
additionalProperties: false
properties:
type:
const: fp8
type: string
required:
- type
type: object
InferenceStep: InferenceStep:
additionalProperties: false additionalProperties: false
properties: properties:
@ -1867,7 +1829,7 @@ info:
description: "This is the specification of the llama stack that provides\n \ description: "This is the specification of the llama stack that provides\n \
\ a set of endpoints and their corresponding interfaces that are tailored\ \ a set of endpoints and their corresponding interfaces that are tailored\
\ to\n best leverage Llama Models. The specification is still in\ \ to\n best leverage Llama Models. The specification is still in\
\ draft and subject to change.\n Generated at 2024-08-20 19:00:39.110138" \ draft and subject to change.\n Generated at 2024-08-21 14:16:38.313950"
title: '[DRAFT] Llama Stack Specification' title: '[DRAFT] Llama Stack Specification'
version: 0.0.1 version: 0.0.1
jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@ -2871,30 +2833,24 @@ servers:
- url: http://any-hosted-llama-stack.com - url: http://any-hosted-llama-stack.com
tags: tags:
- name: RewardScoring - name: RewardScoring
- name: AgenticSystem
- name: SyntheticDataGeneration
- name: Inference
- name: Datasets - name: Datasets
- name: Observability - name: Observability
- name: AgenticSystem
- name: Inference
- name: Evaluations
- name: SyntheticDataGeneration
- name: PostTraining - name: PostTraining
- name: MemoryBanks - name: MemoryBanks
- name: Evaluations
- description: <SchemaDefinition schemaRef="#/components/schemas/Attachment" /> - description: <SchemaDefinition schemaRef="#/components/schemas/Attachment" />
name: Attachment name: Attachment
- description: <SchemaDefinition schemaRef="#/components/schemas/BatchChatCompletionRequest" - description: <SchemaDefinition schemaRef="#/components/schemas/BatchChatCompletionRequest"
/> />
name: BatchChatCompletionRequest name: BatchChatCompletionRequest
- description: <SchemaDefinition schemaRef="#/components/schemas/Bf16QuantizationConfig"
/>
name: Bf16QuantizationConfig
- description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" /> - description: <SchemaDefinition schemaRef="#/components/schemas/BuiltinTool" />
name: BuiltinTool name: BuiltinTool
- description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage" - description: <SchemaDefinition schemaRef="#/components/schemas/CompletionMessage"
/> />
name: CompletionMessage name: CompletionMessage
- description: <SchemaDefinition schemaRef="#/components/schemas/Fp8QuantizationConfig"
/>
name: Fp8QuantizationConfig
- description: <SchemaDefinition schemaRef="#/components/schemas/SamplingParams" /> - description: <SchemaDefinition schemaRef="#/components/schemas/SamplingParams" />
name: SamplingParams name: SamplingParams
- description: <SchemaDefinition schemaRef="#/components/schemas/SamplingStrategy" - description: <SchemaDefinition schemaRef="#/components/schemas/SamplingStrategy"
@ -3252,7 +3208,6 @@ x-tagGroups:
- BatchChatCompletionResponse - BatchChatCompletionResponse
- BatchCompletionRequest - BatchCompletionRequest
- BatchCompletionResponse - BatchCompletionResponse
- Bf16QuantizationConfig
- BuiltinShield - BuiltinShield
- BuiltinTool - BuiltinTool
- ChatCompletionRequest - ChatCompletionRequest
@ -3279,7 +3234,6 @@ x-tagGroups:
- Experiment - Experiment
- ExperimentStatus - ExperimentStatus
- FinetuningAlgorithm - FinetuningAlgorithm
- Fp8QuantizationConfig
- InferenceStep - InferenceStep
- Log - Log
- LogMessagesRequest - LogMessagesRequest