diff --git a/llama_toolchain/agentic_system/api/datatypes.py b/llama_toolchain/agentic_system/api/datatypes.py
index db4e40c4b..648aed698 100644
--- a/llama_toolchain/agentic_system/api/datatypes.py
+++ b/llama_toolchain/agentic_system/api/datatypes.py
@@ -151,8 +151,6 @@ class AgenticSystemInstanceConfig(BaseModel):
input_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
output_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list)
- quantization_config: Optional[QuantizationConfig] = None
-
# if you completely want to replace the messages prefixed by the system,
# this is debug only
debug_prefix_messages: Optional[List[Message]] = Field(default_factory=list)
diff --git a/llama_toolchain/agentic_system/client.py b/llama_toolchain/agentic_system/client.py
index 154bca614..56428c425 100644
--- a/llama_toolchain/agentic_system/client.py
+++ b/llama_toolchain/agentic_system/client.py
@@ -135,7 +135,6 @@ async def run_main(host: str, port: int):
available_tools=tool_definitions,
input_shields=[],
output_shields=[],
- quantization_config=None,
debug_prefix_messages=[],
tool_prompt_format=ToolPromptFormat.json,
),
diff --git a/llama_toolchain/inference/api/endpoints.py b/llama_toolchain/inference/api/endpoints.py
index a3ec18c95..ef1c7b159 100644
--- a/llama_toolchain/inference/api/endpoints.py
+++ b/llama_toolchain/inference/api/endpoints.py
@@ -19,7 +19,6 @@ class CompletionRequest(BaseModel):
stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None
- quantization_config: Optional[QuantizationConfig] = None
@json_schema_type
@@ -43,7 +42,6 @@ class BatchCompletionRequest(BaseModel):
content_batch: List[InterleavedTextAttachment]
sampling_params: Optional[SamplingParams] = SamplingParams()
logprobs: Optional[LogProbConfig] = None
- quantization_config: Optional[QuantizationConfig] = None
@json_schema_type
@@ -62,7 +60,6 @@ class ChatCompletionRequest(BaseModel):
stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None
- quantization_config: Optional[QuantizationConfig] = None
@json_schema_type
@@ -88,7 +85,6 @@ class BatchChatCompletionRequest(BaseModel):
available_tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
logprobs: Optional[LogProbConfig] = None
- quantization_config: Optional[QuantizationConfig] = None
@json_schema_type
diff --git a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html
index f59653edc..f8dab9ec3 100644
--- a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html
+++ b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html
@@ -21,7 +21,7 @@
"info": {
"title": "[DRAFT] Llama Stack Specification",
"version": "0.0.1",
- "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-08-20 19:00:39.110138"
+ "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-08-21 14:16:38.313950"
},
"servers": [
{
@@ -1760,16 +1760,6 @@
}
},
"additionalProperties": false
- },
- "quantization_config": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/Bf16QuantizationConfig"
- },
- {
- "$ref": "#/components/schemas/Fp8QuantizationConfig"
- }
- ]
}
},
"additionalProperties": false,
@@ -1778,19 +1768,6 @@
"messages_batch"
]
},
- "Bf16QuantizationConfig": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "bf16"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ]
- },
"BuiltinTool": {
"type": "string",
"enum": [
@@ -1848,19 +1825,6 @@
"tool_calls"
]
},
- "Fp8QuantizationConfig": {
- "type": "object",
- "properties": {
- "type": {
- "type": "string",
- "const": "fp8"
- }
- },
- "additionalProperties": false,
- "required": [
- "type"
- ]
- },
"SamplingParams": {
"type": "object",
"properties": {
@@ -2229,16 +2193,6 @@
}
},
"additionalProperties": false
- },
- "quantization_config": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/Bf16QuantizationConfig"
- },
- {
- "$ref": "#/components/schemas/Fp8QuantizationConfig"
- }
- ]
}
},
"additionalProperties": false,
@@ -2307,16 +2261,6 @@
}
},
"additionalProperties": false
- },
- "quantization_config": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/Bf16QuantizationConfig"
- },
- {
- "$ref": "#/components/schemas/Fp8QuantizationConfig"
- }
- ]
}
},
"additionalProperties": false,
@@ -2469,16 +2413,6 @@
}
},
"additionalProperties": false
- },
- "quantization_config": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/Bf16QuantizationConfig"
- },
- {
- "$ref": "#/components/schemas/Fp8QuantizationConfig"
- }
- ]
}
},
"additionalProperties": false,
@@ -2552,16 +2486,6 @@
"$ref": "#/components/schemas/ShieldDefinition"
}
},
- "quantization_config": {
- "oneOf": [
- {
- "$ref": "#/components/schemas/Bf16QuantizationConfig"
- },
- {
- "$ref": "#/components/schemas/Fp8QuantizationConfig"
- }
- ]
- },
"debug_prefix_messages": {
"type": "array",
"items": {
@@ -4782,30 +4706,30 @@
{
"name": "RewardScoring"
},
- {
- "name": "AgenticSystem"
- },
- {
- "name": "SyntheticDataGeneration"
- },
- {
- "name": "Inference"
- },
{
"name": "Datasets"
},
{
"name": "Observability"
},
+ {
+ "name": "AgenticSystem"
+ },
+ {
+ "name": "Inference"
+ },
+ {
+ "name": "Evaluations"
+ },
+ {
+ "name": "SyntheticDataGeneration"
+ },
{
"name": "PostTraining"
},
{
"name": "MemoryBanks"
},
- {
- "name": "Evaluations"
- },
{
"name": "Attachment",
"description": ""
@@ -4814,10 +4738,6 @@
"name": "BatchChatCompletionRequest",
"description": ""
},
- {
- "name": "Bf16QuantizationConfig",
- "description": ""
- },
{
"name": "BuiltinTool",
"description": ""
@@ -4826,10 +4746,6 @@
"name": "CompletionMessage",
"description": ""
},
- {
- "name": "Fp8QuantizationConfig",
- "description": ""
- },
{
"name": "SamplingParams",
"description": ""
@@ -5245,7 +5161,6 @@
"BatchChatCompletionResponse",
"BatchCompletionRequest",
"BatchCompletionResponse",
- "Bf16QuantizationConfig",
"BuiltinShield",
"BuiltinTool",
"ChatCompletionRequest",
@@ -5272,7 +5187,6 @@
"Experiment",
"ExperimentStatus",
"FinetuningAlgorithm",
- "Fp8QuantizationConfig",
"InferenceStep",
"Log",
"LogMessagesRequest",
diff --git a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml
index 837036811..7cfb22669 100644
--- a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml
+++ b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml
@@ -45,10 +45,6 @@ components:
items:
$ref: '#/components/schemas/ShieldDefinition'
type: array
- quantization_config:
- oneOf:
- - $ref: '#/components/schemas/Bf16QuantizationConfig'
- - $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params:
$ref: '#/components/schemas/SamplingParams'
tool_prompt_format:
@@ -216,10 +212,6 @@ components:
type: array
model:
type: string
- quantization_config:
- oneOf:
- - $ref: '#/components/schemas/Bf16QuantizationConfig'
- - $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params:
$ref: '#/components/schemas/SamplingParams'
required:
@@ -258,10 +250,6 @@ components:
type: object
model:
type: string
- quantization_config:
- oneOf:
- - $ref: '#/components/schemas/Bf16QuantizationConfig'
- - $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params:
$ref: '#/components/schemas/SamplingParams'
required:
@@ -278,15 +266,6 @@ components:
required:
- completion_message_batch
type: object
- Bf16QuantizationConfig:
- additionalProperties: false
- properties:
- type:
- const: bf16
- type: string
- required:
- - type
- type: object
BuiltinShield:
enum:
- llama_guard
@@ -325,10 +304,6 @@ components:
type: array
model:
type: string
- quantization_config:
- oneOf:
- - $ref: '#/components/schemas/Bf16QuantizationConfig'
- - $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params:
$ref: '#/components/schemas/SamplingParams'
stream:
@@ -421,10 +396,6 @@ components:
type: object
model:
type: string
- quantization_config:
- oneOf:
- - $ref: '#/components/schemas/Bf16QuantizationConfig'
- - $ref: '#/components/schemas/Fp8QuantizationConfig'
sampling_params:
$ref: '#/components/schemas/SamplingParams'
stream:
@@ -717,15 +688,6 @@ components:
- qlora
- dora
type: string
- Fp8QuantizationConfig:
- additionalProperties: false
- properties:
- type:
- const: fp8
- type: string
- required:
- - type
- type: object
InferenceStep:
additionalProperties: false
properties:
@@ -1867,7 +1829,7 @@ info:
description: "This is the specification of the llama stack that provides\n \
\ a set of endpoints and their corresponding interfaces that are tailored\
\ to\n best leverage Llama Models. The specification is still in\
- \ draft and subject to change.\n Generated at 2024-08-20 19:00:39.110138"
+ \ draft and subject to change.\n Generated at 2024-08-21 14:16:38.313950"
title: '[DRAFT] Llama Stack Specification'
version: 0.0.1
jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
@@ -2871,30 +2833,24 @@ servers:
- url: http://any-hosted-llama-stack.com
tags:
- name: RewardScoring
-- name: AgenticSystem
-- name: SyntheticDataGeneration
-- name: Inference
- name: Datasets
- name: Observability
+- name: AgenticSystem
+- name: Inference
+- name: Evaluations
+- name: SyntheticDataGeneration
- name: PostTraining
- name: MemoryBanks
-- name: Evaluations
- description:
name: Attachment
- description:
name: BatchChatCompletionRequest
-- description:
- name: Bf16QuantizationConfig
- description:
name: BuiltinTool
- description:
name: CompletionMessage
-- description:
- name: Fp8QuantizationConfig
- description:
name: SamplingParams
- description: