diff --git a/llama_toolchain/agentic_system/api/datatypes.py b/llama_toolchain/agentic_system/api/datatypes.py index db4e40c4b..648aed698 100644 --- a/llama_toolchain/agentic_system/api/datatypes.py +++ b/llama_toolchain/agentic_system/api/datatypes.py @@ -151,8 +151,6 @@ class AgenticSystemInstanceConfig(BaseModel): input_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list) output_shields: Optional[List[ShieldDefinition]] = Field(default_factory=list) - quantization_config: Optional[QuantizationConfig] = None - # if you completely want to replace the messages prefixed by the system, # this is debug only debug_prefix_messages: Optional[List[Message]] = Field(default_factory=list) diff --git a/llama_toolchain/agentic_system/client.py b/llama_toolchain/agentic_system/client.py index 154bca614..56428c425 100644 --- a/llama_toolchain/agentic_system/client.py +++ b/llama_toolchain/agentic_system/client.py @@ -135,7 +135,6 @@ async def run_main(host: str, port: int): available_tools=tool_definitions, input_shields=[], output_shields=[], - quantization_config=None, debug_prefix_messages=[], tool_prompt_format=ToolPromptFormat.json, ), diff --git a/llama_toolchain/inference/api/endpoints.py b/llama_toolchain/inference/api/endpoints.py index a3ec18c95..ef1c7b159 100644 --- a/llama_toolchain/inference/api/endpoints.py +++ b/llama_toolchain/inference/api/endpoints.py @@ -19,7 +19,6 @@ class CompletionRequest(BaseModel): stream: Optional[bool] = False logprobs: Optional[LogProbConfig] = None - quantization_config: Optional[QuantizationConfig] = None @json_schema_type @@ -43,7 +42,6 @@ class BatchCompletionRequest(BaseModel): content_batch: List[InterleavedTextAttachment] sampling_params: Optional[SamplingParams] = SamplingParams() logprobs: Optional[LogProbConfig] = None - quantization_config: Optional[QuantizationConfig] = None @json_schema_type @@ -62,7 +60,6 @@ class ChatCompletionRequest(BaseModel): stream: Optional[bool] = False logprobs: Optional[LogProbConfig] = None - quantization_config: Optional[QuantizationConfig] = None @json_schema_type @@ -88,7 +85,6 @@ class BatchChatCompletionRequest(BaseModel): available_tools: Optional[List[ToolDefinition]] = Field(default_factory=list) logprobs: Optional[LogProbConfig] = None - quantization_config: Optional[QuantizationConfig] = None @json_schema_type diff --git a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html index f59653edc..f8dab9ec3 100644 --- a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html +++ b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "0.0.1", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-08-20 19:00:39.110138" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-08-21 14:16:38.313950" }, "servers": [ { @@ -1760,16 +1760,6 @@ } }, "additionalProperties": false - }, - "quantization_config": { - "oneOf": [ - { - "$ref": "#/components/schemas/Bf16QuantizationConfig" - }, - { - "$ref": "#/components/schemas/Fp8QuantizationConfig" - } - ] } }, "additionalProperties": false, @@ -1778,19 +1768,6 @@ "messages_batch" ] }, - "Bf16QuantizationConfig": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "bf16" - } - }, - "additionalProperties": false, - "required": [ - "type" - ] - }, "BuiltinTool": { "type": "string", "enum": [ @@ -1848,19 +1825,6 @@ "tool_calls" ] }, - "Fp8QuantizationConfig": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "fp8" - } - }, - "additionalProperties": false, - "required": [ - "type" - ] - }, "SamplingParams": { "type": "object", "properties": { @@ -2229,16 +2193,6 @@ } }, "additionalProperties": false - }, - "quantization_config": { - "oneOf": [ - { - "$ref": "#/components/schemas/Bf16QuantizationConfig" - }, - { - "$ref": "#/components/schemas/Fp8QuantizationConfig" - } - ] } }, "additionalProperties": false, @@ -2307,16 +2261,6 @@ } }, "additionalProperties": false - }, - "quantization_config": { - "oneOf": [ - { - "$ref": "#/components/schemas/Bf16QuantizationConfig" - }, - { - "$ref": "#/components/schemas/Fp8QuantizationConfig" - } - ] } }, "additionalProperties": false, @@ -2469,16 +2413,6 @@ } }, "additionalProperties": false - }, - "quantization_config": { - "oneOf": [ - { - "$ref": "#/components/schemas/Bf16QuantizationConfig" - }, - { - "$ref": "#/components/schemas/Fp8QuantizationConfig" - } - ] } }, "additionalProperties": false, @@ -2552,16 +2486,6 @@ "$ref": "#/components/schemas/ShieldDefinition" } }, - "quantization_config": { - "oneOf": [ - { - "$ref": "#/components/schemas/Bf16QuantizationConfig" - }, - { - "$ref": "#/components/schemas/Fp8QuantizationConfig" - } - ] - }, "debug_prefix_messages": { "type": "array", "items": { @@ -4782,30 +4706,30 @@ { "name": "RewardScoring" }, - { - "name": "AgenticSystem" - }, - { - "name": "SyntheticDataGeneration" - }, - { - "name": "Inference" - }, { "name": "Datasets" }, { "name": "Observability" }, + { + "name": "AgenticSystem" + }, + { + "name": "Inference" + }, + { + "name": "Evaluations" + }, + { + "name": "SyntheticDataGeneration" + }, { "name": "PostTraining" }, { "name": "MemoryBanks" }, - { - "name": "Evaluations" - }, { "name": "Attachment", "description": "" @@ -4814,10 +4738,6 @@ "name": "BatchChatCompletionRequest", "description": "" }, - { - "name": "Bf16QuantizationConfig", - "description": "" - }, { "name": "BuiltinTool", "description": "" @@ -4826,10 +4746,6 @@ "name": "CompletionMessage", "description": "" }, - { - "name": "Fp8QuantizationConfig", - "description": "" - }, { "name": "SamplingParams", "description": "" @@ -5245,7 +5161,6 @@ "BatchChatCompletionResponse", "BatchCompletionRequest", "BatchCompletionResponse", - "Bf16QuantizationConfig", "BuiltinShield", "BuiltinTool", "ChatCompletionRequest", @@ -5272,7 +5187,6 @@ "Experiment", "ExperimentStatus", "FinetuningAlgorithm", - "Fp8QuantizationConfig", "InferenceStep", "Log", "LogMessagesRequest", diff --git a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml index 837036811..7cfb22669 100644 --- a/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml +++ b/rfcs/RFC-0001-llama-stack-assets/llama-stack-spec.yaml @@ -45,10 +45,6 @@ components: items: $ref: '#/components/schemas/ShieldDefinition' type: array - quantization_config: - oneOf: - - $ref: '#/components/schemas/Bf16QuantizationConfig' - - $ref: '#/components/schemas/Fp8QuantizationConfig' sampling_params: $ref: '#/components/schemas/SamplingParams' tool_prompt_format: @@ -216,10 +212,6 @@ components: type: array model: type: string - quantization_config: - oneOf: - - $ref: '#/components/schemas/Bf16QuantizationConfig' - - $ref: '#/components/schemas/Fp8QuantizationConfig' sampling_params: $ref: '#/components/schemas/SamplingParams' required: @@ -258,10 +250,6 @@ components: type: object model: type: string - quantization_config: - oneOf: - - $ref: '#/components/schemas/Bf16QuantizationConfig' - - $ref: '#/components/schemas/Fp8QuantizationConfig' sampling_params: $ref: '#/components/schemas/SamplingParams' required: @@ -278,15 +266,6 @@ components: required: - completion_message_batch type: object - Bf16QuantizationConfig: - additionalProperties: false - properties: - type: - const: bf16 - type: string - required: - - type - type: object BuiltinShield: enum: - llama_guard @@ -325,10 +304,6 @@ components: type: array model: type: string - quantization_config: - oneOf: - - $ref: '#/components/schemas/Bf16QuantizationConfig' - - $ref: '#/components/schemas/Fp8QuantizationConfig' sampling_params: $ref: '#/components/schemas/SamplingParams' stream: @@ -421,10 +396,6 @@ components: type: object model: type: string - quantization_config: - oneOf: - - $ref: '#/components/schemas/Bf16QuantizationConfig' - - $ref: '#/components/schemas/Fp8QuantizationConfig' sampling_params: $ref: '#/components/schemas/SamplingParams' stream: @@ -717,15 +688,6 @@ components: - qlora - dora type: string - Fp8QuantizationConfig: - additionalProperties: false - properties: - type: - const: fp8 - type: string - required: - - type - type: object InferenceStep: additionalProperties: false properties: @@ -1867,7 +1829,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ - \ draft and subject to change.\n Generated at 2024-08-20 19:00:39.110138" + \ draft and subject to change.\n Generated at 2024-08-21 14:16:38.313950" title: '[DRAFT] Llama Stack Specification' version: 0.0.1 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema @@ -2871,30 +2833,24 @@ servers: - url: http://any-hosted-llama-stack.com tags: - name: RewardScoring -- name: AgenticSystem -- name: SyntheticDataGeneration -- name: Inference - name: Datasets - name: Observability +- name: AgenticSystem +- name: Inference +- name: Evaluations +- name: SyntheticDataGeneration - name: PostTraining - name: MemoryBanks -- name: Evaluations - description: name: Attachment - description: name: BatchChatCompletionRequest -- description: - name: Bf16QuantizationConfig - description: name: BuiltinTool - description: name: CompletionMessage -- description: - name: Fp8QuantizationConfig - description: name: SamplingParams - description: