mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-03 09:21:45 +00:00
updates
This commit is contained in:
parent
0cfb2e2473
commit
73d927850e
4 changed files with 43 additions and 316 deletions
182
docs/_static/llama-stack-spec.html
vendored
182
docs/_static/llama-stack-spec.html
vendored
|
@ -128,49 +128,6 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/batch-inference/chat-completion-inline": {
|
|
||||||
"post": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "OK",
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/BatchChatCompletionResponse"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"BatchInference (Coming Soon)"
|
|
||||||
],
|
|
||||||
"description": "",
|
|
||||||
"parameters": [],
|
|
||||||
"requestBody": {
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/BatchChatCompletionInlineRequest"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"/v1/inference/batch-completion": {
|
"/v1/inference/batch-completion": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -214,49 +171,6 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"/v1/batch-inference/completion-inline": {
|
|
||||||
"post": {
|
|
||||||
"responses": {
|
|
||||||
"200": {
|
|
||||||
"description": "OK",
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/BatchCompletionResponse"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"400": {
|
|
||||||
"$ref": "#/components/responses/BadRequest400"
|
|
||||||
},
|
|
||||||
"429": {
|
|
||||||
"$ref": "#/components/responses/TooManyRequests429"
|
|
||||||
},
|
|
||||||
"500": {
|
|
||||||
"$ref": "#/components/responses/InternalServerError500"
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"$ref": "#/components/responses/DefaultError"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tags": [
|
|
||||||
"BatchInference (Coming Soon)"
|
|
||||||
],
|
|
||||||
"description": "",
|
|
||||||
"parameters": [],
|
|
||||||
"requestBody": {
|
|
||||||
"content": {
|
|
||||||
"application/json": {
|
|
||||||
"schema": {
|
|
||||||
"$ref": "#/components/schemas/BatchCompletionInlineRequest"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"/v1/post-training/job/cancel": {
|
"/v1/post-training/job/cancel": {
|
||||||
"post": {
|
"post": {
|
||||||
"responses": {
|
"responses": {
|
||||||
|
@ -325,7 +239,7 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"tags": [
|
"tags": [
|
||||||
"Inference"
|
"BatchInference (Coming Soon)"
|
||||||
],
|
],
|
||||||
"description": "Generate a chat completion for the given messages using the specified model.",
|
"description": "Generate a chat completion for the given messages using the specified model.",
|
||||||
"parameters": [],
|
"parameters": [],
|
||||||
|
@ -373,7 +287,7 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"tags": [
|
"tags": [
|
||||||
"Inference"
|
"BatchInference (Coming Soon)"
|
||||||
],
|
],
|
||||||
"description": "Generate a completion for the given content using the specified model.",
|
"description": "Generate a completion for the given content using the specified model.",
|
||||||
"parameters": [],
|
"parameters": [],
|
||||||
|
@ -4821,56 +4735,6 @@
|
||||||
"title": "TokenLogProbs",
|
"title": "TokenLogProbs",
|
||||||
"description": "Log probabilities for generated tokens."
|
"description": "Log probabilities for generated tokens."
|
||||||
},
|
},
|
||||||
"BatchChatCompletionInlineRequest": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"model": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"messages_batch": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"$ref": "#/components/schemas/Message"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"sampling_params": {
|
|
||||||
"$ref": "#/components/schemas/SamplingParams"
|
|
||||||
},
|
|
||||||
"tools": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"$ref": "#/components/schemas/ToolDefinition"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"tool_config": {
|
|
||||||
"$ref": "#/components/schemas/ToolConfig"
|
|
||||||
},
|
|
||||||
"response_format": {
|
|
||||||
"$ref": "#/components/schemas/ResponseFormat"
|
|
||||||
},
|
|
||||||
"logprobs": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"top_k": {
|
|
||||||
"type": "integer",
|
|
||||||
"default": 0,
|
|
||||||
"description": "How many tokens (for each position) to return log probabilities for."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"title": "LogProbConfig"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"model",
|
|
||||||
"messages_batch"
|
|
||||||
],
|
|
||||||
"title": "BatchChatCompletionInlineRequest"
|
|
||||||
},
|
|
||||||
"BatchCompletionRequest": {
|
"BatchCompletionRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -4963,44 +4827,6 @@
|
||||||
"title": "CompletionResponse",
|
"title": "CompletionResponse",
|
||||||
"description": "Response from a completion request."
|
"description": "Response from a completion request."
|
||||||
},
|
},
|
||||||
"BatchCompletionInlineRequest": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"model": {
|
|
||||||
"type": "string"
|
|
||||||
},
|
|
||||||
"content_batch": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {
|
|
||||||
"$ref": "#/components/schemas/InterleavedContent"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"sampling_params": {
|
|
||||||
"$ref": "#/components/schemas/SamplingParams"
|
|
||||||
},
|
|
||||||
"response_format": {
|
|
||||||
"$ref": "#/components/schemas/ResponseFormat"
|
|
||||||
},
|
|
||||||
"logprobs": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"top_k": {
|
|
||||||
"type": "integer",
|
|
||||||
"default": 0,
|
|
||||||
"description": "How many tokens (for each position) to return log probabilities for."
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"title": "LogProbConfig"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"additionalProperties": false,
|
|
||||||
"required": [
|
|
||||||
"model",
|
|
||||||
"content_batch"
|
|
||||||
],
|
|
||||||
"title": "BatchCompletionInlineRequest"
|
|
||||||
},
|
|
||||||
"CancelTrainingJobRequest": {
|
"CancelTrainingJobRequest": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -11331,7 +11157,9 @@
|
||||||
"x-displayName": "Agents API for creating and interacting with agentic systems."
|
"x-displayName": "Agents API for creating and interacting with agentic systems."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "BatchInference (Coming Soon)"
|
"name": "BatchInference (Coming Soon)",
|
||||||
|
"description": "This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.\n\nNOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs\nincluding (post-training, evals, etc).",
|
||||||
|
"x-displayName": "Batch inference API for generating completions and chat completions."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "Benchmarks"
|
"name": "Benchmarks"
|
||||||
|
|
137
docs/_static/llama-stack-spec.yaml
vendored
137
docs/_static/llama-stack-spec.yaml
vendored
|
@ -69,35 +69,6 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/BatchChatCompletionRequest'
|
$ref: '#/components/schemas/BatchChatCompletionRequest'
|
||||||
required: true
|
required: true
|
||||||
/v1/batch-inference/chat-completion-inline:
|
|
||||||
post:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: OK
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/BatchChatCompletionResponse'
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- BatchInference (Coming Soon)
|
|
||||||
description: ''
|
|
||||||
parameters: []
|
|
||||||
requestBody:
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/BatchChatCompletionInlineRequest'
|
|
||||||
required: true
|
|
||||||
/v1/inference/batch-completion:
|
/v1/inference/batch-completion:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
|
@ -127,35 +98,6 @@ paths:
|
||||||
schema:
|
schema:
|
||||||
$ref: '#/components/schemas/BatchCompletionRequest'
|
$ref: '#/components/schemas/BatchCompletionRequest'
|
||||||
required: true
|
required: true
|
||||||
/v1/batch-inference/completion-inline:
|
|
||||||
post:
|
|
||||||
responses:
|
|
||||||
'200':
|
|
||||||
description: OK
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/BatchCompletionResponse'
|
|
||||||
'400':
|
|
||||||
$ref: '#/components/responses/BadRequest400'
|
|
||||||
'429':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/TooManyRequests429
|
|
||||||
'500':
|
|
||||||
$ref: >-
|
|
||||||
#/components/responses/InternalServerError500
|
|
||||||
default:
|
|
||||||
$ref: '#/components/responses/DefaultError'
|
|
||||||
tags:
|
|
||||||
- BatchInference (Coming Soon)
|
|
||||||
description: ''
|
|
||||||
parameters: []
|
|
||||||
requestBody:
|
|
||||||
content:
|
|
||||||
application/json:
|
|
||||||
schema:
|
|
||||||
$ref: '#/components/schemas/BatchCompletionInlineRequest'
|
|
||||||
required: true
|
|
||||||
/v1/post-training/job/cancel:
|
/v1/post-training/job/cancel:
|
||||||
post:
|
post:
|
||||||
responses:
|
responses:
|
||||||
|
@ -206,7 +148,7 @@ paths:
|
||||||
default:
|
default:
|
||||||
$ref: '#/components/responses/DefaultError'
|
$ref: '#/components/responses/DefaultError'
|
||||||
tags:
|
tags:
|
||||||
- Inference
|
- BatchInference (Coming Soon)
|
||||||
description: >-
|
description: >-
|
||||||
Generate a chat completion for the given messages using the specified model.
|
Generate a chat completion for the given messages using the specified model.
|
||||||
parameters: []
|
parameters: []
|
||||||
|
@ -241,7 +183,7 @@ paths:
|
||||||
default:
|
default:
|
||||||
$ref: '#/components/responses/DefaultError'
|
$ref: '#/components/responses/DefaultError'
|
||||||
tags:
|
tags:
|
||||||
- Inference
|
- BatchInference (Coming Soon)
|
||||||
description: >-
|
description: >-
|
||||||
Generate a completion for the given content using the specified model.
|
Generate a completion for the given content using the specified model.
|
||||||
parameters: []
|
parameters: []
|
||||||
|
@ -3346,42 +3288,6 @@ components:
|
||||||
- logprobs_by_token
|
- logprobs_by_token
|
||||||
title: TokenLogProbs
|
title: TokenLogProbs
|
||||||
description: Log probabilities for generated tokens.
|
description: Log probabilities for generated tokens.
|
||||||
BatchChatCompletionInlineRequest:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
model:
|
|
||||||
type: string
|
|
||||||
messages_batch:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
$ref: '#/components/schemas/Message'
|
|
||||||
sampling_params:
|
|
||||||
$ref: '#/components/schemas/SamplingParams'
|
|
||||||
tools:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
$ref: '#/components/schemas/ToolDefinition'
|
|
||||||
tool_config:
|
|
||||||
$ref: '#/components/schemas/ToolConfig'
|
|
||||||
response_format:
|
|
||||||
$ref: '#/components/schemas/ResponseFormat'
|
|
||||||
logprobs:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
top_k:
|
|
||||||
type: integer
|
|
||||||
default: 0
|
|
||||||
description: >-
|
|
||||||
How many tokens (for each position) to return log probabilities for.
|
|
||||||
additionalProperties: false
|
|
||||||
title: LogProbConfig
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- model
|
|
||||||
- messages_batch
|
|
||||||
title: BatchChatCompletionInlineRequest
|
|
||||||
BatchCompletionRequest:
|
BatchCompletionRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -3450,34 +3356,6 @@ components:
|
||||||
- stop_reason
|
- stop_reason
|
||||||
title: CompletionResponse
|
title: CompletionResponse
|
||||||
description: Response from a completion request.
|
description: Response from a completion request.
|
||||||
BatchCompletionInlineRequest:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
model:
|
|
||||||
type: string
|
|
||||||
content_batch:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
$ref: '#/components/schemas/InterleavedContent'
|
|
||||||
sampling_params:
|
|
||||||
$ref: '#/components/schemas/SamplingParams'
|
|
||||||
response_format:
|
|
||||||
$ref: '#/components/schemas/ResponseFormat'
|
|
||||||
logprobs:
|
|
||||||
type: object
|
|
||||||
properties:
|
|
||||||
top_k:
|
|
||||||
type: integer
|
|
||||||
default: 0
|
|
||||||
description: >-
|
|
||||||
How many tokens (for each position) to return log probabilities for.
|
|
||||||
additionalProperties: false
|
|
||||||
title: LogProbConfig
|
|
||||||
additionalProperties: false
|
|
||||||
required:
|
|
||||||
- model
|
|
||||||
- content_batch
|
|
||||||
title: BatchCompletionInlineRequest
|
|
||||||
CancelTrainingJobRequest:
|
CancelTrainingJobRequest:
|
||||||
type: object
|
type: object
|
||||||
properties:
|
properties:
|
||||||
|
@ -7737,6 +7615,17 @@ tags:
|
||||||
x-displayName: >-
|
x-displayName: >-
|
||||||
Agents API for creating and interacting with agentic systems.
|
Agents API for creating and interacting with agentic systems.
|
||||||
- name: BatchInference (Coming Soon)
|
- name: BatchInference (Coming Soon)
|
||||||
|
description: >-
|
||||||
|
This is an asynchronous API. If the request is successful, the response will
|
||||||
|
be a job which can be polled for completion.
|
||||||
|
|
||||||
|
|
||||||
|
NOTE: This API is not yet implemented and is subject to change in concert with
|
||||||
|
other asynchronous APIs
|
||||||
|
|
||||||
|
including (post-training, evals, etc).
|
||||||
|
x-displayName: >-
|
||||||
|
Batch inference API for generating completions and chat completions.
|
||||||
- name: Benchmarks
|
- name: Benchmarks
|
||||||
- name: DatasetIO
|
- name: DatasetIO
|
||||||
- name: Datasets
|
- name: Datasets
|
||||||
|
|
|
@ -6,40 +6,50 @@
|
||||||
|
|
||||||
from typing import List, Optional, Protocol, runtime_checkable
|
from typing import List, Optional, Protocol, runtime_checkable
|
||||||
|
|
||||||
|
from llama_stack.apis.common.job_types import Job
|
||||||
from llama_stack.apis.inference import (
|
from llama_stack.apis.inference import (
|
||||||
BatchChatCompletionResponse,
|
|
||||||
BatchCompletionResponse,
|
|
||||||
InterleavedContent,
|
InterleavedContent,
|
||||||
LogProbConfig,
|
LogProbConfig,
|
||||||
Message,
|
Message,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
SamplingParams,
|
SamplingParams,
|
||||||
ToolConfig,
|
ToolChoice,
|
||||||
ToolDefinition,
|
ToolDefinition,
|
||||||
|
ToolPromptFormat,
|
||||||
)
|
)
|
||||||
from llama_stack.schema_utils import webmethod
|
from llama_stack.schema_utils import webmethod
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class BatchInference(Protocol):
|
class BatchInference(Protocol):
|
||||||
@webmethod(route="/batch-inference/completion-inline", method="POST")
|
"""Batch inference API for generating completions and chat completions.
|
||||||
async def batch_completion_inline(
|
|
||||||
|
This is an asynchronous API. If the request is successful, the response will be a job which can be polled for completion.
|
||||||
|
|
||||||
|
NOTE: This API is not yet implemented and is subject to change in concert with other asynchronous APIs
|
||||||
|
including (post-training, evals, etc).
|
||||||
|
"""
|
||||||
|
|
||||||
|
@webmethod(route="/batch-inference/completion", method="POST")
|
||||||
|
async def completion(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
content_batch: List[InterleavedContent],
|
content_batch: List[InterleavedContent],
|
||||||
sampling_params: Optional[SamplingParams] = None,
|
sampling_params: Optional[SamplingParams] = None,
|
||||||
response_format: Optional[ResponseFormat] = None,
|
response_format: Optional[ResponseFormat] = None,
|
||||||
logprobs: Optional[LogProbConfig] = None,
|
logprobs: Optional[LogProbConfig] = None,
|
||||||
) -> BatchCompletionResponse: ...
|
) -> Job: ...
|
||||||
|
|
||||||
@webmethod(route="/batch-inference/chat-completion-inline", method="POST")
|
@webmethod(route="/batch-inference/chat-completion", method="POST")
|
||||||
async def batch_chat_completion_inline(
|
async def chat_completion(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
messages_batch: List[List[Message]],
|
messages_batch: List[List[Message]],
|
||||||
sampling_params: Optional[SamplingParams] = None,
|
sampling_params: Optional[SamplingParams] = None,
|
||||||
tools: Optional[List[ToolDefinition]] = list,
|
# zero-shot tool definitions as input to the model
|
||||||
tool_config: Optional[ToolConfig] = None,
|
tools: Optional[List[ToolDefinition]] = None,
|
||||||
|
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
|
||||||
|
tool_prompt_format: Optional[ToolPromptFormat] = None,
|
||||||
response_format: Optional[ResponseFormat] = None,
|
response_format: Optional[ResponseFormat] = None,
|
||||||
logprobs: Optional[LogProbConfig] = None,
|
logprobs: Optional[LogProbConfig] = None,
|
||||||
) -> BatchChatCompletionResponse: ...
|
) -> Job: ...
|
||||||
|
|
|
@ -5,7 +5,6 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
from typing import AsyncGenerator, List, Optional, Union
|
from typing import AsyncGenerator, List, Optional, Union
|
||||||
|
|
||||||
|
@ -44,6 +43,7 @@ from llama_stack.apis.inference import (
|
||||||
UserMessage,
|
UserMessage,
|
||||||
)
|
)
|
||||||
from llama_stack.apis.models import Model, ModelType
|
from llama_stack.apis.models import Model, ModelType
|
||||||
|
from llama_stack.log import get_logger
|
||||||
from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
|
from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
|
||||||
from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
|
from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
|
||||||
from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
|
from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
|
||||||
|
@ -72,7 +72,7 @@ from .config import MetaReferenceInferenceConfig
|
||||||
from .generators import LlamaGenerator
|
from .generators import LlamaGenerator
|
||||||
from .model_parallel import LlamaModelParallelGenerator
|
from .model_parallel import LlamaModelParallelGenerator
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = get_logger(__name__, category="inference")
|
||||||
# there's a single model parallel process running serving the model. for now,
|
# there's a single model parallel process running serving the model. for now,
|
||||||
# we don't support multiple concurrent requests to this process.
|
# we don't support multiple concurrent requests to this process.
|
||||||
SEMAPHORE = asyncio.Semaphore(1)
|
SEMAPHORE = asyncio.Semaphore(1)
|
||||||
|
@ -159,7 +159,7 @@ class MetaReferenceInferenceImpl(
|
||||||
self.model_id = model_id
|
self.model_id = model_id
|
||||||
self.llama_model = llama_model
|
self.llama_model = llama_model
|
||||||
|
|
||||||
print("Warming up...")
|
log.info("Warming up...")
|
||||||
await self.completion(
|
await self.completion(
|
||||||
model_id=model_id,
|
model_id=model_id,
|
||||||
content="Hello, world!",
|
content="Hello, world!",
|
||||||
|
@ -170,7 +170,7 @@ class MetaReferenceInferenceImpl(
|
||||||
messages=[UserMessage(content="Hi how are you?")],
|
messages=[UserMessage(content="Hi how are you?")],
|
||||||
sampling_params=SamplingParams(max_tokens=20),
|
sampling_params=SamplingParams(max_tokens=20),
|
||||||
)
|
)
|
||||||
print("Warmed up!")
|
log.info("Warmed up!")
|
||||||
|
|
||||||
def check_model(self, request) -> None:
|
def check_model(self, request) -> None:
|
||||||
if self.model_id is None or self.llama_model is None:
|
if self.model_id is None or self.llama_model is None:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue