[1/n] migrate inference/chat_completion

This commit is contained in:
Xi Yan 2024-09-11 12:21:19 -07:00
parent 1433aaf9f7
commit 0c7c6b7e02
3 changed files with 35 additions and 7 deletions

View file

@ -176,7 +176,15 @@ class Inference(Protocol):
@webmethod(route="/inference/chat_completion") @webmethod(route="/inference/chat_completion")
async def chat_completion( async def chat_completion(
self, self,
request: ChatCompletionRequest, model: str,
messages: List[Message],
sampling_params: Optional[SamplingParams] = SamplingParams(),
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = list,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]: ... ) -> Union[ChatCompletionResponse, ChatCompletionResponseStreamChunk]: ...
@webmethod(route="/inference/embeddings") @webmethod(route="/inference/embeddings")

View file

@ -10,10 +10,10 @@ from typing import Any, AsyncGenerator
import fire import fire
import httpx import httpx
from pydantic import BaseModel
from termcolor import cprint
from llama_toolchain.core.datatypes import RemoteProviderConfig from llama_toolchain.core.datatypes import RemoteProviderConfig
from pydantic import BaseModel
from termcolor import cprint
from .api import ( from .api import (
ChatCompletionRequest, ChatCompletionRequest,
@ -52,9 +52,7 @@ class InferenceClient(Inference):
async with client.stream( async with client.stream(
"POST", "POST",
f"{self.base_url}/inference/chat_completion", f"{self.base_url}/inference/chat_completion",
json={ json=encodable_dict(request),
"request": encodable_dict(request),
},
headers={"Content-Type": "application/json"}, headers={"Content-Type": "application/json"},
timeout=20, timeout=20,
) as response: ) as response:

View file

@ -22,9 +22,12 @@ from llama_toolchain.inference.api import (
ToolCallParseStatus, ToolCallParseStatus,
) )
from llama_toolchain.inference.prepare_messages import prepare_messages from llama_toolchain.inference.prepare_messages import prepare_messages
from .config import MetaReferenceImplConfig from .config import MetaReferenceImplConfig
from .model_parallel import LlamaModelParallelGenerator from .model_parallel import LlamaModelParallelGenerator
from llama_models.llama3.api.datatypes import * # noqa: F403
from llama_toolchain.inference.api import * # noqa: F403
# there's a single model parallel process running serving the model. for now, # there's a single model parallel process running serving the model. for now,
# we don't support multiple concurrent requests to this process. # we don't support multiple concurrent requests to this process.
@ -50,10 +53,29 @@ class MetaReferenceInferenceImpl(Inference):
# hm, when stream=False, we should not be doing SSE :/ which is what the # hm, when stream=False, we should not be doing SSE :/ which is what the
# top-level server is going to do. make the typing more specific here # top-level server is going to do. make the typing more specific here
async def chat_completion( async def chat_completion(
self, request: ChatCompletionRequest self,
model: str,
messages: List[Message],
sampling_params: Optional[SamplingParams] = SamplingParams(),
tools: Optional[List[ToolDefinition]] = list,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = ToolPromptFormat.json,
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> AsyncIterator[ ) -> AsyncIterator[
Union[ChatCompletionResponseStreamChunk, ChatCompletionResponse] Union[ChatCompletionResponseStreamChunk, ChatCompletionResponse]
]: ]:
request = ChatCompletionRequest(
model=model,
messages=messages,
sampling_params=sampling_params,
tools=tools,
tool_choice=tool_choice,
tool_prompt_format=tool_prompt_format,
stream=stream,
logprobs=logprobs,
)
messages = prepare_messages(request) messages = prepare_messages(request)
model = resolve_model(request.model) model = resolve_model(request.model)
if model is None: if model is None: