mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
Remove request arg from chat completion response processing (#240)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
This commit is contained in:
parent
209cd3d35e
commit
80ada04f76
7 changed files with 14 additions and 18 deletions
|
@ -91,7 +91,7 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
|
||||||
) -> ChatCompletionResponse:
|
) -> ChatCompletionResponse:
|
||||||
params = self._get_params(request)
|
params = self._get_params(request)
|
||||||
r = client.completions.create(**params)
|
r = client.completions.create(**params)
|
||||||
return process_chat_completion_response(request, r, self.formatter)
|
return process_chat_completion_response(r, self.formatter)
|
||||||
|
|
||||||
async def _stream_chat_completion(
|
async def _stream_chat_completion(
|
||||||
self, request: ChatCompletionRequest, client: OpenAI
|
self, request: ChatCompletionRequest, client: OpenAI
|
||||||
|
@ -105,7 +105,7 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
|
||||||
|
|
||||||
stream = _to_async_generator()
|
stream = _to_async_generator()
|
||||||
async for chunk in process_chat_completion_stream_response(
|
async for chunk in process_chat_completion_stream_response(
|
||||||
request, stream, self.formatter
|
stream, self.formatter
|
||||||
):
|
):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference):
|
||||||
) -> ChatCompletionResponse:
|
) -> ChatCompletionResponse:
|
||||||
params = self._get_params(request)
|
params = self._get_params(request)
|
||||||
r = await client.completion.acreate(**params)
|
r = await client.completion.acreate(**params)
|
||||||
return process_chat_completion_response(request, r, self.formatter)
|
return process_chat_completion_response(r, self.formatter)
|
||||||
|
|
||||||
async def _stream_chat_completion(
|
async def _stream_chat_completion(
|
||||||
self, request: ChatCompletionRequest, client: Fireworks
|
self, request: ChatCompletionRequest, client: Fireworks
|
||||||
|
@ -103,7 +103,7 @@ class FireworksInferenceAdapter(ModelRegistryHelper, Inference):
|
||||||
|
|
||||||
stream = client.completion.acreate(**params)
|
stream = client.completion.acreate(**params)
|
||||||
async for chunk in process_chat_completion_stream_response(
|
async for chunk in process_chat_completion_stream_response(
|
||||||
request, stream, self.formatter
|
stream, self.formatter
|
||||||
):
|
):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
|
|
@ -143,7 +143,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
response = OpenAICompatCompletionResponse(
|
response = OpenAICompatCompletionResponse(
|
||||||
choices=[choice],
|
choices=[choice],
|
||||||
)
|
)
|
||||||
return process_chat_completion_response(request, response, self.formatter)
|
return process_chat_completion_response(response, self.formatter)
|
||||||
|
|
||||||
async def _stream_chat_completion(
|
async def _stream_chat_completion(
|
||||||
self, request: ChatCompletionRequest
|
self, request: ChatCompletionRequest
|
||||||
|
@ -163,7 +163,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
|
|
||||||
stream = _generate_and_convert_to_openai_compat()
|
stream = _generate_and_convert_to_openai_compat()
|
||||||
async for chunk in process_chat_completion_stream_response(
|
async for chunk in process_chat_completion_stream_response(
|
||||||
request, stream, self.formatter
|
stream, self.formatter
|
||||||
):
|
):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
|
|
@ -116,7 +116,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
|
||||||
response = OpenAICompatCompletionResponse(
|
response = OpenAICompatCompletionResponse(
|
||||||
choices=[choice],
|
choices=[choice],
|
||||||
)
|
)
|
||||||
return process_chat_completion_response(request, response, self.formatter)
|
return process_chat_completion_response(response, self.formatter)
|
||||||
|
|
||||||
async def _stream_chat_completion(
|
async def _stream_chat_completion(
|
||||||
self, request: ChatCompletionRequest
|
self, request: ChatCompletionRequest
|
||||||
|
@ -135,7 +135,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
|
||||||
|
|
||||||
stream = _generate_and_convert_to_openai_compat()
|
stream = _generate_and_convert_to_openai_compat()
|
||||||
async for chunk in process_chat_completion_stream_response(
|
async for chunk in process_chat_completion_stream_response(
|
||||||
request, stream, self.formatter
|
stream, self.formatter
|
||||||
):
|
):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
|
|
@ -108,7 +108,7 @@ class TogetherInferenceAdapter(
|
||||||
) -> ChatCompletionResponse:
|
) -> ChatCompletionResponse:
|
||||||
params = self._get_params(request)
|
params = self._get_params(request)
|
||||||
r = client.completions.create(**params)
|
r = client.completions.create(**params)
|
||||||
return process_chat_completion_response(request, r, self.formatter)
|
return process_chat_completion_response(r, self.formatter)
|
||||||
|
|
||||||
async def _stream_chat_completion(
|
async def _stream_chat_completion(
|
||||||
self, request: ChatCompletionRequest, client: Together
|
self, request: ChatCompletionRequest, client: Together
|
||||||
|
@ -123,7 +123,7 @@ class TogetherInferenceAdapter(
|
||||||
|
|
||||||
stream = _to_async_generator()
|
stream = _to_async_generator()
|
||||||
async for chunk in process_chat_completion_stream_response(
|
async for chunk in process_chat_completion_stream_response(
|
||||||
request, stream, self.formatter
|
stream, self.formatter
|
||||||
):
|
):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
|
|
@ -207,7 +207,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
|
||||||
response = OpenAICompatCompletionResponse(
|
response = OpenAICompatCompletionResponse(
|
||||||
choices=[choice],
|
choices=[choice],
|
||||||
)
|
)
|
||||||
return process_chat_completion_response(request, response, self.formatter)
|
return process_chat_completion_response(response, self.formatter)
|
||||||
|
|
||||||
async def _stream_chat_completion(
|
async def _stream_chat_completion(
|
||||||
self, request: ChatCompletionRequest, results_generator: AsyncGenerator
|
self, request: ChatCompletionRequest, results_generator: AsyncGenerator
|
||||||
|
@ -229,7 +229,7 @@ class VLLMInferenceImpl(ModelRegistryHelper, Inference):
|
||||||
|
|
||||||
stream = _generate_and_convert_to_openai_compat()
|
stream = _generate_and_convert_to_openai_compat()
|
||||||
async for chunk in process_chat_completion_stream_response(
|
async for chunk in process_chat_completion_stream_response(
|
||||||
request, stream, self.formatter
|
stream, self.formatter
|
||||||
):
|
):
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
|
|
@ -50,9 +50,7 @@ def text_from_choice(choice) -> str:
|
||||||
|
|
||||||
|
|
||||||
def process_chat_completion_response(
|
def process_chat_completion_response(
|
||||||
request: ChatCompletionRequest,
|
response: OpenAICompatCompletionResponse, formatter: ChatFormat
|
||||||
response: OpenAICompatCompletionResponse,
|
|
||||||
formatter: ChatFormat,
|
|
||||||
) -> ChatCompletionResponse:
|
) -> ChatCompletionResponse:
|
||||||
choice = response.choices[0]
|
choice = response.choices[0]
|
||||||
|
|
||||||
|
@ -78,9 +76,7 @@ def process_chat_completion_response(
|
||||||
|
|
||||||
|
|
||||||
async def process_chat_completion_stream_response(
|
async def process_chat_completion_stream_response(
|
||||||
request: ChatCompletionRequest,
|
stream: AsyncGenerator[OpenAICompatCompletionResponse, None], formatter: ChatFormat
|
||||||
stream: AsyncGenerator[OpenAICompatCompletionResponse, None],
|
|
||||||
formatter: ChatFormat,
|
|
||||||
) -> AsyncGenerator:
|
) -> AsyncGenerator:
|
||||||
yield ChatCompletionResponseStreamChunk(
|
yield ChatCompletionResponseStreamChunk(
|
||||||
event=ChatCompletionResponseEvent(
|
event=ChatCompletionResponseEvent(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue