mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-07 02:58:21 +00:00
all providers
This commit is contained in:
parent
aa04867d3a
commit
ba0b620532
10 changed files with 18 additions and 18 deletions
|
@ -201,7 +201,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
|||
response = OpenAICompatCompletionResponse(
|
||||
choices=[choice],
|
||||
)
|
||||
return process_chat_completion_response(response, self.formatter)
|
||||
return process_chat_completion_response(response, self.formatter, request)
|
||||
|
||||
async def _stream_chat_completion(
|
||||
self, request: ChatCompletionRequest, results_generator: AsyncGenerator
|
||||
|
@ -227,7 +227,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
|||
)
|
||||
|
||||
stream = _generate_and_convert_to_openai_compat()
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter):
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
|
||||
yield chunk
|
||||
|
||||
async def embeddings(self, model_id: str, contents: List[InterleavedContent]) -> EmbeddingsResponse:
|
||||
|
|
|
@ -134,7 +134,7 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
|
|||
)
|
||||
|
||||
response = OpenAICompatCompletionResponse(choices=[choice])
|
||||
return process_chat_completion_response(response, self.formatter)
|
||||
return process_chat_completion_response(response, self.formatter, request)
|
||||
|
||||
async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
|
||||
params = await self._get_params_for_chat_completion(request)
|
||||
|
@ -152,7 +152,7 @@ class BedrockInferenceAdapter(ModelRegistryHelper, Inference):
|
|||
yield OpenAICompatCompletionResponse(choices=[choice])
|
||||
|
||||
stream = _generate_and_convert_to_openai_compat()
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter):
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
|
||||
yield chunk
|
||||
|
||||
async def _get_params_for_chat_completion(self, request: ChatCompletionRequest) -> Dict:
|
||||
|
|
|
@ -155,14 +155,14 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
|
|||
|
||||
r = await self.client.completions.create(**params)
|
||||
|
||||
return process_chat_completion_response(r, self.formatter)
|
||||
return process_chat_completion_response(r, self.formatter, request)
|
||||
|
||||
async def _stream_chat_completion(self, request: CompletionRequest) -> AsyncGenerator:
|
||||
params = await self._get_params(request)
|
||||
|
||||
stream = await self.client.completions.create(**params)
|
||||
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter):
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
|
||||
yield chunk
|
||||
|
||||
async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
|
||||
|
|
|
@ -112,7 +112,7 @@ class DatabricksInferenceAdapter(ModelRegistryHelper, Inference):
|
|||
) -> ChatCompletionResponse:
|
||||
params = self._get_params(request)
|
||||
r = client.completions.create(**params)
|
||||
return process_chat_completion_response(r, self.formatter)
|
||||
return process_chat_completion_response(r, self.formatter, request)
|
||||
|
||||
async def _stream_chat_completion(self, request: ChatCompletionRequest, client: OpenAI) -> AsyncGenerator:
|
||||
params = self._get_params(request)
|
||||
|
|
|
@ -304,7 +304,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
response = OpenAICompatCompletionResponse(
|
||||
choices=[choice],
|
||||
)
|
||||
return process_chat_completion_response(response, self.formatter)
|
||||
return process_chat_completion_response(response, self.formatter, request)
|
||||
|
||||
async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
|
||||
params = await self._get_params(request)
|
||||
|
@ -330,7 +330,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
)
|
||||
|
||||
stream = _generate_and_convert_to_openai_compat()
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter):
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
|
||||
yield chunk
|
||||
|
||||
async def embeddings(
|
||||
|
|
|
@ -99,7 +99,7 @@ class RunpodInferenceAdapter(ModelRegistryHelper, Inference):
|
|||
) -> ChatCompletionResponse:
|
||||
params = self._get_params(request)
|
||||
r = client.completions.create(**params)
|
||||
return process_chat_completion_response(r, self.formatter)
|
||||
return process_chat_completion_response(r, self.formatter, request)
|
||||
|
||||
async def _stream_chat_completion(self, request: ChatCompletionRequest, client: OpenAI) -> AsyncGenerator:
|
||||
params = self._get_params(request)
|
||||
|
@ -110,7 +110,7 @@ class RunpodInferenceAdapter(ModelRegistryHelper, Inference):
|
|||
yield chunk
|
||||
|
||||
stream = _to_async_generator()
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter):
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
|
||||
yield chunk
|
||||
|
||||
def _get_params(self, request: ChatCompletionRequest) -> dict:
|
||||
|
|
|
@ -159,7 +159,7 @@ class SambaNovaInferenceAdapter(ModelRegistryHelper, Inference):
|
|||
yield chunk
|
||||
|
||||
stream = _to_async_generator()
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter):
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
|
||||
yield chunk
|
||||
|
||||
async def embeddings(
|
||||
|
|
|
@ -236,7 +236,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
|
|||
response = OpenAICompatCompletionResponse(
|
||||
choices=[choice],
|
||||
)
|
||||
return process_chat_completion_response(response, self.formatter)
|
||||
return process_chat_completion_response(response, self.formatter, request)
|
||||
|
||||
async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
|
||||
params = await self._get_params(request)
|
||||
|
@ -252,7 +252,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
|
|||
)
|
||||
|
||||
stream = _generate_and_convert_to_openai_compat()
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter):
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
|
||||
yield chunk
|
||||
|
||||
async def _get_params(self, request: ChatCompletionRequest) -> dict:
|
||||
|
|
|
@ -220,7 +220,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
|
|||
r = self._get_client().chat.completions.create(**params)
|
||||
else:
|
||||
r = self._get_client().completions.create(**params)
|
||||
return process_chat_completion_response(r, self.formatter)
|
||||
return process_chat_completion_response(r, self.formatter, request)
|
||||
|
||||
async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
|
||||
params = await self._get_params(request)
|
||||
|
@ -235,7 +235,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi
|
|||
yield chunk
|
||||
|
||||
stream = _to_async_generator()
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter):
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
|
||||
yield chunk
|
||||
|
||||
async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
|
||||
|
|
|
@ -142,7 +142,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
) -> ChatCompletionResponse:
|
||||
params = await self._get_params(request)
|
||||
r = client.chat.completions.create(**params)
|
||||
return process_chat_completion_response(r, self.formatter)
|
||||
return process_chat_completion_response(r, self.formatter, request)
|
||||
|
||||
async def _stream_chat_completion(self, request: ChatCompletionRequest, client: OpenAI) -> AsyncGenerator:
|
||||
params = await self._get_params(request)
|
||||
|
@ -155,7 +155,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
yield chunk
|
||||
|
||||
stream = _to_async_generator()
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter):
|
||||
async for chunk in process_chat_completion_stream_response(stream, self.formatter, request):
|
||||
yield chunk
|
||||
|
||||
async def _nonstream_completion(self, request: CompletionRequest) -> CompletionResponse:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue