Fix conversion to RawMessage everywhere

This commit is contained in:
Ashwin Bharambe 2024-12-17 13:38:01 -08:00
parent fbca51d6da
commit b7a7caa9a8
11 changed files with 87 additions and 78 deletions

View file

@ -94,14 +94,14 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
async def _nonstream_completion(
self, request: CompletionRequest
) -> CompletionResponse:
params = self._get_params(request)
params = await self._get_params(request)
r = await self.client.completions.create(**params)
return process_completion_response(r, self.formatter)
async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
params = self._get_params(request)
params = await self._get_params(request)
stream = await self.client.completions.create(**params)
@ -141,7 +141,7 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
async def _nonstream_chat_completion(
self, request: CompletionRequest
) -> CompletionResponse:
params = self._get_params(request)
params = await self._get_params(request)
r = await self.client.completions.create(**params)
@ -150,7 +150,7 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
async def _stream_chat_completion(
self, request: CompletionRequest
) -> AsyncGenerator:
params = self._get_params(request)
params = await self._get_params(request)
stream = await self.client.completions.create(**params)
@ -159,7 +159,7 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
):
yield chunk
def _get_params(
async def _get_params(
self, request: Union[ChatCompletionRequest, CompletionRequest]
) -> dict:
if request.sampling_params and request.sampling_params.top_k:
@ -167,11 +167,11 @@ class CerebrasInferenceAdapter(ModelRegistryHelper, Inference):
prompt = ""
if isinstance(request, ChatCompletionRequest):
prompt = chat_completion_request_to_prompt(
prompt = await chat_completion_request_to_prompt(
request, self.get_llama_model(request.model), self.formatter
)
elif isinstance(request, CompletionRequest):
prompt = completion_request_to_prompt(request, self.formatter)
prompt = await completion_request_to_prompt(request, self.formatter)
else:
raise ValueError(f"Unknown request type {type(request)}")

View file

@ -241,14 +241,16 @@ class FireworksInferenceAdapter(
await convert_message_to_openai_dict(m) for m in request.messages
]
else:
input_dict["prompt"] = chat_completion_request_to_prompt(
input_dict["prompt"] = await chat_completion_request_to_prompt(
request, self.get_llama_model(request.model), self.formatter
)
else:
assert (
not media_present
), "Fireworks does not support media for Completion requests"
input_dict["prompt"] = completion_request_to_prompt(request, self.formatter)
input_dict["prompt"] = await completion_request_to_prompt(
request, self.formatter
)
# Fireworks always prepends with BOS
if "prompt" in input_dict:

View file

@ -243,7 +243,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
]
else:
input_dict["raw"] = True
input_dict["prompt"] = chat_completion_request_to_prompt(
input_dict["prompt"] = await chat_completion_request_to_prompt(
request,
self.register_helper.get_llama_model(request.model),
self.formatter,
@ -252,7 +252,9 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
assert (
not media_present
), "Ollama does not support media for Completion requests"
input_dict["prompt"] = completion_request_to_prompt(request, self.formatter)
input_dict["prompt"] = await completion_request_to_prompt(
request, self.formatter
)
input_dict["raw"] = True
return {

View file

@ -130,8 +130,8 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
return options
def _get_params_for_completion(self, request: CompletionRequest) -> dict:
prompt, input_tokens = completion_request_to_prompt_model_input_info(
async def _get_params_for_completion(self, request: CompletionRequest) -> dict:
prompt, input_tokens = await completion_request_to_prompt_model_input_info(
request, self.formatter
)
@ -147,7 +147,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
)
async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator:
params = self._get_params_for_completion(request)
params = await self._get_params_for_completion(request)
async def _generate_and_convert_to_openai_compat():
s = await self.client.text_generation(**params)
@ -169,7 +169,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
yield chunk
async def _nonstream_completion(self, request: CompletionRequest) -> AsyncGenerator:
params = self._get_params_for_completion(request)
params = await self._get_params_for_completion(request)
r = await self.client.text_generation(**params)
choice = OpenAICompatCompletionChoice(
@ -216,7 +216,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
async def _nonstream_chat_completion(
self, request: ChatCompletionRequest
) -> ChatCompletionResponse:
params = self._get_params(request)
params = await self._get_params(request)
r = await self.client.text_generation(**params)
choice = OpenAICompatCompletionChoice(
@ -231,7 +231,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
async def _stream_chat_completion(
self, request: ChatCompletionRequest
) -> AsyncGenerator:
params = self._get_params(request)
params = await self._get_params(request)
async def _generate_and_convert_to_openai_compat():
s = await self.client.text_generation(**params)
@ -249,8 +249,8 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
):
yield chunk
def _get_params(self, request: ChatCompletionRequest) -> dict:
prompt, input_tokens = chat_completion_request_to_model_input_info(
async def _get_params(self, request: ChatCompletionRequest) -> dict:
prompt, input_tokens = await chat_completion_request_to_model_input_info(
request, self.register_helper.get_llama_model(request.model), self.formatter
)
return dict(

View file

@ -233,14 +233,16 @@ class TogetherInferenceAdapter(
await convert_message_to_openai_dict(m) for m in request.messages
]
else:
input_dict["prompt"] = chat_completion_request_to_prompt(
input_dict["prompt"] = await chat_completion_request_to_prompt(
request, self.get_llama_model(request.model), self.formatter
)
else:
assert (
not media_present
), "Together does not support media for Completion requests"
input_dict["prompt"] = completion_request_to_prompt(request, self.formatter)
input_dict["prompt"] = await completion_request_to_prompt(
request, self.formatter
)
return {
"model": request.model,

View file

@ -77,7 +77,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> Union[CompletionResponse, CompletionResponseStreamChunk]:
raise NotImplementedError()
raise NotImplementedError("Completion not implemented for vLLM")
async def chat_completion(
self,
@ -167,7 +167,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
for m in request.messages
]
else:
input_dict["prompt"] = chat_completion_request_to_prompt(
input_dict["prompt"] = await chat_completion_request_to_prompt(
request,
self.register_helper.get_llama_model(request.model),
self.formatter,
@ -176,7 +176,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
assert (
not media_present
), "Together does not support media for Completion requests"
input_dict["prompt"] = completion_request_to_prompt(
input_dict["prompt"] = await completion_request_to_prompt(
request,
self.register_helper.get_llama_model(request.model),
self.formatter,