forked from phoenix-oss/llama-stack-mirror
completion() for tgi (#295)
This commit is contained in:
parent
cb84034567
commit
3e1c3fdb3f
9 changed files with 173 additions and 35 deletions
|
@ -110,7 +110,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
return await self._nonstream_completion(request)
|
||||
|
||||
def _get_params_for_completion(self, request: CompletionRequest) -> dict:
|
||||
sampling_options = get_sampling_options(request)
|
||||
sampling_options = get_sampling_options(request.sampling_params)
|
||||
# This is needed since the Ollama API expects num_predict to be set
|
||||
# for early truncation instead of max_tokens.
|
||||
if sampling_options["max_tokens"] is not None:
|
||||
|
@ -187,7 +187,7 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate):
|
|||
return {
|
||||
"model": OLLAMA_SUPPORTED_MODELS[request.model],
|
||||
"prompt": chat_completion_request_to_prompt(request, self.formatter),
|
||||
"options": get_sampling_options(request),
|
||||
"options": get_sampling_options(request.sampling_params),
|
||||
"raw": True,
|
||||
"stream": request.stream,
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue