style(test_completion.py): fix merge conflict

This commit is contained in:
Krrish Dholakia 2023-10-05 22:09:38 -07:00
parent 396d9d8e38
commit dd7e397650
22 changed files with 1535 additions and 250 deletions

View file

@ -78,6 +78,6 @@ This list is constantly being updated.
|AlephAlpha| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | |
|Palm| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | |
|NLP Cloud| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | |
|Petals| ✅ | ✅ | | ✅ | | | | | | |
|Petals| ✅ | ✅ | | ✅ | | | | | | |
By default, LiteLLM raises an exception if the param being passed in isn't supported. However, if you want to just drop the param, instead of raising an exception, just set `litellm.drop_params = True`.

View file

@ -311,6 +311,19 @@ from .utils import (
get_llm_provider,
completion_with_config,
)
from .llms.huggingface_restapi import HuggingfaceConfig
from .llms.anthropic import AnthropicConfig
from .llms.replicate import ReplicateConfig
from .llms.cohere import CohereConfig
from .llms.ai21 import AI21Config
from .llms.together_ai import TogetherAIConfig
from .llms.palm import PalmConfig
from .llms.nlp_cloud import NLPCloudConfig
from .llms.aleph_alpha import AlephAlphaConfig
from .llms.petals import PetalsConfig
from .llms.vertex_ai import VertexAIConfig
from .llms.sagemaker import SagemakerConfig
from .llms.bedrock import AmazonConfig
from .main import * # type: ignore
from .integrations import *
from .exceptions import (

View file

@ -1,10 +1,11 @@
import os
import os, types
import json
from enum import Enum
import requests
import time
from typing import Callable
from typing import Callable, Optional
from litellm.utils import ModelResponse
import litellm
class AI21Error(Exception):
def __init__(self, status_code, message):
@ -14,6 +15,68 @@ class AI21Error(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class AI21Config():
"""
Reference: https://docs.ai21.com/reference/j2-complete-ref
The class `AI21Config` provides configuration for the AI21's API interface. Below are the parameters:
- `numResults` (int32): Number of completions to sample and return. Optional, default is 1. If the temperature is greater than 0 (non-greedy decoding), a value greater than 1 can be meaningful.
- `maxTokens` (int32): The maximum number of tokens to generate per result. Optional, default is 16. If no `stopSequences` are given, generation stops after producing `maxTokens`.
- `minTokens` (int32): The minimum number of tokens to generate per result. Optional, default is 0. If `stopSequences` are given, they are ignored until `minTokens` are generated.
- `temperature` (float): Modifies the distribution from which tokens are sampled. Optional, default is 0.7. A value of 0 essentially disables sampling and results in greedy decoding.
- `topP` (float): Used for sampling tokens from the corresponding top percentile of probability mass. Optional, default is 1. For instance, a value of 0.9 considers only tokens comprising the top 90% probability mass.
- `stopSequences` (array of strings): Stops decoding if any of the input strings is generated. Optional.
- `topKReturn` (int32): Range between 0 to 10, including both. Optional, default is 0. Specifies the top-K alternative tokens to return. A non-zero value includes the string representations and log-probabilities for each of the top-K alternatives at each position.
- `frequencyPenalty` (object): Placeholder for frequency penalty object.
- `presencePenalty` (object): Placeholder for presence penalty object.
- `countPenalty` (object): Placeholder for count penalty object.
"""
numResults: Optional[int]=None
maxTokens: Optional[int]=None
minTokens: Optional[int]=None
temperature: Optional[float]=None
topP: Optional[float]=None
stopSequences: Optional[list]=None
topKReturn: Optional[int]=None
frequencePenalty: Optional[dict]=None
presencePenalty: Optional[dict]=None
countPenalty: Optional[dict]=None
def __init__(self,
numResults: Optional[int]=None,
maxTokens: Optional[int]=None,
minTokens: Optional[int]=None,
temperature: Optional[float]=None,
topP: Optional[float]=None,
stopSequences: Optional[list]=None,
topKReturn: Optional[int]=None,
frequencePenalty: Optional[dict]=None,
presencePenalty: Optional[dict]=None,
countPenalty: Optional[dict]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def validate_environment(api_key):
if api_key is None:
raise ValueError(
@ -53,6 +116,13 @@ def completion(
)
else:
prompt += f"{message['content']}"
## Load Config
config = litellm.AI21Config.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > ai21_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
data = {
"prompt": prompt,
# "instruction": prompt, # some baseten models require the prompt to be passed in via the 'instruction' kwarg

View file

@ -1,9 +1,10 @@
import os
import os, types
import json
from enum import Enum
import requests
import time
from typing import Callable
from typing import Callable, Optional
import litellm
from litellm.utils import ModelResponse
class AlephAlphaError(Exception):
@ -14,6 +15,139 @@ class AlephAlphaError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class AlephAlphaConfig():
"""
Reference: https://docs.aleph-alpha.com/api/complete/
The `AlephAlphaConfig` class represents the configuration for the Aleph Alpha API. Here are the properties:
- `maximum_tokens` (integer, required): The maximum number of tokens to be generated by the completion. The sum of input tokens and maximum tokens may not exceed 2048.
- `minimum_tokens` (integer, optional; default value: 0): Generate at least this number of tokens before an end-of-text token is generated.
- `echo` (boolean, optional; default value: false): Whether to echo the prompt in the completion.
- `temperature` (number, nullable; default value: 0): Adjusts how creatively the model generates outputs. Use combinations of temperature, top_k, and top_p sensibly.
- `top_k` (integer, nullable; default value: 0): Introduces randomness into token generation by considering the top k most likely options.
- `top_p` (number, nullable; default value: 0): Adds randomness by considering the smallest set of tokens whose cumulative probability exceeds top_p.
- `presence_penalty`, `frequency_penalty`, `sequence_penalty` (number, nullable; default value: 0): Various penalties that can reduce repetition.
- `sequence_penalty_min_length` (integer; default value: 2): Minimum number of tokens to be considered as a sequence.
- `repetition_penalties_include_prompt`, `repetition_penalties_include_completion`, `use_multiplicative_presence_penalty`,`use_multiplicative_frequency_penalty`,`use_multiplicative_sequence_penalty` (boolean, nullable; default value: false): Various settings that adjust how the repetition penalties are applied.
- `penalty_bias` (string, nullable): Text used in addition to the penalized tokens for repetition penalties.
- `penalty_exceptions` (string[], nullable): Strings that may be generated without penalty.
- `penalty_exceptions_include_stop_sequences` (boolean, nullable; default value: true): Include all stop_sequences in penalty_exceptions.
- `best_of` (integer, nullable; default value: 1): The number of completions will be generated on the server side.
- `n` (integer, nullable; default value: 1): The number of completions to return.
- `logit_bias` (object, nullable): Adjust the logit scores before sampling.
- `log_probs` (integer, nullable): Number of top log probabilities for each token generated.
- `stop_sequences` (string[], nullable): List of strings that will stop generation if they're generated.
- `tokens` (boolean, nullable; default value: false): Flag indicating whether individual tokens of the completion should be returned or not.
- `raw_completion` (boolean; default value: false): if True, the raw completion of the model will be returned.
- `disable_optimizations` (boolean, nullable; default value: false): Disables any applied optimizations to both your prompt and completion.
- `completion_bias_inclusion`, `completion_bias_exclusion` (string[], default value: []): Set of strings to bias the generation of tokens.
- `completion_bias_inclusion_first_token_only`, `completion_bias_exclusion_first_token_only` (boolean; default value: false): Consider only the first token for the completion_bias_inclusion/exclusion.
- `contextual_control_threshold` (number, nullable): Control over how similar tokens are controlled.
- `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
"""
maximum_tokens: Optional[int]=litellm.max_tokens # aleph alpha requires max tokens
minimum_tokens: Optional[int]=None
echo: Optional[bool]=None
temperature: Optional[int]=None
top_k: Optional[int]=None
top_p: Optional[int]=None
presence_penalty: Optional[int]=None
frequency_penalty: Optional[int]=None
sequence_penalty: Optional[int]=None
sequence_penalty_min_length: Optional[int]=None
repetition_penalties_include_prompt: Optional[bool]=None
repetition_penalties_include_completion: Optional[bool]=None
use_multiplicative_presence_penalty: Optional[bool]=None
use_multiplicative_frequency_penalty: Optional[bool]=None
use_multiplicative_sequence_penalty: Optional[bool]=None
penalty_bias: Optional[str]=None
penalty_exceptions_include_stop_sequences: Optional[bool]=None
best_of: Optional[int]=None
n: Optional[int]=None
logit_bias: Optional[dict]=None
log_probs: Optional[int]=None
stop_sequences: Optional[list]=None
tokens: Optional[bool]=None
raw_completion: Optional[bool]=None
disable_optimizations: Optional[bool]=None
completion_bias_inclusion: Optional[list]=None
completion_bias_exclusion: Optional[list]=None
completion_bias_inclusion_first_token_only: Optional[bool]=None
completion_bias_exclusion_first_token_only: Optional[bool]=None
contextual_control_threshold: Optional[int]=None
control_log_additive: Optional[bool]=None
def __init__(self,
maximum_tokens: Optional[int]=None,
minimum_tokens: Optional[int]=None,
echo: Optional[bool]=None,
temperature: Optional[int]=None,
top_k: Optional[int]=None,
top_p: Optional[int]=None,
presence_penalty: Optional[int]=None,
frequency_penalty: Optional[int]=None,
sequence_penalty: Optional[int]=None,
sequence_penalty_min_length: Optional[int]=None,
repetition_penalties_include_prompt: Optional[bool]=None,
repetition_penalties_include_completion: Optional[bool]=None,
use_multiplicative_presence_penalty: Optional[bool]=None,
use_multiplicative_frequency_penalty: Optional[bool]=None,
use_multiplicative_sequence_penalty: Optional[bool]=None,
penalty_bias: Optional[str]=None,
penalty_exceptions_include_stop_sequences: Optional[bool]=None,
best_of: Optional[int]=None,
n: Optional[int]=None,
logit_bias: Optional[dict]=None,
log_probs: Optional[int]=None,
stop_sequences: Optional[list]=None,
tokens: Optional[bool]=None,
raw_completion: Optional[bool]=None,
disable_optimizations: Optional[bool]=None,
completion_bias_inclusion: Optional[list]=None,
completion_bias_exclusion: Optional[list]=None,
completion_bias_inclusion_first_token_only: Optional[bool]=None,
completion_bias_exclusion_first_token_only: Optional[bool]=None,
contextual_control_threshold: Optional[int]=None,
control_log_additive: Optional[bool]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def validate_environment(api_key):
headers = {
"accept": "application/json",
@ -37,6 +171,13 @@ def completion(
default_max_tokens_to_sample=None,
):
headers = validate_environment(api_key)
## Load Config
config = litellm.AlephAlphaConfig.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > aleph_alpha_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
completion_url = "https://api.aleph-alpha.com/complete"
model = model
prompt = ""
@ -65,7 +206,6 @@ def completion(
data = {
"model": model,
"prompt": prompt,
"maximum_tokens": optional_params["maximum_tokens"] if "maximum_tokens" in optional_params else default_max_tokens_to_sample, # required input
**optional_params,
}

View file

@ -1,10 +1,11 @@
import os
import os, types
import json
from enum import Enum
import requests
import time
from typing import Callable
from typing import Callable, Optional
from litellm.utils import ModelResponse
import litellm
class AnthropicConstants(Enum):
HUMAN_PROMPT = "\n\nHuman:"
@ -18,11 +19,38 @@ class AnthropicError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class AnthropicConfig():
"""
Reference: https://docs.anthropic.com/claude/reference/complete_post
# contains any default values we need to pass to the provider
AnthropicConfig = {
"max_tokens_to_sample": 256 # override by setting - completion(..,max_tokens=300)
}
to pass metadata to anthropic, it's {"user_id": "any-relevant-information"}
"""
max_tokens_to_sample: Optional[int]=256 # anthropic requires a default
stop_sequences: Optional[list[str]]=None
temperature: Optional[int]=None
top_p: Optional[int]=None
top_k: Optional[int]=None
metadata: Optional[dict]=None
def __init__(self,
max_tokens_to_sample: Optional[int]=256, # anthropic requires a default
stop_sequences: Optional[list[str]]=None,
temperature: Optional[int]=None,
top_p: Optional[int]=None,
top_k: Optional[int]=None,
metadata: Optional[dict]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
# makes headers for API call
@ -72,11 +100,11 @@ def completion(
prompt += f"{AnthropicConstants.AI_PROMPT.value}"
## Load Config
for k, v in AnthropicConfig.items():
if k not in optional_params:
config = litellm.AnthropicConfig.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
if optional_params["max_tokens_to_sample"] != 256: # not default - print for testing
print_verbose(f"LiteLLM.Anthropic: Max Tokens Set")
data = {
"model": model,
"prompt": prompt,

View file

@ -1,7 +1,8 @@
import json, copy
import json, copy, types
from enum import Enum
import time
from typing import Callable
from typing import Callable, Optional
import litellm
from litellm.utils import ModelResponse, get_secret
class BedrockError(Exception):
@ -12,6 +13,38 @@ class BedrockError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class AmazonConfig():
"""
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-text-express-v1
Supported Params for the Amazon Titan models:
- `maxTokenCount` (integer) max tokens,
- `stopSequences` (string[]) list of stop sequence strings
- `temperature` (float) temperature for model,
- `topP` (int) top p for model
"""
maxTokenCount: Optional[int]=None
stopSequences: Optional[list]=None
temperature: Optional[float]=None
topP: Optional[int]=None
def __init__(self,
maxTokenCount: Optional[int]=None,
stopSequences: Optional[list]=None,
temperature: Optional[float]=None,
topP: Optional[int]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
class AnthropicConstants(Enum):
HUMAN_PROMPT = "\n\nHuman:"
@ -100,22 +133,52 @@ def completion(
prompt = convert_messages_to_prompt(messages, provider)
inference_params = copy.deepcopy(optional_params)
stream = inference_params.pop("stream", False)
print(f"bedrock provider: {provider}")
if provider == "anthropic":
## LOAD CONFIG
config = litellm.AnthropicConfig.get_config()
for k, v in config.items():
if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
inference_params[k] = v
data = json.dumps({
"prompt": prompt,
**inference_params
})
elif provider == "ai21":
## LOAD CONFIG
config = litellm.AI21Config.get_config()
for k, v in config.items():
if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
inference_params[k] = v
data = json.dumps({
"prompt": prompt,
**inference_params
})
elif provider == "cohere":
## LOAD CONFIG
config = litellm.CohereConfig.get_config()
for k, v in config.items():
if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
inference_params[k] = v
data = json.dumps({
"prompt": prompt,
**inference_params
})
elif provider == "amazon": # amazon titan
## LOAD CONFIG
config = litellm.AmazonConfig.get_config()
for k, v in config.items():
if k not in inference_params: # completion(top_k=3) > amazon_config(top_k=3) <- allows for dynamic variables to be passed in
inference_params[k] = v
else: # amazon titan
data = json.dumps({
"inputText": prompt,
"textGenerationConfig": inference_params,
})
## LOGGING
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key="",
@ -147,7 +210,7 @@ def completion(
logging_obj.post_call(
input=prompt,
api_key="",
original_response=response,
original_response=response_body,
additional_args={"complete_input_dict": data},
)
print_verbose(f"raw model_response: {response}")
@ -158,6 +221,8 @@ def completion(
elif provider == "anthropic":
outputText = response_body['completion']
model_response["finish_reason"] = response_body["stop_reason"]
elif provider == "cohere":
outputText = response_body["generations"][0]["text"]
else: # amazon titan
outputText = response_body.get('results')[0].get('outputText')
if "error" in outputText:

View file

@ -1,10 +1,11 @@
import os
import os, types
import json
from enum import Enum
import requests
import time
from typing import Callable
from typing import Callable, Optional
from litellm.utils import ModelResponse
import litellm
class CohereError(Exception):
def __init__(self, status_code, message):
@ -14,6 +15,79 @@ class CohereError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class CohereConfig():
"""
Reference: https://docs.cohere.com/reference/generate
The class `CohereConfig` provides configuration for the Cohere's API interface. Below are the parameters:
- `num_generations` (integer): Maximum number of generations returned. Default is 1, with a minimum value of 1 and a maximum value of 5.
- `max_tokens` (integer): Maximum number of tokens the model will generate as part of the response. Default value is 20.
- `truncate` (string): Specifies how the API handles inputs longer than maximum token length. Options include NONE, START, END. Default is END.
- `temperature` (number): A non-negative float controlling the randomness in generation. Lower temperatures result in less random generations. Default is 0.75.
- `preset` (string): Identifier of a custom preset, a combination of parameters such as prompt, temperature etc.
- `end_sequences` (array of strings): The generated text gets cut at the beginning of the earliest occurrence of an end sequence, which will be excluded from the text.
- `stop_sequences` (array of strings): The generated text gets cut at the end of the earliest occurrence of a stop sequence, which will be included in the text.
- `k` (integer): Limits generation at each step to top `k` most likely tokens. Default is 0.
- `p` (number): Limits generation at each step to most likely tokens with total probability mass of `p`. Default is 0.
- `frequency_penalty` (number): Reduces repetitiveness of generated tokens. Higher values apply stronger penalties to previously occurred tokens.
- `presence_penalty` (number): Reduces repetitiveness of generated tokens. Similar to frequency_penalty, but this penalty applies equally to all tokens that have already appeared.
- `return_likelihoods` (string): Specifies how and if token likelihoods are returned with the response. Options include GENERATION, ALL and NONE.
- `logit_bias` (object): Used to prevent the model from generating unwanted tokens or to incentivize it to include desired tokens. e.g. {"hello_world": 1233}
"""
num_generations: Optional[int]=None
max_tokens: Optional[int]=None
truncate: Optional[str]=None
temperature: Optional[int]=None
preset: Optional[str]=None
end_sequences: Optional[list]=None
stop_sequences: Optional[list]=None
k: Optional[int]=None
p: Optional[int]=None
frequency_penalty: Optional[int]=None
presence_penalty: Optional[int]=None
return_likelihoods: Optional[str]=None
logit_bias: Optional[dict]=None
def __init__(self,
num_generations: Optional[int]=None,
max_tokens: Optional[int]=None,
truncate: Optional[str]=None,
temperature: Optional[int]=None,
preset: Optional[str]=None,
end_sequences: Optional[list]=None,
stop_sequences: Optional[list]=None,
k: Optional[int]=None,
p: Optional[int]=None,
frequency_penalty: Optional[int]=None,
presence_penalty: Optional[int]=None,
return_likelihoods: Optional[str]=None,
logit_bias: Optional[dict]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def validate_environment(api_key):
headers = {
"accept": "application/json",
@ -39,6 +113,13 @@ def completion(
completion_url = "https://api.cohere.ai/v1/generate"
model = model
prompt = " ".join(message["content"] for message in messages)
## Load Config
config=litellm.CohereConfig.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
data = {
"model": model,
"prompt": prompt,

View file

@ -1,9 +1,10 @@
## Uses the huggingface text generation inference API
import os, copy
import os, copy, types
import json
from enum import Enum
import requests
import time
import litellm
from typing import Callable
from litellm.utils import ModelResponse, Choices, Message
from typing import Optional
@ -17,11 +18,52 @@ class HuggingfaceError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
# contains any default values we need to pass to the provider
HuggingfaceConfig = {
"return_full_text": False, # override by setting - completion(..,return_full_text=True)
"details": True # needed for getting logprobs etc. for tgi models. override by setting - completion(..., details=False)
}
class HuggingfaceConfig():
"""
Reference: https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/compat_generate
"""
best_of: Optional[int] = None
decoder_input_details: Optional[bool] = None
details: Optional[bool] = True # enables returning logprobs + best of
max_new_tokens: Optional[int] = None
repetition_penalty: Optional[float] = None
return_full_text: Optional[bool] = False # by default don't return the input as part of the output
seed: Optional[int] = None
temperature: Optional[float] = None
top_k: Optional[int] = None
top_n_tokens: Optional[int] = None
top_p: Optional[int] = None
truncate: Optional[int] = None
typical_p: Optional[float] = None
watermark: Optional[bool] = None
def __init__(self,
best_of: Optional[int] = None,
decoder_input_details: Optional[bool] = None,
details: Optional[bool] = None,
max_new_tokens: Optional[int] = None,
repetition_penalty: Optional[float] = None,
return_full_text: Optional[bool] = None,
seed: Optional[int] = None,
temperature: Optional[float] = None,
top_k: Optional[int] = None,
top_n_tokens: Optional[int] = None,
top_p: Optional[int] = None,
truncate: Optional[int] = None,
typical_p: Optional[float] = None,
watermark: Optional[bool] = None
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def validate_environment(api_key):
headers = {
@ -74,8 +116,10 @@ def get_hf_task_for_model(model):
return "text-generation-inference"
elif model in conversational_models:
return "conversational"
else:
elif "roneneldan/TinyStories" in model:
return None
else:
return "text-generation-inference" # default to tgi
def completion(
model: str,
@ -108,8 +152,9 @@ def completion(
completion_url = f"https://api-inference.huggingface.co/models/{model}"
## Load Config
for k, v in HuggingfaceConfig.items():
if k not in optional_params:
config=litellm.HuggingfaceConfig.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
### MAP INPUT PARAMS
@ -149,19 +194,11 @@ def completion(
)
else:
prompt = prompt_factory(model=model, messages=messages)
if "https://api-inference.huggingface.co/models" in completion_url:
inference_params = copy.deepcopy(optional_params)
data = {
"inputs": prompt,
"parameters": inference_params,
"stream": True if "stream" in inference_params and inference_params["stream"] == True else False,
}
else:
data = {
"inputs": prompt,
"parameters": optional_params,
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
}
data = {
"inputs": prompt,
"parameters": optional_params,
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
}
input_text = prompt
else:
# Non TGI and Conversational llms

View file

@ -1,9 +1,10 @@
import os
import os, types
import json
from enum import Enum
import requests
import time
from typing import Callable
from typing import Callable, Optional
import litellm
from litellm.utils import ModelResponse
class NLPCloudError(Exception):
@ -14,6 +15,75 @@ class NLPCloudError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class NLPCloudConfig():
"""
Reference: https://docs.nlpcloud.com/#generation
- `max_length` (int): Optional. The maximum number of tokens that the generated text should contain.
- `length_no_input` (boolean): Optional. Whether `min_length` and `max_length` should not include the length of the input text.
- `end_sequence` (string): Optional. A specific token that should be the end of the generated sequence.
- `remove_end_sequence` (boolean): Optional. Whether to remove the `end_sequence` string from the result.
- `remove_input` (boolean): Optional. Whether to remove the input text from the result.
- `bad_words` (list of strings): Optional. List of tokens that are not allowed to be generated.
- `temperature` (float): Optional. Temperature sampling. It modulates the next token probabilities.
- `top_p` (float): Optional. Top P sampling. Below 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
- `top_k` (int): Optional. Top K sampling. The number of highest probability vocabulary tokens to keep for top k filtering.
- `repetition_penalty` (float): Optional. Prevents the same word from being repeated too many times.
- `num_beams` (int): Optional. Number of beams for beam search.
- `num_return_sequences` (int): Optional. The number of independently computed returned sequences.
"""
max_length: Optional[int]=None
length_no_input: Optional[bool]=None
end_sequence: Optional[str]=None
remove_end_sequence: Optional[bool]=None
remove_input: Optional[bool]=None
bad_words: Optional[list]=None
temperature: Optional[float]=None
top_p: Optional[float]=None
top_k: Optional[int]=None
repetition_penalty: Optional[float]=None
num_beams: Optional[int]=None
num_return_sequences: Optional[int]=None
def __init__(self,
max_length: Optional[int]=None,
length_no_input: Optional[bool]=None,
end_sequence: Optional[str]=None,
remove_end_sequence: Optional[bool]=None,
remove_input: Optional[bool]=None,
bad_words: Optional[list]=None,
temperature: Optional[float]=None,
top_p: Optional[float]=None,
top_k: Optional[int]=None,
repetition_penalty: Optional[float]=None,
num_beams: Optional[int]=None,
num_return_sequences: Optional[int]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def validate_environment(api_key):
headers = {
"accept": "application/json",
@ -37,6 +107,13 @@ def completion(
default_max_tokens_to_sample=None,
):
headers = validate_environment(api_key)
## Load Config
config = litellm.NLPCloudConfig.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
completion_url_fragment_1 = "https://api.nlpcloud.io/v1/gpu/"
completion_url_fragment_2 = "/generation"
model = model

View file

@ -1,9 +1,10 @@
import os
import os, types
import json
from enum import Enum
import time
from typing import Callable
from typing import Callable, Optional
from litellm.utils import ModelResponse, get_secret
import litellm
import sys
class PalmError(Exception):
@ -14,6 +15,57 @@ class PalmError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class PalmConfig():
"""
Reference: https://developers.generativeai.google/api/python/google/generativeai/chat
The class `PalmConfig` provides configuration for the Palm's API interface. Here are the parameters:
- `context` (string): Text that should be provided to the model first, to ground the response. This could be a prompt to guide the model's responses.
- `examples` (list): Examples of what the model should generate. They are treated identically to conversation messages except that they take precedence over the history in messages if the total input size exceeds the model's input_token_limit.
- `temperature` (float): Controls the randomness of the output. Must be positive. Higher values produce a more random and varied response. A temperature of zero will be deterministic.
- `candidate_count` (int): Maximum number of generated response messages to return. This value must be between [1, 8], inclusive. Only unique candidates are returned.
- `top_k` (int): The API uses combined nucleus and top-k sampling. `top_k` sets the maximum number of tokens to sample from on each step.
- `top_p` (float): The API uses combined nucleus and top-k sampling. `top_p` configures the nucleus sampling. It sets the maximum cumulative probability of tokens to sample from.
- `maxOutputTokens` (int): Sets the maximum number of tokens to be returned in the output
"""
context: Optional[str]=None
examples: Optional[list]=None
temperature: Optional[float]=None
candidate_count: Optional[int]=None
top_k: Optional[int]=None
top_p: Optional[float]=None
maxOutputTokens: Optional[int]=None
def __init__(self,
context: Optional[str]=None,
examples: Optional[list]=None,
temperature: Optional[float]=None,
candidate_count: Optional[int]=None,
top_k: Optional[int]=None,
top_p: Optional[float]=None,
maxOutputTokens: Optional[int]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def completion(
model: str,
messages: list,
@ -33,6 +85,13 @@ def completion(
palm.configure(api_key=api_key)
model = model
## Load Config
config = litellm.PalmConfig.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > palm_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
prompt = ""
for message in messages:
if "role" in message:

View file

@ -1,10 +1,12 @@
import os
import os, types
import json
from enum import Enum
import requests
import time
from typing import Callable
from typing import Callable, Optional
import litellm
from litellm.utils import ModelResponse
from .prompt_templates.factory import prompt_factory, custom_prompt
class PetalsError(Exception):
def __init__(self, status_code, message):
@ -14,13 +16,59 @@ class PetalsError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
PetalsConfig = {
"max_new_tokens": 256
}
class PetalsConfig():
"""
Reference: https://github.com/petals-infra/chat.petals.dev#post-apiv1generate
The `PetalsConfig` class encapsulates the configuration for the Petals API. The properties of this class are described below:
- `max_length` (integer): This represents the maximum length of the generated text (including the prefix) in tokens.
- `max_new_tokens` (integer): This represents the maximum number of newly generated tokens (excluding the prefix).
The generation parameters are compatible with `.generate()` from Hugging Face's Transformers library:
- `do_sample` (boolean, optional): If set to 0 (default), the API runs greedy generation. If set to 1, the API performs sampling using the parameters below:
- `temperature` (float, optional): This value sets the temperature for sampling.
- `top_k` (integer, optional): This value sets the limit for top-k sampling.
- `top_p` (float, optional): This value sets the limit for top-p (nucleus) sampling.
- `repetition_penalty` (float, optional): This helps apply the repetition penalty during text generation, as discussed in this paper.
"""
max_length: Optional[int]=None
max_new_tokens: Optional[int]=litellm.max_tokens # petals requires max tokens to be set
do_sample: Optional[bool]=None
temperature: Optional[float]=None
top_k: Optional[int]=None
top_p: Optional[float]=None
repetition_penalty: Optional[float]=None
def __init__(self,
max_length: Optional[int]=None,
max_new_tokens: Optional[int]=litellm.max_tokens, # petals requires max tokens to be set
do_sample: Optional[bool]=None,
temperature: Optional[float]=None,
top_k: Optional[int]=None,
top_p: Optional[float]=None,
repetition_penalty: Optional[float]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def completion(
model: str,
messages: list,
api_base: Optional[str],
model_response: ModelResponse,
print_verbose: Callable,
encoding,
@ -30,61 +78,91 @@ def completion(
litellm_params=None,
logger_fn=None,
):
try:
import torch
from transformers import AutoTokenizer
from petals import AutoDistributedModelForCausalLM
except:
raise Exception(
"Importing torch, transformers, petals failed\nTry pip installing petals \npip install git+https://github.com/bigscience-workshop/petals"
)
model = model
tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, add_bos_token=False)
model_obj = AutoDistributedModelForCausalLM.from_pretrained(model)
prompt = ""
for message in messages:
if "role" in message:
if message["role"] == "user":
prompt += (
f"{message['content']}"
)
else:
prompt += (
f"{message['content']}"
)
else:
prompt += f"{message['content']}"
## Load Config
for k, v in PetalsConfig.items():
if k not in optional_params:
config = litellm.PetalsConfig.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > petals_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key="",
additional_args={"complete_input_dict": optional_params},
if model in litellm.custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_details = litellm.custom_prompt_dict[model]
prompt = custom_prompt(
role_dict=model_prompt_details["roles"],
initial_prompt_value=model_prompt_details["initial_prompt_value"],
final_prompt_value=model_prompt_details["final_prompt_value"],
messages=messages
)
## COMPLETION CALL
inputs = tokenizer(prompt, return_tensors="pt")["input_ids"]
# optional params: max_new_tokens=1,temperature=0.9, top_p=0.6
outputs = model_obj.generate(inputs, **optional_params)
else:
prompt = prompt_factory(model=model, messages=messages)
## LOGGING
logging_obj.post_call(
input=prompt,
api_key="",
original_response=outputs,
additional_args={"complete_input_dict": optional_params},
)
## RESPONSE OBJECT
output_text = tokenizer.decode(outputs[0])
if api_base:
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key="",
additional_args={"complete_input_dict": optional_params, "api_base": api_base},
)
data = {
"model": model,
"inputs": prompt,
**optional_params
}
## COMPLETION CALL
response = requests.post(api_base, data=data)
## LOGGING
logging_obj.post_call(
input=prompt,
api_key="",
original_response=response.text,
additional_args={"complete_input_dict": optional_params},
)
## RESPONSE OBJECT
try:
output_text = response.json()["outputs"]
except Exception as e:
PetalsError(status_code=response.status_code, message=str(e))
else:
try:
import torch
from transformers import AutoTokenizer
from petals import AutoDistributedModelForCausalLM
except:
raise Exception(
"Importing torch, transformers, petals failed\nTry pip installing petals \npip install git+https://github.com/bigscience-workshop/petals"
)
model = model
tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, add_bos_token=False)
model_obj = AutoDistributedModelForCausalLM.from_pretrained(model)
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key="",
additional_args={"complete_input_dict": optional_params},
)
## COMPLETION CALL
inputs = tokenizer(prompt, return_tensors="pt")["input_ids"]
# optional params: max_new_tokens=1,temperature=0.9, top_p=0.6
outputs = model_obj.generate(inputs, **optional_params)
## LOGGING
logging_obj.post_call(
input=prompt,
api_key="",
original_response=outputs,
additional_args={"complete_input_dict": optional_params},
)
## RESPONSE OBJECT
output_text = tokenizer.decode(outputs[0])
model_response["choices"][0]["message"]["content"] = output_text
prompt_tokens = len(

View file

@ -1,9 +1,10 @@
import os
import os, types
import json
import requests
import time
from typing import Callable
from typing import Callable, Optional
from litellm.utils import ModelResponse
import litellm
class ReplicateError(Exception):
def __init__(self, status_code, message):
@ -13,6 +14,65 @@ class ReplicateError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class ReplicateConfig():
"""
Reference: https://replicate.com/meta/llama-2-70b-chat/api
- `prompt` (string): The prompt to send to the model.
- `system_prompt` (string): The system prompt to send to the model. This is prepended to the prompt and helps guide system behavior. Default value: `You are a helpful assistant`.
- `max_new_tokens` (integer): Maximum number of tokens to generate. Typically, a word is made up of 2-3 tokens. Default value: `128`.
- `min_new_tokens` (integer): Minimum number of tokens to generate. To disable, set to `-1`. A word is usually 2-3 tokens. Default value: `-1`.
- `temperature` (number): Adjusts the randomness of outputs. Values greater than 1 increase randomness, 0 is deterministic, and 0.75 is a reasonable starting value. Default value: `0.75`.
- `top_p` (number): During text decoding, it samples from the top `p` percentage of most likely tokens. Reduce this to ignore less probable tokens. Default value: `0.9`.
- `top_k` (integer): During text decoding, samples from the top `k` most likely tokens. Reduce this to ignore less probable tokens. Default value: `50`.
- `stop_sequences` (string): A comma-separated list of sequences to stop generation at. For example, inputting '<end>,<stop>' will cease generation at the first occurrence of either 'end' or '<stop>'.
- `seed` (integer): This is the seed for the random generator. Leave it blank to randomize the seed.
- `debug` (boolean): If set to `True`, it provides debugging output in logs.
Please note that Replicate's mapping of these parameters can be inconsistent across different models, indicating that not all of these parameters may be available for use with all models.
"""
system_prompt: Optional[str]=None
max_new_tokens: Optional[int]=None
min_new_tokens: Optional[int]=None
temperature: Optional[int]=None
top_p: Optional[int]=None
top_k: Optional[int]=None
stop_sequences: Optional[str]=None
seed: Optional[int]=None
debug: Optional[bool]=None
def __init__(self,
system_prompt: Optional[str]=None,
max_new_tokens: Optional[int]=None,
min_new_tokens: Optional[int]=None,
temperature: Optional[int]=None,
top_p: Optional[int]=None,
top_k: Optional[int]=None,
stop_sequences: Optional[str]=None,
seed: Optional[int]=None,
debug: Optional[bool]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
# Function to start a prediction and get the prediction URL
def start_prediction(version_id, input_data, api_token, logging_obj):
base_url = "https://api.replicate.com/v1"
@ -110,6 +170,13 @@ def completion(
):
# Start a prediction and get the prediction URL
version_id = model_to_version_id(model)
## Load Config
config = litellm.ReplicateConfig.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > replicate_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
if "meta/llama-2-13b-chat" in model:
system_prompt = ""
prompt = ""

View file

@ -1,9 +1,10 @@
import os
import json
import os, types
from enum import Enum
import json
import requests
import time
from typing import Callable
from typing import Callable, Optional
import litellm
from litellm.utils import ModelResponse, get_secret
import sys
from copy import deepcopy
@ -16,6 +17,32 @@ class SagemakerError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class SagemakerConfig():
"""
Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
"""
max_new_tokens: Optional[int]=None
top_p: Optional[float]=None
temperature: Optional[float]=None
return_full_text: Optional[bool]=None
def __init__(self,
max_new_tokens: Optional[int]=None,
top_p: Optional[float]=None,
temperature: Optional[float]=None,
return_full_text: Optional[bool]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
"""
SAGEMAKER AUTH Keys/Vars
os.environ['AWS_ACCESS_KEY_ID'] = ""
@ -47,6 +74,16 @@ def completion(
region_name=region_name
)
# pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
inference_params = deepcopy(optional_params)
inference_params.pop("stream", None)
## Load Config
config = litellm.SagemakerConfig.get_config()
for k, v in config.items():
if k not in inference_params: # completion(top_k=3) > sagemaker_config(top_k=3) <- allows for dynamic variables to be passed in
inference_params[k] = v
model = model
prompt = ""
for message in messages:
@ -61,9 +98,7 @@ def completion(
)
else:
prompt += f"{message['content']}"
# pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
inference_params = deepcopy(optional_params)
inference_params.pop("stream", None)
data = {
"inputs": prompt,
"parameters": inference_params

View file

@ -1,9 +1,10 @@
import os
import os, types
import json
from enum import Enum
import requests
import time
from typing import Callable
from typing import Callable, Optional
import litellm
from litellm.utils import ModelResponse
from .prompt_templates.factory import prompt_factory, custom_prompt
@ -15,6 +16,55 @@ class TogetherAIError(Exception):
self.message
) # Call the base class constructor with the parameters it needs
class TogetherAIConfig():
"""
Reference: https://docs.together.ai/reference/inference
The class `TogetherAIConfig` provides configuration for the TogetherAI's API interface. Here are the parameters:
- `max_tokens` (int32, required): The maximum number of tokens to generate.
- `stop` (string, optional): A string sequence that will truncate (stop) the inference text output. For example, "\n\n" will stop generation as soon as the model generates two newlines.
- `temperature` (float, optional): A decimal number that determines the degree of randomness in the response. A value of 1 will always yield the same output. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value greater than 1 introduces more randomness in the output.
- `top_p` (float, optional): The `top_p` (nucleus) parameter is used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold, below which all less likely tokens are filtered out. This technique helps to maintain diversity and generate more fluent and natural-sounding text.
- `top_k` (int32, optional): The `top_k` parameter is used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.
- `repetition_penalty` (float, optional): A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition.
- `logprobs` (int32, optional): This parameter is not described in the prompt.
"""
max_tokens: Optional[int]=None
stop: Optional[str]=None
temperature:Optional[int]=None
top_p: Optional[float]=None
top_k: Optional[int]=None
repetition_penalty: Optional[float]=None
logprobs: Optional[int]=None
def __init__(self,
max_tokens: Optional[int]=None,
stop: Optional[str]=None,
temperature:Optional[int]=None,
top_p: Optional[float]=None,
top_k: Optional[int]=None,
repetition_penalty: Optional[float]=None,
logprobs: Optional[int]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def validate_environment(api_key):
if api_key is None:
raise ValueError(
@ -41,6 +91,13 @@ def completion(
logger_fn=None,
):
headers = validate_environment(api_key)
## Load Config
config = litellm.TogetherAIConfig.get_config()
for k, v in config.items():
if k not in optional_params: # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model]
@ -52,6 +109,7 @@ def completion(
)
else:
prompt = prompt_factory(model=model, messages=messages)
data = {
"model": model,
"prompt": prompt,

153
litellm/llms/vertex_ai.py Normal file
View file

@ -0,0 +1,153 @@
import os, types
import json
from enum import Enum
import requests
import time
from typing import Callable, Optional
from litellm.utils import ModelResponse
import litellm
class VertexAIError(Exception):
def __init__(self, status_code, message):
self.status_code = status_code
self.message = message
super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class VertexAIConfig():
"""
Reference: https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts
The class `VertexAIConfig` provides configuration for the VertexAI's API interface. Below are the parameters:
- `temperature` (float): This controls the degree of randomness in token selection.
- `max_output_tokens` (integer): This sets the limitation for the maximum amount of token in the text output. In this case, the default value is 256.
- `top_p` (float): The tokens are selected from the most probable to the least probable until the sum of their probabilities equals the `top_p` value. Default is 0.95.
- `top_k` (integer): The value of `top_k` determines how many of the most probable tokens are considered in the selection. For example, a `top_k` of 1 means the selected token is the most probable among all tokens. The default value is 40.
Note: Please make sure to modify the default parameters as required for your use case.
"""
temperature: Optional[float]=None
max_output_tokens: Optional[int]=None
top_p: Optional[float]=None
top_k: Optional[int]=None
def __init__(self,
temperature: Optional[float]=None,
max_output_tokens: Optional[int]=None,
top_p: Optional[float]=None,
top_k: Optional[int]=None) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != 'self' and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {k: v for k, v in cls.__dict__.items()
if not k.startswith('__')
and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod))
and v is not None}
def completion(
model: str,
messages: list,
model_response: ModelResponse,
print_verbose: Callable,
encoding,
logging_obj,
vertex_project=None,
vertex_location=None,
optional_params=None,
litellm_params=None,
logger_fn=None,
):
try:
import vertexai
except:
raise Exception("vertexai import failed please run `pip install google-cloud-aiplatform`")
from vertexai.preview.language_models import ChatModel, CodeChatModel, InputOutputTextPair
from vertexai.language_models import TextGenerationModel, CodeGenerationModel
vertexai.init(
project=vertex_project, location=vertex_location
)
## Load Config
config = litellm.VertexAIConfig.get_config()
for k, v in config.items():
if k not in optional_params:
optional_params[k] = v
# vertexai does not use an API key, it looks for credentials.json in the environment
prompt = " ".join([message["content"] for message in messages])
mode = ""
if model in litellm.vertex_chat_models:
chat_model = ChatModel.from_pretrained(model)
mode = "chat"
elif model in litellm.vertex_text_models:
text_model = TextGenerationModel.from_pretrained(model)
mode = "text"
elif model in litellm.vertex_code_text_models:
text_model = CodeGenerationModel.from_pretrained(model)
mode = "text"
else: # vertex_code_chat_models
chat_model = CodeChatModel.from_pretrained(model)
mode = "chat"
if mode == "chat":
chat = chat_model.start_chat()
## LOGGING
logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params})
if "stream" in optional_params and optional_params["stream"] == True:
model_response = chat.send_message_streaming(prompt, **optional_params)
return model_response
completion_response = chat.send_message(prompt, **optional_params)
elif mode == "text":
## LOGGING
logging_obj.pre_call(input=prompt, api_key=None)
if "stream" in optional_params and optional_params["stream"] == True:
model_response = text_model.predict_streaming(prompt, **optional_params)
return model_response
completion_response = text_model.predict(prompt, **optional_params)
## LOGGING
logging_obj.post_call(
input=prompt, api_key=None, original_response=completion_response
)
## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = str(completion_response)
model_response["created"] = time.time()
model_response["model"] = model
## CALCULATING USAGE
prompt_tokens = len(
encoding.encode(prompt)
)
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"]["content"])
)
model_response["usage"] = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
}
return model_response
def embedding():
# logic for parsing in - calling - parsing out model embedding calls
pass

View file

@ -45,7 +45,8 @@ from .llms import (
cohere,
petals,
oobabooga,
palm)
palm,
vertex_ai)
from .llms.prompt_templates.factory import prompt_factory, custom_prompt
import tiktoken
from concurrent.futures import ThreadPoolExecutor
@ -810,134 +811,32 @@ def completion(
)
return response
response = model_response
elif model in litellm.vertex_chat_models or model in litellm.vertex_code_chat_models:
try:
import vertexai
except:
raise Exception("vertexai import failed please run `pip install google-cloud-aiplatform`")
from vertexai.preview.language_models import ChatModel, CodeChatModel, InputOutputTextPair
elif model in litellm.vertex_chat_models or model in litellm.vertex_code_chat_models or model in litellm.vertex_text_models or model in litellm.vertex_code_text_models:
vertex_ai_project = (litellm.vertex_project
or get_secret("VERTEXAI_PROJECT"))
vertex_ai_location = (litellm.vertex_location
or get_secret("VERTEXAI_LOCATION"))
vertex_project = (litellm.vertex_project or get_secret("VERTEXAI_PROJECT"))
vertex_location = (litellm.vertex_location or get_secret("VERTEXAI_LOCATION"))
vertexai.init(
project=vertex_project, location=vertex_location
# palm does not support streaming as yet :(
model_response = vertex_ai.completion(
model=model,
messages=messages,
model_response=model_response,
print_verbose=print_verbose,
optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
encoding=encoding,
vertex_location=vertex_ai_location,
vertex_project=vertex_ai_project,
logging_obj=logging
)
# vertexai does not use an API key, it looks for credentials.json in the environment
prompt = " ".join([message["content"] for message in messages])
# contains any default values we need to pass to the provider
VertexAIConfig = {
"top_k": 40 # override by setting kwarg in completion() - e.g. completion(..., top_k=20)
}
if model in litellm.vertex_chat_models:
chat_model = ChatModel.from_pretrained(model)
else: # vertex_code_chat_models
chat_model = CodeChatModel.from_pretrained(model)
chat = chat_model.start_chat()
## Load Config
for k, v in VertexAIConfig.items():
if k not in optional_params:
optional_params[k] = v
## LOGGING
logging.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params})
if "stream" in optional_params and optional_params["stream"] == True:
model_response = chat.send_message_streaming(prompt, **optional_params)
response = CustomStreamWrapper(
model_response, model, custom_llm_provider="vertex_ai", logging_obj=logging
)
)
return response
completion_response = chat.send_message(prompt, **optional_params)
## LOGGING
logging.post_call(
input=prompt, api_key=None, original_response=completion_response
)
## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = str(completion_response)
model_response["created"] = time.time()
model_response["model"] = model
## CALCULATING USAGE
prompt_tokens = len(
encoding.encode(prompt)
)
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"]["content"])
)
model_response["usage"] = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
}
response = model_response
elif model in litellm.vertex_text_models or model in litellm.vertex_code_text_models:
try:
import vertexai
except:
raise Exception("vertexai import failed please run `pip install google-cloud-aiplatform`")
from vertexai.language_models import TextGenerationModel, CodeGenerationModel
vertexai.init(
project=litellm.vertex_project, location=litellm.vertex_location
)
# vertexai does not use an API key, it looks for credentials.json in the environment
# contains any default values we need to pass to the provider
VertexAIConfig = {
"top_k": 40 # override by setting kwarg in completion() - e.g. completion(..., top_k=20)
}
prompt = " ".join([message["content"] for message in messages])
if model in litellm.vertex_text_models:
vertex_model = TextGenerationModel.from_pretrained(model)
else:
vertex_model = CodeGenerationModel.from_pretrained(model)
## Load Config
for k, v in VertexAIConfig.items():
if k not in optional_params:
optional_params[k] = v
## LOGGING
logging.pre_call(input=prompt, api_key=None)
if "stream" in optional_params and optional_params["stream"] == True:
model_response = vertex_model.predict_streaming(prompt, **optional_params)
response = CustomStreamWrapper(
model_response, model, custom_llm_provider="vertexai", logging_obj=logging
)
return response
completion_response = vertex_model.predict(prompt, **optional_params)
## LOGGING
logging.post_call(
input=prompt, api_key=None, original_response=completion_response
)
## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = str(completion_response)
model_response["created"] = time.time()
model_response["model"] = model
## CALCULATING USAGE
prompt_tokens = len(
encoding.encode(prompt)
)
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"]["content"])
)
model_response["usage"] = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
}
response = model_response
elif model in litellm.ai21_models:
custom_llm_provider = "ai21"
@ -1122,10 +1021,16 @@ def completion(
custom_llm_provider == "petals"
or model in litellm.petals_models
):
api_base = (
litellm.api_base or
api_base
)
custom_llm_provider = "petals"
model_response = petals.completion(
model=model,
messages=messages,
api_base=api_base,
model_response=model_response,
print_verbose=print_verbose,
optional_params=optional_params,

View file

@ -37,6 +37,8 @@ def test_completion_custom_provider_model_name():
def test_completion_claude():
litellm.set_verbose = True
litellm.anthropic_config(max_tokens_to_sample=200, metadata={"user_id": "1224"})
try:
# test without max tokens
response = completion(
@ -48,7 +50,7 @@ def test_completion_claude():
except Exception as e:
pytest.fail(f"Error occurred: {e}")
test_completion_claude()
# test_completion_claude()
def test_completion_claude_max_tokens():
try:
@ -198,6 +200,8 @@ def test_get_hf_task_for_model():
# # TGI model
# # this is a TGI model https://huggingface.co/glaiveai/glaive-coder-7b
# def hf_test_completion_tgi():
# litellm.huggingface_config(return_full_text=True)
# litellm.set_verbose=True
# try:
# response = litellm.completion(
# model="huggingface/mistralai/Mistral-7B-Instruct-v0.1",
@ -321,13 +325,10 @@ def test_get_hf_task_for_model():
def test_completion_cohere(): # commenting for now as the cohere endpoint is being flaky
try:
litellm.CohereConfig(max_tokens=1000, stop_sequences=["a"])
response = completion(
model="command-nightly",
messages=messages,
max_tokens=100,
n=1,
logit_bias={40: 10},
stop=["a"],
logger_fn=logger_fn
)
# Add any assertions here to check the response
@ -665,13 +666,12 @@ def test_completion_azure_deployment_id():
# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect.
# def test_completion_replicate_llama_2():
# model_name = "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf"
# model_name = "replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
# litellm.replicate_config(max_new_tokens=200)
# try:
# response = completion(
# model=model_name,
# messages=messages,
# max_tokens=20,
# custom_llm_provider="replicate"
# )
# print(response)
# cost = completion_cost(completion_response=response)
@ -1027,7 +1027,7 @@ def test_completion_ai21():
except Exception as e:
pytest.fail(f"Error occurred: {e}")
test_completion_ai21()
# test_completion_ai21()
## test deep infra
def test_completion_deep_infra():
# litellm.set_verbose = True

View file

@ -0,0 +1,395 @@
#### What this tests ####
# This tests setting provider specific configs across providers
# There are 2 types of tests - changing config dynamically or by setting class variables
import sys, os
import traceback
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
from litellm import completion
# Huggingface - Expensive to deploy models and keep them running. Maybe we can try doing this via baseten??
# def hf_test_completion_tgi():
# litellm.HuggingfaceConfig(max_new_tokens=200)
# litellm.set_verbose=True
# try:
# # OVERRIDE WITH DYNAMIC MAX TOKENS
# response_1 = litellm.completion(
# model="huggingface/mistralai/Mistral-7B-Instruct-v0.1",
# messages=[{ "content": "Hello, how are you?","role": "user"}],
# api_base="https://n9ox93a8sv5ihsow.us-east-1.aws.endpoints.huggingface.cloud",
# max_tokens=10
# )
# # Add any assertions here to check the response
# print(response_1)
# response_1_text = response_1.choices[0].message.content
# # USE CONFIG TOKENS
# response_2 = litellm.completion(
# model="huggingface/mistralai/Mistral-7B-Instruct-v0.1",
# messages=[{ "content": "Hello, how are you?","role": "user"}],
# api_base="https://n9ox93a8sv5ihsow.us-east-1.aws.endpoints.huggingface.cloud",
# )
# # Add any assertions here to check the response
# print(response_2)
# response_2_text = response_2.choices[0].message.content
# assert len(response_2_text) > len(response_1_text)
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# hf_test_completion_tgi()
#Anthropic
def claude_test_completion():
litellm.AnthropicConfig(max_tokens_to_sample=200)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="claude-instant-1",
messages=[{ "content": "Hello, how are you?","role": "user"}],
max_tokens=10
)
# Add any assertions here to check the response
print(response_1)
response_1_text = response_1.choices[0].message.content
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="claude-instant-1",
messages=[{ "content": "Hello, how are you?","role": "user"}],
)
# Add any assertions here to check the response
print(response_2)
response_2_text = response_2.choices[0].message.content
assert len(response_2_text) > len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# claude_test_completion()
# Replicate
def replicate_test_completion():
litellm.ReplicateConfig(max_new_tokens=200)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
messages=[{ "content": "Hello, how are you?","role": "user"}],
max_tokens=10
)
# Add any assertions here to check the response
print(response_1)
response_1_text = response_1.choices[0].message.content
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
messages=[{ "content": "Hello, how are you?","role": "user"}],
)
# Add any assertions here to check the response
print(response_2)
response_2_text = response_2.choices[0].message.content
assert len(response_2_text) > len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# replicate_test_completion()
# Cohere
def cohere_test_completion():
litellm.CohereConfig(max_tokens=200)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="command-nightly",
messages=[{ "content": "Hello, how are you?","role": "user"}],
max_tokens=10
)
response_1_text = response_1.choices[0].message.content
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="command-nightly",
messages=[{ "content": "Hello, how are you?","role": "user"}],
)
response_2_text = response_2.choices[0].message.content
assert len(response_2_text) > len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# cohere_test_completion()
# AI21
def ai21_test_completion():
litellm.AI21Config(maxTokens=10)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="j2-mid",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
max_tokens=100
)
response_1_text = response_1.choices[0].message.content
print(f"response_1_text: {response_1_text}")
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="j2-mid",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
)
response_2_text = response_2.choices[0].message.content
print(f"response_2_text: {response_2_text}")
assert len(response_2_text) < len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# ai21_test_completion()
# TogetherAI
def togetherai_test_completion():
litellm.TogetherAIConfig(max_tokens=10)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="together_ai/togethercomputer/llama-2-70b-chat",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
max_tokens=100
)
response_1_text = response_1.choices[0].message.content
print(f"response_1_text: {response_1_text}")
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="together_ai/togethercomputer/llama-2-70b-chat",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
)
response_2_text = response_2.choices[0].message.content
print(f"response_2_text: {response_2_text}")
assert len(response_2_text) < len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# togetherai_test_completion()
# Palm
def palm_test_completion():
litellm.PalmConfig(maxOutputTokens=10)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="palm/chat-bison",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
max_tokens=100
)
response_1_text = response_1.choices[0].message.content
print(f"response_1_text: {response_1_text}")
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="palm/chat-bison",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
)
response_2_text = response_2.choices[0].message.content
print(f"response_2_text: {response_2_text}")
assert len(response_2_text) < len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# palm_test_completion()
# NLP Cloud
def nlp_cloud_test_completion():
litellm.NLPCloudConfig(max_length=10)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="dolphin",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
max_tokens=100
)
response_1_text = response_1.choices[0].message.content
print(f"response_1_text: {response_1_text}")
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="dolphin",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
)
response_2_text = response_2.choices[0].message.content
print(f"response_2_text: {response_2_text}")
assert len(response_2_text) < len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# nlp_cloud_test_completion()
# AlephAlpha
def aleph_alpha_test_completion():
litellm.AlephAlphaConfig(maximum_tokens=10)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="luminous-base",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
max_tokens=100
)
response_1_text = response_1.choices[0].message.content
print(f"response_1_text: {response_1_text}")
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="luminous-base",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
)
response_2_text = response_2.choices[0].message.content
print(f"response_2_text: {response_2_text}")
assert len(response_2_text) < len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# aleph_alpha_test_completion()
# Petals - calls are too slow, will cause circle ci to fail due to delay. Test locally.
# def petals_completion():
# litellm.PetalsConfig(max_new_tokens=10)
# # litellm.set_verbose=True
# try:
# # OVERRIDE WITH DYNAMIC MAX TOKENS
# response_1 = litellm.completion(
# model="petals/petals-team/StableBeluga2",
# messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
# api_base="https://chat.petals.dev/api/v1/generate",
# max_tokens=100
# )
# response_1_text = response_1.choices[0].message.content
# print(f"response_1_text: {response_1_text}")
# # USE CONFIG TOKENS
# response_2 = litellm.completion(
# model="petals/petals-team/StableBeluga2",
# api_base="https://chat.petals.dev/api/v1/generate",
# messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
# )
# response_2_text = response_2.choices[0].message.content
# print(f"response_2_text: {response_2_text}")
# assert len(response_2_text) < len(response_1_text)
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# petals_completion()
# VertexAI
# We don't have vertex ai configured for circle ci yet -- need to figure this out.
# def vertex_ai_test_completion():
# litellm.VertexAIConfig(max_output_tokens=10)
# # litellm.set_verbose=True
# try:
# # OVERRIDE WITH DYNAMIC MAX TOKENS
# response_1 = litellm.completion(
# model="chat-bison",
# messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
# max_tokens=100
# )
# response_1_text = response_1.choices[0].message.content
# print(f"response_1_text: {response_1_text}")
# # USE CONFIG TOKENS
# response_2 = litellm.completion(
# model="chat-bison",
# messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
# )
# response_2_text = response_2.choices[0].message.content
# print(f"response_2_text: {response_2_text}")
# assert len(response_2_text) < len(response_1_text)
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# vertex_ai_test_completion()
# Sagemaker
def sagemaker_test_completion():
litellm.SagemakerConfig(max_new_tokens=10)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
max_tokens=100
)
response_1_text = response_1.choices[0].message.content
print(f"response_1_text: {response_1_text}")
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
)
response_2_text = response_2.choices[0].message.content
print(f"response_2_text: {response_2_text}")
assert len(response_2_text) < len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# sagemaker_test_completion()
# Bedrock
def bedrock_test_completion():
litellm.CohereConfig(max_tokens=10)
# litellm.set_verbose=True
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="bedrock/cohere.command-text-v14",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
max_tokens=100
)
response_1_text = response_1.choices[0].message.content
print(f"response_1_text: {response_1_text}")
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="bedrock/cohere.command-text-v14",
messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
)
response_2_text = response_2.choices[0].message.content
print(f"response_2_text: {response_2_text}")
assert len(response_2_text) < len(response_1_text)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# bedrock_test_completion()

View file

@ -1202,8 +1202,6 @@ def get_optional_params( # use the openai defaults
# \"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}"
if max_tokens:
optional_params["max_tokens_to_sample"] = max_tokens
else:
optional_params["max_tokens_to_sample"] = 256 # anthropic fails without max_tokens_to_sample
if temperature:
optional_params["temperature"] = temperature
if top_p:
@ -1226,6 +1224,28 @@ def get_optional_params( # use the openai defaults
optional_params["topP"] = top_p
if stream:
optional_params["stream"] = stream
elif "cohere" in model: # cohere models on bedrock
supported_params = ["stream", "temperature", "max_tokens", "logit_bias", "top_p", "frequency_penalty", "presence_penalty", "stop"]
_check_valid_arg(supported_params=supported_params)
# handle cohere params
if stream:
optional_params["stream"] = stream
if temperature:
optional_params["temperature"] = temperature
if max_tokens:
optional_params["max_tokens"] = max_tokens
if n:
optional_params["num_generations"] = n
if logit_bias != {}:
optional_params["logit_bias"] = logit_bias
if top_p:
optional_params["p"] = top_p
if frequency_penalty:
optional_params["frequency_penalty"] = frequency_penalty
if presence_penalty:
optional_params["presence_penalty"] = presence_penalty
if stop:
optional_params["stop_sequences"] = stop
elif model in litellm.aleph_alpha_models:
supported_params = ["max_tokens", "stream", "top_p", "temperature", "presence_penalty", "frequency_penalty", "n", "stop"]
_check_valid_arg(supported_params=supported_params)
@ -1312,8 +1332,12 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None):
elif model in litellm.cohere_models:
custom_llm_provider = "cohere"
## replicate
elif model in litellm.replicate_models:
custom_llm_provider = "replicate"
elif model in litellm.replicate_models or ":" in model:
model_parts = model.split(":")
if len(model_parts) > 1 and len(model_parts[1])==64: ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
custom_llm_provider = "replicate"
elif model in litellm.replicate_models:
custom_llm_provider = "replicate"
## openrouter
elif model in litellm.openrouter_models:
custom_llm_provider = "openrouter"