style(test_completion.py): fix merge conflict

2025-04-24 18:24:20 +00:00 · 2023-10-05 22:09:38 -07:00 · 2023-10-05 22:09:38 -07:00 · dd7e397650
commit dd7e397650
parent 396d9d8e38
22 changed files with 1535 additions and 250 deletions
--- a/docs/my-website/docs/completion/input.md
+++ b/docs/my-website/docs/completion/input.md
@ -78,6 +78,6 @@ This list is constantly being updated.
 |AlephAlpha| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |   |  |   |
 |Palm| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |  |   |
 |NLP Cloud| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |  |  |  |   |
-|Petals| ✅ | ✅ |  | ✅ | ✅ |  |   |  |  |   |
+|Petals| ✅ | ✅ |  | ✅ | | ✅ |   |  |  |   |

 By default, LiteLLM raises an exception if the param being passed in isn't supported. However, if you want to just drop the param, instead of raising an exception, just set `litellm.drop_params = True`. 
--- a/litellm/init.py
+++ b/litellm/init.py
@ -311,6 +311,19 @@ from .utils import (
    get_llm_provider,
    completion_with_config,
 )
+from .llms.huggingface_restapi import HuggingfaceConfig
+from .llms.anthropic import AnthropicConfig
+from .llms.replicate import ReplicateConfig
+from .llms.cohere import CohereConfig
+from .llms.ai21 import AI21Config
+from .llms.together_ai import TogetherAIConfig
+from .llms.palm import PalmConfig
+from .llms.nlp_cloud import NLPCloudConfig
+from .llms.aleph_alpha import AlephAlphaConfig
+from .llms.petals import PetalsConfig
+from .llms.vertex_ai import VertexAIConfig
+from .llms.sagemaker import SagemakerConfig
+from .llms.bedrock import AmazonConfig
 from .main import *  # type: ignore
 from .integrations import *
 from .exceptions import (
--- a/litellm/pycache/init.cpython-311.pyc
+++ b/litellm/pycache/init.cpython-311.pyc
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/llms/ai21.py
+++ b/litellm/llms/ai21.py
@ -1,10 +1,11 @@
-import os
+import os, types
 import json
 from enum import Enum
 import requests
 import time
-from typing import Callable
+from typing import Callable, Optional
 from litellm.utils import ModelResponse
+import litellm

 class AI21Error(Exception):
    def __init__(self, status_code, message):
@ -14,6 +15,68 @@ class AI21Error(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class AI21Config():
+    """
+    Reference: https://docs.ai21.com/reference/j2-complete-ref
+
+    The class `AI21Config` provides configuration for the AI21's API interface. Below are the parameters:
+
+    - `numResults` (int32): Number of completions to sample and return. Optional, default is 1. If the temperature is greater than 0 (non-greedy decoding), a value greater than 1 can be meaningful.
+        
+    - `maxTokens` (int32): The maximum number of tokens to generate per result. Optional, default is 16. If no `stopSequences` are given, generation stops after producing `maxTokens`.
+        
+    - `minTokens` (int32): The minimum number of tokens to generate per result. Optional, default is 0. If `stopSequences` are given, they are ignored until `minTokens` are generated.
+        
+    - `temperature` (float): Modifies the distribution from which tokens are sampled. Optional, default is 0.7. A value of 0 essentially disables sampling and results in greedy decoding.
+        
+    - `topP` (float): Used for sampling tokens from the corresponding top percentile of probability mass. Optional, default is 1. For instance, a value of 0.9 considers only tokens comprising the top 90% probability mass.
+        
+    - `stopSequences` (array of strings): Stops decoding if any of the input strings is generated. Optional.
+        
+    - `topKReturn` (int32): Range between 0 to 10, including both. Optional, default is 0. Specifies the top-K alternative tokens to return. A non-zero value includes the string representations and log-probabilities for each of the top-K alternatives at each position.
+        
+    - `frequencyPenalty` (object): Placeholder for frequency penalty object.
+        
+    - `presencePenalty` (object): Placeholder for presence penalty object.
+        
+    - `countPenalty` (object): Placeholder for count penalty object.
+    """
+    numResults: Optional[int]=None
+    maxTokens: Optional[int]=None
+    minTokens: Optional[int]=None
+    temperature: Optional[float]=None
+    topP: Optional[float]=None
+    stopSequences: Optional[list]=None
+    topKReturn: Optional[int]=None
+    frequencePenalty: Optional[dict]=None
+    presencePenalty: Optional[dict]=None
+    countPenalty: Optional[dict]=None
+
+    def __init__(self, 
+                 numResults: Optional[int]=None,
+                 maxTokens: Optional[int]=None,
+                 minTokens: Optional[int]=None,
+                 temperature: Optional[float]=None,
+                 topP: Optional[float]=None,
+                 stopSequences: Optional[list]=None,
+                 topKReturn: Optional[int]=None,
+                 frequencePenalty: Optional[dict]=None,
+                 presencePenalty: Optional[dict]=None,
+                 countPenalty: Optional[dict]=None) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+
+
+
 def validate_environment(api_key):
    if api_key is None:
        raise ValueError(
@ -53,6 +116,13 @@ def completion(
                )
        else:
            prompt += f"{message['content']}"
+    
+    ## Load Config
+    config = litellm.AI21Config.get_config() 
+    for k, v in config.items(): 
+        if k not in optional_params: # completion(top_k=3) > ai21_config(top_k=3) <- allows for dynamic variables to be passed in
+            optional_params[k] = v
+
    data = {
        "prompt": prompt,
        # "instruction": prompt, # some baseten models require the prompt to be passed in via the 'instruction' kwarg
--- a/litellm/llms/aleph_alpha.py
+++ b/litellm/llms/aleph_alpha.py
@ -1,9 +1,10 @@
-import os
+import os, types
 import json
 from enum import Enum
 import requests
 import time
-from typing import Callable
+from typing import Callable, Optional
+import litellm
 from litellm.utils import ModelResponse

 class AlephAlphaError(Exception):
@ -14,6 +15,139 @@ class AlephAlphaError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class AlephAlphaConfig(): 
+    """
+    Reference: https://docs.aleph-alpha.com/api/complete/
+
+    The `AlephAlphaConfig` class represents the configuration for the Aleph Alpha API. Here are the properties:
+
+    - `maximum_tokens` (integer, required): The maximum number of tokens to be generated by the completion. The sum of input tokens and maximum tokens may not exceed 2048.
+
+    - `minimum_tokens` (integer, optional; default value: 0): Generate at least this number of tokens before an end-of-text token is generated.
+
+    - `echo` (boolean, optional; default value: false): Whether to echo the prompt in the completion.
+
+    - `temperature` (number, nullable; default value: 0): Adjusts how creatively the model generates outputs. Use combinations of temperature, top_k, and top_p sensibly.
+
+    - `top_k` (integer, nullable; default value: 0): Introduces randomness into token generation by considering the top k most likely options.
+
+    - `top_p` (number, nullable; default value: 0): Adds randomness by considering the smallest set of tokens whose cumulative probability exceeds top_p.
+
+    - `presence_penalty`, `frequency_penalty`, `sequence_penalty` (number, nullable; default value: 0): Various penalties that can reduce repetition.
+
+    - `sequence_penalty_min_length` (integer; default value: 2): Minimum number of tokens to be considered as a sequence.
+
+    - `repetition_penalties_include_prompt`, `repetition_penalties_include_completion`, `use_multiplicative_presence_penalty`,`use_multiplicative_frequency_penalty`,`use_multiplicative_sequence_penalty` (boolean, nullable; default value: false): Various settings that adjust how the repetition penalties are applied.
+
+    - `penalty_bias` (string, nullable): Text used in addition to the penalized tokens for repetition penalties. 
+
+    - `penalty_exceptions` (string[], nullable): Strings that may be generated without penalty.
+
+    - `penalty_exceptions_include_stop_sequences` (boolean, nullable; default value: true): Include all stop_sequences in penalty_exceptions.
+
+    - `best_of` (integer, nullable; default value: 1): The number of completions will be generated on the server side. 
+
+    - `n` (integer, nullable; default value: 1): The number of completions to return.
+
+    - `logit_bias` (object, nullable): Adjust the logit scores before sampling.
+
+    - `log_probs` (integer, nullable): Number of top log probabilities for each token generated.
+
+    - `stop_sequences` (string[], nullable): List of strings that will stop generation if they're generated.
+
+    - `tokens` (boolean, nullable; default value: false): Flag indicating whether individual tokens of the completion should be returned or not.
+
+    - `raw_completion` (boolean; default value: false): if True, the raw completion of the model will be returned.
+
+    - `disable_optimizations` (boolean, nullable; default value: false): Disables any applied optimizations to both your prompt and completion.
+
+    - `completion_bias_inclusion`, `completion_bias_exclusion` (string[], default value: []): Set of strings to bias the generation of tokens.
+
+    - `completion_bias_inclusion_first_token_only`, `completion_bias_exclusion_first_token_only` (boolean; default value: false): Consider only the first token for the completion_bias_inclusion/exclusion.
+
+    - `contextual_control_threshold` (number, nullable): Control over how similar tokens are controlled. 
+
+    - `control_log_additive` (boolean; default value: true): Method of applying control to attention scores.
+    """
+    maximum_tokens: Optional[int]=litellm.max_tokens # aleph alpha requires max tokens
+    minimum_tokens: Optional[int]=None
+    echo: Optional[bool]=None
+    temperature: Optional[int]=None
+    top_k: Optional[int]=None
+    top_p: Optional[int]=None
+    presence_penalty: Optional[int]=None
+    frequency_penalty: Optional[int]=None
+    sequence_penalty: Optional[int]=None
+    sequence_penalty_min_length: Optional[int]=None
+    repetition_penalties_include_prompt: Optional[bool]=None
+    repetition_penalties_include_completion: Optional[bool]=None
+    use_multiplicative_presence_penalty: Optional[bool]=None
+    use_multiplicative_frequency_penalty: Optional[bool]=None
+    use_multiplicative_sequence_penalty: Optional[bool]=None
+    penalty_bias: Optional[str]=None
+    penalty_exceptions_include_stop_sequences: Optional[bool]=None
+    best_of: Optional[int]=None
+    n: Optional[int]=None
+    logit_bias: Optional[dict]=None
+    log_probs: Optional[int]=None
+    stop_sequences: Optional[list]=None
+    tokens: Optional[bool]=None
+    raw_completion: Optional[bool]=None
+    disable_optimizations: Optional[bool]=None
+    completion_bias_inclusion: Optional[list]=None
+    completion_bias_exclusion: Optional[list]=None
+    completion_bias_inclusion_first_token_only: Optional[bool]=None
+    completion_bias_exclusion_first_token_only: Optional[bool]=None
+    contextual_control_threshold: Optional[int]=None
+    control_log_additive: Optional[bool]=None
+
+
+    def __init__(self,
+                 maximum_tokens: Optional[int]=None,
+                 minimum_tokens: Optional[int]=None,
+                 echo: Optional[bool]=None,
+                 temperature: Optional[int]=None,
+                 top_k: Optional[int]=None,
+                 top_p: Optional[int]=None,
+                 presence_penalty: Optional[int]=None,
+                 frequency_penalty: Optional[int]=None,
+                 sequence_penalty: Optional[int]=None,
+                 sequence_penalty_min_length: Optional[int]=None,
+                 repetition_penalties_include_prompt: Optional[bool]=None,
+                 repetition_penalties_include_completion: Optional[bool]=None,
+                 use_multiplicative_presence_penalty: Optional[bool]=None,
+                 use_multiplicative_frequency_penalty: Optional[bool]=None,
+                 use_multiplicative_sequence_penalty: Optional[bool]=None,
+                 penalty_bias: Optional[str]=None,
+                 penalty_exceptions_include_stop_sequences: Optional[bool]=None,
+                 best_of: Optional[int]=None,
+                 n: Optional[int]=None,
+                 logit_bias: Optional[dict]=None,
+                 log_probs: Optional[int]=None,
+                 stop_sequences: Optional[list]=None,
+                 tokens: Optional[bool]=None,
+                 raw_completion: Optional[bool]=None,
+                 disable_optimizations: Optional[bool]=None,
+                 completion_bias_inclusion: Optional[list]=None,
+                 completion_bias_exclusion: Optional[list]=None,
+                 completion_bias_inclusion_first_token_only: Optional[bool]=None,
+                 completion_bias_exclusion_first_token_only: Optional[bool]=None,
+                 contextual_control_threshold: Optional[int]=None,
+                 control_log_additive: Optional[bool]=None) -> None:
+        
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+
+
 def validate_environment(api_key):
    headers = {
        "accept": "application/json",
@ -37,6 +171,13 @@ def completion(
    default_max_tokens_to_sample=None,
 ):
    headers = validate_environment(api_key)
+
+    ## Load Config
+    config = litellm.AlephAlphaConfig.get_config() 
+    for k, v in config.items(): 
+        if k not in optional_params: # completion(top_k=3) > aleph_alpha_config(top_k=3) <- allows for dynamic variables to be passed in
+            optional_params[k] = v
+
    completion_url = "https://api.aleph-alpha.com/complete"
    model = model
    prompt = ""
@ -65,7 +206,6 @@ def completion(
    data = {
        "model": model,
        "prompt": prompt,
-        "maximum_tokens": optional_params["maximum_tokens"] if "maximum_tokens" in optional_params else default_max_tokens_to_sample,  # required input
        **optional_params,
    }

--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -1,10 +1,11 @@
-import os
+import os, types
 import json
 from enum import Enum
 import requests
 import time
-from typing import Callable
+from typing import Callable, Optional
 from litellm.utils import ModelResponse
+import litellm 

 class AnthropicConstants(Enum):
    HUMAN_PROMPT = "\n\nHuman:"
@ -18,11 +19,38 @@ class AnthropicError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class AnthropicConfig():
+    """
+    Reference: https://docs.anthropic.com/claude/reference/complete_post

-# contains any default values we need to pass to the provider
-AnthropicConfig = { 
-    "max_tokens_to_sample": 256 # override by setting - completion(..,max_tokens=300)
-}
+    to pass metadata to anthropic, it's {"user_id": "any-relevant-information"}
+    """
+    max_tokens_to_sample: Optional[int]=256 # anthropic requires a default 
+    stop_sequences: Optional[list[str]]=None
+    temperature: Optional[int]=None
+    top_p: Optional[int]=None
+    top_k: Optional[int]=None
+    metadata: Optional[dict]=None
+
+    def __init__(self, 
+                 max_tokens_to_sample: Optional[int]=256, # anthropic requires a default 
+                 stop_sequences: Optional[list[str]]=None,
+                 temperature: Optional[int]=None,
+                 top_p: Optional[int]=None,
+                 top_k: Optional[int]=None,
+                 metadata: Optional[dict]=None) -> None:
+        
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}


 # makes headers for API call
@ -72,11 +100,11 @@ def completion(
    prompt += f"{AnthropicConstants.AI_PROMPT.value}"

    ## Load Config
-    for k, v in AnthropicConfig.items(): 
-        if k not in optional_params: 
+    config = litellm.AnthropicConfig.get_config() 
+    for k, v in config.items(): 
+        if k not in optional_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
            optional_params[k] = v
-    if optional_params["max_tokens_to_sample"] != 256: # not default - print for testing 
-        print_verbose(f"LiteLLM.Anthropic: Max Tokens Set")
+
    data = {
        "model": model,
        "prompt": prompt,
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -1,7 +1,8 @@
-import json, copy
+import json, copy, types
 from enum import Enum
 import time
-from typing import Callable
+from typing import Callable, Optional
+import litellm
 from litellm.utils import ModelResponse, get_secret

 class BedrockError(Exception):
@ -12,6 +13,38 @@ class BedrockError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class AmazonConfig(): 
+    """
+    Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-text-express-v1
+
+    Supported Params for the Amazon Titan models:
+
+    - `maxTokenCount` (integer) max tokens,
+    - `stopSequences` (string[]) list of stop sequence strings
+    - `temperature` (float) temperature for model,
+    - `topP` (int) top p for model
+    """
+    maxTokenCount: Optional[int]=None
+    stopSequences: Optional[list]=None
+    temperature: Optional[float]=None
+    topP: Optional[int]=None
+
+    def __init__(self, 
+                 maxTokenCount: Optional[int]=None,
+                 stopSequences: Optional[list]=None,
+                 temperature: Optional[float]=None,
+                 topP: Optional[int]=None) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}

 class AnthropicConstants(Enum):
    HUMAN_PROMPT = "\n\nHuman:"
@ -100,22 +133,52 @@ def completion(
    prompt = convert_messages_to_prompt(messages, provider)
    inference_params = copy.deepcopy(optional_params)
    stream = inference_params.pop("stream", False)
+
+    print(f"bedrock provider: {provider}")
    if provider == "anthropic":
+        ## LOAD CONFIG
+        config = litellm.AnthropicConfig.get_config() 
+        for k, v in config.items(): 
+            if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                inference_params[k] = v
        data = json.dumps({
            "prompt": prompt,
            **inference_params
        })
    elif provider == "ai21":
+        ## LOAD CONFIG
+        config = litellm.AI21Config.get_config() 
+        for k, v in config.items(): 
+            if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                inference_params[k] = v
+
        data = json.dumps({
            "prompt": prompt,
+            **inference_params
        })
+    elif provider == "cohere":
+        ## LOAD CONFIG
+        config = litellm.CohereConfig.get_config() 
+        for k, v in config.items(): 
+            if k not in inference_params: # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in
+                inference_params[k] = v
+        data = json.dumps({
+            "prompt": prompt,
+            **inference_params
+        })
+    elif provider == "amazon":  # amazon titan
+        ## LOAD CONFIG
+        config = litellm.AmazonConfig.get_config() 
+        for k, v in config.items(): 
+            if k not in inference_params: # completion(top_k=3) > amazon_config(top_k=3) <- allows for dynamic variables to be passed in
+                inference_params[k] = v

-    else:  # amazon titan
        data = json.dumps({
            "inputText": prompt,
            "textGenerationConfig": inference_params,
        })
-        ## LOGGING
+    
+    ## LOGGING
    logging_obj.pre_call(
        input=prompt,
        api_key="",
@ -147,7 +210,7 @@ def completion(
    logging_obj.post_call(
        input=prompt,
        api_key="",
-        original_response=response,
+        original_response=response_body,
        additional_args={"complete_input_dict": data},
    )
    print_verbose(f"raw model_response: {response}")
@ -158,6 +221,8 @@ def completion(
    elif provider == "anthropic":
        outputText = response_body['completion']
        model_response["finish_reason"] = response_body["stop_reason"]
+    elif provider == "cohere": 
+        outputText = response_body["generations"][0]["text"]
    else:  # amazon titan
        outputText = response_body.get('results')[0].get('outputText')
    if "error" in outputText:
--- a/litellm/llms/cohere.py
+++ b/litellm/llms/cohere.py
@ -1,10 +1,11 @@
-import os
+import os, types
 import json
 from enum import Enum
 import requests
 import time
-from typing import Callable
+from typing import Callable, Optional
 from litellm.utils import ModelResponse
+import litellm

 class CohereError(Exception):
    def __init__(self, status_code, message):
@ -14,6 +15,79 @@ class CohereError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class CohereConfig():
+    """
+    Reference: https://docs.cohere.com/reference/generate
+
+    The class `CohereConfig` provides configuration for the Cohere's API interface. Below are the parameters:
+        
+    - `num_generations` (integer): Maximum number of generations returned. Default is 1, with a minimum value of 1 and a maximum value of 5.
+        
+    - `max_tokens` (integer): Maximum number of tokens the model will generate as part of the response. Default value is 20.
+        
+    - `truncate` (string): Specifies how the API handles inputs longer than maximum token length. Options include NONE, START, END. Default is END.
+        
+    - `temperature` (number): A non-negative float controlling the randomness in generation. Lower temperatures result in less random generations. Default is 0.75.
+        
+    - `preset` (string): Identifier of a custom preset, a combination of parameters such as prompt, temperature etc.
+        
+    - `end_sequences` (array of strings): The generated text gets cut at the beginning of the earliest occurrence of an end sequence, which will be excluded from the text.
+        
+    - `stop_sequences` (array of strings): The generated text gets cut at the end of the earliest occurrence of a stop sequence, which will be included in the text.
+        
+    - `k` (integer): Limits generation at each step to top `k` most likely tokens. Default is 0.
+        
+    - `p` (number): Limits generation at each step to most likely tokens with total probability mass of `p`. Default is 0.
+        
+    - `frequency_penalty` (number): Reduces repetitiveness of generated tokens. Higher values apply stronger penalties to previously occurred tokens.
+        
+    - `presence_penalty` (number): Reduces repetitiveness of generated tokens. Similar to frequency_penalty, but this penalty applies equally to all tokens that have already appeared.
+        
+    - `return_likelihoods` (string): Specifies how and if token likelihoods are returned with the response. Options include GENERATION, ALL and NONE.
+        
+    - `logit_bias` (object): Used to prevent the model from generating unwanted tokens or to incentivize it to include desired tokens. e.g. {"hello_world": 1233}
+    """
+    num_generations: Optional[int]=None
+    max_tokens: Optional[int]=None
+    truncate: Optional[str]=None
+    temperature: Optional[int]=None
+    preset: Optional[str]=None
+    end_sequences: Optional[list]=None
+    stop_sequences: Optional[list]=None
+    k: Optional[int]=None
+    p: Optional[int]=None
+    frequency_penalty: Optional[int]=None
+    presence_penalty: Optional[int]=None
+    return_likelihoods: Optional[str]=None
+    logit_bias: Optional[dict]=None
+    
+    def __init__(self,
+                 num_generations: Optional[int]=None,
+                 max_tokens: Optional[int]=None,
+                 truncate: Optional[str]=None,
+                 temperature: Optional[int]=None,
+                 preset: Optional[str]=None,
+                 end_sequences: Optional[list]=None,
+                 stop_sequences: Optional[list]=None,
+                 k: Optional[int]=None,
+                 p: Optional[int]=None,
+                 frequency_penalty: Optional[int]=None,
+                 presence_penalty: Optional[int]=None,
+                 return_likelihoods: Optional[str]=None,
+                 logit_bias: Optional[dict]=None) -> None:
+        
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+   
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+
 def validate_environment(api_key):
    headers = {
        "accept": "application/json",
@ -39,6 +113,13 @@ def completion(
    completion_url = "https://api.cohere.ai/v1/generate"
    model = model
    prompt = " ".join(message["content"] for message in messages)
+
+    ## Load Config
+    config=litellm.CohereConfig.get_config()
+    for k, v in config.items():
+        if k not in optional_params: # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
+            optional_params[k] = v
+
    data = {
        "model": model,
        "prompt": prompt,
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -1,9 +1,10 @@
 ## Uses the huggingface text generation inference API
-import os, copy
+import os, copy, types
 import json
 from enum import Enum
 import requests
 import time
+import litellm
 from typing import Callable
 from litellm.utils import ModelResponse, Choices, Message
 from typing import Optional
@ -17,11 +18,52 @@ class HuggingfaceError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

-# contains any default values we need to pass to the provider
-HuggingfaceConfig = { 
-    "return_full_text": False, # override by setting - completion(..,return_full_text=True)
-    "details": True # needed for getting logprobs etc. for tgi models. override by setting - completion(..., details=False)
-}
+class HuggingfaceConfig(): 
+    """
+    Reference: https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/compat_generate 
+    """
+    best_of: Optional[int] = None
+    decoder_input_details: Optional[bool] = None
+    details: Optional[bool] = True # enables returning logprobs + best of
+    max_new_tokens: Optional[int] = None
+    repetition_penalty: Optional[float] = None
+    return_full_text: Optional[bool] = False # by default don't return the input as part of the output
+    seed: Optional[int] = None
+    temperature: Optional[float] = None
+    top_k: Optional[int] = None
+    top_n_tokens: Optional[int] = None
+    top_p: Optional[int] = None
+    truncate: Optional[int] = None
+    typical_p: Optional[float] = None
+    watermark: Optional[bool] = None
+
+    def __init__(self, 
+                 best_of: Optional[int] = None, 
+                 decoder_input_details: Optional[bool] = None,
+                 details: Optional[bool] = None,
+                 max_new_tokens: Optional[int] = None,
+                 repetition_penalty: Optional[float] = None,
+                 return_full_text: Optional[bool] = None,
+                 seed: Optional[int] = None,
+                 temperature: Optional[float] = None, 
+                 top_k: Optional[int] = None,
+                 top_n_tokens: Optional[int] = None, 
+                 top_p: Optional[int] = None,
+                 truncate: Optional[int] = None,
+                 typical_p: Optional[float] = None,
+                 watermark: Optional[bool] = None
+                 ) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+   
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}

 def validate_environment(api_key):
    headers = {
@ -74,8 +116,10 @@ def get_hf_task_for_model(model):
        return "text-generation-inference"
    elif model in conversational_models:
        return "conversational"
-    else:
+    elif "roneneldan/TinyStories" in model:
        return None
+    else:
+        return "text-generation-inference" # default to tgi

 def completion(
    model: str,
@ -108,8 +152,9 @@ def completion(
        completion_url = f"https://api-inference.huggingface.co/models/{model}"

    ## Load Config
-    for k, v in HuggingfaceConfig.items(): 
-        if k not in optional_params: 
+    config=litellm.HuggingfaceConfig.get_config()
+    for k, v in config.items():
+        if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
            optional_params[k] = v

    ### MAP INPUT PARAMS
@ -149,19 +194,11 @@ def completion(
            )
        else:
            prompt = prompt_factory(model=model, messages=messages)
-        if "https://api-inference.huggingface.co/models" in completion_url: 
-            inference_params = copy.deepcopy(optional_params)
-            data = {
-                "inputs": prompt,
-                "parameters": inference_params,
-                "stream": True if "stream" in inference_params and inference_params["stream"] == True else False,
-            }
-        else:
-            data = {
-                "inputs": prompt,
-                "parameters": optional_params,
-                "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
-            }
+        data = {
+            "inputs": prompt,
+            "parameters": optional_params,
+            "stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
+        }
        input_text = prompt
    else:
        # Non TGI and Conversational llms
--- a/litellm/llms/nlp_cloud.py
+++ b/litellm/llms/nlp_cloud.py
@ -1,9 +1,10 @@
-import os
+import os, types
 import json
 from enum import Enum
 import requests
 import time
-from typing import Callable
+from typing import Callable, Optional
+import litellm
 from litellm.utils import ModelResponse

 class NLPCloudError(Exception):
@ -14,6 +15,75 @@ class NLPCloudError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class NLPCloudConfig():
+    """
+    Reference: https://docs.nlpcloud.com/#generation
+
+    - `max_length` (int): Optional. The maximum number of tokens that the generated text should contain.
+
+    - `length_no_input` (boolean): Optional. Whether `min_length` and `max_length` should not include the length of the input text.
+
+    - `end_sequence` (string): Optional. A specific token that should be the end of the generated sequence.
+
+    - `remove_end_sequence` (boolean): Optional. Whether to remove the `end_sequence` string from the result.
+
+    - `remove_input` (boolean): Optional. Whether to remove the input text from the result.
+
+    - `bad_words` (list of strings): Optional. List of tokens that are not allowed to be generated.
+
+    - `temperature` (float): Optional. Temperature sampling. It modulates the next token probabilities.
+
+    - `top_p` (float): Optional. Top P sampling. Below 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+
+    - `top_k` (int): Optional. Top K sampling. The number of highest probability vocabulary tokens to keep for top k filtering.
+
+    - `repetition_penalty` (float): Optional. Prevents the same word from being repeated too many times.
+
+    - `num_beams` (int): Optional. Number of beams for beam search.
+
+    - `num_return_sequences` (int): Optional. The number of independently computed returned sequences.
+    """
+    max_length: Optional[int]=None
+    length_no_input: Optional[bool]=None
+    end_sequence: Optional[str]=None
+    remove_end_sequence: Optional[bool]=None
+    remove_input: Optional[bool]=None
+    bad_words: Optional[list]=None
+    temperature: Optional[float]=None
+    top_p: Optional[float]=None
+    top_k: Optional[int]=None
+    repetition_penalty: Optional[float]=None
+    num_beams: Optional[int]=None
+    num_return_sequences: Optional[int]=None
+
+
+    def __init__(self,
+                 max_length: Optional[int]=None,
+                 length_no_input: Optional[bool]=None,
+                 end_sequence: Optional[str]=None,
+                 remove_end_sequence: Optional[bool]=None,
+                 remove_input: Optional[bool]=None,
+                 bad_words: Optional[list]=None,
+                 temperature: Optional[float]=None,
+                 top_p: Optional[float]=None,
+                 top_k: Optional[int]=None,
+                 repetition_penalty: Optional[float]=None,
+                 num_beams: Optional[int]=None,
+                 num_return_sequences: Optional[int]=None) -> None:
+        
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+
+
 def validate_environment(api_key):
    headers = {
        "accept": "application/json",
@ -37,6 +107,13 @@ def completion(
    default_max_tokens_to_sample=None,
 ):
    headers = validate_environment(api_key)
+
+    ## Load Config
+    config = litellm.NLPCloudConfig.get_config() 
+    for k, v in config.items(): 
+        if k not in optional_params: # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
+            optional_params[k] = v
+
    completion_url_fragment_1 = "https://api.nlpcloud.io/v1/gpu/"
    completion_url_fragment_2 = "/generation"
    model = model
--- a/litellm/llms/palm.py
+++ b/litellm/llms/palm.py
@ -1,9 +1,10 @@
-import os
+import os, types
 import json
 from enum import Enum
 import time
-from typing import Callable
+from typing import Callable, Optional
 from litellm.utils import ModelResponse, get_secret
+import litellm
 import sys

 class PalmError(Exception):
@ -14,6 +15,57 @@ class PalmError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class PalmConfig(): 
+    """
+    Reference: https://developers.generativeai.google/api/python/google/generativeai/chat
+
+    The class `PalmConfig` provides configuration for the Palm's API interface. Here are the parameters:
+
+    - `context` (string): Text that should be provided to the model first, to ground the response. This could be a prompt to guide the model's responses.
+
+    - `examples` (list): Examples of what the model should generate. They are treated identically to conversation messages except that they take precedence over the history in messages if the total input size exceeds the model's input_token_limit.
+
+    - `temperature` (float): Controls the randomness of the output. Must be positive. Higher values produce a more random and varied response. A temperature of zero will be deterministic.
+
+    - `candidate_count` (int): Maximum number of generated response messages to return. This value must be between [1, 8], inclusive. Only unique candidates are returned.
+
+    - `top_k` (int): The API uses combined nucleus and top-k sampling. `top_k` sets the maximum number of tokens to sample from on each step.
+
+    - `top_p` (float): The API uses combined nucleus and top-k sampling. `top_p` configures the nucleus sampling. It sets the maximum cumulative probability of tokens to sample from.
+
+    - `maxOutputTokens` (int): Sets the maximum number of tokens to be returned in the output
+    """
+    context: Optional[str]=None
+    examples: Optional[list]=None
+    temperature: Optional[float]=None
+    candidate_count: Optional[int]=None
+    top_k: Optional[int]=None
+    top_p: Optional[float]=None
+    maxOutputTokens: Optional[int]=None
+
+    def __init__(self,
+                 context: Optional[str]=None,
+                 examples: Optional[list]=None,
+                 temperature: Optional[float]=None,
+                 candidate_count: Optional[int]=None,
+                 top_k: Optional[int]=None,
+                 top_p: Optional[float]=None,
+                 maxOutputTokens: Optional[int]=None) -> None:
+        
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+
+
+
 def completion(
    model: str,
    messages: list,
@ -33,6 +85,13 @@ def completion(
    palm.configure(api_key=api_key)

    model = model
+    
+    ## Load Config
+    config = litellm.PalmConfig.get_config() 
+    for k, v in config.items(): 
+        if k not in optional_params: # completion(top_k=3) > palm_config(top_k=3) <- allows for dynamic variables to be passed in
+            optional_params[k] = v
+
    prompt = ""
    for message in messages:
        if "role" in message:
--- a/litellm/llms/petals.py
+++ b/litellm/llms/petals.py
@ -1,10 +1,12 @@
-import os
+import os, types
 import json
 from enum import Enum
 import requests
 import time
-from typing import Callable
+from typing import Callable, Optional
+import litellm
 from litellm.utils import ModelResponse
+from .prompt_templates.factory import prompt_factory, custom_prompt

 class PetalsError(Exception):
    def __init__(self, status_code, message):
@ -14,13 +16,59 @@ class PetalsError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

-PetalsConfig = {
-    "max_new_tokens": 256
-}
+class PetalsConfig():
+    """
+    Reference: https://github.com/petals-infra/chat.petals.dev#post-apiv1generate
+    The `PetalsConfig` class encapsulates the configuration for the Petals API. The properties of this class are described below:

+    - `max_length` (integer): This represents the maximum length of the generated text (including the prefix) in tokens.
+
+    - `max_new_tokens` (integer): This represents the maximum number of newly generated tokens (excluding the prefix).
+
+    The generation parameters are compatible with `.generate()` from Hugging Face's Transformers library:
+
+    - `do_sample` (boolean, optional): If set to 0 (default), the API runs greedy generation. If set to 1, the API performs sampling using the parameters below:
+
+    - `temperature` (float, optional): This value sets the temperature for sampling.
+    
+    - `top_k` (integer, optional): This value sets the limit for top-k sampling.
+    
+    - `top_p` (float, optional): This value sets the limit for top-p (nucleus) sampling.
+    
+    - `repetition_penalty` (float, optional): This helps apply the repetition penalty during text generation, as discussed in this paper.
+    """
+    max_length: Optional[int]=None
+    max_new_tokens: Optional[int]=litellm.max_tokens # petals requires max tokens to be set
+    do_sample: Optional[bool]=None
+    temperature: Optional[float]=None
+    top_k: Optional[int]=None
+    top_p: Optional[float]=None
+    repetition_penalty: Optional[float]=None
+
+    def __init__(self,
+                 max_length: Optional[int]=None,
+                 max_new_tokens: Optional[int]=litellm.max_tokens, # petals requires max tokens to be set
+                 do_sample: Optional[bool]=None,
+                 temperature: Optional[float]=None,
+                 top_k: Optional[int]=None,
+                 top_p: Optional[float]=None,
+                 repetition_penalty: Optional[float]=None) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+    
 def completion(
    model: str,
    messages: list,
+    api_base: Optional[str], 
    model_response: ModelResponse,
    print_verbose: Callable,
    encoding,
@ -30,61 +78,91 @@ def completion(
    litellm_params=None,
    logger_fn=None,
 ):
-    try:
-        import torch
-        from transformers import AutoTokenizer
-        from petals import AutoDistributedModelForCausalLM
-    except:
-        raise Exception(
-            "Importing torch, transformers, petals failed\nTry pip installing petals \npip install git+https://github.com/bigscience-workshop/petals"
-        )
-
-    model = model
-
-    tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, add_bos_token=False)
-    model_obj = AutoDistributedModelForCausalLM.from_pretrained(model)
-
-    prompt = ""
-    for message in messages:
-        if "role" in message:
-            if message["role"] == "user":
-                prompt += (
-                    f"{message['content']}"
-                )
-            else:
-                prompt += (
-                    f"{message['content']}"
-                )
-        else:
-            prompt += f"{message['content']}"
-    
    ## Load Config
-    for k, v in PetalsConfig.items(): 
-        if k not in optional_params: 
+    config = litellm.PetalsConfig.get_config()
+    for k, v in config.items(): 
+        if k not in optional_params: # completion(top_k=3) > petals_config(top_k=3) <- allows for dynamic variables to be passed in
            optional_params[k] = v

-    ## LOGGING
-    logging_obj.pre_call(
-            input=prompt,
-            api_key="",
-            additional_args={"complete_input_dict": optional_params},
+    if model in litellm.custom_prompt_dict:
+        # check if the model has a registered custom prompt
+        model_prompt_details = litellm.custom_prompt_dict[model]
+        prompt = custom_prompt(
+            role_dict=model_prompt_details["roles"], 
+            initial_prompt_value=model_prompt_details["initial_prompt_value"],  
+            final_prompt_value=model_prompt_details["final_prompt_value"], 
+            messages=messages
        )
-    
-    ## COMPLETION CALL
-    inputs = tokenizer(prompt, return_tensors="pt")["input_ids"]
-    
-    # optional params: max_new_tokens=1,temperature=0.9, top_p=0.6
-    outputs = model_obj.generate(inputs, **optional_params)
+    else:
+        prompt = prompt_factory(model=model, messages=messages)

-    ## LOGGING
-    logging_obj.post_call(
-            input=prompt,
-            api_key="",
-            original_response=outputs,
-            additional_args={"complete_input_dict": optional_params},
-        )
-    ## RESPONSE OBJECT
-    output_text = tokenizer.decode(outputs[0])
+    if api_base: 
+        ## LOGGING
+        logging_obj.pre_call(
+                input=prompt,
+                api_key="",
+                additional_args={"complete_input_dict": optional_params, "api_base": api_base},
+            )
+        data = {
+            "model": model,
+            "inputs": prompt,
+            **optional_params
+        }
+        
+        ## COMPLETION CALL
+        response = requests.post(api_base, data=data)
+        
+        ## LOGGING
+        logging_obj.post_call(
+                input=prompt,
+                api_key="",
+                original_response=response.text,
+                additional_args={"complete_input_dict": optional_params},
+            )
+        
+        ## RESPONSE OBJECT
+        try:
+            output_text = response.json()["outputs"]
+        except Exception as e:
+            PetalsError(status_code=response.status_code, message=str(e))
+
+    else: 
+        try:
+            import torch
+            from transformers import AutoTokenizer
+            from petals import AutoDistributedModelForCausalLM
+        except:
+            raise Exception(
+                "Importing torch, transformers, petals failed\nTry pip installing petals \npip install git+https://github.com/bigscience-workshop/petals"
+            )
+        
+        model = model
+
+        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False, add_bos_token=False)
+        model_obj = AutoDistributedModelForCausalLM.from_pretrained(model)
+
+        ## LOGGING
+        logging_obj.pre_call(
+                input=prompt,
+                api_key="",
+                additional_args={"complete_input_dict": optional_params},
+            )
+        
+        ## COMPLETION CALL
+        inputs = tokenizer(prompt, return_tensors="pt")["input_ids"]
+        
+        # optional params: max_new_tokens=1,temperature=0.9, top_p=0.6
+        outputs = model_obj.generate(inputs, **optional_params)
+
+        ## LOGGING
+        logging_obj.post_call(
+                input=prompt,
+                api_key="",
+                original_response=outputs,
+                additional_args={"complete_input_dict": optional_params},
+            )
+        ## RESPONSE OBJECT
+        output_text = tokenizer.decode(outputs[0])
    model_response["choices"][0]["message"]["content"] = output_text

    prompt_tokens = len(
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -1,9 +1,10 @@
-import os
+import os, types
 import json
 import requests
 import time
-from typing import Callable
+from typing import Callable, Optional
 from litellm.utils import ModelResponse
+import litellm 

 class ReplicateError(Exception):
    def __init__(self, status_code, message):
@ -13,6 +14,65 @@ class ReplicateError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class ReplicateConfig(): 
+    """
+    Reference: https://replicate.com/meta/llama-2-70b-chat/api
+    - `prompt` (string): The prompt to send to the model.
+        
+    - `system_prompt` (string): The system prompt to send to the model. This is prepended to the prompt and helps guide system behavior. Default value: `You are a helpful assistant`.
+        
+    - `max_new_tokens` (integer): Maximum number of tokens to generate. Typically, a word is made up of 2-3 tokens. Default value: `128`.
+        
+    - `min_new_tokens` (integer): Minimum number of tokens to generate. To disable, set to `-1`. A word is usually 2-3 tokens. Default value: `-1`.
+        
+    - `temperature` (number): Adjusts the randomness of outputs. Values greater than 1 increase randomness, 0 is deterministic, and 0.75 is a reasonable starting value. Default value: `0.75`.
+        
+    - `top_p` (number): During text decoding, it samples from the top `p` percentage of most likely tokens. Reduce this to ignore less probable tokens. Default value: `0.9`.
+        
+    - `top_k` (integer): During text decoding, samples from the top `k` most likely tokens. Reduce this to ignore less probable tokens. Default value: `50`.
+    
+    - `stop_sequences` (string): A comma-separated list of sequences to stop generation at. For example, inputting '<end>,<stop>' will cease generation at the first occurrence of either 'end' or '<stop>'.
+        
+    - `seed` (integer): This is the seed for the random generator. Leave it blank to randomize the seed.
+        
+    - `debug` (boolean): If set to `True`, it provides debugging output in logs.
+
+    Please note that Replicate's mapping of these parameters can be inconsistent across different models, indicating that not all of these parameters may be available for use with all models.
+    """
+    system_prompt: Optional[str]=None
+    max_new_tokens: Optional[int]=None
+    min_new_tokens: Optional[int]=None
+    temperature: Optional[int]=None
+    top_p: Optional[int]=None
+    top_k: Optional[int]=None
+    stop_sequences: Optional[str]=None
+    seed: Optional[int]=None
+    debug: Optional[bool]=None
+
+    def __init__(self,
+                 system_prompt: Optional[str]=None,
+                 max_new_tokens: Optional[int]=None,
+                 min_new_tokens: Optional[int]=None,
+                 temperature: Optional[int]=None,
+                 top_p: Optional[int]=None,
+                 top_k: Optional[int]=None,
+                 stop_sequences: Optional[str]=None,
+                 seed: Optional[int]=None,
+                 debug: Optional[bool]=None) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+
+
+
 # Function to start a prediction and get the prediction URL
 def start_prediction(version_id, input_data, api_token, logging_obj):
    base_url = "https://api.replicate.com/v1"
@ -110,6 +170,13 @@ def completion(
 ):
    # Start a prediction and get the prediction URL
    version_id = model_to_version_id(model)
+
+    ## Load Config
+    config = litellm.ReplicateConfig.get_config() 
+    for k, v in config.items(): 
+        if k not in optional_params: # completion(top_k=3) > replicate_config(top_k=3) <- allows for dynamic variables to be passed in
+            optional_params[k] = v
+
    if "meta/llama-2-13b-chat" in model: 
        system_prompt = ""
        prompt = "" 
--- a/litellm/llms/sagemaker.py
+++ b/litellm/llms/sagemaker.py
@ -1,9 +1,10 @@
-import os
-import json
+import os, types
 from enum import Enum
+import json
 import requests
 import time
-from typing import Callable
+from typing import Callable, Optional
+import litellm
 from litellm.utils import ModelResponse, get_secret
 import sys
 from copy import deepcopy
@ -16,6 +17,32 @@ class SagemakerError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class SagemakerConfig(): 
+    """
+    Reference: https://d-uuwbxj1u4cnu.studio.us-west-2.sagemaker.aws/jupyter/default/lab/workspaces/auto-q/tree/DemoNotebooks/meta-textgeneration-llama-2-7b-SDK_1.ipynb
+    """
+    max_new_tokens: Optional[int]=None
+    top_p: Optional[float]=None
+    temperature: Optional[float]=None
+    return_full_text: Optional[bool]=None
+
+    def __init__(self,
+                 max_new_tokens: Optional[int]=None,
+                 top_p: Optional[float]=None,
+                 temperature: Optional[float]=None,
+                 return_full_text: Optional[bool]=None) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+   
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+        
 """
 SAGEMAKER AUTH Keys/Vars
 os.environ['AWS_ACCESS_KEY_ID'] = ""
@ -47,6 +74,16 @@ def completion(
        region_name=region_name
    )

+    # pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
+    inference_params = deepcopy(optional_params)
+    inference_params.pop("stream", None)
+
+    ## Load Config
+    config = litellm.SagemakerConfig.get_config() 
+    for k, v in config.items(): 
+        if k not in inference_params: # completion(top_k=3) > sagemaker_config(top_k=3) <- allows for dynamic variables to be passed in
+            inference_params[k] = v
+
    model = model
    prompt = ""
    for message in messages:
@ -61,9 +98,7 @@ def completion(
                )
        else:
            prompt += f"{message['content']}"
-    # pop streaming if it's in the optional params as 'stream' raises an error with sagemaker
-    inference_params = deepcopy(optional_params)
-    inference_params.pop("stream", None)
+
    data = {
        "inputs": prompt,
        "parameters": inference_params
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@ -1,9 +1,10 @@
-import os
+import os, types
 import json
 from enum import Enum
 import requests
 import time
-from typing import Callable
+from typing import Callable, Optional
+import litellm
 from litellm.utils import ModelResponse
 from .prompt_templates.factory import prompt_factory, custom_prompt

@ -15,6 +16,55 @@ class TogetherAIError(Exception):
            self.message
        )  # Call the base class constructor with the parameters it needs

+class TogetherAIConfig():
+    """
+    Reference: https://docs.together.ai/reference/inference
+
+    The class `TogetherAIConfig` provides configuration for the TogetherAI's API interface. Here are the parameters:
+
+    - `max_tokens` (int32, required): The maximum number of tokens to generate.
+
+    - `stop` (string, optional): A string sequence that will truncate (stop) the inference text output. For example, "\n\n" will stop generation as soon as the model generates two newlines.
+
+    - `temperature` (float, optional): A decimal number that determines the degree of randomness in the response. A value of 1 will always yield the same output. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value greater than 1 introduces more randomness in the output.
+
+    - `top_p` (float, optional): The `top_p` (nucleus) parameter is used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold, below which all less likely tokens are filtered out. This technique helps to maintain diversity and generate more fluent and natural-sounding text.
+
+    - `top_k` (int32, optional): The `top_k` parameter is used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.
+
+    - `repetition_penalty` (float, optional): A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition.
+
+    - `logprobs` (int32, optional): This parameter is not described in the prompt. 
+    """
+    max_tokens: Optional[int]=None
+    stop: Optional[str]=None
+    temperature:Optional[int]=None
+    top_p: Optional[float]=None
+    top_k: Optional[int]=None
+    repetition_penalty: Optional[float]=None
+    logprobs: Optional[int]=None
+    
+    def __init__(self,
+                 max_tokens: Optional[int]=None,
+                 stop: Optional[str]=None,
+                 temperature:Optional[int]=None,
+                 top_p: Optional[float]=None,
+                 top_k: Optional[int]=None,
+                 repetition_penalty: Optional[float]=None,
+                 logprobs: Optional[int]=None) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+
+
 def validate_environment(api_key):
    if api_key is None:
        raise ValueError(
@ -41,6 +91,13 @@ def completion(
    logger_fn=None,
 ):
    headers = validate_environment(api_key)
+
+    ## Load Config
+    config = litellm.TogetherAIConfig.get_config() 
+    for k, v in config.items(): 
+        if k not in optional_params: # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
+            optional_params[k] = v
+
    if model in custom_prompt_dict:
        # check if the model has a registered custom prompt
        model_prompt_details = custom_prompt_dict[model]
@ -52,6 +109,7 @@ def completion(
        )
    else:
        prompt = prompt_factory(model=model, messages=messages)
+
    data = {
        "model": model,
        "prompt": prompt,
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -0,0 +1,153 @@
+import os, types
+import json
+from enum import Enum
+import requests
+import time
+from typing import Callable, Optional
+from litellm.utils import ModelResponse
+import litellm
+
+class VertexAIError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+class VertexAIConfig():
+    """
+    Reference: https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts
+
+    The class `VertexAIConfig` provides configuration for the VertexAI's API interface. Below are the parameters:
+
+    - `temperature` (float): This controls the degree of randomness in token selection.
+
+    - `max_output_tokens` (integer): This sets the limitation for the maximum amount of token in the text output. In this case, the default value is 256.
+
+    - `top_p` (float): The tokens are selected from the most probable to the least probable until the sum of their probabilities equals the `top_p` value. Default is 0.95.
+
+    - `top_k` (integer): The value of `top_k` determines how many of the most probable tokens are considered in the selection. For example, a `top_k` of 1 means the selected token is the most probable among all tokens. The default value is 40.
+
+    Note: Please make sure to modify the default parameters as required for your use case.
+    """
+    temperature: Optional[float]=None
+    max_output_tokens: Optional[int]=None
+    top_p: Optional[float]=None
+    top_k: Optional[int]=None
+
+    def __init__(self, 
+                 temperature: Optional[float]=None,
+                 max_output_tokens: Optional[int]=None,
+                 top_p: Optional[float]=None,
+                 top_k: Optional[int]=None) -> None:
+        
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != 'self' and value is not None:
+                setattr(self.__class__, key, value)
+    
+    @classmethod
+    def get_config(cls):
+        return {k: v for k, v in cls.__dict__.items() 
+                if not k.startswith('__') 
+                and not isinstance(v, (types.FunctionType, types.BuiltinFunctionType, classmethod, staticmethod)) 
+                and v is not None}
+
+def completion(
+    model: str,
+    messages: list,
+    model_response: ModelResponse,
+    print_verbose: Callable,
+    encoding,
+    logging_obj,
+    vertex_project=None,
+    vertex_location=None,
+    optional_params=None,
+    litellm_params=None,
+    logger_fn=None,
+):
+    try:
+        import vertexai
+    except:
+        raise Exception("vertexai import failed please run `pip install google-cloud-aiplatform`")
+    from vertexai.preview.language_models import ChatModel, CodeChatModel, InputOutputTextPair
+    from vertexai.language_models import TextGenerationModel, CodeGenerationModel
+
+    vertexai.init(
+        project=vertex_project, location=vertex_location
+    )
+
+    ## Load Config
+    config = litellm.VertexAIConfig.get_config()
+    for k, v in config.items(): 
+        if k not in optional_params: 
+            optional_params[k] = v
+
+    # vertexai does not use an API key, it looks for credentials.json in the environment
+
+    prompt = " ".join([message["content"] for message in messages])
+
+    mode = "" 
+    if model in litellm.vertex_chat_models:
+        chat_model = ChatModel.from_pretrained(model)
+        mode = "chat"
+    elif model in litellm.vertex_text_models:
+        text_model = TextGenerationModel.from_pretrained(model)
+        mode = "text"
+    elif model in litellm.vertex_code_text_models:
+        text_model = CodeGenerationModel.from_pretrained(model)
+        mode = "text"
+    else: # vertex_code_chat_models
+        chat_model = CodeChatModel.from_pretrained(model)
+        mode = "chat"
+    
+    if mode == "chat":
+        chat = chat_model.start_chat()
+
+        ## LOGGING
+        logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params})
+
+        if "stream" in optional_params and optional_params["stream"] == True:
+            model_response = chat.send_message_streaming(prompt, **optional_params)
+            return model_response
+
+        completion_response = chat.send_message(prompt, **optional_params)
+    elif mode == "text":
+        ## LOGGING
+        logging_obj.pre_call(input=prompt, api_key=None)
+
+        if "stream" in optional_params and optional_params["stream"] == True:
+            model_response = text_model.predict_streaming(prompt, **optional_params)
+            return model_response
+
+        completion_response = text_model.predict(prompt, **optional_params)
+        
+    ## LOGGING
+    logging_obj.post_call(
+        input=prompt, api_key=None, original_response=completion_response
+    )
+
+    ## RESPONSE OBJECT
+    model_response["choices"][0]["message"]["content"] = str(completion_response)
+    model_response["created"] = time.time()
+    model_response["model"] = model
+    ## CALCULATING USAGE
+    prompt_tokens = len(
+        encoding.encode(prompt)
+    ) 
+    completion_tokens = len(
+        encoding.encode(model_response["choices"][0]["message"]["content"])
+    )
+
+    model_response["usage"] = {
+        "prompt_tokens": prompt_tokens,
+        "completion_tokens": completion_tokens,
+        "total_tokens": prompt_tokens + completion_tokens,
+    }
+    return model_response
+
+
+def embedding():
+    # logic for parsing in - calling - parsing out model embedding calls
+    pass
--- a/litellm/main.py
+++ b/litellm/main.py
@ -45,7 +45,8 @@ from .llms import (
    cohere,
    petals,
    oobabooga,
-    palm)
+    palm,
+    vertex_ai)
 from .llms.prompt_templates.factory import prompt_factory, custom_prompt
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
@ -810,134 +811,32 @@ def completion(
                )
                return response
            response = model_response
-        elif model in litellm.vertex_chat_models or model in litellm.vertex_code_chat_models:
-            try:
-                import vertexai
-            except:
-                raise Exception("vertexai import failed please run `pip install google-cloud-aiplatform`")
-            from vertexai.preview.language_models import ChatModel, CodeChatModel, InputOutputTextPair
+        elif model in litellm.vertex_chat_models or model in litellm.vertex_code_chat_models or model in litellm.vertex_text_models or model in litellm.vertex_code_text_models:
+            vertex_ai_project = (litellm.vertex_project 
+                                 or get_secret("VERTEXAI_PROJECT"))
+            vertex_ai_location = (litellm.vertex_location 
+                                  or get_secret("VERTEXAI_LOCATION"))

-            vertex_project = (litellm.vertex_project or get_secret("VERTEXAI_PROJECT"))
-            vertex_location = (litellm.vertex_location or get_secret("VERTEXAI_LOCATION"))
-            vertexai.init(
-                project=vertex_project, location=vertex_location
+            # palm does not support streaming as yet :(
+            model_response = vertex_ai.completion(
+                model=model,
+                messages=messages,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                encoding=encoding,
+                vertex_location=vertex_ai_location,
+                vertex_project=vertex_ai_project,
+                logging_obj=logging
            )
-            # vertexai does not use an API key, it looks for credentials.json in the environment
-
-            prompt = " ".join([message["content"] for message in messages])
-            # contains any default values we need to pass to the provider
-            VertexAIConfig = { 
-                "top_k": 40 # override by setting kwarg in completion()  - e.g. completion(..., top_k=20)
-            }
-            if model in litellm.vertex_chat_models:
-                chat_model = ChatModel.from_pretrained(model)
-            else: # vertex_code_chat_models
-                chat_model = CodeChatModel.from_pretrained(model)
-
-            chat = chat_model.start_chat()
-
-            ## Load Config
-            for k, v in VertexAIConfig.items(): 
-                if k not in optional_params: 
-                    optional_params[k] = v
-
-            ## LOGGING
-            logging.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params})
-
+            
            if "stream" in optional_params and optional_params["stream"] == True:
-                model_response = chat.send_message_streaming(prompt, **optional_params)
                response = CustomStreamWrapper(
                    model_response, model, custom_llm_provider="vertex_ai", logging_obj=logging
-                )
+                    )
                return response
-
-            completion_response = chat.send_message(prompt, **optional_params)
-
-            ## LOGGING
-            logging.post_call(
-                input=prompt, api_key=None, original_response=completion_response
-            )
-
-            ## RESPONSE OBJECT
-            model_response["choices"][0]["message"]["content"] = str(completion_response)
-            model_response["created"] = time.time()
-            model_response["model"] = model
-            ## CALCULATING USAGE
-            prompt_tokens = len(
-                encoding.encode(prompt)
-            ) 
-            completion_tokens = len(
-                encoding.encode(model_response["choices"][0]["message"]["content"])
-            )
-
-            model_response["usage"] = {
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens,
-            }
-            response = model_response
-        elif model in litellm.vertex_text_models or model in litellm.vertex_code_text_models:
-            try:
-                import vertexai
-            except:
-                raise Exception("vertexai import failed please run `pip install google-cloud-aiplatform`")
-            from vertexai.language_models import TextGenerationModel, CodeGenerationModel
-
-            vertexai.init(
-                project=litellm.vertex_project, location=litellm.vertex_location
-            )
-            # vertexai does not use an API key, it looks for credentials.json in the environment
-
-            # contains any default values we need to pass to the provider
-            VertexAIConfig = { 
-                "top_k": 40 # override by setting kwarg in completion()  - e.g. completion(..., top_k=20)
-            }
-
-            prompt = " ".join([message["content"] for message in messages])
-            
-            if model in litellm.vertex_text_models:
-                vertex_model = TextGenerationModel.from_pretrained(model)
-            else:
-                vertex_model = CodeGenerationModel.from_pretrained(model)
-            
-            ## Load Config
-            for k, v in VertexAIConfig.items(): 
-                if k not in optional_params: 
-                    optional_params[k] = v
-
-            ## LOGGING
-            logging.pre_call(input=prompt, api_key=None)
-
-            if "stream" in optional_params and optional_params["stream"] == True:
-                model_response = vertex_model.predict_streaming(prompt, **optional_params)
-                response = CustomStreamWrapper(
-                    model_response, model, custom_llm_provider="vertexai", logging_obj=logging
-                )
-                return response
-
-            completion_response = vertex_model.predict(prompt, **optional_params)
-
-            ## LOGGING
-            logging.post_call(
-                input=prompt, api_key=None, original_response=completion_response
-            )
-            ## RESPONSE OBJECT
-            model_response["choices"][0]["message"]["content"] = str(completion_response)
-            model_response["created"] = time.time()
-            model_response["model"] = model
-            ## CALCULATING USAGE
-            prompt_tokens = len(
-                encoding.encode(prompt)
-            ) 
-            completion_tokens = len(
-                encoding.encode(model_response["choices"][0]["message"]["content"])
-            )
-
-            model_response["usage"] = {
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens,
-            }
            response = model_response
        elif model in litellm.ai21_models:
            custom_llm_provider = "ai21"
@ -1122,10 +1021,16 @@ def completion(
            custom_llm_provider == "petals"
            or model in litellm.petals_models
        ):
+            api_base = (
+                litellm.api_base or
+                api_base 
+            )
+
            custom_llm_provider = "petals"
            model_response = petals.completion(
                model=model,
                messages=messages,
+                api_base=api_base,
                model_response=model_response,
                print_verbose=print_verbose,
                optional_params=optional_params,
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -37,6 +37,8 @@ def test_completion_custom_provider_model_name():


 def test_completion_claude():
+    litellm.set_verbose = True
+    litellm.anthropic_config(max_tokens_to_sample=200, metadata={"user_id": "1224"})
    try:
        # test without max tokens
        response = completion(
@ -48,7 +50,7 @@ def test_completion_claude():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

-test_completion_claude()
+# test_completion_claude()

 def test_completion_claude_max_tokens():
    try:
@ -198,6 +200,8 @@ def test_get_hf_task_for_model():
 # # TGI model
 # # this is a TGI model https://huggingface.co/glaiveai/glaive-coder-7b
 # def hf_test_completion_tgi():
+#     litellm.huggingface_config(return_full_text=True)
+#     litellm.set_verbose=True
 #     try:
 #         response = litellm.completion(
 #             model="huggingface/mistralai/Mistral-7B-Instruct-v0.1",
@ -321,13 +325,10 @@ def test_get_hf_task_for_model():

 def test_completion_cohere(): # commenting for now as the cohere endpoint is being flaky
    try:
+        litellm.CohereConfig(max_tokens=1000, stop_sequences=["a"])
        response = completion(
            model="command-nightly",
            messages=messages,
-            max_tokens=100,
-            n=1,
-            logit_bias={40: 10},
-            stop=["a"],
            logger_fn=logger_fn
        )
        # Add any assertions here to check the response
@ -665,13 +666,12 @@ def test_completion_azure_deployment_id():
 # Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect.

 # def test_completion_replicate_llama_2():
-#     model_name = "replicate/llama-2-70b-chat:2796ee9483c3fd7aa2e171d38f4ca12251a30609463dcfd4cd76703f22e96cdf"
+#     model_name = "replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
+#     litellm.replicate_config(max_new_tokens=200)
 #     try:
 #         response = completion(
 #             model=model_name, 
 #             messages=messages, 
-#             max_tokens=20,
-#             custom_llm_provider="replicate"
 #         )
 #         print(response)
 #         cost = completion_cost(completion_response=response)
@ -1027,7 +1027,7 @@ def test_completion_ai21():
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

-test_completion_ai21()
+# test_completion_ai21()
 ## test deep infra 
 def test_completion_deep_infra():
    # litellm.set_verbose = True
--- a/litellm/tests/test_provider_specific_config.py
+++ b/litellm/tests/test_provider_specific_config.py
@ -0,0 +1,395 @@
+#### What this tests ####
+#    This tests setting provider specific configs across providers
+# There are 2 types of tests - changing config dynamically or by setting class variables 
+
+import sys, os
+import traceback
+import pytest
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import litellm
+from litellm import completion
+
+#  Huggingface - Expensive to deploy models and keep them running. Maybe we can try doing this via baseten?? 
+# def hf_test_completion_tgi():
+#     litellm.HuggingfaceConfig(max_new_tokens=200)
+#     litellm.set_verbose=True
+#     try:
+#         # OVERRIDE WITH DYNAMIC MAX TOKENS
+#         response_1 = litellm.completion(
+#             model="huggingface/mistralai/Mistral-7B-Instruct-v0.1",
+#             messages=[{ "content": "Hello, how are you?","role": "user"}],
+#             api_base="https://n9ox93a8sv5ihsow.us-east-1.aws.endpoints.huggingface.cloud",
+#             max_tokens=10
+#         )
+#         # Add any assertions here to check the response
+#         print(response_1)
+#         response_1_text = response_1.choices[0].message.content
+
+#         # USE CONFIG TOKENS
+#         response_2 = litellm.completion(
+#             model="huggingface/mistralai/Mistral-7B-Instruct-v0.1",
+#             messages=[{ "content": "Hello, how are you?","role": "user"}],
+#             api_base="https://n9ox93a8sv5ihsow.us-east-1.aws.endpoints.huggingface.cloud",
+#         )
+#         # Add any assertions here to check the response
+#         print(response_2)
+#         response_2_text = response_2.choices[0].message.content
+
+#         assert len(response_2_text) > len(response_1_text)
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+# hf_test_completion_tgi()
+
+#Anthropic
+
+def claude_test_completion():
+    litellm.AnthropicConfig(max_tokens_to_sample=200)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="claude-instant-1",
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+            max_tokens=10
+        )
+        # Add any assertions here to check the response
+        print(response_1)
+        response_1_text = response_1.choices[0].message.content
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="claude-instant-1",
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+        )
+        # Add any assertions here to check the response
+        print(response_2)
+        response_2_text = response_2.choices[0].message.content
+
+        assert len(response_2_text) > len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# claude_test_completion()
+
+#  Replicate
+
+def replicate_test_completion():
+    litellm.ReplicateConfig(max_new_tokens=200)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+            max_tokens=10
+        )
+        # Add any assertions here to check the response
+        print(response_1)
+        response_1_text = response_1.choices[0].message.content
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3",
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+        )
+        # Add any assertions here to check the response
+        print(response_2)
+        response_2_text = response_2.choices[0].message.content
+
+        assert len(response_2_text) > len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# replicate_test_completion()
+
+#  Cohere
+
+def cohere_test_completion():
+    litellm.CohereConfig(max_tokens=200)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="command-nightly",
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+            max_tokens=10
+        )
+        response_1_text = response_1.choices[0].message.content
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="command-nightly",
+            messages=[{ "content": "Hello, how are you?","role": "user"}],
+        )
+        response_2_text = response_2.choices[0].message.content
+
+        assert len(response_2_text) > len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# cohere_test_completion()
+
+#  AI21
+
+def ai21_test_completion():
+    litellm.AI21Config(maxTokens=10)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="j2-mid",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+            max_tokens=100
+        )
+        response_1_text = response_1.choices[0].message.content
+        print(f"response_1_text: {response_1_text}")
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="j2-mid",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+        )
+        response_2_text = response_2.choices[0].message.content
+        print(f"response_2_text: {response_2_text}")
+
+        assert len(response_2_text) < len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# ai21_test_completion()
+
+#  TogetherAI
+
+def togetherai_test_completion():
+    litellm.TogetherAIConfig(max_tokens=10)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="together_ai/togethercomputer/llama-2-70b-chat",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+            max_tokens=100
+        )
+        response_1_text = response_1.choices[0].message.content
+        print(f"response_1_text: {response_1_text}")
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="together_ai/togethercomputer/llama-2-70b-chat",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+        )
+        response_2_text = response_2.choices[0].message.content
+        print(f"response_2_text: {response_2_text}")
+
+        assert len(response_2_text) < len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# togetherai_test_completion()
+
+#  Palm
+
+def palm_test_completion():
+    litellm.PalmConfig(maxOutputTokens=10)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="palm/chat-bison",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+            max_tokens=100
+        )
+        response_1_text = response_1.choices[0].message.content
+        print(f"response_1_text: {response_1_text}")
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="palm/chat-bison",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+        )
+        response_2_text = response_2.choices[0].message.content
+        print(f"response_2_text: {response_2_text}")
+
+        assert len(response_2_text) < len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# palm_test_completion()
+
+#  NLP Cloud
+
+def nlp_cloud_test_completion():
+    litellm.NLPCloudConfig(max_length=10)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="dolphin",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+            max_tokens=100
+        )
+        response_1_text = response_1.choices[0].message.content
+        print(f"response_1_text: {response_1_text}")
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="dolphin",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+        )
+        response_2_text = response_2.choices[0].message.content
+        print(f"response_2_text: {response_2_text}")
+
+        assert len(response_2_text) < len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# nlp_cloud_test_completion()
+
+#  AlephAlpha
+
+def aleph_alpha_test_completion():
+    litellm.AlephAlphaConfig(maximum_tokens=10)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="luminous-base",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+            max_tokens=100
+        )
+        response_1_text = response_1.choices[0].message.content
+        print(f"response_1_text: {response_1_text}")
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="luminous-base",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+        )
+        response_2_text = response_2.choices[0].message.content
+        print(f"response_2_text: {response_2_text}")
+
+        assert len(response_2_text) < len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# aleph_alpha_test_completion()
+
+#  Petals - calls are too slow, will cause circle ci to fail due to delay. Test locally. 
+# def petals_completion():
+#     litellm.PetalsConfig(max_new_tokens=10)
+#     # litellm.set_verbose=True
+#     try:
+#         # OVERRIDE WITH DYNAMIC MAX TOKENS
+#         response_1 = litellm.completion(
+#             model="petals/petals-team/StableBeluga2",
+#             messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+#             api_base="https://chat.petals.dev/api/v1/generate",
+#             max_tokens=100
+#         )
+#         response_1_text = response_1.choices[0].message.content
+#         print(f"response_1_text: {response_1_text}")
+
+#         # USE CONFIG TOKENS
+#         response_2 = litellm.completion(
+#             model="petals/petals-team/StableBeluga2",
+#             api_base="https://chat.petals.dev/api/v1/generate",
+#             messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+#         )
+#         response_2_text = response_2.choices[0].message.content
+#         print(f"response_2_text: {response_2_text}")
+
+#         assert len(response_2_text) < len(response_1_text)
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# petals_completion()
+
+#  VertexAI
+# We don't have vertex ai configured for circle ci yet -- need to figure this out. 
+# def vertex_ai_test_completion():
+#     litellm.VertexAIConfig(max_output_tokens=10)
+#     # litellm.set_verbose=True
+#     try:
+#         # OVERRIDE WITH DYNAMIC MAX TOKENS
+#         response_1 = litellm.completion(
+#             model="chat-bison",
+#             messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+#             max_tokens=100
+#         )
+#         response_1_text = response_1.choices[0].message.content
+#         print(f"response_1_text: {response_1_text}")
+
+#         # USE CONFIG TOKENS
+#         response_2 = litellm.completion(
+#             model="chat-bison",
+#             messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+#         )
+#         response_2_text = response_2.choices[0].message.content
+#         print(f"response_2_text: {response_2_text}")
+
+#         assert len(response_2_text) < len(response_1_text)
+#     except Exception as e:
+#         pytest.fail(f"Error occurred: {e}")
+
+# vertex_ai_test_completion()
+
+#  Sagemaker
+
+def sagemaker_test_completion():
+    litellm.SagemakerConfig(max_new_tokens=10)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+            max_tokens=100
+        )
+        response_1_text = response_1.choices[0].message.content
+        print(f"response_1_text: {response_1_text}")
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+        )
+        response_2_text = response_2.choices[0].message.content
+        print(f"response_2_text: {response_2_text}")
+
+        assert len(response_2_text) < len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# sagemaker_test_completion()
+
+#  Bedrock
+
+
+def bedrock_test_completion():
+    litellm.CohereConfig(max_tokens=10)
+    # litellm.set_verbose=True
+    try:
+        # OVERRIDE WITH DYNAMIC MAX TOKENS
+        response_1 = litellm.completion(
+            model="bedrock/cohere.command-text-v14",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+            max_tokens=100
+        )
+        response_1_text = response_1.choices[0].message.content
+        print(f"response_1_text: {response_1_text}")
+
+        # USE CONFIG TOKENS
+        response_2 = litellm.completion(
+            model="bedrock/cohere.command-text-v14",
+            messages=[{ "content": "Hello, how are you? Be as verbose as possible","role": "user"}],
+        )
+        response_2_text = response_2.choices[0].message.content
+        print(f"response_2_text: {response_2_text}")
+
+        assert len(response_2_text) < len(response_1_text)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+# bedrock_test_completion()
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1202,8 +1202,6 @@ def get_optional_params(  # use the openai defaults
            # \"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}"
            if max_tokens:
                optional_params["max_tokens_to_sample"] = max_tokens
-            else:
-                optional_params["max_tokens_to_sample"] = 256 # anthropic fails without max_tokens_to_sample
            if temperature:
                optional_params["temperature"] = temperature
            if top_p:
@ -1226,6 +1224,28 @@ def get_optional_params(  # use the openai defaults
                optional_params["topP"] = top_p
            if stream: 
                optional_params["stream"] = stream
+        elif "cohere" in model: # cohere models on bedrock
+            supported_params = ["stream", "temperature", "max_tokens", "logit_bias", "top_p", "frequency_penalty", "presence_penalty", "stop"]
+            _check_valid_arg(supported_params=supported_params)
+            # handle cohere params
+            if stream:
+                optional_params["stream"] = stream
+            if temperature:
+                optional_params["temperature"] = temperature
+            if max_tokens:
+                optional_params["max_tokens"] = max_tokens
+            if n: 
+                optional_params["num_generations"] = n
+            if logit_bias != {}:
+                optional_params["logit_bias"] = logit_bias
+            if top_p: 
+                optional_params["p"] = top_p
+            if frequency_penalty: 
+                optional_params["frequency_penalty"] = frequency_penalty
+            if presence_penalty: 
+                optional_params["presence_penalty"] = presence_penalty
+            if stop:
+                optional_params["stop_sequences"] = stop
    elif model in litellm.aleph_alpha_models:
        supported_params = ["max_tokens", "stream", "top_p", "temperature", "presence_penalty", "frequency_penalty", "n", "stop"]
        _check_valid_arg(supported_params=supported_params)
@ -1312,8 +1332,12 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None):
        elif model in litellm.cohere_models:
            custom_llm_provider = "cohere"
        ## replicate
-        elif model in litellm.replicate_models:
-            custom_llm_provider = "replicate"
+        elif model in litellm.replicate_models or ":" in model:
+            model_parts = model.split(":")
+            if len(model_parts) > 1 and len(model_parts[1])==64: ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
+                custom_llm_provider = "replicate"
+            elif model in litellm.replicate_models:
+                custom_llm_provider = "replicate"
        ## openrouter
        elif model in litellm.openrouter_models:
            custom_llm_provider = "openrouter"