diff --git a/docs/my-website/docs/providers/cerebras.md b/docs/my-website/docs/providers/cerebras.md new file mode 100644 index 000000000..4fabeb31c --- /dev/null +++ b/docs/my-website/docs/providers/cerebras.md @@ -0,0 +1,145 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Cerebras +https://inference-docs.cerebras.ai/api-reference/chat-completions + +:::tip + +**We support ALL Cerebras models, just set `model=cerebras/` as a prefix when sending litellm requests** + +::: + +## API Key +```python +# env variable +os.environ['CEREBRAS_API_KEY'] +``` + +## Sample Usage +```python +from litellm import completion +import os + +os.environ['CEREBRAS_API_KEY'] = "" +response = completion( + model="cerebras/meta/llama3-70b-instruct", + messages=[ + { + "role": "user", + "content": "What's the weather like in Boston today in Fahrenheit?", + } + ], + max_tokens=10, + response_format={ "type": "json_object" }, + seed=123, + stop=["\n\n"], + temperature=0.2, + top_p=0.9, + tool_choice="auto", + tools=[], + user="user", +) +print(response) +``` + +## Sample Usage - Streaming +```python +from litellm import completion +import os + +os.environ['CEREBRAS_API_KEY'] = "" +response = completion( + model="cerebras/meta/llama3-70b-instruct", + messages=[ + { + "role": "user", + "content": "What's the weather like in Boston today in Fahrenheit?", + } + ], + stream=True, + max_tokens=10, + response_format={ "type": "json_object" }, + seed=123, + stop=["\n\n"], + temperature=0.2, + top_p=0.9, + tool_choice="auto", + tools=[], + user="user", +) + +for chunk in response: + print(chunk) +``` + + +## Usage with LiteLLM Proxy Server + +Here's how to call a Cerebras model with the LiteLLM Proxy Server + +1. Modify the config.yaml + + ```yaml + model_list: + - model_name: my-model + litellm_params: + model: cerebras/ # add cerebras/ prefix to route as Cerebras provider + api_key: api-key # api key to send your model + ``` + + +2. Start the proxy + + ```bash + $ litellm --config /path/to/config.yaml + ``` + +3. Send Request to LiteLLM Proxy Server + + + + + + ```python + import openai + client = openai.OpenAI( + api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys + base_url="http://0.0.0.0:4000" # litellm-proxy-base url + ) + + response = client.chat.completions.create( + model="my-model", + messages = [ + { + "role": "user", + "content": "what llm are you" + } + ], + ) + + print(response) + ``` + + + + + ```shell + curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "my-model", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' + ``` + + + + + diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 59db4c363..048b04171 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -138,6 +138,7 @@ const sidebars = { "providers/watsonx", "providers/predibase", "providers/nvidia_nim", + "providers/cerebras", "providers/volcano", "providers/triton-inference-server", "providers/ollama", diff --git a/litellm/__init__.py b/litellm/__init__.py index 0436e039c..2e7914fab 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -452,6 +452,7 @@ openai_compatible_providers: List = [ "mistral", "groq", "nvidia_nim", + "cerebras", "volcengine", "codestral", "deepseek", @@ -690,6 +691,7 @@ provider_list: List = [ "mistral", "groq", "nvidia_nim", + "cerebras", "volcengine", "codestral", "text-completion-codestral", @@ -905,6 +907,7 @@ from .llms.openai import ( AzureAIStudioConfig, ) from .llms.nvidia_nim import NvidiaNimConfig +from .llms.cerebras.chat import CerebrasConfig from .llms.fireworks_ai import FireworksAIConfig from .llms.volcengine import VolcEngineConfig from .llms.text_completion_codestral import MistralTextCompletionConfig diff --git a/litellm/llms/cerebras/chat.py b/litellm/llms/cerebras/chat.py new file mode 100644 index 000000000..13b8f0ee9 --- /dev/null +++ b/litellm/llms/cerebras/chat.py @@ -0,0 +1,91 @@ +""" +Cerebras Chat Completions API + +this is OpenAI compatible - no translation needed / occurs +""" + +import types +from typing import Optional, Union + + +class CerebrasConfig: + """ + Reference: https://inference-docs.cerebras.ai/api-reference/chat-completions + + Below are the parameters: + """ + + max_tokens: Optional[int] = None + response_format: Optional[dict] = None + seed: Optional[int] = None + stop: Optional[str] = None + stream: Optional[bool] = None + temperature: Optional[float] = None + top_p: Optional[int] = None + tool_choice: Optional[str] = None + tools: Optional[list] = None + user: Optional[str] = None + + def __init__( + self, + max_tokens: Optional[int] = None, + response_format: Optional[dict] = None, + seed: Optional[int] = None, + stop: Optional[str] = None, + stream: Optional[bool] = None, + temperature: Optional[float] = None, + top_p: Optional[int] = None, + tool_choice: Optional[str] = None, + tools: Optional[list] = None, + user: Optional[str] = None, + ) -> None: + locals_ = locals().copy() + for key, value in locals_.items(): + if key != "self" and value is not None: + setattr(self.__class__, key, value) + + @classmethod + def get_config(cls): + return { + k: v + for k, v in cls.__dict__.items() + if not k.startswith("__") + and not isinstance( + v, + ( + types.FunctionType, + types.BuiltinFunctionType, + classmethod, + staticmethod, + ), + ) + and v is not None + } + + def get_supported_openai_params(self, model: str) -> list: + """ + Get the supported OpenAI params for the given model + + """ + + return [ + "max_tokens", + "response_format", + "seed", + "stop", + "stream", + "temperature", + "top_p", + "tool_choice", + "tools", + "user", + ] + + def map_openai_params( + self, model: str, non_default_params: dict, optional_params: dict + ) -> dict: + supported_openai_params = self.get_supported_openai_params(model=model) + for param, value in non_default_params.items(): + if param in supported_openai_params: + optional_params[param] = value + return optional_params diff --git a/litellm/main.py b/litellm/main.py index f9ef4a419..7f1431073 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -384,6 +384,7 @@ async def acompletion( or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" or custom_llm_provider == "nvidia_nim" + or custom_llm_provider == "cerebras" or custom_llm_provider == "volcengine" or custom_llm_provider == "codestral" or custom_llm_provider == "text-completion-codestral" @@ -1289,6 +1290,7 @@ def completion( or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" or custom_llm_provider == "nvidia_nim" + or custom_llm_provider == "cerebras" or custom_llm_provider == "volcengine" or custom_llm_provider == "codestral" or custom_llm_provider == "deepseek" @@ -3138,6 +3140,7 @@ async def aembedding(*args, **kwargs) -> EmbeddingResponse: or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" or custom_llm_provider == "nvidia_nim" + or custom_llm_provider == "cerebras" or custom_llm_provider == "volcengine" or custom_llm_provider == "deepseek" or custom_llm_provider == "fireworks_ai" @@ -3789,6 +3792,7 @@ async def atext_completion( or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" or custom_llm_provider == "nvidia_nim" + or custom_llm_provider == "cerebras" or custom_llm_provider == "volcengine" or custom_llm_provider == "text-completion-codestral" or custom_llm_provider == "deepseek" diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index a60743c65..daf2c502a 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -1273,6 +1273,26 @@ "mode": "chat", "supports_function_calling": true }, + "cerebras/llama3.1-8b": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 128000, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.0000001, + "litellm_provider": "cerebras", + "mode": "chat", + "supports_function_calling": true + }, + "cerebras/llama3.1-70b": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 128000, + "input_cost_per_token": 0.0000006, + "output_cost_per_token": 0.0000006, + "litellm_provider": "cerebras", + "mode": "chat", + "supports_function_calling": true + }, "friendliai/mixtral-8x7b-instruct-v0-1": { "max_tokens": 32768, "max_input_tokens": 32768, diff --git a/litellm/utils.py b/litellm/utils.py index bb50900d0..facbc6a0a 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2863,6 +2863,7 @@ def get_optional_params( and custom_llm_provider != "together_ai" and custom_llm_provider != "groq" and custom_llm_provider != "nvidia_nim" + and custom_llm_provider != "cerebras" and custom_llm_provider != "volcengine" and custom_llm_provider != "deepseek" and custom_llm_provider != "codestral" @@ -3622,6 +3623,16 @@ def get_optional_params( non_default_params=non_default_params, optional_params=optional_params, ) + elif custom_llm_provider == "cerebras": + supported_params = get_supported_openai_params( + model=model, custom_llm_provider=custom_llm_provider + ) + _check_valid_arg(supported_params=supported_params) + optional_params = litellm.CerebrasConfig().map_openai_params( + non_default_params=non_default_params, + optional_params=optional_params, + model=model, + ) elif custom_llm_provider == "fireworks_ai": supported_params = get_supported_openai_params( model=model, custom_llm_provider=custom_llm_provider @@ -4247,6 +4258,8 @@ def get_supported_openai_params( return litellm.FireworksAIConfig().get_supported_openai_params() elif custom_llm_provider == "nvidia_nim": return litellm.NvidiaNimConfig().get_supported_openai_params(model=model) + elif custom_llm_provider == "cerebras": + return litellm.CerebrasConfig().get_supported_openai_params(model=model) elif custom_llm_provider == "volcengine": return litellm.VolcEngineConfig().get_supported_openai_params(model=model) elif custom_llm_provider == "groq": @@ -4674,6 +4687,13 @@ def get_llm_provider( or "https://integrate.api.nvidia.com/v1" ) # type: ignore dynamic_api_key = api_key or get_secret("NVIDIA_NIM_API_KEY") + elif custom_llm_provider == "cerebras": + api_base = ( + api_base + or get_secret("CEREBRAS_API_BASE") + or "https://api.cerebras.ai/v1" + ) # type: ignore + dynamic_api_key = api_key or get_secret("CEREBRAS_API_KEY") elif custom_llm_provider == "volcengine": # volcengine is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.endpoints.anyscale.com/v1 api_base = ( @@ -4824,6 +4844,9 @@ def get_llm_provider( elif endpoint == "https://integrate.api.nvidia.com/v1": custom_llm_provider = "nvidia_nim" dynamic_api_key = get_secret("NVIDIA_NIM_API_KEY") + elif endpoint == "https://api.cerebras.ai/v1": + custom_llm_provider = "cerebras" + dynamic_api_key = get_secret("CEREBRAS_API_KEY") elif endpoint == "https://codestral.mistral.ai/v1": custom_llm_provider = "codestral" dynamic_api_key = get_secret("CODESTRAL_API_KEY") @@ -5749,6 +5772,11 @@ def validate_environment( keys_in_environment = True else: missing_keys.append("NVIDIA_NIM_API_KEY") + elif custom_llm_provider == "cerebras": + if "CEREBRAS_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("CEREBRAS_API_KEY") elif custom_llm_provider == "volcengine": if "VOLCENGINE_API_KEY" in os.environ: keys_in_environment = True diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index a60743c65..daf2c502a 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -1273,6 +1273,26 @@ "mode": "chat", "supports_function_calling": true }, + "cerebras/llama3.1-8b": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 128000, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.0000001, + "litellm_provider": "cerebras", + "mode": "chat", + "supports_function_calling": true + }, + "cerebras/llama3.1-70b": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 128000, + "input_cost_per_token": 0.0000006, + "output_cost_per_token": 0.0000006, + "litellm_provider": "cerebras", + "mode": "chat", + "supports_function_calling": true + }, "friendliai/mixtral-8x7b-instruct-v0-1": { "max_tokens": 32768, "max_input_tokens": 32768,