From cb6a0f0237334b38342288bf6fc724d22e393d06 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 31 Aug 2024 08:34:46 -0700 Subject: [PATCH 1/6] add cerebras config --- litellm/llms/cerebras/chat.py | 91 +++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 litellm/llms/cerebras/chat.py diff --git a/litellm/llms/cerebras/chat.py b/litellm/llms/cerebras/chat.py new file mode 100644 index 000000000..13b8f0ee9 --- /dev/null +++ b/litellm/llms/cerebras/chat.py @@ -0,0 +1,91 @@ +""" +Cerebras Chat Completions API + +this is OpenAI compatible - no translation needed / occurs +""" + +import types +from typing import Optional, Union + + +class CerebrasConfig: + """ + Reference: https://inference-docs.cerebras.ai/api-reference/chat-completions + + Below are the parameters: + """ + + max_tokens: Optional[int] = None + response_format: Optional[dict] = None + seed: Optional[int] = None + stop: Optional[str] = None + stream: Optional[bool] = None + temperature: Optional[float] = None + top_p: Optional[int] = None + tool_choice: Optional[str] = None + tools: Optional[list] = None + user: Optional[str] = None + + def __init__( + self, + max_tokens: Optional[int] = None, + response_format: Optional[dict] = None, + seed: Optional[int] = None, + stop: Optional[str] = None, + stream: Optional[bool] = None, + temperature: Optional[float] = None, + top_p: Optional[int] = None, + tool_choice: Optional[str] = None, + tools: Optional[list] = None, + user: Optional[str] = None, + ) -> None: + locals_ = locals().copy() + for key, value in locals_.items(): + if key != "self" and value is not None: + setattr(self.__class__, key, value) + + @classmethod + def get_config(cls): + return { + k: v + for k, v in cls.__dict__.items() + if not k.startswith("__") + and not isinstance( + v, + ( + types.FunctionType, + types.BuiltinFunctionType, + classmethod, + staticmethod, + ), + ) + and v is not None + } + + def get_supported_openai_params(self, model: str) -> list: + """ + Get the supported OpenAI params for the given model + + """ + + return [ + "max_tokens", + "response_format", + "seed", + "stop", + "stream", + "temperature", + "top_p", + "tool_choice", + "tools", + "user", + ] + + def map_openai_params( + self, model: str, non_default_params: dict, optional_params: dict + ) -> dict: + supported_openai_params = self.get_supported_openai_params(model=model) + for param, value in non_default_params.items(): + if param in supported_openai_params: + optional_params[param] = value + return optional_params From de9efe76ca01764d84500ce0325beaf710005e73 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 31 Aug 2024 08:35:23 -0700 Subject: [PATCH 2/6] add cerebras api --- litellm/__init__.py | 3 +++ litellm/main.py | 4 ++++ litellm/utils.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/litellm/__init__.py b/litellm/__init__.py index 1c3b8434f..d24003fff 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -452,6 +452,7 @@ openai_compatible_providers: List = [ "mistral", "groq", "nvidia_nim", + "cerebras", "volcengine", "codestral", "deepseek", @@ -690,6 +691,7 @@ provider_list: List = [ "mistral", "groq", "nvidia_nim", + "cerebras", "volcengine", "codestral", "text-completion-codestral", @@ -905,6 +907,7 @@ from .llms.openai import ( AzureAIStudioConfig, ) from .llms.nvidia_nim import NvidiaNimConfig +from .llms.cerebras.chat import CerebrasConfig from .llms.fireworks_ai import FireworksAIConfig from .llms.volcengine import VolcEngineConfig from .llms.text_completion_codestral import MistralTextCompletionConfig diff --git a/litellm/main.py b/litellm/main.py index 95a106377..658a46258 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -390,6 +390,7 @@ async def acompletion( or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" or custom_llm_provider == "nvidia_nim" + or custom_llm_provider == "cerebras" or custom_llm_provider == "volcengine" or custom_llm_provider == "codestral" or custom_llm_provider == "text-completion-codestral" @@ -1295,6 +1296,7 @@ def completion( or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" or custom_llm_provider == "nvidia_nim" + or custom_llm_provider == "cerebras" or custom_llm_provider == "volcengine" or custom_llm_provider == "codestral" or custom_llm_provider == "deepseek" @@ -3144,6 +3146,7 @@ async def aembedding(*args, **kwargs) -> EmbeddingResponse: or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" or custom_llm_provider == "nvidia_nim" + or custom_llm_provider == "cerebras" or custom_llm_provider == "volcengine" or custom_llm_provider == "deepseek" or custom_llm_provider == "fireworks_ai" @@ -3795,6 +3798,7 @@ async def atext_completion( or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" or custom_llm_provider == "nvidia_nim" + or custom_llm_provider == "cerebras" or custom_llm_provider == "volcengine" or custom_llm_provider == "text-completion-codestral" or custom_llm_provider == "deepseek" diff --git a/litellm/utils.py b/litellm/utils.py index ec4ac79c0..faa317c1b 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2854,6 +2854,7 @@ def get_optional_params( and custom_llm_provider != "together_ai" and custom_llm_provider != "groq" and custom_llm_provider != "nvidia_nim" + and custom_llm_provider != "cerebras" and custom_llm_provider != "volcengine" and custom_llm_provider != "deepseek" and custom_llm_provider != "codestral" @@ -3613,6 +3614,16 @@ def get_optional_params( non_default_params=non_default_params, optional_params=optional_params, ) + elif custom_llm_provider == "cerebras": + supported_params = get_supported_openai_params( + model=model, custom_llm_provider=custom_llm_provider + ) + _check_valid_arg(supported_params=supported_params) + optional_params = litellm.CerebrasConfig().map_openai_params( + non_default_params=non_default_params, + optional_params=optional_params, + model=model, + ) elif custom_llm_provider == "fireworks_ai": supported_params = get_supported_openai_params( model=model, custom_llm_provider=custom_llm_provider @@ -4238,6 +4249,8 @@ def get_supported_openai_params( return litellm.FireworksAIConfig().get_supported_openai_params() elif custom_llm_provider == "nvidia_nim": return litellm.NvidiaNimConfig().get_supported_openai_params(model=model) + elif custom_llm_provider == "cerebras": + return litellm.CerebrasConfig().get_supported_openai_params(model=model) elif custom_llm_provider == "volcengine": return litellm.VolcEngineConfig().get_supported_openai_params(model=model) elif custom_llm_provider == "groq": @@ -4665,6 +4678,13 @@ def get_llm_provider( or "https://integrate.api.nvidia.com/v1" ) # type: ignore dynamic_api_key = api_key or get_secret("NVIDIA_NIM_API_KEY") + elif custom_llm_provider == "cerebras": + api_base = ( + api_base + or get_secret("CEREBRAS_API_BASE") + or "https://api.cerebras.ai/v1" + ) # type: ignore + dynamic_api_key = api_key or get_secret("CEREBRAS_API_KEY") elif custom_llm_provider == "volcengine": # volcengine is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.endpoints.anyscale.com/v1 api_base = ( @@ -4815,6 +4835,9 @@ def get_llm_provider( elif endpoint == "https://integrate.api.nvidia.com/v1": custom_llm_provider = "nvidia_nim" dynamic_api_key = get_secret("NVIDIA_NIM_API_KEY") + elif endpoint == "https://api.cerebras.ai/v1": + custom_llm_provider = "cerebras" + dynamic_api_key = get_secret("CEREBRAS_API_KEY") elif endpoint == "https://codestral.mistral.ai/v1": custom_llm_provider = "codestral" dynamic_api_key = get_secret("CODESTRAL_API_KEY") @@ -5734,6 +5757,11 @@ def validate_environment( keys_in_environment = True else: missing_keys.append("NVIDIA_NIM_API_KEY") + elif custom_llm_provider == "cerebras": + if "CEREBRAS_API_KEY" in os.environ: + keys_in_environment = True + else: + missing_keys.append("CEREBRAS_API_KEY") elif custom_llm_provider == "volcengine": if "VOLCENGINE_API_KEY" in os.environ: keys_in_environment = True From 47ef1f9191bd1ee08405f77082d17c22504a5154 Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Sat, 31 Aug 2024 14:09:35 -0700 Subject: [PATCH 3/6] anthropic prompt caching cost tracking (#5453) * fix(utils.py): support 'drop_params' for embedding requests Fixes https://github.com/BerriAI/litellm/issues/5444 * feat(anthropic/cost_calculation.py): Support calculating cost for prompt caching on anthropic * feat(types/utils.py): allows us to migrate to openai's equivalent, once that comes out * fix: fix linting errors * test: mark flaky test --- litellm/__init__.py | 4 +- litellm/cost_calculator.py | 42 +++- .../llms/{anthropic.py => anthropic/chat.py} | 10 +- .../completion.py} | 8 +- litellm/llms/anthropic/cost_calculation.py | 42 ++++ litellm/llms/base.py | 13 +- .../vertex_ai_anthropic.py | 13 +- litellm/main.py | 10 +- ...odel_prices_and_context_window_backup.json | 6 + litellm/proxy/_new_secret_config.yaml | 4 +- litellm/tests/test_anthropic_completion.py | 230 ++++++++++++++---- litellm/tests/test_completion_cost.py | 70 ++++++ .../tests/test_dynamic_rate_limit_handler.py | 1 + litellm/tests/test_optional_params.py | 10 + litellm/types/utils.py | 20 +- litellm/utils.py | 27 +- model_prices_and_context_window.json | 6 + 17 files changed, 432 insertions(+), 84 deletions(-) rename litellm/llms/{anthropic.py => anthropic/chat.py} (99%) rename litellm/llms/{anthropic_text.py => anthropic/completion.py} (98%) create mode 100644 litellm/llms/anthropic/cost_calculation.py diff --git a/litellm/__init__.py b/litellm/__init__.py index d24003fff..2e7914fab 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -843,10 +843,10 @@ ALL_LITELLM_RESPONSE_TYPES = [ from .types.utils import ImageObject from .llms.custom_llm import CustomLLM from .llms.huggingface_restapi import HuggingfaceConfig -from .llms.anthropic import AnthropicConfig +from .llms.anthropic.chat import AnthropicConfig +from .llms.anthropic.completion import AnthropicTextConfig from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig from .llms.predibase import PredibaseConfig -from .llms.anthropic_text import AnthropicTextConfig from .llms.replicate import ReplicateConfig from .llms.cohere.completion import CohereConfig from .llms.clarifai import ClarifaiConfig diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 3c025055e..a0645c19a 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -19,8 +19,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import ( cost_router as google_cost_router, ) from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character +from litellm.llms.anthropic.cost_calculation import ( + cost_per_token as anthropic_cost_per_token, +) from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS +from litellm.types.utils import Usage from litellm.utils import ( CallTypes, CostPerToken, @@ -59,14 +63,17 @@ def _cost_per_token_custom_pricing_helper( def cost_per_token( model: str = "", - prompt_tokens: float = 0, - completion_tokens: float = 0, + prompt_tokens: int = 0, + completion_tokens: int = 0, response_time_ms=None, custom_llm_provider: Optional[str] = None, region_name=None, ### CHARACTER PRICING ### - prompt_characters: float = 0, - completion_characters: float = 0, + prompt_characters: int = 0, + completion_characters: int = 0, + ### PROMPT CACHING PRICING ### - used for anthropic + cache_creation_input_tokens: Optional[int] = 0, + cache_read_input_tokens: Optional[int] = 0, ### CUSTOM PRICING ### custom_cost_per_token: Optional[CostPerToken] = None, custom_cost_per_second: Optional[float] = None, @@ -108,6 +115,16 @@ def cost_per_token( """ if model is None: raise Exception("Invalid arg. Model cannot be none.") + + ## RECONSTRUCT USAGE BLOCK ## + usage_block = Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, + ) + ## CUSTOM PRICING ## response_cost = _cost_per_token_custom_pricing_helper( prompt_tokens=prompt_tokens, @@ -137,6 +154,7 @@ def cost_per_token( model_with_provider = model_with_provider_and_region else: _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model) + model_without_prefix = model model_parts = model.split("/") if len(model_parts) > 1: @@ -162,6 +180,7 @@ def cost_per_token( # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models print_verbose(f"Looking up model={model} in model_cost_map") + if custom_llm_provider == "vertex_ai": cost_router = google_cost_router( model=model_without_prefix, @@ -188,6 +207,8 @@ def cost_per_token( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, ) + elif custom_llm_provider == "anthropic": + return anthropic_cost_per_token(model=model, usage=usage_block) elif custom_llm_provider == "gemini": return google_cost_per_token( model=model_without_prefix, @@ -520,6 +541,8 @@ def completion_cost( prompt_characters = 0 completion_tokens = 0 completion_characters = 0 + cache_creation_input_tokens: Optional[int] = None + cache_read_input_tokens: Optional[int] = None if completion_response is not None and ( isinstance(completion_response, BaseModel) or isinstance(completion_response, dict) @@ -541,6 +564,13 @@ def completion_cost( completion_tokens = completion_response.get("usage", {}).get( "completion_tokens", 0 ) + cache_creation_input_tokens = completion_response.get("usage", {}).get( + "cache_creation_input_tokens", 0 + ) + cache_read_input_tokens = completion_response.get("usage", {}).get( + "cache_read_input_tokens", 0 + ) + total_time = getattr(completion_response, "_response_ms", 0) verbose_logger.debug( f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} " @@ -550,7 +580,7 @@ def completion_cost( ) if hasattr(completion_response, "_hidden_params"): custom_llm_provider = completion_response._hidden_params.get( - "custom_llm_provider", custom_llm_provider or "" + "custom_llm_provider", custom_llm_provider or None ) region_name = completion_response._hidden_params.get( "region_name", region_name @@ -697,6 +727,8 @@ def completion_cost( custom_cost_per_token=custom_cost_per_token, prompt_characters=prompt_characters, completion_characters=completion_characters, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, call_type=call_type, ) _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic/chat.py similarity index 99% rename from litellm/llms/anthropic.py rename to litellm/llms/anthropic/chat.py index 813897c66..f62c7246e 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic/chat.py @@ -1,3 +1,7 @@ +""" +Calling + translation logic for anthropic's `/v1/messages` endpoint +""" + import copy import json import os @@ -70,8 +74,8 @@ from litellm.types.llms.openai import ( from litellm.types.utils import Choices, GenericStreamingChunk from litellm.utils import CustomStreamWrapper, ModelResponse, Usage -from .base import BaseLLM -from .prompt_templates.factory import custom_prompt, prompt_factory +from ..base import BaseLLM +from ..prompt_templates.factory import custom_prompt, prompt_factory class AnthropicConstants(Enum): @@ -982,7 +986,7 @@ class AnthropicChatCompletion(BaseLLM): ) except Exception as e: verbose_logger.exception( - "litellm.llms.anthropic.py::completion() - Exception occurred - {}\nReceived Messages: {}".format( + "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format( str(e), messages ) ) diff --git a/litellm/llms/anthropic_text.py b/litellm/llms/anthropic/completion.py similarity index 98% rename from litellm/llms/anthropic_text.py rename to litellm/llms/anthropic/completion.py index d20e49daf..dd2d47e53 100644 --- a/litellm/llms/anthropic_text.py +++ b/litellm/llms/anthropic/completion.py @@ -1,3 +1,7 @@ +""" +Translation logic for anthropic's `/v1/complete` endpoint +""" + import json import os import time @@ -12,8 +16,8 @@ import litellm from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.utils import CustomStreamWrapper, ModelResponse, Usage -from .base import BaseLLM -from .prompt_templates.factory import custom_prompt, prompt_factory +from ..base import BaseLLM +from ..prompt_templates.factory import custom_prompt, prompt_factory class AnthropicConstants(Enum): diff --git a/litellm/llms/anthropic/cost_calculation.py b/litellm/llms/anthropic/cost_calculation.py new file mode 100644 index 000000000..d1742aae9 --- /dev/null +++ b/litellm/llms/anthropic/cost_calculation.py @@ -0,0 +1,42 @@ +""" +Helper util for handling anthropic-specific cost calculation +- e.g.: prompt caching +""" + +from typing import Tuple + +from litellm.types.utils import Usage +from litellm.utils import get_model_info + + +def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]: + """ + Calculates the cost per token for a given model, prompt tokens, and completion tokens. + + Input: + - model: str, the model name without provider prefix + - usage: LiteLLM Usage block, containing anthropic caching information + + Returns: + Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd + """ + ## GET MODEL INFO + model_info = get_model_info(model=model, custom_llm_provider="anthropic") + + ## CALCULATE INPUT COST + + prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"] + if model_info.get("cache_creation_input_token_cost") is not None: + prompt_cost += ( + usage._cache_creation_input_tokens # type: ignore + * model_info["cache_creation_input_token_cost"] + ) + if model_info.get("cache_read_input_token_cost") is not None: + prompt_cost += ( + usage._cache_read_input_tokens * model_info["cache_read_input_token_cost"] # type: ignore + ) + + ## CALCULATE OUTPUT COST + completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"] + + return prompt_cost, completion_cost diff --git a/litellm/llms/base.py b/litellm/llms/base.py index 7e80de9ab..08c5e1992 100644 --- a/litellm/llms/base.py +++ b/litellm/llms/base.py @@ -1,11 +1,14 @@ ## This is a template base class to be used for adding new LLM providers via API calls +from typing import Any, Optional, Union + +import httpx +import requests + import litellm -import httpx, requests -from typing import Optional, Union -from litellm.litellm_core_utils.litellm_logging import Logging class BaseLLM: + _client_session: Optional[httpx.Client] = None def process_response( @@ -14,7 +17,7 @@ class BaseLLM: response: Union[requests.Response, httpx.Response], model_response: litellm.utils.ModelResponse, stream: bool, - logging_obj: Logging, + logging_obj: Any, optional_params: dict, api_key: str, data: Union[dict, str], @@ -33,7 +36,7 @@ class BaseLLM: response: Union[requests.Response, httpx.Response], model_response: litellm.utils.TextCompletionResponse, stream: bool, - logging_obj: Logging, + logging_obj: Any, optional_params: dict, api_key: str, data: Union[dict, str], diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py index e85160a43..025b27240 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py @@ -267,18 +267,19 @@ def completion( ): try: import vertexai - from anthropic import AnthropicVertex - - from litellm.llms.anthropic import AnthropicChatCompletion - from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( - VertexLLM, - ) except: raise VertexAIError( status_code=400, message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""", ) + from anthropic import AnthropicVertex + + from litellm.llms.anthropic.chat import AnthropicChatCompletion + from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( + VertexLLM, + ) + if not ( hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models") ): diff --git a/litellm/main.py b/litellm/main.py index 658a46258..7f1431073 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -77,13 +77,10 @@ from .caching import disable_cache, enable_cache, update_cache from .llms import ( ai21, aleph_alpha, - anthropic_text, baseten, bedrock, clarifai, cloudflare, - gemini, - huggingface_restapi, maritalk, nlp_cloud, ollama, @@ -93,13 +90,10 @@ from .llms import ( palm, petals, replicate, - together_ai, - triton, vllm, - watsonx, ) -from .llms.anthropic import AnthropicChatCompletion -from .llms.anthropic_text import AnthropicTextCompletion +from .llms.anthropic.chat import AnthropicChatCompletion +from .llms.anthropic.completion import AnthropicTextCompletion from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params from .llms.azure_text import AzureTextCompletion from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 5e6d0f2ab..a60743c65 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -1336,6 +1336,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.00000025, "output_cost_per_token": 0.00000125, + "cache_creation_input_token_cost": 0.0000003, + "cache_read_input_token_cost": 0.00000003, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, @@ -1349,6 +1351,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000075, + "cache_creation_input_token_cost": 0.00001875, + "cache_read_input_token_cost": 0.0000015, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, @@ -1375,6 +1379,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000015, + "cache_creation_input_token_cost": 0.00000375, + "cache_read_input_token_cost": 0.0000003, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index b8f964ab3..b84ef7453 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,4 +1,4 @@ model_list: - - model_name: "gemini/*" + - model_name: "gpt-3.5-turbo" litellm_params: - model: "gemini/*" \ No newline at end of file + model: "gpt-3.5-turbo" \ No newline at end of file diff --git a/litellm/tests/test_anthropic_completion.py b/litellm/tests/test_anthropic_completion.py index b5e01d448..b8ccf716e 100644 --- a/litellm/tests/test_anthropic_completion.py +++ b/litellm/tests/test_anthropic_completion.py @@ -10,7 +10,7 @@ from dotenv import load_dotenv import litellm.types import litellm.types.utils -from litellm.llms.anthropic import ModelResponseIterator +from litellm.llms.anthropic.chat import ModelResponseIterator load_dotenv() import io @@ -152,48 +152,190 @@ def test_anthropic_completion_e2e(stream): anthropic_chunk_list = [ - {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "To"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " answer"}}, - {"type": "content_block_delta", "index": 0, - "delta": {"type": "text_delta", "text": " your question about the weather"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " in Boston and Los"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " Angeles today, I'll"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " need to"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " use"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " the"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " get_current_weather"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " function"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " both"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " cities"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": ". Let"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " me fetch"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " that"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " information"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " you."}}, + { + "type": "content_block_start", + "index": 0, + "content_block": {"type": "text", "text": ""}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": "To"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " answer"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " your question about the weather"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " in Boston and Los"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " Angeles today, I'll"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " need to"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " use"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " the"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " get_current_weather"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " function"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " for"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " both"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " cities"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": ". Let"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " me fetch"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " that"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " information"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " for"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " you."}, + }, {"type": "content_block_stop", "index": 0}, - {"type": "content_block_start", "index": 1, - "content_block": {"type": "tool_use", "id": "toolu_12345", "name": "get_current_weather", "input": {}}}, - {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": ""}}, - {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "{\"locat"}}, - {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ion\": \"Bos"}}, - {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ton, MA\"}"}}, + { + "type": "content_block_start", + "index": 1, + "content_block": { + "type": "tool_use", + "id": "toolu_12345", + "name": "get_current_weather", + "input": {}, + }, + }, + { + "type": "content_block_delta", + "index": 1, + "delta": {"type": "input_json_delta", "partial_json": ""}, + }, + { + "type": "content_block_delta", + "index": 1, + "delta": {"type": "input_json_delta", "partial_json": '{"locat'}, + }, + { + "type": "content_block_delta", + "index": 1, + "delta": {"type": "input_json_delta", "partial_json": 'ion": "Bos'}, + }, + { + "type": "content_block_delta", + "index": 1, + "delta": {"type": "input_json_delta", "partial_json": 'ton, MA"}'}, + }, {"type": "content_block_stop", "index": 1}, - {"type": "content_block_start", "index": 2, - "content_block": {"type": "tool_use", "id": "toolu_023423423", "name": "get_current_weather", "input": {}}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": ""}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "{\"l"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "oca"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "tio"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "n\": \"Lo"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "s Angel"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "es, CA\"}"}}, + { + "type": "content_block_start", + "index": 2, + "content_block": { + "type": "tool_use", + "id": "toolu_023423423", + "name": "get_current_weather", + "input": {}, + }, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": ""}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": '{"l'}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": "oca"}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": "tio"}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": 'n": "Lo'}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": "s Angel"}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": 'es, CA"}'}, + }, {"type": "content_block_stop", "index": 2}, - {"type": "message_delta", "delta": {"stop_reason": "tool_use", "stop_sequence": None}, - "usage": {"output_tokens": 137}}, - {"type": "message_stop"} + { + "type": "message_delta", + "delta": {"stop_reason": "tool_use", "stop_sequence": None}, + "usage": {"output_tokens": 137}, + }, + {"type": "message_stop"}, ] @@ -211,12 +353,12 @@ def test_anthropic_tool_streaming(): correct_tool_index = -1 for chunk in anthropic_chunk_list: parsed_chunk = response_iter.chunk_parser(chunk) - if tool_use := parsed_chunk.get('tool_use'): + if tool_use := parsed_chunk.get("tool_use"): # We only increment when a new block starts - if tool_use.get('id') is not None: + if tool_use.get("id") is not None: correct_tool_index += 1 - assert tool_use['index'] == correct_tool_index + assert tool_use["index"] == correct_tool_index @pytest.mark.asyncio @@ -344,4 +486,4 @@ def test_anthropic_tool_calling_translation(): print(translated_params["messages"]) assert len(translated_params["messages"]) > 0 - assert translated_params["messages"][0]["role"] == "user" \ No newline at end of file + assert translated_params["messages"][0]["role"] == "user" diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index e9326752f..f48a85cad 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -1097,3 +1097,73 @@ def test_completion_cost_azure_common_deployment_name(): print(f"mock_client.call_args: {mock_client.call_args.kwargs}") assert "azure/gpt-4" == mock_client.call_args.kwargs["model"] + + +def test_completion_cost_anthropic_prompt_caching(): + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + + from litellm.utils import Choices, Message, ModelResponse, Usage + + model = "anthropic/claude-3-5-sonnet-20240620" + + ## WRITE TO CACHE ## (MORE EXPENSIVE) + response_1 = ModelResponse( + id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424", + choices=[ + Choices( + finish_reason="length", + index=0, + message=Message( + content="Hello! I'm doing well, thank you for", + role="assistant", + tool_calls=None, + function_call=None, + ), + ) + ], + created=1725036547, + model="claude-3-5-sonnet-20240620", + object="chat.completion", + system_fingerprint=None, + usage=Usage( + completion_tokens=10, + prompt_tokens=14, + total_tokens=24, + cache_creation_input_tokens=100, + cache_read_input_tokens=0, + ), + ) + + ## READ FROM CACHE ## (LESS EXPENSIVE) + response_2 = ModelResponse( + id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424", + choices=[ + Choices( + finish_reason="length", + index=0, + message=Message( + content="Hello! I'm doing well, thank you for", + role="assistant", + tool_calls=None, + function_call=None, + ), + ) + ], + created=1725036547, + model="claude-3-5-sonnet-20240620", + object="chat.completion", + system_fingerprint=None, + usage=Usage( + completion_tokens=10, + prompt_tokens=14, + total_tokens=24, + cache_creation_input_tokens=0, + cache_read_input_tokens=100, + ), + ) + + cost_1 = completion_cost(model=model, completion_response=response_1) + cost_2 = completion_cost(model=model, completion_response=response_2) + + assert cost_1 > cost_2 diff --git a/litellm/tests/test_dynamic_rate_limit_handler.py b/litellm/tests/test_dynamic_rate_limit_handler.py index f49a760af..d711de71f 100644 --- a/litellm/tests/test_dynamic_rate_limit_handler.py +++ b/litellm/tests/test_dynamic_rate_limit_handler.py @@ -290,6 +290,7 @@ async def test_base_case(dynamic_rate_limit_handler, mock_response): @pytest.mark.asyncio +@pytest.mark.flaky(retries=3, delay=1) async def test_update_cache( dynamic_rate_limit_handler, mock_response, user_api_key_auth ): diff --git a/litellm/tests/test_optional_params.py b/litellm/tests/test_optional_params.py index e8bc999f2..54e2e5b43 100644 --- a/litellm/tests/test_optional_params.py +++ b/litellm/tests/test_optional_params.py @@ -75,6 +75,16 @@ def test_bedrock_optional_params_embeddings(): assert len(optional_params) == 0 +def test_google_ai_studio_optional_params_embeddings(): + optional_params = get_optional_params_embeddings( + user="John", + encoding_format=None, + custom_llm_provider="gemini", + drop_params=True, + ) + assert len(optional_params) == 0 + + def test_openai_optional_params_embeddings(): litellm.drop_params = True optional_params = get_optional_params_embeddings( diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 81dc268af..aadbdd22a 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -51,6 +51,8 @@ class ModelInfo(TypedDict, total=False): max_input_tokens: Required[Optional[int]] max_output_tokens: Required[Optional[int]] input_cost_per_token: Required[float] + cache_creation_input_token_cost: Optional[float] + cache_read_input_token_cost: Optional[float] input_cost_per_character: Optional[float] # only for vertex ai models input_cost_per_token_above_128k_tokens: Optional[float] # only for vertex ai models input_cost_per_character_above_128k_tokens: Optional[ @@ -454,6 +456,13 @@ class Choices(OpenAIObject): class Usage(CompletionUsage): + _cache_creation_input_tokens: int = PrivateAttr( + 0 + ) # hidden param for prompt caching. Might change, once openai introduces their equivalent. + _cache_read_input_tokens: int = PrivateAttr( + 0 + ) # hidden param for prompt caching. Might change, once openai introduces their equivalent. + def __init__( self, prompt_tokens: Optional[int] = None, @@ -466,9 +475,18 @@ class Usage(CompletionUsage): "completion_tokens": completion_tokens or 0, "total_tokens": total_tokens or 0, } - super().__init__(**data) + if "cache_creation_input_tokens" in params and isinstance( + params["cache_creation_input_tokens"], int + ): + self._cache_creation_input_tokens = params["cache_creation_input_tokens"] + + if "cache_read_input_tokens" in params and isinstance( + params["cache_read_input_tokens"], int + ): + self._cache_read_input_tokens = params["cache_read_input_tokens"] + for k, v in params.items(): setattr(self, k, v) diff --git a/litellm/utils.py b/litellm/utils.py index faa317c1b..facbc6a0a 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2550,6 +2550,7 @@ def get_optional_params_embeddings( encoding_format=None, dimensions=None, custom_llm_provider="", + drop_params: Optional[bool] = None, additional_drop_params: Optional[bool] = None, **kwargs, ): @@ -2560,6 +2561,7 @@ def get_optional_params_embeddings( for k, v in special_params.items(): passed_params[k] = v + drop_params = passed_params.pop("drop_params", None) additional_drop_params = passed_params.pop("additional_drop_params", None) default_params = {"user": None, "encoding_format": None, "dimensions": None} @@ -2571,11 +2573,16 @@ def get_optional_params_embeddings( for k in non_default_params.keys(): if k not in supported_params: unsupported_params[k] = non_default_params[k] - if unsupported_params and not litellm.drop_params: - raise UnsupportedParamsError( - status_code=500, - message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n", - ) + if unsupported_params: + if litellm.drop_params is True or ( + drop_params is not None and drop_params is True + ): + pass + else: + raise UnsupportedParamsError( + status_code=500, + message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n", + ) non_default_params = _get_non_default_params( passed_params=passed_params, @@ -2680,7 +2687,9 @@ def get_optional_params_embeddings( and custom_llm_provider not in litellm.openai_compatible_providers ): if len(non_default_params.keys()) > 0: - if litellm.drop_params is True: # drop the unsupported non-default values + if ( + litellm.drop_params is True or drop_params is True + ): # drop the unsupported non-default values keys = list(non_default_params.keys()) for k in keys: non_default_params.pop(k, None) @@ -5358,6 +5367,12 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod max_input_tokens=_model_info.get("max_input_tokens", None), max_output_tokens=_model_info.get("max_output_tokens", None), input_cost_per_token=_input_cost_per_token, + cache_creation_input_token_cost=_model_info.get( + "cache_creation_input_token_cost", None + ), + cache_read_input_token_cost=_model_info.get( + "cache_read_input_token_cost", None + ), input_cost_per_character=_model_info.get( "input_cost_per_character", None ), diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 5e6d0f2ab..a60743c65 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -1336,6 +1336,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.00000025, "output_cost_per_token": 0.00000125, + "cache_creation_input_token_cost": 0.0000003, + "cache_read_input_token_cost": 0.00000003, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, @@ -1349,6 +1351,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000075, + "cache_creation_input_token_cost": 0.00001875, + "cache_read_input_token_cost": 0.0000015, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, @@ -1375,6 +1379,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000015, + "cache_creation_input_token_cost": 0.00000375, + "cache_read_input_token_cost": 0.0000003, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, From 017dd8891095c4e7533abf4d1a1fe8f75800db7d Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Sat, 31 Aug 2024 14:34:00 -0700 Subject: [PATCH 4/6] test: skip test on end of life model --- litellm/tests/test_streaming.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index d2ef8aafc..1b8b4e085 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -1545,6 +1545,7 @@ def test_completion_bedrock_claude_stream(): # test_completion_bedrock_claude_stream() +@pytest.mark.skip(reason="model end of life") def test_completion_bedrock_ai21_stream(): try: litellm.set_verbose = False From fd4157cf7130f3ec59e880c19448fac5acdbcb68 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 31 Aug 2024 14:57:12 -0700 Subject: [PATCH 5/6] docs add cerebras --- docs/my-website/docs/providers/cerebras.md | 145 +++++++++++++++++++++ docs/my-website/sidebars.js | 1 + 2 files changed, 146 insertions(+) create mode 100644 docs/my-website/docs/providers/cerebras.md diff --git a/docs/my-website/docs/providers/cerebras.md b/docs/my-website/docs/providers/cerebras.md new file mode 100644 index 000000000..4fabeb31c --- /dev/null +++ b/docs/my-website/docs/providers/cerebras.md @@ -0,0 +1,145 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Cerebras +https://inference-docs.cerebras.ai/api-reference/chat-completions + +:::tip + +**We support ALL Cerebras models, just set `model=cerebras/` as a prefix when sending litellm requests** + +::: + +## API Key +```python +# env variable +os.environ['CEREBRAS_API_KEY'] +``` + +## Sample Usage +```python +from litellm import completion +import os + +os.environ['CEREBRAS_API_KEY'] = "" +response = completion( + model="cerebras/meta/llama3-70b-instruct", + messages=[ + { + "role": "user", + "content": "What's the weather like in Boston today in Fahrenheit?", + } + ], + max_tokens=10, + response_format={ "type": "json_object" }, + seed=123, + stop=["\n\n"], + temperature=0.2, + top_p=0.9, + tool_choice="auto", + tools=[], + user="user", +) +print(response) +``` + +## Sample Usage - Streaming +```python +from litellm import completion +import os + +os.environ['CEREBRAS_API_KEY'] = "" +response = completion( + model="cerebras/meta/llama3-70b-instruct", + messages=[ + { + "role": "user", + "content": "What's the weather like in Boston today in Fahrenheit?", + } + ], + stream=True, + max_tokens=10, + response_format={ "type": "json_object" }, + seed=123, + stop=["\n\n"], + temperature=0.2, + top_p=0.9, + tool_choice="auto", + tools=[], + user="user", +) + +for chunk in response: + print(chunk) +``` + + +## Usage with LiteLLM Proxy Server + +Here's how to call a Cerebras model with the LiteLLM Proxy Server + +1. Modify the config.yaml + + ```yaml + model_list: + - model_name: my-model + litellm_params: + model: cerebras/ # add cerebras/ prefix to route as Cerebras provider + api_key: api-key # api key to send your model + ``` + + +2. Start the proxy + + ```bash + $ litellm --config /path/to/config.yaml + ``` + +3. Send Request to LiteLLM Proxy Server + + + + + + ```python + import openai + client = openai.OpenAI( + api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys + base_url="http://0.0.0.0:4000" # litellm-proxy-base url + ) + + response = client.chat.completions.create( + model="my-model", + messages = [ + { + "role": "user", + "content": "what llm are you" + } + ], + ) + + print(response) + ``` + + + + + ```shell + curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "my-model", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' + ``` + + + + + diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 59db4c363..048b04171 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -138,6 +138,7 @@ const sidebars = { "providers/watsonx", "providers/predibase", "providers/nvidia_nim", + "providers/cerebras", "providers/volcano", "providers/triton-inference-server", "providers/ollama", From 4bd85b9d83b75bd669328b2b743f8f1e5e5c66a9 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 31 Aug 2024 16:29:26 -0700 Subject: [PATCH 6/6] add cerebras cost tracking --- ...odel_prices_and_context_window_backup.json | 20 +++++++++++++++++++ model_prices_and_context_window.json | 20 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index a60743c65..daf2c502a 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -1273,6 +1273,26 @@ "mode": "chat", "supports_function_calling": true }, + "cerebras/llama3.1-8b": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 128000, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.0000001, + "litellm_provider": "cerebras", + "mode": "chat", + "supports_function_calling": true + }, + "cerebras/llama3.1-70b": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 128000, + "input_cost_per_token": 0.0000006, + "output_cost_per_token": 0.0000006, + "litellm_provider": "cerebras", + "mode": "chat", + "supports_function_calling": true + }, "friendliai/mixtral-8x7b-instruct-v0-1": { "max_tokens": 32768, "max_input_tokens": 32768, diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index a60743c65..daf2c502a 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -1273,6 +1273,26 @@ "mode": "chat", "supports_function_calling": true }, + "cerebras/llama3.1-8b": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 128000, + "input_cost_per_token": 0.0000001, + "output_cost_per_token": 0.0000001, + "litellm_provider": "cerebras", + "mode": "chat", + "supports_function_calling": true + }, + "cerebras/llama3.1-70b": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 128000, + "input_cost_per_token": 0.0000006, + "output_cost_per_token": 0.0000006, + "litellm_provider": "cerebras", + "mode": "chat", + "supports_function_calling": true + }, "friendliai/mixtral-8x7b-instruct-v0-1": { "max_tokens": 32768, "max_input_tokens": 32768,