From cb6a0f0237334b38342288bf6fc724d22e393d06 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 31 Aug 2024 08:34:46 -0700
Subject: [PATCH 1/6] add cerebras config

---
 litellm/llms/cerebras/chat.py | 91 +++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 litellm/llms/cerebras/chat.py

diff --git a/litellm/llms/cerebras/chat.py b/litellm/llms/cerebras/chat.py
new file mode 100644
index 000000000..13b8f0ee9
--- /dev/null
+++ b/litellm/llms/cerebras/chat.py
@@ -0,0 +1,91 @@
+"""
+Cerebras Chat Completions API
+
+this is OpenAI compatible - no translation needed / occurs
+"""
+
+import types
+from typing import Optional, Union
+
+
+class CerebrasConfig:
+    """
+    Reference: https://inference-docs.cerebras.ai/api-reference/chat-completions
+
+    Below are the parameters:
+    """
+
+    max_tokens: Optional[int] = None
+    response_format: Optional[dict] = None
+    seed: Optional[int] = None
+    stop: Optional[str] = None
+    stream: Optional[bool] = None
+    temperature: Optional[float] = None
+    top_p: Optional[int] = None
+    tool_choice: Optional[str] = None
+    tools: Optional[list] = None
+    user: Optional[str] = None
+
+    def __init__(
+        self,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[dict] = None,
+        seed: Optional[int] = None,
+        stop: Optional[str] = None,
+        stream: Optional[bool] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[int] = None,
+        tool_choice: Optional[str] = None,
+        tools: Optional[list] = None,
+        user: Optional[str] = None,
+    ) -> None:
+        locals_ = locals().copy()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params(self, model: str) -> list:
+        """
+        Get the supported OpenAI params for the given model
+
+        """
+
+        return [
+            "max_tokens",
+            "response_format",
+            "seed",
+            "stop",
+            "stream",
+            "temperature",
+            "top_p",
+            "tool_choice",
+            "tools",
+            "user",
+        ]
+
+    def map_openai_params(
+        self, model: str, non_default_params: dict, optional_params: dict
+    ) -> dict:
+        supported_openai_params = self.get_supported_openai_params(model=model)
+        for param, value in non_default_params.items():
+            if param in supported_openai_params:
+                optional_params[param] = value
+        return optional_params

From de9efe76ca01764d84500ce0325beaf710005e73 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 31 Aug 2024 08:35:23 -0700
Subject: [PATCH 2/6] add cerebras api

---
 litellm/__init__.py |  3 +++
 litellm/main.py     |  4 ++++
 litellm/utils.py    | 28 ++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+)

diff --git a/litellm/__init__.py b/litellm/__init__.py
index 1c3b8434f..d24003fff 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -452,6 +452,7 @@ openai_compatible_providers: List = [
     "mistral",
     "groq",
     "nvidia_nim",
+    "cerebras",
     "volcengine",
     "codestral",
     "deepseek",
@@ -690,6 +691,7 @@ provider_list: List = [
     "mistral",
     "groq",
     "nvidia_nim",
+    "cerebras",
     "volcengine",
     "codestral",
     "text-completion-codestral",
@@ -905,6 +907,7 @@ from .llms.openai import (
     AzureAIStudioConfig,
 )
 from .llms.nvidia_nim import NvidiaNimConfig
+from .llms.cerebras.chat import CerebrasConfig
 from .llms.fireworks_ai import FireworksAIConfig
 from .llms.volcengine import VolcEngineConfig
 from .llms.text_completion_codestral import MistralTextCompletionConfig
diff --git a/litellm/main.py b/litellm/main.py
index 95a106377..658a46258 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -390,6 +390,7 @@ async def acompletion(
             or custom_llm_provider == "perplexity"
             or custom_llm_provider == "groq"
             or custom_llm_provider == "nvidia_nim"
+            or custom_llm_provider == "cerebras"
             or custom_llm_provider == "volcengine"
             or custom_llm_provider == "codestral"
             or custom_llm_provider == "text-completion-codestral"
@@ -1295,6 +1296,7 @@ def completion(
             or custom_llm_provider == "perplexity"
             or custom_llm_provider == "groq"
             or custom_llm_provider == "nvidia_nim"
+            or custom_llm_provider == "cerebras"
             or custom_llm_provider == "volcengine"
             or custom_llm_provider == "codestral"
             or custom_llm_provider == "deepseek"
@@ -3144,6 +3146,7 @@ async def aembedding(*args, **kwargs) -> EmbeddingResponse:
             or custom_llm_provider == "perplexity"
             or custom_llm_provider == "groq"
             or custom_llm_provider == "nvidia_nim"
+            or custom_llm_provider == "cerebras"
             or custom_llm_provider == "volcengine"
             or custom_llm_provider == "deepseek"
             or custom_llm_provider == "fireworks_ai"
@@ -3795,6 +3798,7 @@ async def atext_completion(
             or custom_llm_provider == "perplexity"
             or custom_llm_provider == "groq"
             or custom_llm_provider == "nvidia_nim"
+            or custom_llm_provider == "cerebras"
             or custom_llm_provider == "volcengine"
             or custom_llm_provider == "text-completion-codestral"
             or custom_llm_provider == "deepseek"
diff --git a/litellm/utils.py b/litellm/utils.py
index ec4ac79c0..faa317c1b 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2854,6 +2854,7 @@ def get_optional_params(
             and custom_llm_provider != "together_ai"
             and custom_llm_provider != "groq"
             and custom_llm_provider != "nvidia_nim"
+            and custom_llm_provider != "cerebras"
             and custom_llm_provider != "volcengine"
             and custom_llm_provider != "deepseek"
             and custom_llm_provider != "codestral"
@@ -3613,6 +3614,16 @@ def get_optional_params(
             non_default_params=non_default_params,
             optional_params=optional_params,
         )
+    elif custom_llm_provider == "cerebras":
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+        _check_valid_arg(supported_params=supported_params)
+        optional_params = litellm.CerebrasConfig().map_openai_params(
+            non_default_params=non_default_params,
+            optional_params=optional_params,
+            model=model,
+        )
     elif custom_llm_provider == "fireworks_ai":
         supported_params = get_supported_openai_params(
             model=model, custom_llm_provider=custom_llm_provider
@@ -4238,6 +4249,8 @@ def get_supported_openai_params(
         return litellm.FireworksAIConfig().get_supported_openai_params()
     elif custom_llm_provider == "nvidia_nim":
         return litellm.NvidiaNimConfig().get_supported_openai_params(model=model)
+    elif custom_llm_provider == "cerebras":
+        return litellm.CerebrasConfig().get_supported_openai_params(model=model)
     elif custom_llm_provider == "volcengine":
         return litellm.VolcEngineConfig().get_supported_openai_params(model=model)
     elif custom_llm_provider == "groq":
@@ -4665,6 +4678,13 @@ def get_llm_provider(
                     or "https://integrate.api.nvidia.com/v1"
                 )  # type: ignore
                 dynamic_api_key = api_key or get_secret("NVIDIA_NIM_API_KEY")
+            elif custom_llm_provider == "cerebras":
+                api_base = (
+                    api_base
+                    or get_secret("CEREBRAS_API_BASE")
+                    or "https://api.cerebras.ai/v1"
+                )  # type: ignore
+                dynamic_api_key = api_key or get_secret("CEREBRAS_API_KEY")
             elif custom_llm_provider == "volcengine":
                 # volcengine is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.endpoints.anyscale.com/v1
                 api_base = (
@@ -4815,6 +4835,9 @@ def get_llm_provider(
                     elif endpoint == "https://integrate.api.nvidia.com/v1":
                         custom_llm_provider = "nvidia_nim"
                         dynamic_api_key = get_secret("NVIDIA_NIM_API_KEY")
+                    elif endpoint == "https://api.cerebras.ai/v1":
+                        custom_llm_provider = "cerebras"
+                        dynamic_api_key = get_secret("CEREBRAS_API_KEY")
                     elif endpoint == "https://codestral.mistral.ai/v1":
                         custom_llm_provider = "codestral"
                         dynamic_api_key = get_secret("CODESTRAL_API_KEY")
@@ -5734,6 +5757,11 @@ def validate_environment(
                 keys_in_environment = True
             else:
                 missing_keys.append("NVIDIA_NIM_API_KEY")
+        elif custom_llm_provider == "cerebras":
+            if "CEREBRAS_API_KEY" in os.environ:
+                keys_in_environment = True
+            else:
+                missing_keys.append("CEREBRAS_API_KEY")
         elif custom_llm_provider == "volcengine":
             if "VOLCENGINE_API_KEY" in os.environ:
                 keys_in_environment = True

From 47ef1f9191bd1ee08405f77082d17c22504a5154 Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Sat, 31 Aug 2024 14:09:35 -0700
Subject: [PATCH 3/6] anthropic prompt caching cost tracking (#5453)

* fix(utils.py): support 'drop_params' for embedding requests

Fixes https://github.com/BerriAI/litellm/issues/5444

* feat(anthropic/cost_calculation.py): Support calculating cost for prompt caching on anthropic

* feat(types/utils.py): allows us to migrate to openai's equivalent, once that comes out

* fix: fix linting errors

* test: mark flaky test
---
 litellm/__init__.py                           |   4 +-
 litellm/cost_calculator.py                    |  42 +++-
 .../llms/{anthropic.py => anthropic/chat.py}  |  10 +-
 .../completion.py}                            |   8 +-
 litellm/llms/anthropic/cost_calculation.py    |  42 ++++
 litellm/llms/base.py                          |  13 +-
 .../vertex_ai_anthropic.py                    |  13 +-
 litellm/main.py                               |  10 +-
 ...odel_prices_and_context_window_backup.json |   6 +
 litellm/proxy/_new_secret_config.yaml         |   4 +-
 litellm/tests/test_anthropic_completion.py    | 230 ++++++++++++++----
 litellm/tests/test_completion_cost.py         |  70 ++++++
 .../tests/test_dynamic_rate_limit_handler.py  |   1 +
 litellm/tests/test_optional_params.py         |  10 +
 litellm/types/utils.py                        |  20 +-
 litellm/utils.py                              |  27 +-
 model_prices_and_context_window.json          |   6 +
 17 files changed, 432 insertions(+), 84 deletions(-)
 rename litellm/llms/{anthropic.py => anthropic/chat.py} (99%)
 rename litellm/llms/{anthropic_text.py => anthropic/completion.py} (98%)
 create mode 100644 litellm/llms/anthropic/cost_calculation.py

diff --git a/litellm/__init__.py b/litellm/__init__.py
index d24003fff..2e7914fab 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -843,10 +843,10 @@ ALL_LITELLM_RESPONSE_TYPES = [
 from .types.utils import ImageObject
 from .llms.custom_llm import CustomLLM
 from .llms.huggingface_restapi import HuggingfaceConfig
-from .llms.anthropic import AnthropicConfig
+from .llms.anthropic.chat import AnthropicConfig
+from .llms.anthropic.completion import AnthropicTextConfig
 from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
 from .llms.predibase import PredibaseConfig
-from .llms.anthropic_text import AnthropicTextConfig
 from .llms.replicate import ReplicateConfig
 from .llms.cohere.completion import CohereConfig
 from .llms.clarifai import ClarifaiConfig
diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py
index 3c025055e..a0645c19a 100644
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@@ -19,8 +19,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
     cost_router as google_cost_router,
 )
 from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
+from litellm.llms.anthropic.cost_calculation import (
+    cost_per_token as anthropic_cost_per_token,
+)
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
+from litellm.types.utils import Usage
 from litellm.utils import (
     CallTypes,
     CostPerToken,
@@ -59,14 +63,17 @@ def _cost_per_token_custom_pricing_helper(
 
 def cost_per_token(
     model: str = "",
-    prompt_tokens: float = 0,
-    completion_tokens: float = 0,
+    prompt_tokens: int = 0,
+    completion_tokens: int = 0,
     response_time_ms=None,
     custom_llm_provider: Optional[str] = None,
     region_name=None,
     ### CHARACTER PRICING ###
-    prompt_characters: float = 0,
-    completion_characters: float = 0,
+    prompt_characters: int = 0,
+    completion_characters: int = 0,
+    ### PROMPT CACHING PRICING ### - used for anthropic
+    cache_creation_input_tokens: Optional[int] = 0,
+    cache_read_input_tokens: Optional[int] = 0,
     ### CUSTOM PRICING ###
     custom_cost_per_token: Optional[CostPerToken] = None,
     custom_cost_per_second: Optional[float] = None,
@@ -108,6 +115,16 @@ def cost_per_token(
     """
     if model is None:
         raise Exception("Invalid arg. Model cannot be none.")
+
+    ## RECONSTRUCT USAGE BLOCK ##
+    usage_block = Usage(
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        total_tokens=prompt_tokens + completion_tokens,
+        cache_creation_input_tokens=cache_creation_input_tokens,
+        cache_read_input_tokens=cache_read_input_tokens,
+    )
+
     ## CUSTOM PRICING ##
     response_cost = _cost_per_token_custom_pricing_helper(
         prompt_tokens=prompt_tokens,
@@ -137,6 +154,7 @@ def cost_per_token(
                 model_with_provider = model_with_provider_and_region
     else:
         _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
+
     model_without_prefix = model
     model_parts = model.split("/")
     if len(model_parts) > 1:
@@ -162,6 +180,7 @@ def cost_per_token(
 
     # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
     print_verbose(f"Looking up model={model} in model_cost_map")
+
     if custom_llm_provider == "vertex_ai":
         cost_router = google_cost_router(
             model=model_without_prefix,
@@ -188,6 +207,8 @@ def cost_per_token(
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
             )
+    elif custom_llm_provider == "anthropic":
+        return anthropic_cost_per_token(model=model, usage=usage_block)
     elif custom_llm_provider == "gemini":
         return google_cost_per_token(
             model=model_without_prefix,
@@ -520,6 +541,8 @@ def completion_cost(
         prompt_characters = 0
         completion_tokens = 0
         completion_characters = 0
+        cache_creation_input_tokens: Optional[int] = None
+        cache_read_input_tokens: Optional[int] = None
         if completion_response is not None and (
             isinstance(completion_response, BaseModel)
             or isinstance(completion_response, dict)
@@ -541,6 +564,13 @@ def completion_cost(
             completion_tokens = completion_response.get("usage", {}).get(
                 "completion_tokens", 0
             )
+            cache_creation_input_tokens = completion_response.get("usage", {}).get(
+                "cache_creation_input_tokens", 0
+            )
+            cache_read_input_tokens = completion_response.get("usage", {}).get(
+                "cache_read_input_tokens", 0
+            )
+
             total_time = getattr(completion_response, "_response_ms", 0)
             verbose_logger.debug(
                 f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
@@ -550,7 +580,7 @@ def completion_cost(
             )
             if hasattr(completion_response, "_hidden_params"):
                 custom_llm_provider = completion_response._hidden_params.get(
-                    "custom_llm_provider", custom_llm_provider or ""
+                    "custom_llm_provider", custom_llm_provider or None
                 )
                 region_name = completion_response._hidden_params.get(
                     "region_name", region_name
@@ -697,6 +727,8 @@ def completion_cost(
             custom_cost_per_token=custom_cost_per_token,
             prompt_characters=prompt_characters,
             completion_characters=completion_characters,
+            cache_creation_input_tokens=cache_creation_input_tokens,
+            cache_read_input_tokens=cache_read_input_tokens,
             call_type=call_type,
         )
         _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic/chat.py
similarity index 99%
rename from litellm/llms/anthropic.py
rename to litellm/llms/anthropic/chat.py
index 813897c66..f62c7246e 100644
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic/chat.py
@@ -1,3 +1,7 @@
+"""
+Calling + translation logic for anthropic's `/v1/messages` endpoint
+"""
+
 import copy
 import json
 import os
@@ -70,8 +74,8 @@ from litellm.types.llms.openai import (
 from litellm.types.utils import Choices, GenericStreamingChunk
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
 
-from .base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..base import BaseLLM
+from ..prompt_templates.factory import custom_prompt, prompt_factory
 
 
 class AnthropicConstants(Enum):
@@ -982,7 +986,7 @@ class AnthropicChatCompletion(BaseLLM):
                 )
             except Exception as e:
                 verbose_logger.exception(
-                    "litellm.llms.anthropic.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
+                    "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
                         str(e), messages
                     )
                 )
diff --git a/litellm/llms/anthropic_text.py b/litellm/llms/anthropic/completion.py
similarity index 98%
rename from litellm/llms/anthropic_text.py
rename to litellm/llms/anthropic/completion.py
index d20e49daf..dd2d47e53 100644
--- a/litellm/llms/anthropic_text.py
+++ b/litellm/llms/anthropic/completion.py
@@ -1,3 +1,7 @@
+"""
+Translation logic for anthropic's `/v1/complete` endpoint
+"""
+
 import json
 import os
 import time
@@ -12,8 +16,8 @@ import litellm
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
 
-from .base import BaseLLM
-from .prompt_templates.factory import custom_prompt, prompt_factory
+from ..base import BaseLLM
+from ..prompt_templates.factory import custom_prompt, prompt_factory
 
 
 class AnthropicConstants(Enum):
diff --git a/litellm/llms/anthropic/cost_calculation.py b/litellm/llms/anthropic/cost_calculation.py
new file mode 100644
index 000000000..d1742aae9
--- /dev/null
+++ b/litellm/llms/anthropic/cost_calculation.py
@@ -0,0 +1,42 @@
+"""
+Helper util for handling anthropic-specific cost calculation
+- e.g.: prompt caching
+"""
+
+from typing import Tuple
+
+from litellm.types.utils import Usage
+from litellm.utils import get_model_info
+
+
+def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
+    """
+    Calculates the cost per token for a given model, prompt tokens, and completion tokens.
+
+    Input:
+        - model: str, the model name without provider prefix
+        - usage: LiteLLM Usage block, containing anthropic caching information
+
+    Returns:
+        Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
+    """
+    ## GET MODEL INFO
+    model_info = get_model_info(model=model, custom_llm_provider="anthropic")
+
+    ## CALCULATE INPUT COST
+
+    prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
+    if model_info.get("cache_creation_input_token_cost") is not None:
+        prompt_cost += (
+            usage._cache_creation_input_tokens  # type: ignore
+            * model_info["cache_creation_input_token_cost"]
+        )
+    if model_info.get("cache_read_input_token_cost") is not None:
+        prompt_cost += (
+            usage._cache_read_input_tokens * model_info["cache_read_input_token_cost"]  # type: ignore
+        )
+
+    ## CALCULATE OUTPUT COST
+    completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
+
+    return prompt_cost, completion_cost
diff --git a/litellm/llms/base.py b/litellm/llms/base.py
index 7e80de9ab..08c5e1992 100644
--- a/litellm/llms/base.py
+++ b/litellm/llms/base.py
@@ -1,11 +1,14 @@
 ## This is a template base class to be used for adding new LLM providers via API calls
+from typing import Any, Optional, Union
+
+import httpx
+import requests
+
 import litellm
-import httpx, requests
-from typing import Optional, Union
-from litellm.litellm_core_utils.litellm_logging import Logging
 
 
 class BaseLLM:
+
     _client_session: Optional[httpx.Client] = None
 
     def process_response(
@@ -14,7 +17,7 @@ class BaseLLM:
         response: Union[requests.Response, httpx.Response],
         model_response: litellm.utils.ModelResponse,
         stream: bool,
-        logging_obj: Logging,
+        logging_obj: Any,
         optional_params: dict,
         api_key: str,
         data: Union[dict, str],
@@ -33,7 +36,7 @@ class BaseLLM:
         response: Union[requests.Response, httpx.Response],
         model_response: litellm.utils.TextCompletionResponse,
         stream: bool,
-        logging_obj: Logging,
+        logging_obj: Any,
         optional_params: dict,
         api_key: str,
         data: Union[dict, str],
diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
index e85160a43..025b27240 100644
--- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py
@@ -267,18 +267,19 @@ def completion(
 ):
     try:
         import vertexai
-        from anthropic import AnthropicVertex
-
-        from litellm.llms.anthropic import AnthropicChatCompletion
-        from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
-            VertexLLM,
-        )
     except:
         raise VertexAIError(
             status_code=400,
             message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""",
         )
 
+    from anthropic import AnthropicVertex
+
+    from litellm.llms.anthropic.chat import AnthropicChatCompletion
+    from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexLLM,
+    )
+
     if not (
         hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
     ):
diff --git a/litellm/main.py b/litellm/main.py
index 658a46258..7f1431073 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -77,13 +77,10 @@ from .caching import disable_cache, enable_cache, update_cache
 from .llms import (
     ai21,
     aleph_alpha,
-    anthropic_text,
     baseten,
     bedrock,
     clarifai,
     cloudflare,
-    gemini,
-    huggingface_restapi,
     maritalk,
     nlp_cloud,
     ollama,
@@ -93,13 +90,10 @@ from .llms import (
     palm,
     petals,
     replicate,
-    together_ai,
-    triton,
     vllm,
-    watsonx,
 )
-from .llms.anthropic import AnthropicChatCompletion
-from .llms.anthropic_text import AnthropicTextCompletion
+from .llms.anthropic.chat import AnthropicChatCompletion
+from .llms.anthropic.completion import AnthropicTextCompletion
 from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params
 from .llms.azure_text import AzureTextCompletion
 from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 5e6d0f2ab..a60743c65 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -1336,6 +1336,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.00000025,
         "output_cost_per_token": 0.00000125,
+        "cache_creation_input_token_cost": 0.0000003,
+        "cache_read_input_token_cost": 0.00000003,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
@@ -1349,6 +1351,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.000015,
         "output_cost_per_token": 0.000075,
+        "cache_creation_input_token_cost": 0.00001875,
+        "cache_read_input_token_cost": 0.0000015,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
@@ -1375,6 +1379,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index b8f964ab3..b84ef7453 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,4 +1,4 @@
 model_list:
-  - model_name: "gemini/*"
+  - model_name: "gpt-3.5-turbo"
     litellm_params:
-      model: "gemini/*"
\ No newline at end of file
+      model: "gpt-3.5-turbo"
\ No newline at end of file
diff --git a/litellm/tests/test_anthropic_completion.py b/litellm/tests/test_anthropic_completion.py
index b5e01d448..b8ccf716e 100644
--- a/litellm/tests/test_anthropic_completion.py
+++ b/litellm/tests/test_anthropic_completion.py
@@ -10,7 +10,7 @@ from dotenv import load_dotenv
 
 import litellm.types
 import litellm.types.utils
-from litellm.llms.anthropic import ModelResponseIterator
+from litellm.llms.anthropic.chat import ModelResponseIterator
 
 load_dotenv()
 import io
@@ -152,48 +152,190 @@ def test_anthropic_completion_e2e(stream):
 
 
 anthropic_chunk_list = [
-    {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "To"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " answer"}},
-    {"type": "content_block_delta", "index": 0,
-     "delta": {"type": "text_delta", "text": " your question about the weather"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " in Boston and Los"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " Angeles today, I'll"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " need to"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " use"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " the"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " get_current_weather"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " function"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " both"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " cities"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": ". Let"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " me fetch"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " that"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " information"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
-    {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " you."}},
+    {
+        "type": "content_block_start",
+        "index": 0,
+        "content_block": {"type": "text", "text": ""},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": "To"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " answer"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " your question about the weather"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " in Boston and Los"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " Angeles today, I'll"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " need to"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " use"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " the"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " get_current_weather"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " function"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " for"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " both"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " cities"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": ". Let"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " me fetch"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " that"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " information"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " for"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 0,
+        "delta": {"type": "text_delta", "text": " you."},
+    },
     {"type": "content_block_stop", "index": 0},
-    {"type": "content_block_start", "index": 1,
-     "content_block": {"type": "tool_use", "id": "toolu_12345", "name": "get_current_weather", "input": {}}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": ""}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "{\"locat"}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ion\": \"Bos"}},
-    {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ton, MA\"}"}},
+    {
+        "type": "content_block_start",
+        "index": 1,
+        "content_block": {
+            "type": "tool_use",
+            "id": "toolu_12345",
+            "name": "get_current_weather",
+            "input": {},
+        },
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": ""},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": '{"locat'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": 'ion": "Bos'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 1,
+        "delta": {"type": "input_json_delta", "partial_json": 'ton, MA"}'},
+    },
     {"type": "content_block_stop", "index": 1},
-    {"type": "content_block_start", "index": 2,
-     "content_block": {"type": "tool_use", "id": "toolu_023423423", "name": "get_current_weather", "input": {}}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": ""}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "{\"l"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "oca"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "tio"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "n\": \"Lo"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "s Angel"}},
-    {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "es, CA\"}"}},
+    {
+        "type": "content_block_start",
+        "index": 2,
+        "content_block": {
+            "type": "tool_use",
+            "id": "toolu_023423423",
+            "name": "get_current_weather",
+            "input": {},
+        },
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": ""},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": '{"l'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": "oca"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": "tio"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": 'n": "Lo'},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": "s Angel"},
+    },
+    {
+        "type": "content_block_delta",
+        "index": 2,
+        "delta": {"type": "input_json_delta", "partial_json": 'es, CA"}'},
+    },
     {"type": "content_block_stop", "index": 2},
-    {"type": "message_delta", "delta": {"stop_reason": "tool_use", "stop_sequence": None},
-     "usage": {"output_tokens": 137}},
-    {"type": "message_stop"}
+    {
+        "type": "message_delta",
+        "delta": {"stop_reason": "tool_use", "stop_sequence": None},
+        "usage": {"output_tokens": 137},
+    },
+    {"type": "message_stop"},
 ]
 
 
@@ -211,12 +353,12 @@ def test_anthropic_tool_streaming():
     correct_tool_index = -1
     for chunk in anthropic_chunk_list:
         parsed_chunk = response_iter.chunk_parser(chunk)
-        if tool_use := parsed_chunk.get('tool_use'):
+        if tool_use := parsed_chunk.get("tool_use"):
 
             # We only increment when a new block starts
-            if tool_use.get('id') is not None:
+            if tool_use.get("id") is not None:
                 correct_tool_index += 1
-            assert tool_use['index'] == correct_tool_index
+            assert tool_use["index"] == correct_tool_index
 
 
 @pytest.mark.asyncio
@@ -344,4 +486,4 @@ def test_anthropic_tool_calling_translation():
     print(translated_params["messages"])
 
     assert len(translated_params["messages"]) > 0
-    assert translated_params["messages"][0]["role"] == "user"
\ No newline at end of file
+    assert translated_params["messages"][0]["role"] == "user"
diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py
index e9326752f..f48a85cad 100644
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@@ -1097,3 +1097,73 @@ def test_completion_cost_azure_common_deployment_name():
 
         print(f"mock_client.call_args: {mock_client.call_args.kwargs}")
         assert "azure/gpt-4" == mock_client.call_args.kwargs["model"]
+
+
+def test_completion_cost_anthropic_prompt_caching():
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
+    from litellm.utils import Choices, Message, ModelResponse, Usage
+
+    model = "anthropic/claude-3-5-sonnet-20240620"
+
+    ## WRITE TO CACHE ## (MORE EXPENSIVE)
+    response_1 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model="claude-3-5-sonnet-20240620",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=14,
+            total_tokens=24,
+            cache_creation_input_tokens=100,
+            cache_read_input_tokens=0,
+        ),
+    )
+
+    ## READ FROM CACHE ## (LESS EXPENSIVE)
+    response_2 = ModelResponse(
+        id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
+        choices=[
+            Choices(
+                finish_reason="length",
+                index=0,
+                message=Message(
+                    content="Hello! I'm doing well, thank you for",
+                    role="assistant",
+                    tool_calls=None,
+                    function_call=None,
+                ),
+            )
+        ],
+        created=1725036547,
+        model="claude-3-5-sonnet-20240620",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=Usage(
+            completion_tokens=10,
+            prompt_tokens=14,
+            total_tokens=24,
+            cache_creation_input_tokens=0,
+            cache_read_input_tokens=100,
+        ),
+    )
+
+    cost_1 = completion_cost(model=model, completion_response=response_1)
+    cost_2 = completion_cost(model=model, completion_response=response_2)
+
+    assert cost_1 > cost_2
diff --git a/litellm/tests/test_dynamic_rate_limit_handler.py b/litellm/tests/test_dynamic_rate_limit_handler.py
index f49a760af..d711de71f 100644
--- a/litellm/tests/test_dynamic_rate_limit_handler.py
+++ b/litellm/tests/test_dynamic_rate_limit_handler.py
@@ -290,6 +290,7 @@ async def test_base_case(dynamic_rate_limit_handler, mock_response):
 
 
 @pytest.mark.asyncio
+@pytest.mark.flaky(retries=3, delay=1)
 async def test_update_cache(
     dynamic_rate_limit_handler, mock_response, user_api_key_auth
 ):
diff --git a/litellm/tests/test_optional_params.py b/litellm/tests/test_optional_params.py
index e8bc999f2..54e2e5b43 100644
--- a/litellm/tests/test_optional_params.py
+++ b/litellm/tests/test_optional_params.py
@@ -75,6 +75,16 @@ def test_bedrock_optional_params_embeddings():
     assert len(optional_params) == 0
 
 
+def test_google_ai_studio_optional_params_embeddings():
+    optional_params = get_optional_params_embeddings(
+        user="John",
+        encoding_format=None,
+        custom_llm_provider="gemini",
+        drop_params=True,
+    )
+    assert len(optional_params) == 0
+
+
 def test_openai_optional_params_embeddings():
     litellm.drop_params = True
     optional_params = get_optional_params_embeddings(
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index 81dc268af..aadbdd22a 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -51,6 +51,8 @@ class ModelInfo(TypedDict, total=False):
     max_input_tokens: Required[Optional[int]]
     max_output_tokens: Required[Optional[int]]
     input_cost_per_token: Required[float]
+    cache_creation_input_token_cost: Optional[float]
+    cache_read_input_token_cost: Optional[float]
     input_cost_per_character: Optional[float]  # only for vertex ai models
     input_cost_per_token_above_128k_tokens: Optional[float]  # only for vertex ai models
     input_cost_per_character_above_128k_tokens: Optional[
@@ -454,6 +456,13 @@ class Choices(OpenAIObject):
 
 
 class Usage(CompletionUsage):
+    _cache_creation_input_tokens: int = PrivateAttr(
+        0
+    )  # hidden param for prompt caching. Might change, once openai introduces their equivalent.
+    _cache_read_input_tokens: int = PrivateAttr(
+        0
+    )  # hidden param for prompt caching. Might change, once openai introduces their equivalent.
+
     def __init__(
         self,
         prompt_tokens: Optional[int] = None,
@@ -466,9 +475,18 @@ class Usage(CompletionUsage):
             "completion_tokens": completion_tokens or 0,
             "total_tokens": total_tokens or 0,
         }
-
         super().__init__(**data)
 
+        if "cache_creation_input_tokens" in params and isinstance(
+            params["cache_creation_input_tokens"], int
+        ):
+            self._cache_creation_input_tokens = params["cache_creation_input_tokens"]
+
+        if "cache_read_input_tokens" in params and isinstance(
+            params["cache_read_input_tokens"], int
+        ):
+            self._cache_read_input_tokens = params["cache_read_input_tokens"]
+
         for k, v in params.items():
             setattr(self, k, v)
 
diff --git a/litellm/utils.py b/litellm/utils.py
index faa317c1b..facbc6a0a 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -2550,6 +2550,7 @@ def get_optional_params_embeddings(
     encoding_format=None,
     dimensions=None,
     custom_llm_provider="",
+    drop_params: Optional[bool] = None,
     additional_drop_params: Optional[bool] = None,
     **kwargs,
 ):
@@ -2560,6 +2561,7 @@ def get_optional_params_embeddings(
     for k, v in special_params.items():
         passed_params[k] = v
 
+    drop_params = passed_params.pop("drop_params", None)
     additional_drop_params = passed_params.pop("additional_drop_params", None)
 
     default_params = {"user": None, "encoding_format": None, "dimensions": None}
@@ -2571,11 +2573,16 @@ def get_optional_params_embeddings(
         for k in non_default_params.keys():
             if k not in supported_params:
                 unsupported_params[k] = non_default_params[k]
-        if unsupported_params and not litellm.drop_params:
-            raise UnsupportedParamsError(
-                status_code=500,
-                message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
-            )
+        if unsupported_params:
+            if litellm.drop_params is True or (
+                drop_params is not None and drop_params is True
+            ):
+                pass
+            else:
+                raise UnsupportedParamsError(
+                    status_code=500,
+                    message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
+                )
 
     non_default_params = _get_non_default_params(
         passed_params=passed_params,
@@ -2680,7 +2687,9 @@ def get_optional_params_embeddings(
         and custom_llm_provider not in litellm.openai_compatible_providers
     ):
         if len(non_default_params.keys()) > 0:
-            if litellm.drop_params is True:  # drop the unsupported non-default values
+            if (
+                litellm.drop_params is True or drop_params is True
+            ):  # drop the unsupported non-default values
                 keys = list(non_default_params.keys())
                 for k in keys:
                     non_default_params.pop(k, None)
@@ -5358,6 +5367,12 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
                 max_input_tokens=_model_info.get("max_input_tokens", None),
                 max_output_tokens=_model_info.get("max_output_tokens", None),
                 input_cost_per_token=_input_cost_per_token,
+                cache_creation_input_token_cost=_model_info.get(
+                    "cache_creation_input_token_cost", None
+                ),
+                cache_read_input_token_cost=_model_info.get(
+                    "cache_read_input_token_cost", None
+                ),
                 input_cost_per_character=_model_info.get(
                     "input_cost_per_character", None
                 ),
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 5e6d0f2ab..a60743c65 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -1336,6 +1336,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.00000025,
         "output_cost_per_token": 0.00000125,
+        "cache_creation_input_token_cost": 0.0000003,
+        "cache_read_input_token_cost": 0.00000003,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
@@ -1349,6 +1351,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.000015,
         "output_cost_per_token": 0.000075,
+        "cache_creation_input_token_cost": 0.00001875,
+        "cache_read_input_token_cost": 0.0000015,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,
@@ -1375,6 +1379,8 @@
         "max_output_tokens": 4096,
         "input_cost_per_token": 0.000003,
         "output_cost_per_token": 0.000015,
+        "cache_creation_input_token_cost": 0.00000375,
+        "cache_read_input_token_cost": 0.0000003,
         "litellm_provider": "anthropic",
         "mode": "chat",
         "supports_function_calling": true,

From 017dd8891095c4e7533abf4d1a1fe8f75800db7d Mon Sep 17 00:00:00 2001
From: Krrish Dholakia <krrishdholakia@gmail.com>
Date: Sat, 31 Aug 2024 14:34:00 -0700
Subject: [PATCH 4/6] test: skip test on end of life model

---
 litellm/tests/test_streaming.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index d2ef8aafc..1b8b4e085 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -1545,6 +1545,7 @@ def test_completion_bedrock_claude_stream():
 # test_completion_bedrock_claude_stream()
 
 
+@pytest.mark.skip(reason="model end of life")
 def test_completion_bedrock_ai21_stream():
     try:
         litellm.set_verbose = False

From fd4157cf7130f3ec59e880c19448fac5acdbcb68 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 31 Aug 2024 14:57:12 -0700
Subject: [PATCH 5/6] docs add cerebras

---
 docs/my-website/docs/providers/cerebras.md | 145 +++++++++++++++++++++
 docs/my-website/sidebars.js                |   1 +
 2 files changed, 146 insertions(+)
 create mode 100644 docs/my-website/docs/providers/cerebras.md

diff --git a/docs/my-website/docs/providers/cerebras.md b/docs/my-website/docs/providers/cerebras.md
new file mode 100644
index 000000000..4fabeb31c
--- /dev/null
+++ b/docs/my-website/docs/providers/cerebras.md
@@ -0,0 +1,145 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Cerebras
+https://inference-docs.cerebras.ai/api-reference/chat-completions
+
+:::tip
+
+**We support ALL Cerebras models, just set `model=cerebras/<any-model-on-cerebras>` as a prefix when sending litellm requests**
+
+:::
+
+## API Key
+```python
+# env variable
+os.environ['CEREBRAS_API_KEY']
+```
+
+## Sample Usage
+```python
+from litellm import completion
+import os
+
+os.environ['CEREBRAS_API_KEY'] = ""
+response = completion(
+    model="cerebras/meta/llama3-70b-instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "What's the weather like in Boston today in Fahrenheit?",
+        }
+    ],
+    max_tokens=10,
+    response_format={ "type": "json_object" },
+    seed=123,
+    stop=["\n\n"],
+    temperature=0.2,
+    top_p=0.9,
+    tool_choice="auto",
+    tools=[],
+    user="user",
+)
+print(response)
+```
+
+## Sample Usage - Streaming
+```python
+from litellm import completion
+import os
+
+os.environ['CEREBRAS_API_KEY'] = ""
+response = completion(
+    model="cerebras/meta/llama3-70b-instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "What's the weather like in Boston today in Fahrenheit?",
+        }
+    ],
+    stream=True,
+    max_tokens=10,
+    response_format={ "type": "json_object" },
+    seed=123,
+    stop=["\n\n"],
+    temperature=0.2,
+    top_p=0.9,
+    tool_choice="auto",
+    tools=[],
+    user="user",
+)
+
+for chunk in response:
+    print(chunk)
+```
+
+
+## Usage with LiteLLM Proxy Server
+
+Here's how to call a Cerebras model with the LiteLLM Proxy Server
+
+1. Modify the config.yaml 
+
+  ```yaml
+  model_list:
+    - model_name: my-model
+      litellm_params:
+        model: cerebras/<your-model-name>  # add cerebras/ prefix to route as Cerebras provider
+        api_key: api-key                 # api key to send your model
+  ```
+
+
+2. Start the proxy 
+
+  ```bash
+  $ litellm --config /path/to/config.yaml
+  ```
+
+3. Send Request to LiteLLM Proxy Server
+
+  <Tabs>
+
+  <TabItem value="openai" label="OpenAI Python v1.0.0+">
+
+  ```python
+  import openai
+  client = openai.OpenAI(
+      api_key="sk-1234",             # pass litellm proxy key, if you're using virtual keys
+      base_url="http://0.0.0.0:4000" # litellm-proxy-base url
+  )
+
+  response = client.chat.completions.create(
+      model="my-model",
+      messages = [
+          {
+              "role": "user",
+              "content": "what llm are you"
+          }
+      ],
+  )
+
+  print(response)
+  ```
+  </TabItem>
+
+  <TabItem value="curl" label="curl">
+
+  ```shell
+  curl --location 'http://0.0.0.0:4000/chat/completions' \
+      --header 'Authorization: Bearer sk-1234' \
+      --header 'Content-Type: application/json' \
+      --data '{
+      "model": "my-model",
+      "messages": [
+          {
+          "role": "user",
+          "content": "what llm are you"
+          }
+      ],
+  }'
+  ```
+  </TabItem>
+
+  </Tabs>
+
+
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
index 59db4c363..048b04171 100644
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@@ -138,6 +138,7 @@ const sidebars = {
         "providers/watsonx",
         "providers/predibase",
         "providers/nvidia_nim", 
+        "providers/cerebras", 
         "providers/volcano", 
         "providers/triton-inference-server",
         "providers/ollama", 

From 4bd85b9d83b75bd669328b2b743f8f1e5e5c66a9 Mon Sep 17 00:00:00 2001
From: Ishaan Jaff <ishaanjaffer0324@gmail.com>
Date: Sat, 31 Aug 2024 16:29:26 -0700
Subject: [PATCH 6/6] add cerebras cost tracking

---
 ...odel_prices_and_context_window_backup.json | 20 +++++++++++++++++++
 model_prices_and_context_window.json          | 20 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index a60743c65..daf2c502a 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -1273,6 +1273,26 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "cerebras/llama3.1-8b": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0000001,
+        "litellm_provider": "cerebras",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "cerebras/llama3.1-70b": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.0000006,
+        "output_cost_per_token": 0.0000006,
+        "litellm_provider": "cerebras",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "friendliai/mixtral-8x7b-instruct-v0-1": {
         "max_tokens": 32768,
         "max_input_tokens": 32768,
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index a60743c65..daf2c502a 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -1273,6 +1273,26 @@
         "mode": "chat",
         "supports_function_calling": true
     },
+    "cerebras/llama3.1-8b": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0000001,
+        "litellm_provider": "cerebras",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
+    "cerebras/llama3.1-70b": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.0000006,
+        "output_cost_per_token": 0.0000006,
+        "litellm_provider": "cerebras",
+        "mode": "chat",
+        "supports_function_calling": true
+    },
     "friendliai/mixtral-8x7b-instruct-v0-1": {
         "max_tokens": 32768,
         "max_input_tokens": 32768,