Merge branch 'main' into litellm_add_semantic_cache

2024-02-06 11:18:43 -08:00 · 2024-02-06 11:18:43 -08:00 · 7cb69c72c8
commit 7cb69c72c8
parent 8175fb4deb 405a44727c
25 changed files with 1499 additions and 342 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -80,7 +80,7 @@ jobs:
          command: |
            pwd
            ls
-            python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
+            python -m pytest -vv -s litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 
          no_output_timeout: 120m
      # Store test results
--- a/.gitignore
+++ b/.gitignore
@ -43,3 +43,4 @@ ui/litellm-dashboard/package-lock.json
 deploy/charts/litellm-helm/*.tgz
 deploy/charts/litellm-helm/charts/*
 deploy/charts/*.tgz
 litellm/proxy/vertex_key.json
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -10,6 +10,12 @@ repos:
       exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
       additional_dependencies: [flake8-print]
       files: litellm/.*\.py
 -   repo: local
    hooks:
    -   id: check-files-match
        name: Check if files match
        entry: python3 ci_cd/check_files_match.py
        language: system
 -   repo: local
    hooks:
    -   id: mypy
--- a/ci_cd/check_files_match.py
+++ b/ci_cd/check_files_match.py
@ -0,0 +1,32 @@
 import sys
 import filecmp
 import shutil
 def main(argv=None):
    print(
        "Comparing model_prices_and_context_window and litellm/model_prices_and_context_window_backup.json files... checking if they match."
    )
    file1 = "model_prices_and_context_window.json"
    file2 = "litellm/model_prices_and_context_window_backup.json"
    cmp_result = filecmp.cmp(file1, file2, shallow=False)
    if cmp_result:
        print(f"Passed! Files {file1} and {file2} match.")
        return 0
    else:
        print(
            f"Failed! Files {file1} and {file2} do not match. Copying content from {file1} to {file2}."
        )
        copy_content(file1, file2)
        return 1
 def copy_content(source, destination):
    shutil.copy2(source, destination)
 if __name__ == "__main__":
    sys.exit(main())
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -122,6 +122,7 @@ response = completion(
      "generation_id": "gen-id22",                  # set langfuse Generation ID 
      "trace_id": "trace-id22",                     # set langfuse Trace ID
      "trace_user_id": "user-id2",                  # set langfuse Trace User ID
      "session_id": "session-1",                    # set langfuse Session ID
  },
 )
--- a/docs/my-website/docs/proxy/virtual_keys.md
+++ b/docs/my-website/docs/proxy/virtual_keys.md
@ -352,6 +352,22 @@ Request Params:
 }
 ```
 ## Upperbound /key/generate params
 Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key. 
 Set `litellm_settings:upperbound_key_generate_params`:
 ```yaml
 litellm_settings:
  upperbound_key_generate_params:
    max_budget: 100 # upperbound of $100, for all /key/generate requests
    duration: "30d" # upperbound of 30 days for all /key/generate requests
 ```
 ** Expected Behavior **
 - Send a `/key/generate` request with `max_budget=200`
 - Key will be created with `max_budget=100` since 100 is the upper bound
 ## Default /key/generate params
 Use this, if you need to control the default `max_budget` or any `key/generate` param per key. 
--- a/litellm/init.py
+++ b/litellm/init.py
@ -146,6 +146,7 @@ suppress_debug_info = False
 dynamodb_table_name: Optional[str] = None
 s3_callback_params: Optional[Dict] = None
 default_key_generate_params: Optional[Dict] = None
 upperbound_key_generate_params: Optional[Dict] = None
 default_team_settings: Optional[List] = None
 #### RELIABILITY ####
 request_timeout: Optional[float] = 6000
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -55,8 +55,21 @@ class LangFuseLogger:
        else:
            self.upstream_langfuse = None
    # def log_error(kwargs, response_obj, start_time, end_time):
    #     generation = trace.generation(
    #         level ="ERROR" # can be any of DEBUG, DEFAULT, WARNING or ERROR
    #         status_message='error' # can be any string (e.g. stringified stack trace or error body)
    #     )
    def log_event(
-        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
+        self,
        kwargs,
        response_obj,
        start_time,
        end_time,
        user_id,
        print_verbose,
        level="DEFAULT",
        status_message=None,
    ):
        # Method definition
@ -84,15 +97,24 @@ class LangFuseLogger:
                        pass
            # end of processing langfuse ########################
-            if kwargs.get("call_type", None) == "embedding" or isinstance(
+            if (
-                response_obj, litellm.EmbeddingResponse
+                level == "ERROR"
                and status_message is not None
                and isinstance(status_message, str)
            ):
                input = prompt
                output = status_message
            elif response_obj is not None and (
                kwargs.get("call_type", None) == "embedding"
                or isinstance(response_obj, litellm.EmbeddingResponse)
            ):
                input = prompt
                output = response_obj["data"]
-            else:
+            elif response_obj is not None:
                input = prompt
                output = response_obj["choices"][0]["message"].json()
-            print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
+            print(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
            if self._is_langfuse_v2():
                self._log_langfuse_v2(
                    user_id,
                    metadata,
@ -103,8 +125,11 @@ class LangFuseLogger:
                    optional_params,
                    input,
                    response_obj,
                    level,
                    print_verbose,
-            ) if self._is_langfuse_v2() else self._log_langfuse_v1(
+                )
            elif response_obj is not None:
                self._log_langfuse_v1(
                    user_id,
                    metadata,
                    output,
@ -123,15 +148,15 @@ class LangFuseLogger:
            verbose_logger.info(f"Langfuse Layer Logging - logging success")
        except:
            traceback.print_exc()
-            print_verbose(f"Langfuse Layer Error - {traceback.format_exc()}")
+            print(f"Langfuse Layer Error - {traceback.format_exc()}")
            pass
    async def _async_log_event(
        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
    ):
-        self.log_event(
+        """
-            kwargs, response_obj, start_time, end_time, user_id, print_verbose
+        TODO: support async calls when langfuse is truly async
-        )
+        """
    def _is_langfuse_v2(self):
        import langfuse
@ -193,10 +218,12 @@ class LangFuseLogger:
        optional_params,
        input,
        response_obj,
        level,
        print_verbose,
    ):
        import langfuse
        try:
            tags = []
            supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
            supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
@ -211,11 +238,17 @@ class LangFuseLogger:
            trace_params = {
                "name": generation_name,
                "input": input,
            "output": output,
                "user_id": metadata.get("trace_user_id", user_id),
                "id": metadata.get("trace_id", None),
                "session_id": metadata.get("session_id", None),
            }
-        cost = kwargs["response_cost"]
+
            if level == "ERROR":
                trace_params["status_message"] = output
            else:
                trace_params["output"] = output
            cost = kwargs.get("response_cost", None)
            print_verbose(f"trace: {cost}")
            if supports_tags:
                for key, value in metadata.items():
@ -226,10 +259,22 @@ class LangFuseLogger:
            trace = self.Langfuse.trace(**trace_params)
            if level == "ERROR":
                trace.generation(
                    level="ERROR",  # can be any of DEBUG, DEFAULT, WARNING or ERROR
                    status_message=output,  # can be any string (e.g. stringified stack trace or error body)
                )
                print(f"SUCCESSFULLY LOGGED ERROR")
            else:
                # get generation_id
                generation_id = None
-        if response_obj.get("id", None) is not None:
+                if (
-            generation_id = litellm.utils.get_logging_id(start_time, response_obj)
+                    response_obj is not None
                    and response_obj.get("id", None) is not None
                ):
                    generation_id = litellm.utils.get_logging_id(
                        start_time, response_obj
                    )
                trace.generation(
                    name=generation_name,
                    id=metadata.get("generation_id", generation_id),
@ -246,3 +291,5 @@ class LangFuseLogger:
                    },
                    metadata=metadata,
                )
        except Exception as e:
            print(f"Langfuse Layer Error - {traceback.format_exc()}")
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -146,7 +146,15 @@ def get_ollama_response(
            optional_params[k] = v
    stream = optional_params.pop("stream", False)
-    data = {"model": model, "prompt": prompt, "options": optional_params}
+    format = optional_params.pop("format", None)
    data = {
        "model": model,
        "prompt": prompt,
        "options": optional_params,
        "stream": stream,
    }
    if format is not None:
        data["format"] = format
    ## LOGGING
    logging_obj.pre_call(
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -146,7 +146,15 @@ def get_ollama_response(
            optional_params[k] = v
    stream = optional_params.pop("stream", False)
-    data = {"model": model, "messages": messages, "options": optional_params}
+    format = optional_params.pop("format", None)
    data = {
        "model": model,
        "messages": messages,
        "options": optional_params,
        "stream": stream,
    }
    if format is not None:
        data["format"] = format
    ## LOGGING
    logging_obj.pre_call(
        input=None,
@ -320,11 +328,15 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
                model_response["choices"][0]["message"] = message
            else:
                model_response["choices"][0]["message"] = response_json["message"]
            model_response["created"] = int(time.time())
-            model_response["model"] = "ollama/" + data["model"]
+            model_response["model"] = "ollama_chat/" + data["model"]
            prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"]))  # type: ignore
            completion_tokens = response_json.get(
-                "eval_count", litellm.token_counter(text=response_json["message"])
+                "eval_count",
                litellm.token_counter(
                    text=response_json["message"]["content"], count_response_tokens=True
                ),
            )
            model_response["usage"] = litellm.Usage(
                prompt_tokens=prompt_tokens,
--- a/litellm/main.py
+++ b/litellm/main.py
@ -263,6 +263,7 @@ async def acompletion(
            or custom_llm_provider == "ollama"
            or custom_llm_provider == "ollama_chat"
            or custom_llm_provider == "vertex_ai"
            or custom_llm_provider in litellm.openai_compatible_providers
        ):  # currently implemented aiohttp calls for just azure, openai, hf, ollama, vertex ai soon all.
            init_response = await loop.run_in_executor(None, func_with_context)
            if isinstance(init_response, dict) or isinstance(
@ -3319,6 +3320,10 @@ async def ahealth_check(
                response = {}  # args like remaining ratelimit etc.
        return response
    except Exception as e:
        if model not in litellm.model_cost and mode is None:
            raise Exception(
                "Missing `mode`. Set the `mode` for the model - https://docs.litellm.ai/docs/proxy/health#embedding-models"
            )
        return {"error": str(e)}
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -78,7 +78,9 @@ litellm_settings:
    type: "redis-semantic"
    similarity_threshold: 0.8
    redis_semantic_cache_embedding_model: azure-embedding-model
-  # cache: True
+  upperbound_key_generate_params:
    max_budget: 100
    duration: "30d"   
  # setting callback class
  # callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance]
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -636,6 +636,36 @@ async def user_api_key_auth(
                    raise Exception(
                        f"Only master key can be used to generate, delete, update or get info for new keys/users. Value of allow_user_auth={allow_user_auth}"
                    )
        # check if token is from litellm-ui, litellm ui makes keys to allow users to login with sso. These keys can only be used for LiteLLM UI functions
        # sso/login, ui/login, /key functions and /user functions
        # this will never be allowed to call /chat/completions
        token_team = getattr(valid_token, "team_id", None)
        if token_team is not None:
            if token_team == "litellm-dashboard":
                # this token is only used for managing the ui
                allowed_routes = [
                    "/sso",
                    "/login",
                    "/key",
                    "/spend",
                    "/user",
                ]
                # check if the current route startswith any of the allowed routes
                if (
                    route is not None
                    and isinstance(route, str)
                    and any(
                        route.startswith(allowed_route)
                        for allowed_route in allowed_routes
                    )
                ):
                    # Do something if the current route starts with any of the allowed routes
                    pass
                else:
                    raise Exception(
                        f"This key is made for LiteLLM UI, Tried to access route: {route}. Not allowed"
                    )
            return UserAPIKeyAuth(api_key=api_key, **valid_token_dict)
        else:
            raise Exception(f"Invalid Key Passed to LiteLLM Proxy")
@ -758,9 +788,10 @@ async def _PROXY_track_cost_callback(
            verbose_proxy_logger.info(
                f"response_cost {response_cost}, for user_id {user_id}"
            )
-            if user_api_key and (
+            verbose_proxy_logger.debug(
-                prisma_client is not None or custom_db_client is not None
+                f"user_api_key {user_api_key}, prisma_client: {prisma_client}, custom_db_client: {custom_db_client}"
-            ):
+            )
            if user_api_key is not None:
                await update_database(
                    token=user_api_key,
                    response_cost=response_cost,
@ -770,6 +801,8 @@ async def _PROXY_track_cost_callback(
                    start_time=start_time,
                    end_time=end_time,
                )
            else:
                raise Exception("User API key missing from custom callback.")
        else:
            if kwargs["stream"] != True or (
                kwargs["stream"] == True
@ -1361,6 +1394,26 @@ class ProxyConfig:
 proxy_config = ProxyConfig()
 def _duration_in_seconds(duration: str):
    match = re.match(r"(\d+)([smhd]?)", duration)
    if not match:
        raise ValueError("Invalid duration format")
    value, unit = match.groups()
    value = int(value)
    if unit == "s":
        return value
    elif unit == "m":
        return value * 60
    elif unit == "h":
        return value * 3600
    elif unit == "d":
        return value * 86400
    else:
        raise ValueError("Unsupported duration unit")
 async def generate_key_helper_fn(
    duration: Optional[str],
    models: list,
@ -1395,25 +1448,6 @@ async def generate_key_helper_fn(
    if token is None:
        token = f"sk-{secrets.token_urlsafe(16)}"
    def _duration_in_seconds(duration: str):
        match = re.match(r"(\d+)([smhd]?)", duration)
        if not match:
            raise ValueError("Invalid duration format")
        value, unit = match.groups()
        value = int(value)
        if unit == "s":
            return value
        elif unit == "m":
            return value * 60
        elif unit == "h":
            return value * 3600
        elif unit == "d":
            return value * 86400
        else:
            raise ValueError("Unsupported duration unit")
    if duration is None:  # allow tokens that never expire
        expires = None
    else:
@ -2630,6 +2664,36 @@ async def generate_key_fn(
                elif key == "metadata" and value == {}:
                    setattr(data, key, litellm.default_key_generate_params.get(key, {}))
        # check if user set default key/generate params on config.yaml
        if litellm.upperbound_key_generate_params is not None:
            for elem in data:
                # if key in litellm.upperbound_key_generate_params, use the min of value and litellm.upperbound_key_generate_params[key]
                key, value = elem
                if value is not None and key in litellm.upperbound_key_generate_params:
                    # if value is float/int
                    if key in [
                        "max_budget",
                        "max_parallel_requests",
                        "tpm_limit",
                        "rpm_limit",
                    ]:
                        if value > litellm.upperbound_key_generate_params[key]:
                            # directly compare floats/ints
                            setattr(
                                data, key, litellm.upperbound_key_generate_params[key]
                            )
                    elif key == "budget_duration":
                        # budgets are in 1s, 1m, 1h, 1d, 1m (30s, 30m, 30h, 30d, 30m)
                        # compare the duration in seconds and max duration in seconds
                        upperbound_budget_duration = _duration_in_seconds(
                            duration=litellm.upperbound_key_generate_params[key]
                        )
                        user_set_budget_duration = _duration_in_seconds(duration=value)
                        if user_set_budget_duration > upperbound_budget_duration:
                            setattr(
                                data, key, litellm.upperbound_key_generate_params[key]
                            )
        data_json = data.json()  # type: ignore
        # if we get max_budget passed to /key/generate, then use it as key_max_budget. Since generate_key_helper_fn is used to make new users
--- a/litellm/tests/test_configs/test_config_no_auth.yaml
+++ b/litellm/tests/test_configs/test_config_no_auth.yaml
@ -9,21 +9,11 @@ model_list:
    api_key: os.environ/AZURE_CANADA_API_KEY
    model: azure/gpt-35-turbo
  model_name: azure-model
 - litellm_params:
    api_base: https://gateway.ai.cloudflare.com/v1/0399b10e77ac6668c80404a5ff49eb37/litellm-test/azure-openai/openai-gpt-4-test-v-1
    api_key: os.environ/AZURE_API_KEY
    model: azure/chatgpt-v-2
  model_name: azure-cloudflare-model
 - litellm_params:
    api_base: https://openai-france-1234.openai.azure.com
    api_key: os.environ/AZURE_FRANCE_API_KEY
    model: azure/gpt-turbo
  model_name: azure-model
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
    description: this is a test openai model
  model_name: test_openai_models
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
@ -36,93 +26,8 @@ model_list:
    description: this is a test openai model
    id: 4d1ee26c-abca-450c-8744-8e87fd6755e9
  model_name: test_openai_models
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
    description: this is a test openai model
    id: 00e19c0f-b63d-42bb-88e9-016fb0c60764
  model_name: test_openai_models
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
    description: this is a test openai model
    id: 79fc75bf-8e1b-47d5-8d24-9365a854af03
  model_name: test_openai_models
 - litellm_params:
    api_base: os.environ/AZURE_API_BASE
    api_key: os.environ/AZURE_API_KEY
    api_version: 2023-07-01-preview
    model: azure/azure-embedding-model
  model_info:
    mode: embedding
  model_name: azure-embedding-model
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
    description: this is a test openai model
    id: 55848c55-4162-40f9-a6e2-9a722b9ef404
  model_name: test_openai_models
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
    description: this is a test openai model
    id: 34339b1e-e030-4bcc-a531-c48559f10ce4
  model_name: test_openai_models
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
    description: this is a test openai model
    id: f6f74e14-ac64-4403-9365-319e584dcdc5
  model_name: test_openai_models
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
    description: this is a test openai model
    id: 9b1ef341-322c-410a-8992-903987fef439
  model_name: test_openai_models
 - litellm_params:
    model: bedrock/amazon.titan-embed-text-v1
  model_info:
    mode: embedding
  model_name: amazon-embeddings
 - litellm_params:
    model: sagemaker/berri-benchmarking-gpt-j-6b-fp16
  model_info:
    mode: embedding
  model_name: GPT-J 6B - Sagemaker Text Embedding (Internal)
 - litellm_params:
    model: dall-e-3
  model_info:
    mode: image_generation
  model_name: dall-e-3
 - litellm_params:
    api_base: os.environ/AZURE_SWEDEN_API_BASE
    api_key: os.environ/AZURE_SWEDEN_API_KEY
    api_version: 2023-12-01-preview
    model: azure/dall-e-3-test
  model_info:
    mode: image_generation
  model_name: dall-e-3
 - litellm_params:
    api_base: os.environ/AZURE_API_BASE
    api_key: os.environ/AZURE_API_KEY
    api_version: 2023-06-01-preview
    model: azure/
  model_info:
    mode: image_generation
  model_name: dall-e-2
 - litellm_params:
    api_base: os.environ/AZURE_API_BASE
    api_key: os.environ/AZURE_API_KEY
    api_version: 2023-07-01-preview
    model: azure/azure-embedding-model
  model_info:
    base_model: text-embedding-ada-002
    mode: embedding
  model_name: text-embedding-ada-002
 - litellm_params:
    model: gpt-3.5-turbo
  model_info:
    description: this is a test openai model
    id: 34cb2419-7c63-44ae-a189-53f1d1ce5953
  model_name: test_openai_models
--- a/litellm/tests/test_key_generate_dynamodb.py
+++ b/litellm/tests/test_key_generate_dynamodb.py
@ -490,8 +490,13 @@ def test_dynamo_db_migration(custom_db_client):
    try:
        async def test():
            request = GenerateKeyRequest(max_budget=1)
            key = await generate_key_fn(request)
            print(key)
            generated_key = key.key
            bearer_token = (
-                "Bearer " + "sk-elJDL2pOEjcAuC7zD4psAg"
+                "Bearer " + generated_key
            )  # this works with ishaan's db, it's a never expiring key
            request = Request(scope={"type": "http"})
--- a/litellm/tests/test_key_generate_prisma.py
+++ b/litellm/tests/test_key_generate_prisma.py
@ -44,6 +44,7 @@ from litellm.proxy.proxy_server import (
    info_key_fn,
    update_key_fn,
    generate_key_fn,
    generate_key_helper_fn,
    spend_user_fn,
    spend_key_fn,
    view_spend_logs,
@ -1278,6 +1279,40 @@ async def test_default_key_params(prisma_client):
        pytest.fail(f"Got exception {e}")
@pytest.mark.asyncio()
 async def test_upperbound_key_params(prisma_client):
    """
    - create key
    - get key info
    - assert key_name is not null
    """
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
    litellm.upperbound_key_generate_params = {
        "max_budget": 0.001,
        "budget_duration": "1m",
    }
    await litellm.proxy.proxy_server.prisma_client.connect()
    try:
        request = GenerateKeyRequest(
            max_budget=200000,
            budget_duration="30d",
        )
        key = await generate_key_fn(request)
        generated_key = key.key
        result = await info_key_fn(key=generated_key)
        key_info = result["info"]
        # assert it used the upper bound for max_budget, and budget_duration
        assert key_info["max_budget"] == 0.001
        assert key_info["budget_duration"] == "1m"
        print(result)
    except Exception as e:
        print("Got Exception", e)
        pytest.fail(f"Got exception {e}")
 def test_get_bearer_token():
    from litellm.proxy.proxy_server import _get_bearer_token
@ -1378,3 +1413,35 @@ async def test_user_api_key_auth_without_master_key(prisma_client):
    except Exception as e:
        print("Got Exception", e)
        pytest.fail(f"Got exception {e}")
@pytest.mark.asyncio
 async def test_key_with_no_permissions(prisma_client):
    """
    - create key
    - get key info
    - assert key_name is null
    """
    setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client)
    setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
    setattr(litellm.proxy.proxy_server, "general_settings", {"allow_user_auth": False})
    await litellm.proxy.proxy_server.prisma_client.connect()
    try:
        response = await generate_key_helper_fn(
            **{"duration": "1hr", "key_max_budget": 0, "models": [], "aliases": {}, "config": {}, "spend": 0, "user_id": "ishaan", "team_id": "litellm-dashboard"}  # type: ignore
        )
        print(response)
        key = response["token"]
        # make a /chat/completions call -> it should fail
        request = Request(scope={"type": "http"})
        request._url = URL(url="/chat/completions")
        # use generated key to auth in
        result = await user_api_key_auth(request=request, api_key="Bearer " + key)
        print("result from user auth with new key", result)
        pytest.fail(f"This should have failed!. IT's an invalid key")
    except Exception as e:
        print("Got Exception", e)
        print(e.message)
--- a/litellm/tests/test_parallel_request_limiter.py
+++ b/litellm/tests/test_parallel_request_limiter.py
@ -379,6 +379,7 @@ async def test_normal_router_tpm_limit():
        )
    except Exception as e:
        print("Exception on test_normal_router_tpm_limit", e)
        assert e.status_code == 429
--- a/litellm/tests/test_proxy_startup.py
+++ b/litellm/tests/test_proxy_startup.py
@ -33,6 +33,11 @@ def test_proxy_gunicorn_startup_direct_config():
    Test both approaches
    """
    try:
        from litellm._logging import verbose_proxy_logger, verbose_router_logger
        import logging
        verbose_proxy_logger.setLevel(level=logging.DEBUG)
        verbose_router_logger.setLevel(level=logging.DEBUG)
        filepath = os.path.dirname(os.path.abspath(__file__))
        # test with worker_config = config yaml
        config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
@ -48,6 +53,11 @@ def test_proxy_gunicorn_startup_direct_config():
 def test_proxy_gunicorn_startup_config_dict():
    try:
        from litellm._logging import verbose_proxy_logger, verbose_router_logger
        import logging
        verbose_proxy_logger.setLevel(level=logging.DEBUG)
        verbose_router_logger.setLevel(level=logging.DEBUG)
        filepath = os.path.dirname(os.path.abspath(__file__))
        # test with worker_config = config yaml
        config_fp = f"{filepath}/test_configs/test_config_no_auth.yaml"
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -980,12 +980,9 @@ class Logging:
            self.model_call_details["log_event_type"] = "post_api_call"
            # User Logging -> if you pass in a custom logging function
-            verbose_logger.debug(
+            print_verbose(
                f"RAW RESPONSE:\n{self.model_call_details.get('original_response', self.model_call_details)}\n\n"
            )
            verbose_logger.debug(
                f"Logging Details Post-API Call: LiteLLM Params: {self.model_call_details}"
            )
            if self.logger_fn and callable(self.logger_fn):
                try:
                    self.logger_fn(
@ -1636,34 +1633,6 @@ class Logging:
                            end_time=end_time,
                            print_verbose=print_verbose,
                        )
                if callback == "langfuse":
                    global langFuseLogger
                    print_verbose("reaches Async langfuse for logging!")
                    kwargs = {}
                    for k, v in self.model_call_details.items():
                        if (
                            k != "original_response"
                        ):  # copy.deepcopy raises errors as this could be a coroutine
                            kwargs[k] = v
                    # this only logs streaming once, complete_streaming_response exists i.e when stream ends
                    if self.stream:
                        if "complete_streaming_response" not in kwargs:
                            return
                        else:
                            print_verbose(
                                "reaches Async langfuse for streaming logging!"
                            )
                            result = kwargs["complete_streaming_response"]
                    if langFuseLogger is None:
                        langFuseLogger = LangFuseLogger()
                    await langFuseLogger._async_log_event(
                        kwargs=kwargs,
                        response_obj=result,
                        start_time=start_time,
                        end_time=end_time,
                        user_id=kwargs.get("user", None),
                        print_verbose=print_verbose,
                    )
            except:
                print_verbose(
                    f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
@ -1788,9 +1757,37 @@ class Logging:
                            response_obj=result,
                            kwargs=self.model_call_details,
                        )
                    elif callback == "langfuse":
                        global langFuseLogger
                        verbose_logger.debug("reaches langfuse for logging!")
                        kwargs = {}
                        for k, v in self.model_call_details.items():
                            if (
                                k != "original_response"
                            ):  # copy.deepcopy raises errors as this could be a coroutine
                                kwargs[k] = v
                        # this only logs streaming once, complete_streaming_response exists i.e when stream ends
                        if langFuseLogger is None or (
                            self.langfuse_public_key != langFuseLogger.public_key
                            and self.langfuse_secret != langFuseLogger.secret_key
                        ):
                            langFuseLogger = LangFuseLogger(
                                langfuse_public_key=self.langfuse_public_key,
                                langfuse_secret=self.langfuse_secret,
                            )
                        langFuseLogger.log_event(
                            start_time=start_time,
                            end_time=end_time,
                            response_obj=None,
                            user_id=kwargs.get("user", None),
                            print_verbose=print_verbose,
                            status_message=str(exception),
                            level="ERROR",
                            kwargs=self.model_call_details,
                        )
                except Exception as e:
                    print_verbose(
-                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {traceback.format_exc()}"
+                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while failure logging with integrations {str(e)}"
                    )
                    print_verbose(
                        f"LiteLLM.Logging: is sentry capture exception initialized {capture_exception}"
@ -3860,6 +3857,8 @@ def get_optional_params(
            and custom_llm_provider != "text-completion-openai"
            and custom_llm_provider != "azure"
            and custom_llm_provider != "vertex_ai"
            and custom_llm_provider != "anyscale"
            and custom_llm_provider != "together_ai"
        ):
            if custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat":
                # ollama actually supports json output
@ -3878,11 +3877,6 @@ def get_optional_params(
                    optional_params[
                        "functions_unsupported_model"
                    ] = non_default_params.pop("functions")
            elif (
                custom_llm_provider == "anyscale"
                and model == "mistralai/Mistral-7B-Instruct-v0.1"
            ):  # anyscale just supports function calling with mistral
                pass
            elif (
                litellm.add_function_to_prompt
            ):  # if user opts to add it to prompt instead
@ -4095,6 +4089,8 @@ def get_optional_params(
            "top_p",
            "stop",
            "frequency_penalty",
            "tools",
            "tool_choice",
        ]
        _check_valid_arg(supported_params=supported_params)
@ -4112,6 +4108,10 @@ def get_optional_params(
            ] = frequency_penalty  # https://docs.together.ai/reference/inference
        if stop is not None:
            optional_params["stop"] = stop
        if tools is not None:
            optional_params["tools"] = tools
        if tool_choice is not None:
            optional_params["tool_choice"] = tool_choice
    elif custom_llm_provider == "ai21":
        ## check if unsupported param passed in
        supported_params = [
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -156,8 +156,8 @@
        "max_tokens": 4097,
        "max_input_tokens": 4097,
        "max_output_tokens": 4096,
-        "input_cost_per_token": 0.000012,
+        "input_cost_per_token": 0.000003,
-        "output_cost_per_token": 0.000016,
+        "output_cost_per_token": 0.000006,
        "litellm_provider": "openai",
        "mode": "chat"
    },
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.22.4"
+version = "1.22.8"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -69,7 +69,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.22.4"
+version = "1.22.8"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -13,7 +13,7 @@ redisvl==0.0.7 # semantic caching
 numpy==1.24.3 # semantic caching
 prisma==0.11.0 # for db
 mangum==0.17.0 # for aws lambda functions
-google-generativeai==0.1.0 # for vertex ai calls
+google-generativeai==0.3.2 # for vertex ai calls
 async_generator==1.10.0 # for async ollama calls
 traceloop-sdk==0.5.3 # for open telemetry logging
 langfuse>=2.6.3 # for langfuse self-hosted logging
--- a/ui/litellm-dashboard/src/app/layout.tsx
+++ b/ui/litellm-dashboard/src/app/layout.tsx
@ -5,8 +5,8 @@ import "./globals.css";
 const inter = Inter({ subsets: ["latin"] });
 export const metadata: Metadata = {
-  title: "Create Next App",
+  title: "🚅 LiteLLM",
-  description: "Generated by create next app",
+  description: "LiteLLM Proxy Admin UI",
 };
 export default function RootLayout({