Merge branch 'main' into litellm_aporio_integration

2024-07-17 22:14:29 -07:00 · 2024-07-17 22:14:29 -07:00 · 77656d9f11
commit 77656d9f11
parent 07d90f6739 f359335b5b
19 changed files with 512 additions and 142 deletions
--- a/docs/my-website/docs/observability/helicone_integration.md
+++ b/docs/my-website/docs/observability/helicone_integration.md
@ -72,7 +72,7 @@ Helicone's proxy provides [advanced functionality](https://docs.helicone.ai/gett
 To use Helicone as a proxy for your LLM requests:
 1. Set Helicone as your base URL via: litellm.api_base
-2. Pass in Helicone request headers via: litellm.headers
+2. Pass in Helicone request headers via: litellm.metadata
 Complete Code:
@ -99,7 +99,7 @@ print(response)
 You can add custom metadata and properties to your requests using Helicone headers. Here are some examples:
 ```python
-litellm.headers = {
+litellm.metadata = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",  # Authenticate to send requests to Helicone API
    "Helicone-User-Id": "user-abc",  # Specify the user making the request
    "Helicone-Property-App": "web",  # Custom property to add additional information
@ -127,7 +127,7 @@ litellm.headers = {
 Enable caching and set up rate limiting policies:
 ```python
-litellm.headers = {
+litellm.metadata = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",  # Authenticate to send requests to Helicone API
    "Helicone-Cache-Enabled": "true",  # Enable caching of responses
    "Cache-Control": "max-age=3600",  # Set cache limit to 1 hour
@ -140,7 +140,7 @@ litellm.headers = {
 Track multi-step and agentic LLM interactions using session IDs and paths:
 ```python
-litellm.headers = {
+litellm.metadata = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",  # Authenticate to send requests to Helicone API
    "Helicone-Session-Id": "session-abc-123",  # The session ID you want to track
    "Helicone-Session-Path": "parent-trace/child-trace",  # The path of the session
@ -157,7 +157,7 @@ By using these two headers, you can effectively group and visualize multi-step L
 Set up retry mechanisms and fallback options:
 ```python
-litellm.headers = {
+litellm.metadata = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",  # Authenticate to send requests to Helicone API
    "Helicone-Retry-Enabled": "true",  # Enable retry mechanism
    "helicone-retry-num": "3",  # Set number of retries
--- a/docs/my-website/docs/observability/langsmith_integration.md
+++ b/docs/my-website/docs/observability/langsmith_integration.md
@ -14,7 +14,7 @@ https://github.com/BerriAI/litellm
 An all-in-one developer platform for every step of the application lifecycle
 https://smith.langchain.com/
-<Image img={require('../../img/langsmith.png')} />
+<Image img={require('../../img/langsmith_new.png')} />
 :::info
 We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -5,6 +5,7 @@ Log Proxy input, output, and exceptions using:
 - Langfuse
 - OpenTelemetry
 - Custom Callbacks
 - Langsmith
 - DataDog
 - DynamoDB
 - s3 Bucket
@ -1086,6 +1087,50 @@ litellm_settings:
 Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API 
 ## Logging LLM IO to Langsmith
 1. Set `success_callback: ["langsmith"]` on litellm config.yaml
 If you're using a custom LangSmith instance, you can set the
 `LANGSMITH_BASE_URL` environment variable to point to your instance.
 ```yaml
 litellm_settings:
  success_callback: ["langsmith"]
 environment_variables:
  LANGSMITH_API_KEY: "lsv2_pt_xxxxxxxx"
  LANGSMITH_PROJECT: "litellm-proxy"
  LANGSMITH_BASE_URL: "https://api.smith.langchain.com" # (Optional - only needed if you have a custom Langsmith instance)
 ```
 2. Start Proxy
 ```
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "fake-openai-endpoint",
      "messages": [
        {
          "role": "user",
          "content": "Hello, Claude gm!"
        }
      ],
    }
 '
 ```
 Expect to see your log on Langfuse
 <Image img={require('../../img/langsmith_new.png')} />
 ## Logging LLM IO to Galileo
 [BETA]
--- a/docs/my-website/img/langsmith_new.png
+++ b/docs/my-website/img/langsmith_new.png
--- a/litellm/init.py
+++ b/litellm/init.py
@ -38,7 +38,7 @@ success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
 service_callback: List[Union[str, Callable]] = []
 _custom_logger_compatible_callbacks_literal = Literal[
-    "lago", "openmeter", "logfire", "dynamic_rate_limiter"
+    "lago", "openmeter", "logfire", "dynamic_rate_limiter", "langsmith", "galileo"
 ]
 callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
 _langfuse_default_tags: Optional[
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -1,13 +1,43 @@
 #### What this does ####
 #    On success, logs events to Langsmith
 import dotenv, os  # type: ignore
 import requests  # type: ignore
 from datetime import datetime
 import traceback
 import asyncio
 import os
 import traceback
 import types
 from datetime import datetime
 from typing import Any, List, Optional, Union
 import dotenv  # type: ignore
 import httpx
 import requests  # type: ignore
 from pydantic import BaseModel  # type: ignore
 import litellm
 from litellm._logging import verbose_logger
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 class LangsmithInputs(BaseModel):
    model: Optional[str] = None
    messages: Optional[List[Any]] = None
    stream: Optional[bool] = None
    call_type: Optional[str] = None
    litellm_call_id: Optional[str] = None
    completion_start_time: Optional[datetime] = None
    temperature: Optional[float] = None
    max_tokens: Optional[int] = None
    custom_llm_provider: Optional[str] = None
    input: Optional[List[Any]] = None
    log_event_type: Optional[str] = None
    original_response: Optional[Any] = None
    response_cost: Optional[float] = None
    # LiteLLM Virtual Key specific fields
    user_api_key: Optional[str] = None
    user_api_key_user_id: Optional[str] = None
    user_api_key_team_alias: Optional[str] = None
 def is_serializable(value):
    non_serializable_types = (
@ -19,7 +49,7 @@ def is_serializable(value):
    return not isinstance(value, non_serializable_types)
-class LangsmithLogger:
+class LangsmithLogger(CustomLogger):
    # Class variables or attributes
    def __init__(self):
        self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
@ -27,71 +57,121 @@ class LangsmithLogger:
        self.langsmith_default_run_name = os.getenv(
            "LANGSMITH_DEFAULT_RUN_NAME", "LLMRun"
        )
        self.langsmith_base_url = os.getenv(
            "LANGSMITH_BASE_URL", "https://api.smith.langchain.com"
        )
        self.async_httpx_client = AsyncHTTPHandler(
            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
        )
-    def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
+    def _prepare_log_data(self, kwargs, response_obj, start_time, end_time):
-        # Method definition
+        import datetime
-        # inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb
+        from datetime import timezone
-        metadata = (
+
-            kwargs.get("litellm_params", {}).get("metadata", {}) or {}
+        metadata = kwargs.get("litellm_params", {}).get("metadata", {}) or {}
-        )  # if metadata is None
+
        kwargs["user_api_key"] = metadata.get("user_api_key", None)
        kwargs["user_api_key_user_id"] = metadata.get("user_api_key_user_id", None)
        kwargs["user_api_key_team_alias"] = metadata.get(
            "user_api_key_team_alias", None
        )
        # set project name and run_name for langsmith logging
        # users can pass project_name and run name to litellm.completion()
        # Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"})
        # if not set litellm will fallback to the environment variable LANGSMITH_PROJECT, then to the default project_name = litellm-completion, run_name = LLMRun
        project_name = metadata.get("project_name", self.langsmith_project)
        run_name = metadata.get("run_name", self.langsmith_default_run_name)
-        print_verbose(
+        run_id = metadata.get("id", None)
        verbose_logger.debug(
            f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
        )
        langsmith_base_url = os.getenv(
            "LANGSMITH_BASE_URL", "https://api.smith.langchain.com"
        )
        try:
-            print_verbose(
+            start_time = kwargs["start_time"].astimezone(timezone.utc).isoformat()
-                f"Langsmith Logging - Enters logging function for model {kwargs}"
+            end_time = kwargs["end_time"].astimezone(timezone.utc).isoformat()
-            )
+        except:
-            import requests
+            start_time = datetime.datetime.utcnow().isoformat()
-            import datetime
+            end_time = datetime.datetime.utcnow().isoformat()
            from datetime import timezone
        # filter out kwargs to not include any dicts, langsmith throws an erros when trying to log kwargs
        logged_kwargs = LangsmithInputs(**kwargs)
        kwargs = logged_kwargs.model_dump()
        new_kwargs = {}
        for key in kwargs:
            value = kwargs[key]
            if key == "start_time" or key == "end_time" or value is None:
                pass
            elif key == "original_response" and not isinstance(value, str):
                new_kwargs[key] = str(value)
            elif type(value) == datetime.datetime:
                new_kwargs[key] = value.isoformat()
            elif type(value) != dict and is_serializable(value=value):
                new_kwargs[key] = value
            elif not is_serializable(value=value):
                continue
        if isinstance(response_obj, BaseModel):
            try:
-                start_time = kwargs["start_time"].astimezone(timezone.utc).isoformat()
+                response_obj = response_obj.model_dump()
                end_time = kwargs["end_time"].astimezone(timezone.utc).isoformat()
            except:
-                start_time = datetime.datetime.utcnow().isoformat()
+                response_obj = response_obj.dict()  # type: ignore
                end_time = datetime.datetime.utcnow().isoformat()
-            # filter out kwargs to not include any dicts, langsmith throws an erros when trying to log kwargs
+        data = {
-            new_kwargs = {}
+            "name": run_name,
-            for key in kwargs:
+            "run_type": "llm",  # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain"
-                value = kwargs[key]
+            "inputs": new_kwargs,
-                if key == "start_time" or key == "end_time" or value is None:
+            "outputs": response_obj,
-                    pass
+            "session_name": project_name,
-                elif type(value) == datetime.datetime:
+            "start_time": start_time,
-                    new_kwargs[key] = value.isoformat()
+            "end_time": end_time,
-                elif type(value) != dict and is_serializable(value=value):
+        }
                    new_kwargs[key] = value
-            if isinstance(response_obj, BaseModel):
+        if run_id:
-                try:
+            data["id"] = run_id
                    response_obj = response_obj.model_dump()
                except:
                    response_obj = response_obj.dict()  # type: ignore
-            data = {
+        verbose_logger.debug("Langsmith Logging data on langsmith: %s", data)
-                "name": run_name,
+
-                "run_type": "llm",  # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain"
+        return data
-                "inputs": new_kwargs,
+
-                "outputs": response_obj,
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
-                "session_name": project_name,
+        try:
-                "start_time": start_time,
+            verbose_logger.debug(
-                "end_time": end_time,
+                "Langsmith Async Layer Logging - kwargs: %s, response_obj: %s",
-            }
+                kwargs,
                response_obj,
            )
            data = self._prepare_log_data(kwargs, response_obj, start_time, end_time)
            url = f"{self.langsmith_base_url}/runs"
            verbose_logger.debug(f"Langsmith Logging - About to send data to {url} ...")
            headers = {"x-api-key": self.langsmith_api_key}
            response = await self.async_httpx_client.post(
                url=url, json=data, headers=headers
            )
            if response.status_code >= 300:
                verbose_logger.error(
                    f"Langmsith Error: {response.status_code} - {response.text}"
                )
            else:
                verbose_logger.debug(
                    "Run successfully created, response=%s", response.text
                )
            verbose_logger.debug(
                f"Langsmith Layer Logging - final response object: {response_obj}. Response text from langsmith={response.text}"
            )
        except:
            verbose_logger.error(f"Langsmith Layer Error - {traceback.format_exc()}")
    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        try:
            verbose_logger.debug(
                "Langsmith Sync Layer Logging - kwargs: %s, response_obj: %s",
                kwargs,
                response_obj,
            )
            data = self._prepare_log_data(kwargs, response_obj, start_time, end_time)
            url = f"{self.langsmith_base_url}/runs"
            verbose_logger.debug(f"Langsmith Logging - About to send data to {url} ...")
            url = f"{langsmith_base_url}/runs"
            print_verbose(f"Langsmith Logging - About to send data to {url} ...")
            response = requests.post(
                url=url,
                json=data,
@ -99,12 +179,21 @@ class LangsmithLogger:
            )
            if response.status_code >= 300:
-                print_verbose(f"Error: {response.status_code}")
+                verbose_logger.error(f"Error: {response.status_code} - {response.text}")
            else:
-                print_verbose("Run successfully created")
+                verbose_logger.debug("Run successfully created")
-            print_verbose(
+            verbose_logger.debug(
-                f"Langsmith Layer Logging - final response object: {response_obj}"
+                f"Langsmith Layer Logging - final response object: {response_obj}. Response text from langsmith={response.text}"
            )
        except:
-            print_verbose(f"Langsmith Layer Error - {traceback.format_exc()}")
+            verbose_logger.error(f"Langsmith Layer Error - {traceback.format_exc()}")
-            pass
+
    def get_run_by_id(self, run_id):
        url = f"{self.langsmith_base_url}/runs/{run_id}"
        response = requests.get(
            url=url,
            headers={"x-api-key": self.langsmith_api_key},
        )
        return response.json()
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -39,7 +39,6 @@ from litellm.utils import (
    add_breadcrumb,
    capture_exception,
    customLogger,
    langsmithLogger,
    liteDebuggerClient,
    logfireLogger,
    lunaryLogger,
@ -89,7 +88,6 @@ alerts_channel = None
 heliconeLogger = None
 athinaLogger = None
 promptLayerLogger = None
 langsmithLogger = None
 logfireLogger = None
 weightsBiasesLogger = None
 customLogger = None
@ -136,7 +134,7 @@ in_memory_trace_id_cache = ServiceTraceIDCache()
 class Logging:
-    global supabaseClient, liteDebuggerClient, promptLayerLogger, weightsBiasesLogger, langsmithLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app
+    global supabaseClient, liteDebuggerClient, promptLayerLogger, weightsBiasesLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app
    custom_pricing: bool = False
    stream_options = None
@ -738,23 +736,6 @@ class Logging:
                            end_time=end_time,
                            print_verbose=print_verbose,
                        )
                    if callback == "langsmith":
                        print_verbose("reaches langsmith for logging!")
                        if self.stream:
                            if "complete_streaming_response" not in kwargs:
                                continue
                            else:
                                print_verbose(
                                    "reaches langsmith for streaming logging!"
                                )
                                result = kwargs["complete_streaming_response"]
                        langsmithLogger.log_event(
                            kwargs=self.model_call_details,
                            response_obj=result,
                            start_time=start_time,
                            end_time=end_time,
                            print_verbose=print_verbose,
                        )
                    if callback == "logfire":
                        global logfireLogger
                        verbose_logger.debug("reaches logfire for success logging!")
@ -1337,7 +1318,14 @@ class Logging:
                if kwargs.get("no-log", False) == True:
                    print_verbose("no-log request, skipping logging")
                    continue
-                if callback == "cache" and litellm.cache is not None:
+                if (
                    callback == "cache"
                    and litellm.cache is not None
                    and self.model_call_details.get("litellm_params", {}).get(
                        "acompletion", False
                    )
                    is True
                ):
                    # set_cache once complete streaming response is built
                    print_verbose("async success_callback: reaches cache for logging!")
                    kwargs = self.model_call_details
@ -1417,6 +1405,9 @@ class Logging:
                            end_time=end_time,
                        )
                if callable(callback):  # custom logger functions
                    global customLogger
                    if customLogger is None:
                        customLogger = CustomLogger()
                    if self.stream:
                        if (
                            "async_complete_streaming_response"
@ -1822,7 +1813,7 @@ def set_callbacks(callback_list, function_id=None):
    """
    Globally sets the callback client
    """
-    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
+    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
    try:
        for callback in callback_list:
@ -1903,8 +1894,6 @@ def set_callbacks(callback_list, function_id=None):
                s3Logger = S3Logger()
            elif callback == "wandb":
                weightsBiasesLogger = WeightsBiasesLogger()
            elif callback == "langsmith":
                langsmithLogger = LangsmithLogger()
            elif callback == "logfire":
                logfireLogger = LogfireLogger()
            elif callback == "aispend":
@ -1957,6 +1946,15 @@ def _init_custom_logger_compatible_class(
        _in_memory_loggers.append(_openmeter_logger)
        return _openmeter_logger  # type: ignore
    elif logging_integration == "langsmith":
        for callback in _in_memory_loggers:
            if isinstance(callback, LangsmithLogger):
                return callback  # type: ignore
        _langsmith_logger = LangsmithLogger()
        _in_memory_loggers.append(_langsmith_logger)
        return _langsmith_logger  # type: ignore
    elif logging_integration == "galileo":
        for callback in _in_memory_loggers:
            if isinstance(callback, GalileoObserve):
@ -2025,6 +2023,10 @@ def get_custom_logger_compatible_class(
        for callback in _in_memory_loggers:
            if isinstance(callback, GalileoObserve):
                return callback
    elif logging_integration == "langsmith":
        for callback in _in_memory_loggers:
            if isinstance(callback, LangsmithLogger):
                return callback
    elif logging_integration == "logfire":
        if "LOGFIRE_TOKEN" not in os.environ:
            raise ValueError("LOGFIRE_TOKEN not found in environment variables")
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1020,6 +1020,26 @@
        "mode": "chat",
        "supports_function_calling": true
    },
    "groq/llama3-groq-70b-8192-tool-use-preview": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0.00000089,
        "output_cost_per_token": 0.00000089,
        "litellm_provider": "groq",
        "mode": "chat",
        "supports_function_calling": true
    },
    "groq/llama3-groq-8b-8192-tool-use-preview": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0.00000019,
        "output_cost_per_token": 0.00000019,
        "litellm_provider": "groq",
        "mode": "chat",
        "supports_function_calling": true
    },
    "friendliai/mixtral-8x7b-instruct-v0-1": {
        "max_tokens": 32768,
        "max_input_tokens": 32768,
@ -1800,6 +1820,26 @@
        "supports_vision": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "medlm-medium": {
        "max_tokens": 8192,
        "max_input_tokens": 32768,
        "max_output_tokens": 8192,
        "input_cost_per_character": 0.0000005,
        "output_cost_per_character": 0.000001,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "medlm-large": {
        "max_tokens": 1024,
        "max_input_tokens": 8192,
        "max_output_tokens": 1024,
        "input_cost_per_character": 0.000005,
        "output_cost_per_character": 0.000015,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "vertex_ai/claude-3-sonnet@20240229": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@ -1,10 +1,5 @@
 model_list:
-  - model_name: "*"
+  - model_name: llama-3
    litellm_params:
-      model: openai/*
+      model: gpt-4
-
+      request_timeout: 1
 litellm_settings:
  guardrails:
    - prompt_injection:
        callbacks: ["aporio_prompt_injection"]
        default_on: true
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -17,9 +17,7 @@ model_list:
 general_settings: 
  master_key: sk-1234 
  litellm_key_header_name: "X-Litellm-Key"
 litellm_settings:
-  cache: true
+  success_callback: ["langsmith"]
  callbacks: ["otel"]
--- a/litellm/router.py
+++ b/litellm/router.py
@ -718,6 +718,9 @@ class Router:
                data.get(
                    "timeout", None
                )  # timeout set on litellm_params for this deployment
                or data.get(
                    "request_timeout", None
                )  # timeout set on litellm_params for this deployment
                or self.timeout  # timeout set on router
                or kwargs.get(
                    "timeout", None
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -1579,18 +1579,21 @@ async def test_redis_semantic_cache_acompletion():
    assert response1.id == response2.id
-def test_caching_redis_simple(caplog):
+def test_caching_redis_simple(caplog, capsys):
    """
    Relevant issue - https://github.com/BerriAI/litellm/issues/4511
    """
    litellm.set_verbose = True  ## REQUIRED FOR TEST.
    litellm.cache = Cache(
        type="redis", url=os.getenv("REDIS_SSL_URL")
    )  # passing `supported_call_types = ["completion"]` has no effect
    s = time.time()
    uuid_str = str(uuid.uuid4())
    x = completion(
-        model="gpt-4o",
+        model="gpt-3.5-turbo",
-        messages=[{"role": "user", "content": "Hello, how are you? Wink"}],
+        messages=[{"role": "user", "content": f"Hello, how are you? Wink {uuid_str}"}],
        stream=True,
    )
    for m in x:
@ -1599,8 +1602,8 @@ def test_caching_redis_simple(caplog):
    s2 = time.time()
    x = completion(
-        model="gpt-4o",
+        model="gpt-3.5-turbo",
-        messages=[{"role": "user", "content": "Hello, how are you? Wink"}],
+        messages=[{"role": "user", "content": f"Hello, how are you? Wink {uuid_str}"}],
        stream=True,
    )
    for m in x:
@ -1609,11 +1612,15 @@ def test_caching_redis_simple(caplog):
    redis_async_caching_error = False
    redis_service_logging_error = False
    captured = capsys.readouterr()
    captured_logs = [rec.message for rec in caplog.records]
    print(f"captured_logs: {captured_logs}")
    for item in captured_logs:
-        if "Error connecting to Async Redis client" in item:
+        if (
            "Error connecting to Async Redis client" in item
            or "Set ASYNC Redis Cache" in item
        ):
            redis_async_caching_error = True
        if "ServiceLogging.async_service_success_hook" in item:
@ -1621,3 +1628,4 @@ def test_caching_redis_simple(caplog):
    assert redis_async_caching_error is False
    assert redis_service_logging_error is False
    assert "async success_callback: reaches cache for logging" not in captured.out
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.llms.prompt_templates.factory import anthropic_messages_pt
-# litellm.num_retries = 3
+# litellm.num_retries=3
 litellm.cache = None
 litellm.success_callback = []
 user_message = "Write a short poem about the sky"
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -706,6 +706,33 @@ def test_vertex_ai_completion_cost():
    print("calculated_input_cost: {}".format(calculated_input_cost))
 # @pytest.mark.skip(reason="new test - WIP, working on fixing this")
 def test_vertex_ai_medlm_completion_cost():
    """Test for medlm completion cost."""
    with pytest.raises(Exception) as e:
        model = "vertex_ai/medlm-medium"
        messages = [{"role": "user", "content": "Test MedLM completion cost."}]
        predictive_cost = completion_cost(
            model=model, messages=messages, custom_llm_provider="vertex_ai"
        )
    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    model = "vertex_ai/medlm-medium"
    messages = [{"role": "user", "content": "Test MedLM completion cost."}]
    predictive_cost = completion_cost(
        model=model, messages=messages, custom_llm_provider="vertex_ai"
    )
    assert predictive_cost > 0
    model = "vertex_ai/medlm-large"
    messages = [{"role": "user", "content": "Test MedLM completion cost."}]
    predictive_cost = completion_cost(model=model, messages=messages)
    assert predictive_cost > 0
 def test_vertex_ai_claude_completion_cost():
    from litellm import Choices, Message, ModelResponse
    from litellm.utils import Usage
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -589,7 +589,7 @@ async def test_triton_embeddings():
        print(f"response: {response}")
        # stubbed endpoint is setup to return this
-        assert response.data[0]["embedding"] == [0.1, 0.2, 0.3]
+        assert response.data[0]["embedding"] == [0.1, 0.2]
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
--- a/litellm/tests/test_langsmith.py
+++ b/litellm/tests/test_langsmith.py
@ -1,70 +1,176 @@
 import sys
 import os
 import io
 import os
 import sys
 sys.path.insert(0, os.path.abspath("../.."))
-from litellm import completion
+import asyncio
-import litellm
+import logging
 import uuid
 import pytest
 import litellm
 from litellm import completion
 from litellm._logging import verbose_logger
 from litellm.integrations.langsmith import LangsmithLogger
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 verbose_logger.setLevel(logging.DEBUG)
 litellm.success_callback = ["langsmith"]
 litellm.set_verbose = True
 import time
-def test_langsmith_logging():
+@pytest.mark.asyncio()
 async def test_async_langsmith_logging():
    try:
-        response = completion(
+        test_langsmith_logger = LangsmithLogger()
        run_id = str(uuid.uuid4())
        litellm.set_verbose = True
        litellm.callbacks = ["langsmith"]
        response = await litellm.acompletion(
            model="claude-instant-1.2",
            messages=[{"role": "user", "content": "what llm are u"}],
            max_tokens=10,
            temperature=0.2,
            metadata={
                "id": run_id,
                "user_api_key": "6eb81e014497d89f3cc1aa9da7c2b37bda6b7fea68e4b710d33d94201e68970c",
                "user_api_key_alias": "ishaans-langmsith-key",
                "user_api_end_user_max_budget": None,
                "litellm_api_version": "1.40.19",
                "global_max_parallel_requests": None,
                "user_api_key_user_id": "admin",
                "user_api_key_org_id": None,
                "user_api_key_team_id": "dbe2f686-a686-4896-864a-4c3924458709",
                "user_api_key_team_alias": "testing-team",
            },
        )
        print(response)
        await asyncio.sleep(3)
        print("run_id", run_id)
        logged_run_on_langsmith = test_langsmith_logger.get_run_by_id(run_id=run_id)
        print("logged_run_on_langsmith", logged_run_on_langsmith)
        print("fields in logged_run_on_langsmith", logged_run_on_langsmith.keys())
        input_fields_on_langsmith = logged_run_on_langsmith.get("inputs")
        extra_fields_on_langsmith = logged_run_on_langsmith.get("extra").get(
            "invocation_params"
        )
        print("\nLogged INPUT ON LANGSMITH", input_fields_on_langsmith)
        print("\nextra fields on langsmith", extra_fields_on_langsmith)
        assert isinstance(input_fields_on_langsmith, dict)
        assert "api_key" not in input_fields_on_langsmith
        assert "api_key" not in extra_fields_on_langsmith
        # assert user_api_key in extra_fields_on_langsmith
        assert "user_api_key" in extra_fields_on_langsmith
        assert "user_api_key_user_id" in extra_fields_on_langsmith
        assert "user_api_key_team_alias" in extra_fields_on_langsmith
        for cb in litellm.callbacks:
            if isinstance(cb, LangsmithLogger):
                await cb.async_httpx_client.client.aclose()
        # test_langsmith_logger.async_httpx_client.close()
    except Exception as e:
        print(e)
        pytest.fail(f"Error occurred: {e}")
 # test_langsmith_logging()
-def test_langsmith_logging_with_metadata():
+def test_async_langsmith_logging_with_metadata():
    try:
        litellm.success_callback = ["langsmith"]
        litellm.set_verbose = True
        response = completion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "what llm are u"}],
            max_tokens=10,
            temperature=0.2,
            metadata={
                "run_name": "litellmRUN",
                "project_name": "litellm-completion",
            },
        )
        print(response)
        time.sleep(3)
        for cb in litellm.callbacks:
            if isinstance(cb, LangsmithLogger):
                cb.async_httpx_client.close()
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
        print(e)
-# test_langsmith_logging_with_metadata()
+@pytest.mark.parametrize("sync_mode", [False, True])
-
+@pytest.mark.asyncio
-
+async def test_async_langsmith_logging_with_streaming_and_metadata(sync_mode):
 def test_langsmith_logging_with_streaming_and_metadata():
    try:
-        response = completion(
+        test_langsmith_logger = LangsmithLogger()
-            model="gpt-3.5-turbo",
+        litellm.success_callback = ["langsmith"]
-            messages=[{"role": "user", "content": "what llm are u"}],
+        litellm.set_verbose = True
-            max_tokens=10,
+        run_id = str(uuid.uuid4())
-            temperature=0.2,
+
-            metadata={
+        messages = [{"role": "user", "content": "what llm are u"}]
-                "run_name": "litellmRUN",
+        if sync_mode is True:
-                "project_name": "litellm-completion",
+            response = completion(
-            },
+                model="gpt-3.5-turbo",
-            stream=True,
+                messages=messages,
                max_tokens=10,
                temperature=0.2,
                stream=True,
                metadata={"id": run_id},
            )
            for cb in litellm.callbacks:
                if isinstance(cb, LangsmithLogger):
                    cb.async_httpx_client = AsyncHTTPHandler()
            for chunk in response:
                continue
            time.sleep(3)
        else:
            response = await litellm.acompletion(
                model="gpt-3.5-turbo",
                messages=messages,
                max_tokens=10,
                temperature=0.2,
                mock_response="This is a mock request",
                stream=True,
                metadata={"id": run_id},
            )
            for cb in litellm.callbacks:
                if isinstance(cb, LangsmithLogger):
                    cb.async_httpx_client = AsyncHTTPHandler()
            async for chunk in response:
                continue
            await asyncio.sleep(3)
        print("run_id", run_id)
        logged_run_on_langsmith = test_langsmith_logger.get_run_by_id(run_id=run_id)
        print("logged_run_on_langsmith", logged_run_on_langsmith)
        print("fields in logged_run_on_langsmith", logged_run_on_langsmith.keys())
        input_fields_on_langsmith = logged_run_on_langsmith.get("inputs")
        extra_fields_on_langsmith = logged_run_on_langsmith.get("extra").get(
            "invocation_params"
        )
-        for chunk in response:
+
-            continue
+        assert logged_run_on_langsmith.get("run_type") == "llm"
        print("\nLogged INPUT ON LANGSMITH", input_fields_on_langsmith)
        print("\nextra fields on langsmith", extra_fields_on_langsmith)
        assert isinstance(input_fields_on_langsmith, dict)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
        print(e)
 test_langsmith_logging_with_streaming_and_metadata()
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -515,6 +515,7 @@ async def test_completion_predibase_streaming(sync_mode):
            response = completion(
                model="predibase/llama-3-8b-instruct",
                tenant_id="c4768f95",
                max_tokens=10,
                api_base="https://serving.app.predibase.com",
                api_key=os.getenv("PREDIBASE_API_KEY"),
                messages=[{"role": "user", "content": "What is the meaning of life?"}],
@ -539,6 +540,7 @@ async def test_completion_predibase_streaming(sync_mode):
            response = await litellm.acompletion(
                model="predibase/llama-3-8b-instruct",
                tenant_id="c4768f95",
                max_tokens=10,
                api_base="https://serving.app.predibase.com",
                api_key=os.getenv("PREDIBASE_API_KEY"),
                messages=[{"role": "user", "content": "What is the meaning of life?"}],
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -417,6 +417,21 @@ def function_setup(
                    # we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
                    litellm._async_success_callback.append(callback)
                    removed_async_items.append(index)
                elif callback == "langsmith":
                    callback_class = litellm.litellm_core_utils.litellm_logging._init_custom_logger_compatible_class(  # type: ignore
                        callback, internal_usage_cache=None, llm_router=None
                    )
                    # don't double add a callback
                    if not any(
                        isinstance(cb, type(callback_class)) for cb in litellm.callbacks
                    ):
                        litellm.callbacks.append(callback_class)  # type: ignore
                        litellm.input_callback.append(callback_class)  # type: ignore
                        litellm.success_callback.append(callback_class)  # type: ignore
                        litellm.failure_callback.append(callback_class)  # type: ignore
                        litellm._async_success_callback.append(callback_class)  # type: ignore
                        litellm._async_failure_callback.append(callback_class)  # type: ignore
            # Pop the async items from success_callback in reverse order to avoid index issues
            for index in reversed(removed_async_items):
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -1020,6 +1020,26 @@
        "mode": "chat",
        "supports_function_calling": true
    },
    "groq/llama3-groq-70b-8192-tool-use-preview": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0.00000089,
        "output_cost_per_token": 0.00000089,
        "litellm_provider": "groq",
        "mode": "chat",
        "supports_function_calling": true
    },
    "groq/llama3-groq-8b-8192-tool-use-preview": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
        "max_output_tokens": 8192,
        "input_cost_per_token": 0.00000019,
        "output_cost_per_token": 0.00000019,
        "litellm_provider": "groq",
        "mode": "chat",
        "supports_function_calling": true
    },
    "friendliai/mixtral-8x7b-instruct-v0-1": {
        "max_tokens": 32768,
        "max_input_tokens": 32768,
@ -1800,6 +1820,26 @@
        "supports_vision": true,
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "medlm-medium": {
        "max_tokens": 8192,
        "max_input_tokens": 32768,
        "max_output_tokens": 8192,
        "input_cost_per_character": 0.0000005,
        "output_cost_per_character": 0.000001,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "medlm-large": {
        "max_tokens": 1024,
        "max_input_tokens": 8192,
        "max_output_tokens": 1024,
        "input_cost_per_character": 0.000005,
        "output_cost_per_character": 0.000015,
        "litellm_provider": "vertex_ai-language-models",
        "mode": "chat",
        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
    },
    "vertex_ai/claude-3-sonnet@20240229": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,