diff --git a/docs/my-website/docs/observability/helicone_integration.md b/docs/my-website/docs/observability/helicone_integration.md
index 57e7039fc..7e7f9fcb6 100644
--- a/docs/my-website/docs/observability/helicone_integration.md
+++ b/docs/my-website/docs/observability/helicone_integration.md
@@ -72,7 +72,7 @@ Helicone's proxy provides [advanced functionality](https://docs.helicone.ai/gett
To use Helicone as a proxy for your LLM requests:
1. Set Helicone as your base URL via: litellm.api_base
-2. Pass in Helicone request headers via: litellm.headers
+2. Pass in Helicone request headers via: litellm.metadata
Complete Code:
@@ -99,7 +99,7 @@ print(response)
You can add custom metadata and properties to your requests using Helicone headers. Here are some examples:
```python
-litellm.headers = {
+litellm.metadata = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API
"Helicone-User-Id": "user-abc", # Specify the user making the request
"Helicone-Property-App": "web", # Custom property to add additional information
@@ -127,7 +127,7 @@ litellm.headers = {
Enable caching and set up rate limiting policies:
```python
-litellm.headers = {
+litellm.metadata = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API
"Helicone-Cache-Enabled": "true", # Enable caching of responses
"Cache-Control": "max-age=3600", # Set cache limit to 1 hour
@@ -140,7 +140,7 @@ litellm.headers = {
Track multi-step and agentic LLM interactions using session IDs and paths:
```python
-litellm.headers = {
+litellm.metadata = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API
"Helicone-Session-Id": "session-abc-123", # The session ID you want to track
"Helicone-Session-Path": "parent-trace/child-trace", # The path of the session
@@ -157,7 +157,7 @@ By using these two headers, you can effectively group and visualize multi-step L
Set up retry mechanisms and fallback options:
```python
-litellm.headers = {
+litellm.metadata = {
"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API
"Helicone-Retry-Enabled": "true", # Enable retry mechanism
"helicone-retry-num": "3", # Set number of retries
diff --git a/docs/my-website/docs/observability/langsmith_integration.md b/docs/my-website/docs/observability/langsmith_integration.md
index c038abd82..79d047e33 100644
--- a/docs/my-website/docs/observability/langsmith_integration.md
+++ b/docs/my-website/docs/observability/langsmith_integration.md
@@ -14,7 +14,7 @@ https://github.com/BerriAI/litellm
An all-in-one developer platform for every step of the application lifecycle
https://smith.langchain.com/
-
+
:::info
We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md
index 27f1789e0..0d5016645 100644
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@@ -5,6 +5,7 @@ Log Proxy input, output, and exceptions using:
- Langfuse
- OpenTelemetry
- Custom Callbacks
+- Langsmith
- DataDog
- DynamoDB
- s3 Bucket
@@ -1086,6 +1087,50 @@ litellm_settings:
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
+## Logging LLM IO to Langsmith
+
+1. Set `success_callback: ["langsmith"]` on litellm config.yaml
+
+If you're using a custom LangSmith instance, you can set the
+`LANGSMITH_BASE_URL` environment variable to point to your instance.
+
+```yaml
+litellm_settings:
+ success_callback: ["langsmith"]
+
+environment_variables:
+ LANGSMITH_API_KEY: "lsv2_pt_xxxxxxxx"
+ LANGSMITH_PROJECT: "litellm-proxy"
+
+ LANGSMITH_BASE_URL: "https://api.smith.langchain.com" # (Optional - only needed if you have a custom Langsmith instance)
+```
+
+
+2. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+3. Test it!
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+ "model": "fake-openai-endpoint",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Hello, Claude gm!"
+ }
+ ],
+ }
+'
+```
+Expect to see your log on Langfuse
+
+
## Logging LLM IO to Galileo
[BETA]
diff --git a/docs/my-website/img/langsmith_new.png b/docs/my-website/img/langsmith_new.png
new file mode 100644
index 000000000..d5586bdbe
Binary files /dev/null and b/docs/my-website/img/langsmith_new.png differ
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 645a0bccd..7dcc934a6 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -38,7 +38,7 @@ success_callback: List[Union[str, Callable]] = []
failure_callback: List[Union[str, Callable]] = []
service_callback: List[Union[str, Callable]] = []
_custom_logger_compatible_callbacks_literal = Literal[
- "lago", "openmeter", "logfire", "dynamic_rate_limiter"
+ "lago", "openmeter", "logfire", "dynamic_rate_limiter", "langsmith", "galileo"
]
callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
_langfuse_default_tags: Optional[
diff --git a/litellm/integrations/langsmith.py b/litellm/integrations/langsmith.py
index 48185afee..81db798ae 100644
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@@ -1,13 +1,43 @@
#### What this does ####
# On success, logs events to Langsmith
-import dotenv, os # type: ignore
-import requests # type: ignore
-from datetime import datetime
-import traceback
import asyncio
+import os
+import traceback
import types
+from datetime import datetime
+from typing import Any, List, Optional, Union
+
+import dotenv # type: ignore
+import httpx
+import requests # type: ignore
from pydantic import BaseModel # type: ignore
+import litellm
+from litellm._logging import verbose_logger
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+
+
+class LangsmithInputs(BaseModel):
+ model: Optional[str] = None
+ messages: Optional[List[Any]] = None
+ stream: Optional[bool] = None
+ call_type: Optional[str] = None
+ litellm_call_id: Optional[str] = None
+ completion_start_time: Optional[datetime] = None
+ temperature: Optional[float] = None
+ max_tokens: Optional[int] = None
+ custom_llm_provider: Optional[str] = None
+ input: Optional[List[Any]] = None
+ log_event_type: Optional[str] = None
+ original_response: Optional[Any] = None
+ response_cost: Optional[float] = None
+
+ # LiteLLM Virtual Key specific fields
+ user_api_key: Optional[str] = None
+ user_api_key_user_id: Optional[str] = None
+ user_api_key_team_alias: Optional[str] = None
+
def is_serializable(value):
non_serializable_types = (
@@ -19,7 +49,7 @@ def is_serializable(value):
return not isinstance(value, non_serializable_types)
-class LangsmithLogger:
+class LangsmithLogger(CustomLogger):
# Class variables or attributes
def __init__(self):
self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
@@ -27,71 +57,121 @@ class LangsmithLogger:
self.langsmith_default_run_name = os.getenv(
"LANGSMITH_DEFAULT_RUN_NAME", "LLMRun"
)
+ self.langsmith_base_url = os.getenv(
+ "LANGSMITH_BASE_URL", "https://api.smith.langchain.com"
+ )
+ self.async_httpx_client = AsyncHTTPHandler(
+ timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+ )
- def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
- # Method definition
- # inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb
- metadata = (
- kwargs.get("litellm_params", {}).get("metadata", {}) or {}
- ) # if metadata is None
+ def _prepare_log_data(self, kwargs, response_obj, start_time, end_time):
+ import datetime
+ from datetime import timezone
+
+ metadata = kwargs.get("litellm_params", {}).get("metadata", {}) or {}
+
+ kwargs["user_api_key"] = metadata.get("user_api_key", None)
+ kwargs["user_api_key_user_id"] = metadata.get("user_api_key_user_id", None)
+ kwargs["user_api_key_team_alias"] = metadata.get(
+ "user_api_key_team_alias", None
+ )
- # set project name and run_name for langsmith logging
- # users can pass project_name and run name to litellm.completion()
- # Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"})
- # if not set litellm will fallback to the environment variable LANGSMITH_PROJECT, then to the default project_name = litellm-completion, run_name = LLMRun
project_name = metadata.get("project_name", self.langsmith_project)
run_name = metadata.get("run_name", self.langsmith_default_run_name)
- print_verbose(
+ run_id = metadata.get("id", None)
+ verbose_logger.debug(
f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
)
- langsmith_base_url = os.getenv(
- "LANGSMITH_BASE_URL", "https://api.smith.langchain.com"
- )
try:
- print_verbose(
- f"Langsmith Logging - Enters logging function for model {kwargs}"
- )
- import requests
- import datetime
- from datetime import timezone
+ start_time = kwargs["start_time"].astimezone(timezone.utc).isoformat()
+ end_time = kwargs["end_time"].astimezone(timezone.utc).isoformat()
+ except:
+ start_time = datetime.datetime.utcnow().isoformat()
+ end_time = datetime.datetime.utcnow().isoformat()
+ # filter out kwargs to not include any dicts, langsmith throws an erros when trying to log kwargs
+ logged_kwargs = LangsmithInputs(**kwargs)
+ kwargs = logged_kwargs.model_dump()
+
+ new_kwargs = {}
+ for key in kwargs:
+ value = kwargs[key]
+ if key == "start_time" or key == "end_time" or value is None:
+ pass
+ elif key == "original_response" and not isinstance(value, str):
+ new_kwargs[key] = str(value)
+ elif type(value) == datetime.datetime:
+ new_kwargs[key] = value.isoformat()
+ elif type(value) != dict and is_serializable(value=value):
+ new_kwargs[key] = value
+ elif not is_serializable(value=value):
+ continue
+
+ if isinstance(response_obj, BaseModel):
try:
- start_time = kwargs["start_time"].astimezone(timezone.utc).isoformat()
- end_time = kwargs["end_time"].astimezone(timezone.utc).isoformat()
+ response_obj = response_obj.model_dump()
except:
- start_time = datetime.datetime.utcnow().isoformat()
- end_time = datetime.datetime.utcnow().isoformat()
+ response_obj = response_obj.dict() # type: ignore
- # filter out kwargs to not include any dicts, langsmith throws an erros when trying to log kwargs
- new_kwargs = {}
- for key in kwargs:
- value = kwargs[key]
- if key == "start_time" or key == "end_time" or value is None:
- pass
- elif type(value) == datetime.datetime:
- new_kwargs[key] = value.isoformat()
- elif type(value) != dict and is_serializable(value=value):
- new_kwargs[key] = value
+ data = {
+ "name": run_name,
+ "run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain"
+ "inputs": new_kwargs,
+ "outputs": response_obj,
+ "session_name": project_name,
+ "start_time": start_time,
+ "end_time": end_time,
+ }
- if isinstance(response_obj, BaseModel):
- try:
- response_obj = response_obj.model_dump()
- except:
- response_obj = response_obj.dict() # type: ignore
+ if run_id:
+ data["id"] = run_id
- data = {
- "name": run_name,
- "run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain"
- "inputs": new_kwargs,
- "outputs": response_obj,
- "session_name": project_name,
- "start_time": start_time,
- "end_time": end_time,
- }
+ verbose_logger.debug("Langsmith Logging data on langsmith: %s", data)
+
+ return data
+
+ async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+ try:
+ verbose_logger.debug(
+ "Langsmith Async Layer Logging - kwargs: %s, response_obj: %s",
+ kwargs,
+ response_obj,
+ )
+ data = self._prepare_log_data(kwargs, response_obj, start_time, end_time)
+ url = f"{self.langsmith_base_url}/runs"
+ verbose_logger.debug(f"Langsmith Logging - About to send data to {url} ...")
+
+ headers = {"x-api-key": self.langsmith_api_key}
+ response = await self.async_httpx_client.post(
+ url=url, json=data, headers=headers
+ )
+
+ if response.status_code >= 300:
+ verbose_logger.error(
+ f"Langmsith Error: {response.status_code} - {response.text}"
+ )
+ else:
+ verbose_logger.debug(
+ "Run successfully created, response=%s", response.text
+ )
+ verbose_logger.debug(
+ f"Langsmith Layer Logging - final response object: {response_obj}. Response text from langsmith={response.text}"
+ )
+ except:
+ verbose_logger.error(f"Langsmith Layer Error - {traceback.format_exc()}")
+
+ def log_success_event(self, kwargs, response_obj, start_time, end_time):
+ try:
+ verbose_logger.debug(
+ "Langsmith Sync Layer Logging - kwargs: %s, response_obj: %s",
+ kwargs,
+ response_obj,
+ )
+ data = self._prepare_log_data(kwargs, response_obj, start_time, end_time)
+ url = f"{self.langsmith_base_url}/runs"
+ verbose_logger.debug(f"Langsmith Logging - About to send data to {url} ...")
- url = f"{langsmith_base_url}/runs"
- print_verbose(f"Langsmith Logging - About to send data to {url} ...")
response = requests.post(
url=url,
json=data,
@@ -99,12 +179,21 @@ class LangsmithLogger:
)
if response.status_code >= 300:
- print_verbose(f"Error: {response.status_code}")
+ verbose_logger.error(f"Error: {response.status_code} - {response.text}")
else:
- print_verbose("Run successfully created")
- print_verbose(
- f"Langsmith Layer Logging - final response object: {response_obj}"
+ verbose_logger.debug("Run successfully created")
+ verbose_logger.debug(
+ f"Langsmith Layer Logging - final response object: {response_obj}. Response text from langsmith={response.text}"
)
except:
- print_verbose(f"Langsmith Layer Error - {traceback.format_exc()}")
- pass
+ verbose_logger.error(f"Langsmith Layer Error - {traceback.format_exc()}")
+
+ def get_run_by_id(self, run_id):
+
+ url = f"{self.langsmith_base_url}/runs/{run_id}"
+ response = requests.get(
+ url=url,
+ headers={"x-api-key": self.langsmith_api_key},
+ )
+
+ return response.json()
diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
index 3fde07815..32633960f 100644
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@@ -39,7 +39,6 @@ from litellm.utils import (
add_breadcrumb,
capture_exception,
customLogger,
- langsmithLogger,
liteDebuggerClient,
logfireLogger,
lunaryLogger,
@@ -89,7 +88,6 @@ alerts_channel = None
heliconeLogger = None
athinaLogger = None
promptLayerLogger = None
-langsmithLogger = None
logfireLogger = None
weightsBiasesLogger = None
customLogger = None
@@ -136,7 +134,7 @@ in_memory_trace_id_cache = ServiceTraceIDCache()
class Logging:
- global supabaseClient, liteDebuggerClient, promptLayerLogger, weightsBiasesLogger, langsmithLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app
+ global supabaseClient, liteDebuggerClient, promptLayerLogger, weightsBiasesLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app
custom_pricing: bool = False
stream_options = None
@@ -738,23 +736,6 @@ class Logging:
end_time=end_time,
print_verbose=print_verbose,
)
- if callback == "langsmith":
- print_verbose("reaches langsmith for logging!")
- if self.stream:
- if "complete_streaming_response" not in kwargs:
- continue
- else:
- print_verbose(
- "reaches langsmith for streaming logging!"
- )
- result = kwargs["complete_streaming_response"]
- langsmithLogger.log_event(
- kwargs=self.model_call_details,
- response_obj=result,
- start_time=start_time,
- end_time=end_time,
- print_verbose=print_verbose,
- )
if callback == "logfire":
global logfireLogger
verbose_logger.debug("reaches logfire for success logging!")
@@ -1337,7 +1318,14 @@ class Logging:
if kwargs.get("no-log", False) == True:
print_verbose("no-log request, skipping logging")
continue
- if callback == "cache" and litellm.cache is not None:
+ if (
+ callback == "cache"
+ and litellm.cache is not None
+ and self.model_call_details.get("litellm_params", {}).get(
+ "acompletion", False
+ )
+ is True
+ ):
# set_cache once complete streaming response is built
print_verbose("async success_callback: reaches cache for logging!")
kwargs = self.model_call_details
@@ -1417,6 +1405,9 @@ class Logging:
end_time=end_time,
)
if callable(callback): # custom logger functions
+ global customLogger
+ if customLogger is None:
+ customLogger = CustomLogger()
if self.stream:
if (
"async_complete_streaming_response"
@@ -1822,7 +1813,7 @@ def set_callbacks(callback_list, function_id=None):
"""
Globally sets the callback client
"""
- global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
+ global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
try:
for callback in callback_list:
@@ -1903,8 +1894,6 @@ def set_callbacks(callback_list, function_id=None):
s3Logger = S3Logger()
elif callback == "wandb":
weightsBiasesLogger = WeightsBiasesLogger()
- elif callback == "langsmith":
- langsmithLogger = LangsmithLogger()
elif callback == "logfire":
logfireLogger = LogfireLogger()
elif callback == "aispend":
@@ -1957,6 +1946,15 @@ def _init_custom_logger_compatible_class(
_in_memory_loggers.append(_openmeter_logger)
return _openmeter_logger # type: ignore
+ elif logging_integration == "langsmith":
+ for callback in _in_memory_loggers:
+ if isinstance(callback, LangsmithLogger):
+ return callback # type: ignore
+
+ _langsmith_logger = LangsmithLogger()
+ _in_memory_loggers.append(_langsmith_logger)
+ return _langsmith_logger # type: ignore
+
elif logging_integration == "galileo":
for callback in _in_memory_loggers:
if isinstance(callback, GalileoObserve):
@@ -2025,6 +2023,10 @@ def get_custom_logger_compatible_class(
for callback in _in_memory_loggers:
if isinstance(callback, GalileoObserve):
return callback
+ elif logging_integration == "langsmith":
+ for callback in _in_memory_loggers:
+ if isinstance(callback, LangsmithLogger):
+ return callback
elif logging_integration == "logfire":
if "LOGFIRE_TOKEN" not in os.environ:
raise ValueError("LOGFIRE_TOKEN not found in environment variables")
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 60f812b2b..2fc6a5771 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -1020,6 +1020,26 @@
"mode": "chat",
"supports_function_calling": true
},
+ "groq/llama3-groq-70b-8192-tool-use-preview": {
+ "max_tokens": 8192,
+ "max_input_tokens": 8192,
+ "max_output_tokens": 8192,
+ "input_cost_per_token": 0.00000089,
+ "output_cost_per_token": 0.00000089,
+ "litellm_provider": "groq",
+ "mode": "chat",
+ "supports_function_calling": true
+ },
+ "groq/llama3-groq-8b-8192-tool-use-preview": {
+ "max_tokens": 8192,
+ "max_input_tokens": 8192,
+ "max_output_tokens": 8192,
+ "input_cost_per_token": 0.00000019,
+ "output_cost_per_token": 0.00000019,
+ "litellm_provider": "groq",
+ "mode": "chat",
+ "supports_function_calling": true
+ },
"friendliai/mixtral-8x7b-instruct-v0-1": {
"max_tokens": 32768,
"max_input_tokens": 32768,
@@ -1800,6 +1820,26 @@
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
+ "medlm-medium": {
+ "max_tokens": 8192,
+ "max_input_tokens": 32768,
+ "max_output_tokens": 8192,
+ "input_cost_per_character": 0.0000005,
+ "output_cost_per_character": 0.000001,
+ "litellm_provider": "vertex_ai-language-models",
+ "mode": "chat",
+ "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+ },
+ "medlm-large": {
+ "max_tokens": 1024,
+ "max_input_tokens": 8192,
+ "max_output_tokens": 1024,
+ "input_cost_per_character": 0.000005,
+ "output_cost_per_character": 0.000015,
+ "litellm_provider": "vertex_ai-language-models",
+ "mode": "chat",
+ "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+ },
"vertex_ai/claude-3-sonnet@20240229": {
"max_tokens": 4096,
"max_input_tokens": 200000,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
index b6ac36044..641c70ebc 100644
--- a/litellm/proxy/_new_secret_config.yaml
+++ b/litellm/proxy/_new_secret_config.yaml
@@ -1,10 +1,5 @@
model_list:
- - model_name: "*"
+ - model_name: llama-3
litellm_params:
- model: openai/*
-
-litellm_settings:
- guardrails:
- - prompt_injection:
- callbacks: ["aporio_prompt_injection"]
- default_on: true
+ model: gpt-4
+ request_timeout: 1
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 42e77475f..3f3b0858e 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -17,9 +17,7 @@ model_list:
general_settings:
master_key: sk-1234
- litellm_key_header_name: "X-Litellm-Key"
litellm_settings:
- cache: true
- callbacks: ["otel"]
+ success_callback: ["langsmith"]
diff --git a/litellm/router.py b/litellm/router.py
index f50723ab9..754210802 100644
--- a/litellm/router.py
+++ b/litellm/router.py
@@ -718,6 +718,9 @@ class Router:
data.get(
"timeout", None
) # timeout set on litellm_params for this deployment
+ or data.get(
+ "request_timeout", None
+ ) # timeout set on litellm_params for this deployment
or self.timeout # timeout set on router
or kwargs.get(
"timeout", None
diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py
index fa35f75de..a4a70a535 100644
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@@ -1579,18 +1579,21 @@ async def test_redis_semantic_cache_acompletion():
assert response1.id == response2.id
-def test_caching_redis_simple(caplog):
+def test_caching_redis_simple(caplog, capsys):
"""
Relevant issue - https://github.com/BerriAI/litellm/issues/4511
"""
+ litellm.set_verbose = True ## REQUIRED FOR TEST.
litellm.cache = Cache(
type="redis", url=os.getenv("REDIS_SSL_URL")
) # passing `supported_call_types = ["completion"]` has no effect
s = time.time()
+
+ uuid_str = str(uuid.uuid4())
x = completion(
- model="gpt-4o",
- messages=[{"role": "user", "content": "Hello, how are you? Wink"}],
+ model="gpt-3.5-turbo",
+ messages=[{"role": "user", "content": f"Hello, how are you? Wink {uuid_str}"}],
stream=True,
)
for m in x:
@@ -1599,8 +1602,8 @@ def test_caching_redis_simple(caplog):
s2 = time.time()
x = completion(
- model="gpt-4o",
- messages=[{"role": "user", "content": "Hello, how are you? Wink"}],
+ model="gpt-3.5-turbo",
+ messages=[{"role": "user", "content": f"Hello, how are you? Wink {uuid_str}"}],
stream=True,
)
for m in x:
@@ -1609,11 +1612,15 @@ def test_caching_redis_simple(caplog):
redis_async_caching_error = False
redis_service_logging_error = False
+ captured = capsys.readouterr()
captured_logs = [rec.message for rec in caplog.records]
print(f"captured_logs: {captured_logs}")
for item in captured_logs:
- if "Error connecting to Async Redis client" in item:
+ if (
+ "Error connecting to Async Redis client" in item
+ or "Set ASYNC Redis Cache" in item
+ ):
redis_async_caching_error = True
if "ServiceLogging.async_service_success_hook" in item:
@@ -1621,3 +1628,4 @@ def test_caching_redis_simple(caplog):
assert redis_async_caching_error is False
assert redis_service_logging_error is False
+ assert "async success_callback: reaches cache for logging" not in captured.out
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index b538edee5..87efa86be 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.llms.prompt_templates.factory import anthropic_messages_pt
-# litellm.num_retries = 3
+# litellm.num_retries=3
litellm.cache = None
litellm.success_callback = []
user_message = "Write a short poem about the sky"
diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py
index 1daf1531c..5371c0abd 100644
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@@ -706,6 +706,33 @@ def test_vertex_ai_completion_cost():
print("calculated_input_cost: {}".format(calculated_input_cost))
+# @pytest.mark.skip(reason="new test - WIP, working on fixing this")
+def test_vertex_ai_medlm_completion_cost():
+ """Test for medlm completion cost."""
+
+ with pytest.raises(Exception) as e:
+ model = "vertex_ai/medlm-medium"
+ messages = [{"role": "user", "content": "Test MedLM completion cost."}]
+ predictive_cost = completion_cost(
+ model=model, messages=messages, custom_llm_provider="vertex_ai"
+ )
+
+ os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+ litellm.model_cost = litellm.get_model_cost_map(url="")
+
+ model = "vertex_ai/medlm-medium"
+ messages = [{"role": "user", "content": "Test MedLM completion cost."}]
+ predictive_cost = completion_cost(
+ model=model, messages=messages, custom_llm_provider="vertex_ai"
+ )
+ assert predictive_cost > 0
+
+ model = "vertex_ai/medlm-large"
+ messages = [{"role": "user", "content": "Test MedLM completion cost."}]
+ predictive_cost = completion_cost(model=model, messages=messages)
+ assert predictive_cost > 0
+
+
def test_vertex_ai_claude_completion_cost():
from litellm import Choices, Message, ModelResponse
from litellm.utils import Usage
diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index 1e3f5455a..e041ec0af 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -589,7 +589,7 @@ async def test_triton_embeddings():
print(f"response: {response}")
# stubbed endpoint is setup to return this
- assert response.data[0]["embedding"] == [0.1, 0.2, 0.3]
+ assert response.data[0]["embedding"] == [0.1, 0.2]
except Exception as e:
pytest.fail(f"Error occurred: {e}")
diff --git a/litellm/tests/test_langsmith.py b/litellm/tests/test_langsmith.py
index 603a8370d..7c690212e 100644
--- a/litellm/tests/test_langsmith.py
+++ b/litellm/tests/test_langsmith.py
@@ -1,70 +1,176 @@
-import sys
-import os
import io
+import os
+import sys
sys.path.insert(0, os.path.abspath("../.."))
-from litellm import completion
-import litellm
+import asyncio
+import logging
+import uuid
+
+import pytest
+
+import litellm
+from litellm import completion
+from litellm._logging import verbose_logger
+from litellm.integrations.langsmith import LangsmithLogger
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+
+verbose_logger.setLevel(logging.DEBUG)
-litellm.success_callback = ["langsmith"]
litellm.set_verbose = True
import time
-def test_langsmith_logging():
+@pytest.mark.asyncio()
+async def test_async_langsmith_logging():
try:
- response = completion(
+ test_langsmith_logger = LangsmithLogger()
+ run_id = str(uuid.uuid4())
+ litellm.set_verbose = True
+ litellm.callbacks = ["langsmith"]
+ response = await litellm.acompletion(
model="claude-instant-1.2",
messages=[{"role": "user", "content": "what llm are u"}],
max_tokens=10,
temperature=0.2,
+ metadata={
+ "id": run_id,
+ "user_api_key": "6eb81e014497d89f3cc1aa9da7c2b37bda6b7fea68e4b710d33d94201e68970c",
+ "user_api_key_alias": "ishaans-langmsith-key",
+ "user_api_end_user_max_budget": None,
+ "litellm_api_version": "1.40.19",
+ "global_max_parallel_requests": None,
+ "user_api_key_user_id": "admin",
+ "user_api_key_org_id": None,
+ "user_api_key_team_id": "dbe2f686-a686-4896-864a-4c3924458709",
+ "user_api_key_team_alias": "testing-team",
+ },
)
print(response)
+ await asyncio.sleep(3)
+
+ print("run_id", run_id)
+ logged_run_on_langsmith = test_langsmith_logger.get_run_by_id(run_id=run_id)
+
+ print("logged_run_on_langsmith", logged_run_on_langsmith)
+
+ print("fields in logged_run_on_langsmith", logged_run_on_langsmith.keys())
+
+ input_fields_on_langsmith = logged_run_on_langsmith.get("inputs")
+ extra_fields_on_langsmith = logged_run_on_langsmith.get("extra").get(
+ "invocation_params"
+ )
+
+ print("\nLogged INPUT ON LANGSMITH", input_fields_on_langsmith)
+
+ print("\nextra fields on langsmith", extra_fields_on_langsmith)
+
+ assert isinstance(input_fields_on_langsmith, dict)
+ assert "api_key" not in input_fields_on_langsmith
+ assert "api_key" not in extra_fields_on_langsmith
+
+ # assert user_api_key in extra_fields_on_langsmith
+ assert "user_api_key" in extra_fields_on_langsmith
+ assert "user_api_key_user_id" in extra_fields_on_langsmith
+ assert "user_api_key_team_alias" in extra_fields_on_langsmith
+
+ for cb in litellm.callbacks:
+ if isinstance(cb, LangsmithLogger):
+ await cb.async_httpx_client.client.aclose()
+ # test_langsmith_logger.async_httpx_client.close()
+
except Exception as e:
print(e)
+ pytest.fail(f"Error occurred: {e}")
# test_langsmith_logging()
-def test_langsmith_logging_with_metadata():
+def test_async_langsmith_logging_with_metadata():
try:
+ litellm.success_callback = ["langsmith"]
+ litellm.set_verbose = True
response = completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "what llm are u"}],
max_tokens=10,
temperature=0.2,
- metadata={
- "run_name": "litellmRUN",
- "project_name": "litellm-completion",
- },
)
print(response)
+ time.sleep(3)
+
+ for cb in litellm.callbacks:
+ if isinstance(cb, LangsmithLogger):
+ cb.async_httpx_client.close()
+
except Exception as e:
+ pytest.fail(f"Error occurred: {e}")
print(e)
-# test_langsmith_logging_with_metadata()
-
-
-def test_langsmith_logging_with_streaming_and_metadata():
+@pytest.mark.parametrize("sync_mode", [False, True])
+@pytest.mark.asyncio
+async def test_async_langsmith_logging_with_streaming_and_metadata(sync_mode):
try:
- response = completion(
- model="gpt-3.5-turbo",
- messages=[{"role": "user", "content": "what llm are u"}],
- max_tokens=10,
- temperature=0.2,
- metadata={
- "run_name": "litellmRUN",
- "project_name": "litellm-completion",
- },
- stream=True,
+ test_langsmith_logger = LangsmithLogger()
+ litellm.success_callback = ["langsmith"]
+ litellm.set_verbose = True
+ run_id = str(uuid.uuid4())
+
+ messages = [{"role": "user", "content": "what llm are u"}]
+ if sync_mode is True:
+ response = completion(
+ model="gpt-3.5-turbo",
+ messages=messages,
+ max_tokens=10,
+ temperature=0.2,
+ stream=True,
+ metadata={"id": run_id},
+ )
+ for cb in litellm.callbacks:
+ if isinstance(cb, LangsmithLogger):
+ cb.async_httpx_client = AsyncHTTPHandler()
+ for chunk in response:
+ continue
+ time.sleep(3)
+ else:
+ response = await litellm.acompletion(
+ model="gpt-3.5-turbo",
+ messages=messages,
+ max_tokens=10,
+ temperature=0.2,
+ mock_response="This is a mock request",
+ stream=True,
+ metadata={"id": run_id},
+ )
+ for cb in litellm.callbacks:
+ if isinstance(cb, LangsmithLogger):
+ cb.async_httpx_client = AsyncHTTPHandler()
+ async for chunk in response:
+ continue
+ await asyncio.sleep(3)
+
+ print("run_id", run_id)
+ logged_run_on_langsmith = test_langsmith_logger.get_run_by_id(run_id=run_id)
+
+ print("logged_run_on_langsmith", logged_run_on_langsmith)
+
+ print("fields in logged_run_on_langsmith", logged_run_on_langsmith.keys())
+
+ input_fields_on_langsmith = logged_run_on_langsmith.get("inputs")
+
+ extra_fields_on_langsmith = logged_run_on_langsmith.get("extra").get(
+ "invocation_params"
)
- for chunk in response:
- continue
+
+ assert logged_run_on_langsmith.get("run_type") == "llm"
+ print("\nLogged INPUT ON LANGSMITH", input_fields_on_langsmith)
+
+ print("\nextra fields on langsmith", extra_fields_on_langsmith)
+
+ assert isinstance(input_fields_on_langsmith, dict)
except Exception as e:
+ pytest.fail(f"Error occurred: {e}")
print(e)
-
-
-test_langsmith_logging_with_streaming_and_metadata()
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index eab202406..8c7943893 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -515,6 +515,7 @@ async def test_completion_predibase_streaming(sync_mode):
response = completion(
model="predibase/llama-3-8b-instruct",
tenant_id="c4768f95",
+ max_tokens=10,
api_base="https://serving.app.predibase.com",
api_key=os.getenv("PREDIBASE_API_KEY"),
messages=[{"role": "user", "content": "What is the meaning of life?"}],
@@ -539,6 +540,7 @@ async def test_completion_predibase_streaming(sync_mode):
response = await litellm.acompletion(
model="predibase/llama-3-8b-instruct",
tenant_id="c4768f95",
+ max_tokens=10,
api_base="https://serving.app.predibase.com",
api_key=os.getenv("PREDIBASE_API_KEY"),
messages=[{"role": "user", "content": "What is the meaning of life?"}],
diff --git a/litellm/utils.py b/litellm/utils.py
index b9c3f983c..a02a276b7 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -417,6 +417,21 @@ def function_setup(
# we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
litellm._async_success_callback.append(callback)
removed_async_items.append(index)
+ elif callback == "langsmith":
+ callback_class = litellm.litellm_core_utils.litellm_logging._init_custom_logger_compatible_class( # type: ignore
+ callback, internal_usage_cache=None, llm_router=None
+ )
+
+ # don't double add a callback
+ if not any(
+ isinstance(cb, type(callback_class)) for cb in litellm.callbacks
+ ):
+ litellm.callbacks.append(callback_class) # type: ignore
+ litellm.input_callback.append(callback_class) # type: ignore
+ litellm.success_callback.append(callback_class) # type: ignore
+ litellm.failure_callback.append(callback_class) # type: ignore
+ litellm._async_success_callback.append(callback_class) # type: ignore
+ litellm._async_failure_callback.append(callback_class) # type: ignore
# Pop the async items from success_callback in reverse order to avoid index issues
for index in reversed(removed_async_items):
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index 60f812b2b..2fc6a5771 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -1020,6 +1020,26 @@
"mode": "chat",
"supports_function_calling": true
},
+ "groq/llama3-groq-70b-8192-tool-use-preview": {
+ "max_tokens": 8192,
+ "max_input_tokens": 8192,
+ "max_output_tokens": 8192,
+ "input_cost_per_token": 0.00000089,
+ "output_cost_per_token": 0.00000089,
+ "litellm_provider": "groq",
+ "mode": "chat",
+ "supports_function_calling": true
+ },
+ "groq/llama3-groq-8b-8192-tool-use-preview": {
+ "max_tokens": 8192,
+ "max_input_tokens": 8192,
+ "max_output_tokens": 8192,
+ "input_cost_per_token": 0.00000019,
+ "output_cost_per_token": 0.00000019,
+ "litellm_provider": "groq",
+ "mode": "chat",
+ "supports_function_calling": true
+ },
"friendliai/mixtral-8x7b-instruct-v0-1": {
"max_tokens": 32768,
"max_input_tokens": 32768,
@@ -1800,6 +1820,26 @@
"supports_vision": true,
"source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
},
+ "medlm-medium": {
+ "max_tokens": 8192,
+ "max_input_tokens": 32768,
+ "max_output_tokens": 8192,
+ "input_cost_per_character": 0.0000005,
+ "output_cost_per_character": 0.000001,
+ "litellm_provider": "vertex_ai-language-models",
+ "mode": "chat",
+ "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+ },
+ "medlm-large": {
+ "max_tokens": 1024,
+ "max_input_tokens": 8192,
+ "max_output_tokens": 1024,
+ "input_cost_per_character": 0.000005,
+ "output_cost_per_character": 0.000015,
+ "litellm_provider": "vertex_ai-language-models",
+ "mode": "chat",
+ "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+ },
"vertex_ai/claude-3-sonnet@20240229": {
"max_tokens": 4096,
"max_input_tokens": 200000,