diff --git a/docs/my-website/docs/observability/helicone_integration.md b/docs/my-website/docs/observability/helicone_integration.md index 57e7039fc..7e7f9fcb6 100644 --- a/docs/my-website/docs/observability/helicone_integration.md +++ b/docs/my-website/docs/observability/helicone_integration.md @@ -72,7 +72,7 @@ Helicone's proxy provides [advanced functionality](https://docs.helicone.ai/gett To use Helicone as a proxy for your LLM requests: 1. Set Helicone as your base URL via: litellm.api_base -2. Pass in Helicone request headers via: litellm.headers +2. Pass in Helicone request headers via: litellm.metadata Complete Code: @@ -99,7 +99,7 @@ print(response) You can add custom metadata and properties to your requests using Helicone headers. Here are some examples: ```python -litellm.headers = { +litellm.metadata = { "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API "Helicone-User-Id": "user-abc", # Specify the user making the request "Helicone-Property-App": "web", # Custom property to add additional information @@ -127,7 +127,7 @@ litellm.headers = { Enable caching and set up rate limiting policies: ```python -litellm.headers = { +litellm.metadata = { "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API "Helicone-Cache-Enabled": "true", # Enable caching of responses "Cache-Control": "max-age=3600", # Set cache limit to 1 hour @@ -140,7 +140,7 @@ litellm.headers = { Track multi-step and agentic LLM interactions using session IDs and paths: ```python -litellm.headers = { +litellm.metadata = { "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API "Helicone-Session-Id": "session-abc-123", # The session ID you want to track "Helicone-Session-Path": "parent-trace/child-trace", # The path of the session @@ -157,7 +157,7 @@ By using these two headers, you can effectively group and visualize multi-step L Set up retry mechanisms and fallback options: ```python -litellm.headers = { +litellm.metadata = { "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API "Helicone-Retry-Enabled": "true", # Enable retry mechanism "helicone-retry-num": "3", # Set number of retries diff --git a/docs/my-website/docs/observability/langsmith_integration.md b/docs/my-website/docs/observability/langsmith_integration.md index c038abd82..79d047e33 100644 --- a/docs/my-website/docs/observability/langsmith_integration.md +++ b/docs/my-website/docs/observability/langsmith_integration.md @@ -14,7 +14,7 @@ https://github.com/BerriAI/litellm An all-in-one developer platform for every step of the application lifecycle https://smith.langchain.com/ - + :::info We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md index 27f1789e0..0d5016645 100644 --- a/docs/my-website/docs/proxy/logging.md +++ b/docs/my-website/docs/proxy/logging.md @@ -5,6 +5,7 @@ Log Proxy input, output, and exceptions using: - Langfuse - OpenTelemetry - Custom Callbacks +- Langsmith - DataDog - DynamoDB - s3 Bucket @@ -1086,6 +1087,50 @@ litellm_settings: Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API +## Logging LLM IO to Langsmith + +1. Set `success_callback: ["langsmith"]` on litellm config.yaml + +If you're using a custom LangSmith instance, you can set the +`LANGSMITH_BASE_URL` environment variable to point to your instance. + +```yaml +litellm_settings: + success_callback: ["langsmith"] + +environment_variables: + LANGSMITH_API_KEY: "lsv2_pt_xxxxxxxx" + LANGSMITH_PROJECT: "litellm-proxy" + + LANGSMITH_BASE_URL: "https://api.smith.langchain.com" # (Optional - only needed if you have a custom Langsmith instance) +``` + + +2. Start Proxy + +``` +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data ' { + "model": "fake-openai-endpoint", + "messages": [ + { + "role": "user", + "content": "Hello, Claude gm!" + } + ], + } +' +``` +Expect to see your log on Langfuse + + ## Logging LLM IO to Galileo [BETA] diff --git a/docs/my-website/img/langsmith_new.png b/docs/my-website/img/langsmith_new.png new file mode 100644 index 000000000..d5586bdbe Binary files /dev/null and b/docs/my-website/img/langsmith_new.png differ diff --git a/litellm/__init__.py b/litellm/__init__.py index 645a0bccd..7dcc934a6 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -38,7 +38,7 @@ success_callback: List[Union[str, Callable]] = [] failure_callback: List[Union[str, Callable]] = [] service_callback: List[Union[str, Callable]] = [] _custom_logger_compatible_callbacks_literal = Literal[ - "lago", "openmeter", "logfire", "dynamic_rate_limiter" + "lago", "openmeter", "logfire", "dynamic_rate_limiter", "langsmith", "galileo" ] callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = [] _langfuse_default_tags: Optional[ diff --git a/litellm/integrations/langsmith.py b/litellm/integrations/langsmith.py index 48185afee..81db798ae 100644 --- a/litellm/integrations/langsmith.py +++ b/litellm/integrations/langsmith.py @@ -1,13 +1,43 @@ #### What this does #### # On success, logs events to Langsmith -import dotenv, os # type: ignore -import requests # type: ignore -from datetime import datetime -import traceback import asyncio +import os +import traceback import types +from datetime import datetime +from typing import Any, List, Optional, Union + +import dotenv # type: ignore +import httpx +import requests # type: ignore from pydantic import BaseModel # type: ignore +import litellm +from litellm._logging import verbose_logger +from litellm.integrations.custom_logger import CustomLogger +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler + + +class LangsmithInputs(BaseModel): + model: Optional[str] = None + messages: Optional[List[Any]] = None + stream: Optional[bool] = None + call_type: Optional[str] = None + litellm_call_id: Optional[str] = None + completion_start_time: Optional[datetime] = None + temperature: Optional[float] = None + max_tokens: Optional[int] = None + custom_llm_provider: Optional[str] = None + input: Optional[List[Any]] = None + log_event_type: Optional[str] = None + original_response: Optional[Any] = None + response_cost: Optional[float] = None + + # LiteLLM Virtual Key specific fields + user_api_key: Optional[str] = None + user_api_key_user_id: Optional[str] = None + user_api_key_team_alias: Optional[str] = None + def is_serializable(value): non_serializable_types = ( @@ -19,7 +49,7 @@ def is_serializable(value): return not isinstance(value, non_serializable_types) -class LangsmithLogger: +class LangsmithLogger(CustomLogger): # Class variables or attributes def __init__(self): self.langsmith_api_key = os.getenv("LANGSMITH_API_KEY") @@ -27,71 +57,121 @@ class LangsmithLogger: self.langsmith_default_run_name = os.getenv( "LANGSMITH_DEFAULT_RUN_NAME", "LLMRun" ) + self.langsmith_base_url = os.getenv( + "LANGSMITH_BASE_URL", "https://api.smith.langchain.com" + ) + self.async_httpx_client = AsyncHTTPHandler( + timeout=httpx.Timeout(timeout=600.0, connect=5.0) + ) - def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose): - # Method definition - # inspired by Langsmith http api here: https://github.com/langchain-ai/langsmith-cookbook/blob/main/tracing-examples/rest/rest.ipynb - metadata = ( - kwargs.get("litellm_params", {}).get("metadata", {}) or {} - ) # if metadata is None + def _prepare_log_data(self, kwargs, response_obj, start_time, end_time): + import datetime + from datetime import timezone + + metadata = kwargs.get("litellm_params", {}).get("metadata", {}) or {} + + kwargs["user_api_key"] = metadata.get("user_api_key", None) + kwargs["user_api_key_user_id"] = metadata.get("user_api_key_user_id", None) + kwargs["user_api_key_team_alias"] = metadata.get( + "user_api_key_team_alias", None + ) - # set project name and run_name for langsmith logging - # users can pass project_name and run name to litellm.completion() - # Example: litellm.completion(model, messages, metadata={"project_name": "my-litellm-project", "run_name": "my-langsmith-run"}) - # if not set litellm will fallback to the environment variable LANGSMITH_PROJECT, then to the default project_name = litellm-completion, run_name = LLMRun project_name = metadata.get("project_name", self.langsmith_project) run_name = metadata.get("run_name", self.langsmith_default_run_name) - print_verbose( + run_id = metadata.get("id", None) + verbose_logger.debug( f"Langsmith Logging - project_name: {project_name}, run_name {run_name}" ) - langsmith_base_url = os.getenv( - "LANGSMITH_BASE_URL", "https://api.smith.langchain.com" - ) try: - print_verbose( - f"Langsmith Logging - Enters logging function for model {kwargs}" - ) - import requests - import datetime - from datetime import timezone + start_time = kwargs["start_time"].astimezone(timezone.utc).isoformat() + end_time = kwargs["end_time"].astimezone(timezone.utc).isoformat() + except: + start_time = datetime.datetime.utcnow().isoformat() + end_time = datetime.datetime.utcnow().isoformat() + # filter out kwargs to not include any dicts, langsmith throws an erros when trying to log kwargs + logged_kwargs = LangsmithInputs(**kwargs) + kwargs = logged_kwargs.model_dump() + + new_kwargs = {} + for key in kwargs: + value = kwargs[key] + if key == "start_time" or key == "end_time" or value is None: + pass + elif key == "original_response" and not isinstance(value, str): + new_kwargs[key] = str(value) + elif type(value) == datetime.datetime: + new_kwargs[key] = value.isoformat() + elif type(value) != dict and is_serializable(value=value): + new_kwargs[key] = value + elif not is_serializable(value=value): + continue + + if isinstance(response_obj, BaseModel): try: - start_time = kwargs["start_time"].astimezone(timezone.utc).isoformat() - end_time = kwargs["end_time"].astimezone(timezone.utc).isoformat() + response_obj = response_obj.model_dump() except: - start_time = datetime.datetime.utcnow().isoformat() - end_time = datetime.datetime.utcnow().isoformat() + response_obj = response_obj.dict() # type: ignore - # filter out kwargs to not include any dicts, langsmith throws an erros when trying to log kwargs - new_kwargs = {} - for key in kwargs: - value = kwargs[key] - if key == "start_time" or key == "end_time" or value is None: - pass - elif type(value) == datetime.datetime: - new_kwargs[key] = value.isoformat() - elif type(value) != dict and is_serializable(value=value): - new_kwargs[key] = value + data = { + "name": run_name, + "run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain" + "inputs": new_kwargs, + "outputs": response_obj, + "session_name": project_name, + "start_time": start_time, + "end_time": end_time, + } - if isinstance(response_obj, BaseModel): - try: - response_obj = response_obj.model_dump() - except: - response_obj = response_obj.dict() # type: ignore + if run_id: + data["id"] = run_id - data = { - "name": run_name, - "run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain" - "inputs": new_kwargs, - "outputs": response_obj, - "session_name": project_name, - "start_time": start_time, - "end_time": end_time, - } + verbose_logger.debug("Langsmith Logging data on langsmith: %s", data) + + return data + + async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): + try: + verbose_logger.debug( + "Langsmith Async Layer Logging - kwargs: %s, response_obj: %s", + kwargs, + response_obj, + ) + data = self._prepare_log_data(kwargs, response_obj, start_time, end_time) + url = f"{self.langsmith_base_url}/runs" + verbose_logger.debug(f"Langsmith Logging - About to send data to {url} ...") + + headers = {"x-api-key": self.langsmith_api_key} + response = await self.async_httpx_client.post( + url=url, json=data, headers=headers + ) + + if response.status_code >= 300: + verbose_logger.error( + f"Langmsith Error: {response.status_code} - {response.text}" + ) + else: + verbose_logger.debug( + "Run successfully created, response=%s", response.text + ) + verbose_logger.debug( + f"Langsmith Layer Logging - final response object: {response_obj}. Response text from langsmith={response.text}" + ) + except: + verbose_logger.error(f"Langsmith Layer Error - {traceback.format_exc()}") + + def log_success_event(self, kwargs, response_obj, start_time, end_time): + try: + verbose_logger.debug( + "Langsmith Sync Layer Logging - kwargs: %s, response_obj: %s", + kwargs, + response_obj, + ) + data = self._prepare_log_data(kwargs, response_obj, start_time, end_time) + url = f"{self.langsmith_base_url}/runs" + verbose_logger.debug(f"Langsmith Logging - About to send data to {url} ...") - url = f"{langsmith_base_url}/runs" - print_verbose(f"Langsmith Logging - About to send data to {url} ...") response = requests.post( url=url, json=data, @@ -99,12 +179,21 @@ class LangsmithLogger: ) if response.status_code >= 300: - print_verbose(f"Error: {response.status_code}") + verbose_logger.error(f"Error: {response.status_code} - {response.text}") else: - print_verbose("Run successfully created") - print_verbose( - f"Langsmith Layer Logging - final response object: {response_obj}" + verbose_logger.debug("Run successfully created") + verbose_logger.debug( + f"Langsmith Layer Logging - final response object: {response_obj}. Response text from langsmith={response.text}" ) except: - print_verbose(f"Langsmith Layer Error - {traceback.format_exc()}") - pass + verbose_logger.error(f"Langsmith Layer Error - {traceback.format_exc()}") + + def get_run_by_id(self, run_id): + + url = f"{self.langsmith_base_url}/runs/{run_id}" + response = requests.get( + url=url, + headers={"x-api-key": self.langsmith_api_key}, + ) + + return response.json() diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index 3fde07815..32633960f 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -39,7 +39,6 @@ from litellm.utils import ( add_breadcrumb, capture_exception, customLogger, - langsmithLogger, liteDebuggerClient, logfireLogger, lunaryLogger, @@ -89,7 +88,6 @@ alerts_channel = None heliconeLogger = None athinaLogger = None promptLayerLogger = None -langsmithLogger = None logfireLogger = None weightsBiasesLogger = None customLogger = None @@ -136,7 +134,7 @@ in_memory_trace_id_cache = ServiceTraceIDCache() class Logging: - global supabaseClient, liteDebuggerClient, promptLayerLogger, weightsBiasesLogger, langsmithLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app + global supabaseClient, liteDebuggerClient, promptLayerLogger, weightsBiasesLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app custom_pricing: bool = False stream_options = None @@ -738,23 +736,6 @@ class Logging: end_time=end_time, print_verbose=print_verbose, ) - if callback == "langsmith": - print_verbose("reaches langsmith for logging!") - if self.stream: - if "complete_streaming_response" not in kwargs: - continue - else: - print_verbose( - "reaches langsmith for streaming logging!" - ) - result = kwargs["complete_streaming_response"] - langsmithLogger.log_event( - kwargs=self.model_call_details, - response_obj=result, - start_time=start_time, - end_time=end_time, - print_verbose=print_verbose, - ) if callback == "logfire": global logfireLogger verbose_logger.debug("reaches logfire for success logging!") @@ -1337,7 +1318,14 @@ class Logging: if kwargs.get("no-log", False) == True: print_verbose("no-log request, skipping logging") continue - if callback == "cache" and litellm.cache is not None: + if ( + callback == "cache" + and litellm.cache is not None + and self.model_call_details.get("litellm_params", {}).get( + "acompletion", False + ) + is True + ): # set_cache once complete streaming response is built print_verbose("async success_callback: reaches cache for logging!") kwargs = self.model_call_details @@ -1417,6 +1405,9 @@ class Logging: end_time=end_time, ) if callable(callback): # custom logger functions + global customLogger + if customLogger is None: + customLogger = CustomLogger() if self.stream: if ( "async_complete_streaming_response" @@ -1822,7 +1813,7 @@ def set_callbacks(callback_list, function_id=None): """ Globally sets the callback client """ - global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger + global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, logfireLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger try: for callback in callback_list: @@ -1903,8 +1894,6 @@ def set_callbacks(callback_list, function_id=None): s3Logger = S3Logger() elif callback == "wandb": weightsBiasesLogger = WeightsBiasesLogger() - elif callback == "langsmith": - langsmithLogger = LangsmithLogger() elif callback == "logfire": logfireLogger = LogfireLogger() elif callback == "aispend": @@ -1957,6 +1946,15 @@ def _init_custom_logger_compatible_class( _in_memory_loggers.append(_openmeter_logger) return _openmeter_logger # type: ignore + elif logging_integration == "langsmith": + for callback in _in_memory_loggers: + if isinstance(callback, LangsmithLogger): + return callback # type: ignore + + _langsmith_logger = LangsmithLogger() + _in_memory_loggers.append(_langsmith_logger) + return _langsmith_logger # type: ignore + elif logging_integration == "galileo": for callback in _in_memory_loggers: if isinstance(callback, GalileoObserve): @@ -2025,6 +2023,10 @@ def get_custom_logger_compatible_class( for callback in _in_memory_loggers: if isinstance(callback, GalileoObserve): return callback + elif logging_integration == "langsmith": + for callback in _in_memory_loggers: + if isinstance(callback, LangsmithLogger): + return callback elif logging_integration == "logfire": if "LOGFIRE_TOKEN" not in os.environ: raise ValueError("LOGFIRE_TOKEN not found in environment variables") diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 60f812b2b..2fc6a5771 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -1020,6 +1020,26 @@ "mode": "chat", "supports_function_calling": true }, + "groq/llama3-groq-70b-8192-tool-use-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.00000089, + "output_cost_per_token": 0.00000089, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true + }, + "groq/llama3-groq-8b-8192-tool-use-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.00000019, + "output_cost_per_token": 0.00000019, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true + }, "friendliai/mixtral-8x7b-instruct-v0-1": { "max_tokens": 32768, "max_input_tokens": 32768, @@ -1800,6 +1820,26 @@ "supports_vision": true, "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" }, + "medlm-medium": { + "max_tokens": 8192, + "max_input_tokens": 32768, + "max_output_tokens": 8192, + "input_cost_per_character": 0.0000005, + "output_cost_per_character": 0.000001, + "litellm_provider": "vertex_ai-language-models", + "mode": "chat", + "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + }, + "medlm-large": { + "max_tokens": 1024, + "max_input_tokens": 8192, + "max_output_tokens": 1024, + "input_cost_per_character": 0.000005, + "output_cost_per_character": 0.000015, + "litellm_provider": "vertex_ai-language-models", + "mode": "chat", + "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + }, "vertex_ai/claude-3-sonnet@20240229": { "max_tokens": 4096, "max_input_tokens": 200000, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index b6ac36044..641c70ebc 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,10 +1,5 @@ model_list: - - model_name: "*" + - model_name: llama-3 litellm_params: - model: openai/* - -litellm_settings: - guardrails: - - prompt_injection: - callbacks: ["aporio_prompt_injection"] - default_on: true + model: gpt-4 + request_timeout: 1 diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 42e77475f..3f3b0858e 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -17,9 +17,7 @@ model_list: general_settings: master_key: sk-1234 - litellm_key_header_name: "X-Litellm-Key" litellm_settings: - cache: true - callbacks: ["otel"] + success_callback: ["langsmith"] diff --git a/litellm/router.py b/litellm/router.py index f50723ab9..754210802 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -718,6 +718,9 @@ class Router: data.get( "timeout", None ) # timeout set on litellm_params for this deployment + or data.get( + "request_timeout", None + ) # timeout set on litellm_params for this deployment or self.timeout # timeout set on router or kwargs.get( "timeout", None diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index fa35f75de..a4a70a535 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1579,18 +1579,21 @@ async def test_redis_semantic_cache_acompletion(): assert response1.id == response2.id -def test_caching_redis_simple(caplog): +def test_caching_redis_simple(caplog, capsys): """ Relevant issue - https://github.com/BerriAI/litellm/issues/4511 """ + litellm.set_verbose = True ## REQUIRED FOR TEST. litellm.cache = Cache( type="redis", url=os.getenv("REDIS_SSL_URL") ) # passing `supported_call_types = ["completion"]` has no effect s = time.time() + + uuid_str = str(uuid.uuid4()) x = completion( - model="gpt-4o", - messages=[{"role": "user", "content": "Hello, how are you? Wink"}], + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": f"Hello, how are you? Wink {uuid_str}"}], stream=True, ) for m in x: @@ -1599,8 +1602,8 @@ def test_caching_redis_simple(caplog): s2 = time.time() x = completion( - model="gpt-4o", - messages=[{"role": "user", "content": "Hello, how are you? Wink"}], + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": f"Hello, how are you? Wink {uuid_str}"}], stream=True, ) for m in x: @@ -1609,11 +1612,15 @@ def test_caching_redis_simple(caplog): redis_async_caching_error = False redis_service_logging_error = False + captured = capsys.readouterr() captured_logs = [rec.message for rec in caplog.records] print(f"captured_logs: {captured_logs}") for item in captured_logs: - if "Error connecting to Async Redis client" in item: + if ( + "Error connecting to Async Redis client" in item + or "Set ASYNC Redis Cache" in item + ): redis_async_caching_error = True if "ServiceLogging.async_service_success_hook" in item: @@ -1621,3 +1628,4 @@ def test_caching_redis_simple(caplog): assert redis_async_caching_error is False assert redis_service_logging_error is False + assert "async success_callback: reaches cache for logging" not in captured.out diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index b538edee5..87efa86be 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.llms.prompt_templates.factory import anthropic_messages_pt -# litellm.num_retries = 3 +# litellm.num_retries=3 litellm.cache = None litellm.success_callback = [] user_message = "Write a short poem about the sky" diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index 1daf1531c..5371c0abd 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -706,6 +706,33 @@ def test_vertex_ai_completion_cost(): print("calculated_input_cost: {}".format(calculated_input_cost)) +# @pytest.mark.skip(reason="new test - WIP, working on fixing this") +def test_vertex_ai_medlm_completion_cost(): + """Test for medlm completion cost.""" + + with pytest.raises(Exception) as e: + model = "vertex_ai/medlm-medium" + messages = [{"role": "user", "content": "Test MedLM completion cost."}] + predictive_cost = completion_cost( + model=model, messages=messages, custom_llm_provider="vertex_ai" + ) + + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + + model = "vertex_ai/medlm-medium" + messages = [{"role": "user", "content": "Test MedLM completion cost."}] + predictive_cost = completion_cost( + model=model, messages=messages, custom_llm_provider="vertex_ai" + ) + assert predictive_cost > 0 + + model = "vertex_ai/medlm-large" + messages = [{"role": "user", "content": "Test MedLM completion cost."}] + predictive_cost = completion_cost(model=model, messages=messages) + assert predictive_cost > 0 + + def test_vertex_ai_claude_completion_cost(): from litellm import Choices, Message, ModelResponse from litellm.utils import Usage diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py index 1e3f5455a..e041ec0af 100644 --- a/litellm/tests/test_embedding.py +++ b/litellm/tests/test_embedding.py @@ -589,7 +589,7 @@ async def test_triton_embeddings(): print(f"response: {response}") # stubbed endpoint is setup to return this - assert response.data[0]["embedding"] == [0.1, 0.2, 0.3] + assert response.data[0]["embedding"] == [0.1, 0.2] except Exception as e: pytest.fail(f"Error occurred: {e}") diff --git a/litellm/tests/test_langsmith.py b/litellm/tests/test_langsmith.py index 603a8370d..7c690212e 100644 --- a/litellm/tests/test_langsmith.py +++ b/litellm/tests/test_langsmith.py @@ -1,70 +1,176 @@ -import sys -import os import io +import os +import sys sys.path.insert(0, os.path.abspath("../..")) -from litellm import completion -import litellm +import asyncio +import logging +import uuid + +import pytest + +import litellm +from litellm import completion +from litellm._logging import verbose_logger +from litellm.integrations.langsmith import LangsmithLogger +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler + +verbose_logger.setLevel(logging.DEBUG) -litellm.success_callback = ["langsmith"] litellm.set_verbose = True import time -def test_langsmith_logging(): +@pytest.mark.asyncio() +async def test_async_langsmith_logging(): try: - response = completion( + test_langsmith_logger = LangsmithLogger() + run_id = str(uuid.uuid4()) + litellm.set_verbose = True + litellm.callbacks = ["langsmith"] + response = await litellm.acompletion( model="claude-instant-1.2", messages=[{"role": "user", "content": "what llm are u"}], max_tokens=10, temperature=0.2, + metadata={ + "id": run_id, + "user_api_key": "6eb81e014497d89f3cc1aa9da7c2b37bda6b7fea68e4b710d33d94201e68970c", + "user_api_key_alias": "ishaans-langmsith-key", + "user_api_end_user_max_budget": None, + "litellm_api_version": "1.40.19", + "global_max_parallel_requests": None, + "user_api_key_user_id": "admin", + "user_api_key_org_id": None, + "user_api_key_team_id": "dbe2f686-a686-4896-864a-4c3924458709", + "user_api_key_team_alias": "testing-team", + }, ) print(response) + await asyncio.sleep(3) + + print("run_id", run_id) + logged_run_on_langsmith = test_langsmith_logger.get_run_by_id(run_id=run_id) + + print("logged_run_on_langsmith", logged_run_on_langsmith) + + print("fields in logged_run_on_langsmith", logged_run_on_langsmith.keys()) + + input_fields_on_langsmith = logged_run_on_langsmith.get("inputs") + extra_fields_on_langsmith = logged_run_on_langsmith.get("extra").get( + "invocation_params" + ) + + print("\nLogged INPUT ON LANGSMITH", input_fields_on_langsmith) + + print("\nextra fields on langsmith", extra_fields_on_langsmith) + + assert isinstance(input_fields_on_langsmith, dict) + assert "api_key" not in input_fields_on_langsmith + assert "api_key" not in extra_fields_on_langsmith + + # assert user_api_key in extra_fields_on_langsmith + assert "user_api_key" in extra_fields_on_langsmith + assert "user_api_key_user_id" in extra_fields_on_langsmith + assert "user_api_key_team_alias" in extra_fields_on_langsmith + + for cb in litellm.callbacks: + if isinstance(cb, LangsmithLogger): + await cb.async_httpx_client.client.aclose() + # test_langsmith_logger.async_httpx_client.close() + except Exception as e: print(e) + pytest.fail(f"Error occurred: {e}") # test_langsmith_logging() -def test_langsmith_logging_with_metadata(): +def test_async_langsmith_logging_with_metadata(): try: + litellm.success_callback = ["langsmith"] + litellm.set_verbose = True response = completion( model="gpt-3.5-turbo", messages=[{"role": "user", "content": "what llm are u"}], max_tokens=10, temperature=0.2, - metadata={ - "run_name": "litellmRUN", - "project_name": "litellm-completion", - }, ) print(response) + time.sleep(3) + + for cb in litellm.callbacks: + if isinstance(cb, LangsmithLogger): + cb.async_httpx_client.close() + except Exception as e: + pytest.fail(f"Error occurred: {e}") print(e) -# test_langsmith_logging_with_metadata() - - -def test_langsmith_logging_with_streaming_and_metadata(): +@pytest.mark.parametrize("sync_mode", [False, True]) +@pytest.mark.asyncio +async def test_async_langsmith_logging_with_streaming_and_metadata(sync_mode): try: - response = completion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "what llm are u"}], - max_tokens=10, - temperature=0.2, - metadata={ - "run_name": "litellmRUN", - "project_name": "litellm-completion", - }, - stream=True, + test_langsmith_logger = LangsmithLogger() + litellm.success_callback = ["langsmith"] + litellm.set_verbose = True + run_id = str(uuid.uuid4()) + + messages = [{"role": "user", "content": "what llm are u"}] + if sync_mode is True: + response = completion( + model="gpt-3.5-turbo", + messages=messages, + max_tokens=10, + temperature=0.2, + stream=True, + metadata={"id": run_id}, + ) + for cb in litellm.callbacks: + if isinstance(cb, LangsmithLogger): + cb.async_httpx_client = AsyncHTTPHandler() + for chunk in response: + continue + time.sleep(3) + else: + response = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + max_tokens=10, + temperature=0.2, + mock_response="This is a mock request", + stream=True, + metadata={"id": run_id}, + ) + for cb in litellm.callbacks: + if isinstance(cb, LangsmithLogger): + cb.async_httpx_client = AsyncHTTPHandler() + async for chunk in response: + continue + await asyncio.sleep(3) + + print("run_id", run_id) + logged_run_on_langsmith = test_langsmith_logger.get_run_by_id(run_id=run_id) + + print("logged_run_on_langsmith", logged_run_on_langsmith) + + print("fields in logged_run_on_langsmith", logged_run_on_langsmith.keys()) + + input_fields_on_langsmith = logged_run_on_langsmith.get("inputs") + + extra_fields_on_langsmith = logged_run_on_langsmith.get("extra").get( + "invocation_params" ) - for chunk in response: - continue + + assert logged_run_on_langsmith.get("run_type") == "llm" + print("\nLogged INPUT ON LANGSMITH", input_fields_on_langsmith) + + print("\nextra fields on langsmith", extra_fields_on_langsmith) + + assert isinstance(input_fields_on_langsmith, dict) except Exception as e: + pytest.fail(f"Error occurred: {e}") print(e) - - -test_langsmith_logging_with_streaming_and_metadata() diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index eab202406..8c7943893 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -515,6 +515,7 @@ async def test_completion_predibase_streaming(sync_mode): response = completion( model="predibase/llama-3-8b-instruct", tenant_id="c4768f95", + max_tokens=10, api_base="https://serving.app.predibase.com", api_key=os.getenv("PREDIBASE_API_KEY"), messages=[{"role": "user", "content": "What is the meaning of life?"}], @@ -539,6 +540,7 @@ async def test_completion_predibase_streaming(sync_mode): response = await litellm.acompletion( model="predibase/llama-3-8b-instruct", tenant_id="c4768f95", + max_tokens=10, api_base="https://serving.app.predibase.com", api_key=os.getenv("PREDIBASE_API_KEY"), messages=[{"role": "user", "content": "What is the meaning of life?"}], diff --git a/litellm/utils.py b/litellm/utils.py index b9c3f983c..a02a276b7 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -417,6 +417,21 @@ def function_setup( # we only support async dynamo db logging for acompletion/aembedding since that's used on proxy litellm._async_success_callback.append(callback) removed_async_items.append(index) + elif callback == "langsmith": + callback_class = litellm.litellm_core_utils.litellm_logging._init_custom_logger_compatible_class( # type: ignore + callback, internal_usage_cache=None, llm_router=None + ) + + # don't double add a callback + if not any( + isinstance(cb, type(callback_class)) for cb in litellm.callbacks + ): + litellm.callbacks.append(callback_class) # type: ignore + litellm.input_callback.append(callback_class) # type: ignore + litellm.success_callback.append(callback_class) # type: ignore + litellm.failure_callback.append(callback_class) # type: ignore + litellm._async_success_callback.append(callback_class) # type: ignore + litellm._async_failure_callback.append(callback_class) # type: ignore # Pop the async items from success_callback in reverse order to avoid index issues for index in reversed(removed_async_items): diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 60f812b2b..2fc6a5771 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -1020,6 +1020,26 @@ "mode": "chat", "supports_function_calling": true }, + "groq/llama3-groq-70b-8192-tool-use-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.00000089, + "output_cost_per_token": 0.00000089, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true + }, + "groq/llama3-groq-8b-8192-tool-use-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.00000019, + "output_cost_per_token": 0.00000019, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true + }, "friendliai/mixtral-8x7b-instruct-v0-1": { "max_tokens": 32768, "max_input_tokens": 32768, @@ -1800,6 +1820,26 @@ "supports_vision": true, "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" }, + "medlm-medium": { + "max_tokens": 8192, + "max_input_tokens": 32768, + "max_output_tokens": 8192, + "input_cost_per_character": 0.0000005, + "output_cost_per_character": 0.000001, + "litellm_provider": "vertex_ai-language-models", + "mode": "chat", + "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + }, + "medlm-large": { + "max_tokens": 1024, + "max_input_tokens": 8192, + "max_output_tokens": 1024, + "input_cost_per_character": 0.000005, + "output_cost_per_character": 0.000015, + "litellm_provider": "vertex_ai-language-models", + "mode": "chat", + "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + }, "vertex_ai/claude-3-sonnet@20240229": { "max_tokens": 4096, "max_input_tokens": 200000,