diff --git a/.circleci/config.yml b/.circleci/config.yml index 26a2ae356b..b43a8aa64c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -125,6 +125,7 @@ jobs: pip install tiktoken pip install aiohttp pip install click + pip install "boto3==1.34.34" pip install jinja2 pip install tokenizers pip install openai @@ -287,6 +288,7 @@ jobs: pip install "pytest==7.3.1" pip install "pytest-mock==3.12.0" pip install "pytest-asyncio==0.21.1" + pip install "boto3==1.34.34" pip install mypy pip install pyarrow pip install numpydoc diff --git a/Dockerfile b/Dockerfile index c8e9956b29..bd840eaf54 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,6 +62,11 @@ COPY --from=builder /wheels/ /wheels/ RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels # Generate prisma client +ENV PRISMA_BINARY_CACHE_DIR=/app/prisma +RUN mkdir -p /.cache +RUN chmod -R 777 /.cache +RUN pip install nodejs-bin +RUN pip install prisma RUN prisma generate RUN chmod +x entrypoint.sh diff --git a/Dockerfile.database b/Dockerfile.database index 22084bab89..c995939e5b 100644 --- a/Dockerfile.database +++ b/Dockerfile.database @@ -62,6 +62,11 @@ RUN pip install PyJWT --no-cache-dir RUN chmod +x build_admin_ui.sh && ./build_admin_ui.sh # Generate prisma client +ENV PRISMA_BINARY_CACHE_DIR=/app/prisma +RUN mkdir -p /.cache +RUN chmod -R 777 /.cache +RUN pip install nodejs-bin +RUN pip install prisma RUN prisma generate RUN chmod +x entrypoint.sh diff --git a/docs/my-website/docs/completion/json_mode.md b/docs/my-website/docs/completion/json_mode.md index bf159cd07e..1d12a22ba0 100644 --- a/docs/my-website/docs/completion/json_mode.md +++ b/docs/my-website/docs/completion/json_mode.md @@ -84,17 +84,20 @@ from litellm import completion # add to env var os.environ["OPENAI_API_KEY"] = "" -messages = [{"role": "user", "content": "List 5 cookie recipes"}] +messages = [{"role": "user", "content": "List 5 important events in the XIX century"}] class CalendarEvent(BaseModel): name: str date: str participants: list[str] +class EventsList(BaseModel): + events: list[CalendarEvent] + resp = completion( model="gpt-4o-2024-08-06", messages=messages, - response_format=CalendarEvent + response_format=EventsList ) print("Received={}".format(resp)) diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md index 7c254ed35d..9f21068e03 100644 --- a/docs/my-website/docs/proxy/deploy.md +++ b/docs/my-website/docs/proxy/deploy.md @@ -705,6 +705,29 @@ docker run ghcr.io/berriai/litellm:main-latest \ Provide an ssl certificate when starting litellm proxy server +### 3. Providing LiteLLM config.yaml file as a s3 Object/url + +Use this if you cannot mount a config file on your deployment service (example - AWS Fargate, Railway etc) + +LiteLLM Proxy will read your config.yaml from an s3 Bucket + +Set the following .env vars +```shell +LITELLM_CONFIG_BUCKET_NAME = "litellm-proxy" # your bucket name on s3 +LITELLM_CONFIG_BUCKET_OBJECT_KEY = "litellm_proxy_config.yaml" # object key on s3 +``` + +Start litellm proxy with these env vars - litellm will read your config from s3 + +```shell +docker run --name litellm-proxy \ + -e DATABASE_URL= \ + -e LITELLM_CONFIG_BUCKET_NAME= \ + -e LITELLM_CONFIG_BUCKET_OBJECT_KEY="> \ + -p 4000:4000 \ + ghcr.io/berriai/litellm-database:main-latest +``` + ## Platform-specific Guide diff --git a/docs/my-website/docs/proxy/model_management.md b/docs/my-website/docs/proxy/model_management.md index 02ce4ba23b..a8cc66ae76 100644 --- a/docs/my-website/docs/proxy/model_management.md +++ b/docs/my-website/docs/proxy/model_management.md @@ -17,7 +17,7 @@ model_list: ## Get Model Information - `/model/info` -Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes. +Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled from the model_info you set and the [litellm model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). Sensitive details like API keys are excluded for security purposes. - + + ```bash curl -X POST "http://0.0.0.0:4000/model/new" \ - -H "accept: application/json" \ - -H "Content-Type: application/json" \ - -d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }' + -H "accept: application/json" \ + -H "Content-Type: application/json" \ + -d '{ "model_name": "azure-gpt-turbo", "litellm_params": {"model": "azure/gpt-3.5-turbo", "api_key": "os.environ/AZURE_API_KEY", "api_base": "my-azure-api-base"} }' ``` - + + + +```yaml +model_list: + - model_name: gpt-3.5-turbo ### RECEIVED MODEL NAME ### `openai.chat.completions.create(model="gpt-3.5-turbo",...)` + litellm_params: # all params accepted by litellm.completion() - https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/types/router.py#L297 + model: azure/gpt-turbo-small-eu ### MODEL NAME sent to `litellm.completion()` ### + api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ + api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU") + rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm) + model_info: + my_custom_key: my_custom_value # additional model metadata +``` + + @@ -85,4 +96,83 @@ Keep in mind that as both endpoints are in [BETA], you may need to visit the ass - Get Model Information: [Issue #933](https://github.com/BerriAI/litellm/issues/933) - Add a New Model: [Issue #964](https://github.com/BerriAI/litellm/issues/964) -Feedback on the beta endpoints is valuable and helps improve the API for all users. \ No newline at end of file +Feedback on the beta endpoints is valuable and helps improve the API for all users. + + +## Add Additional Model Information + +If you want the ability to add a display name, description, and labels for models, just use `model_info:` + +```yaml +model_list: + - model_name: "gpt-4" + litellm_params: + model: "gpt-4" + api_key: "os.environ/OPENAI_API_KEY" + model_info: # 👈 KEY CHANGE + my_custom_key: "my_custom_value" +``` + +### Usage + +1. Add additional information to model + +```yaml +model_list: + - model_name: "gpt-4" + litellm_params: + model: "gpt-4" + api_key: "os.environ/OPENAI_API_KEY" + model_info: # 👈 KEY CHANGE + my_custom_key: "my_custom_value" +``` + +2. Call with `/model/info` + +Use a key with access to the model `gpt-4`. + +```bash +curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \ +-H 'Authorization: Bearer LITELLM_KEY' \ +``` + +3. **Expected Response** + +Returned `model_info = Your custom model_info + (if exists) LITELLM MODEL INFO` + + +[**How LiteLLM Model Info is found**](https://github.com/BerriAI/litellm/blob/9b46ec05b02d36d6e4fb5c32321e51e7f56e4a6e/litellm/proxy/proxy_server.py#L7460) + +[Tell us how this can be improved!](https://github.com/BerriAI/litellm/issues) + +```bash +{ + "data": [ + { + "model_name": "gpt-4", + "litellm_params": { + "model": "gpt-4" + }, + "model_info": { + "id": "e889baacd17f591cce4c63639275ba5e8dc60765d6c553e6ee5a504b19e50ddc", + "db_model": false, + "my_custom_key": "my_custom_value", # 👈 CUSTOM INFO + "key": "gpt-4", # 👈 KEY in LiteLLM MODEL INFO/COST MAP - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json + "max_tokens": 4096, + "max_input_tokens": 8192, + "max_output_tokens": 4096, + "input_cost_per_token": 3e-05, + "input_cost_per_character": null, + "input_cost_per_token_above_128k_tokens": null, + "output_cost_per_token": 6e-05, + "output_cost_per_character": null, + "output_cost_per_token_above_128k_tokens": null, + "output_cost_per_character_above_128k_tokens": null, + "output_vector_size": null, + "litellm_provider": "openai", + "mode": "chat" + } + }, + ] +} +``` diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 6c856f58b3..4b913d2e82 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -72,15 +72,15 @@ http://localhost:4000/metrics | Metric Name | Description | |----------------------|--------------------------------------| -| `deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. | +| `litellm_deployment_state` | The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage. | | `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment | | `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment | - `llm_deployment_success_responses` | Total number of successful LLM API calls for deployment | -| `llm_deployment_failure_responses` | Total number of failed LLM API calls for deployment | -| `llm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure | -| `llm_deployment_latency_per_output_token` | Latency per output token for deployment | -| `llm_deployment_successful_fallbacks` | Number of successful fallback requests from primary model -> fallback model | -| `llm_deployment_failed_fallbacks` | Number of failed fallback requests from primary model -> fallback model | + `litellm_deployment_success_responses` | Total number of successful LLM API calls for deployment | +| `litellm_deployment_failure_responses` | Total number of failed LLM API calls for deployment | +| `litellm_deployment_total_requests` | Total number of LLM API calls for deployment - success + failure | +| `litellm_deployment_latency_per_output_token` | Latency per output token for deployment | +| `litellm_deployment_successful_fallbacks` | Number of successful fallback requests from primary model -> fallback model | +| `litellm_deployment_failed_fallbacks` | Number of failed fallback requests from primary model -> fallback model | diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 7df5e61578..3c3e1cbf97 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -151,7 +151,7 @@ const sidebars = { }, { type: "category", - label: "Chat Completions (litellm.completion)", + label: "Chat Completions (litellm.completion + PROXY)", link: { type: "generated-index", title: "Chat Completions", diff --git a/litellm/integrations/gcs_bucket.py b/litellm/integrations/gcs_bucket.py index 46f55f8f01..be7f8e39c2 100644 --- a/litellm/integrations/gcs_bucket.py +++ b/litellm/integrations/gcs_bucket.py @@ -1,5 +1,6 @@ import json import os +import uuid from datetime import datetime from typing import Any, Dict, List, Optional, TypedDict, Union @@ -29,6 +30,8 @@ class GCSBucketPayload(TypedDict): end_time: str response_cost: Optional[float] spend_log_metadata: str + exception: Optional[str] + log_event_type: Optional[str] class GCSBucketLogger(CustomLogger): @@ -79,6 +82,7 @@ class GCSBucketLogger(CustomLogger): logging_payload: GCSBucketPayload = await self.get_gcs_payload( kwargs, response_obj, start_time_str, end_time_str ) + logging_payload["log_event_type"] = "successful_api_call" json_logged_payload = json.dumps(logging_payload) @@ -103,7 +107,56 @@ class GCSBucketLogger(CustomLogger): verbose_logger.error("GCS Bucket logging error: %s", str(e)) async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): - pass + from litellm.proxy.proxy_server import premium_user + + if premium_user is not True: + raise ValueError( + f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}" + ) + try: + verbose_logger.debug( + "GCS Logger: async_log_failure_event logging kwargs: %s, response_obj: %s", + kwargs, + response_obj, + ) + + start_time_str = start_time.strftime("%Y-%m-%d %H:%M:%S") + end_time_str = end_time.strftime("%Y-%m-%d %H:%M:%S") + headers = await self.construct_request_headers() + + logging_payload: GCSBucketPayload = await self.get_gcs_payload( + kwargs, response_obj, start_time_str, end_time_str + ) + logging_payload["log_event_type"] = "failed_api_call" + + _litellm_params = kwargs.get("litellm_params") or {} + metadata = _litellm_params.get("metadata") or {} + + json_logged_payload = json.dumps(logging_payload) + + # Get the current date + current_date = datetime.now().strftime("%Y-%m-%d") + + # Modify the object_name to include the date-based folder + object_name = f"{current_date}/failure-{uuid.uuid4().hex}" + + if "gcs_log_id" in metadata: + object_name = metadata["gcs_log_id"] + + response = await self.async_httpx_client.post( + headers=headers, + url=f"https://storage.googleapis.com/upload/storage/v1/b/{self.BUCKET_NAME}/o?uploadType=media&name={object_name}", + data=json_logged_payload, + ) + + if response.status_code != 200: + verbose_logger.error("GCS Bucket logging error: %s", str(response.text)) + + verbose_logger.debug("GCS Bucket response %s", response) + verbose_logger.debug("GCS Bucket status code %s", response.status_code) + verbose_logger.debug("GCS Bucket response.text %s", response.text) + except Exception as e: + verbose_logger.error("GCS Bucket logging error: %s", str(e)) async def construct_request_headers(self) -> Dict[str, str]: from litellm import vertex_chat_completion @@ -139,9 +192,18 @@ class GCSBucketLogger(CustomLogger): optional_params=kwargs.get("optional_params", None), ) response_dict = {} - response_dict = convert_litellm_response_object_to_dict( - response_obj=response_obj - ) + if response_obj: + response_dict = convert_litellm_response_object_to_dict( + response_obj=response_obj + ) + + exception_str = None + + # Handle logging exception attributes + if "exception" in kwargs: + exception_str = kwargs.get("exception", "") + if not isinstance(exception_str, str): + exception_str = str(exception_str) _spend_log_payload: SpendLogsPayload = get_logging_payload( kwargs=kwargs, @@ -156,8 +218,10 @@ class GCSBucketLogger(CustomLogger): response_obj=response_dict, start_time=start_time, end_time=end_time, - spend_log_metadata=_spend_log_payload["metadata"], + spend_log_metadata=_spend_log_payload.get("metadata", ""), response_cost=kwargs.get("response_cost", None), + exception=exception_str, + log_event_type=None, ) return gcs_payload diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 8797807ac6..08431fd7af 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -141,42 +141,42 @@ class PrometheusLogger(CustomLogger): ] # Metric for deployment state - self.deployment_state = Gauge( - "deployment_state", + self.litellm_deployment_state = Gauge( + "litellm_deployment_state", "LLM Deployment Analytics - The state of the deployment: 0 = healthy, 1 = partial outage, 2 = complete outage", labelnames=_logged_llm_labels, ) - self.llm_deployment_success_responses = Counter( - name="llm_deployment_success_responses", + self.litellm_deployment_success_responses = Counter( + name="litellm_deployment_success_responses", documentation="LLM Deployment Analytics - Total number of successful LLM API calls via litellm", labelnames=_logged_llm_labels, ) - self.llm_deployment_failure_responses = Counter( - name="llm_deployment_failure_responses", + self.litellm_deployment_failure_responses = Counter( + name="litellm_deployment_failure_responses", documentation="LLM Deployment Analytics - Total number of failed LLM API calls via litellm", labelnames=_logged_llm_labels, ) - self.llm_deployment_total_requests = Counter( - name="llm_deployment_total_requests", + self.litellm_deployment_total_requests = Counter( + name="litellm_deployment_total_requests", documentation="LLM Deployment Analytics - Total number of LLM API calls via litellm - success + failure", labelnames=_logged_llm_labels, ) # Deployment Latency tracking - self.llm_deployment_latency_per_output_token = Histogram( - name="llm_deployment_latency_per_output_token", + self.litellm_deployment_latency_per_output_token = Histogram( + name="litellm_deployment_latency_per_output_token", documentation="LLM Deployment Analytics - Latency per output token", labelnames=_logged_llm_labels, ) - self.llm_deployment_successful_fallbacks = Counter( - "llm_deployment_successful_fallbacks", + self.litellm_deployment_successful_fallbacks = Counter( + "litellm_deployment_successful_fallbacks", "LLM Deployment Analytics - Number of successful fallback requests from primary model -> fallback model", ["primary_model", "fallback_model"], ) - self.llm_deployment_failed_fallbacks = Counter( - "llm_deployment_failed_fallbacks", + self.litellm_deployment_failed_fallbacks = Counter( + "litellm_deployment_failed_fallbacks", "LLM Deployment Analytics - Number of failed fallback requests from primary model -> fallback model", ["primary_model", "fallback_model"], ) @@ -358,14 +358,14 @@ class PrometheusLogger(CustomLogger): api_provider=llm_provider, ) - self.llm_deployment_failure_responses.labels( + self.litellm_deployment_failure_responses.labels( litellm_model_name=litellm_model_name, model_id=model_id, api_base=api_base, api_provider=llm_provider, ).inc() - self.llm_deployment_total_requests.labels( + self.litellm_deployment_total_requests.labels( litellm_model_name=litellm_model_name, model_id=model_id, api_base=api_base, @@ -438,14 +438,14 @@ class PrometheusLogger(CustomLogger): api_provider=llm_provider, ) - self.llm_deployment_success_responses.labels( + self.litellm_deployment_success_responses.labels( litellm_model_name=litellm_model_name, model_id=model_id, api_base=api_base, api_provider=llm_provider, ).inc() - self.llm_deployment_total_requests.labels( + self.litellm_deployment_total_requests.labels( litellm_model_name=litellm_model_name, model_id=model_id, api_base=api_base, @@ -475,7 +475,7 @@ class PrometheusLogger(CustomLogger): latency_per_token = None if output_tokens is not None and output_tokens > 0: latency_per_token = _latency_seconds / output_tokens - self.llm_deployment_latency_per_output_token.labels( + self.litellm_deployment_latency_per_output_token.labels( litellm_model_name=litellm_model_name, model_id=model_id, api_base=api_base, @@ -497,7 +497,7 @@ class PrometheusLogger(CustomLogger): kwargs, ) _new_model = kwargs.get("model") - self.llm_deployment_successful_fallbacks.labels( + self.litellm_deployment_successful_fallbacks.labels( primary_model=original_model_group, fallback_model=_new_model ).inc() @@ -508,11 +508,11 @@ class PrometheusLogger(CustomLogger): kwargs, ) _new_model = kwargs.get("model") - self.llm_deployment_failed_fallbacks.labels( + self.litellm_deployment_failed_fallbacks.labels( primary_model=original_model_group, fallback_model=_new_model ).inc() - def set_deployment_state( + def set_litellm_deployment_state( self, state: int, litellm_model_name: str, @@ -520,7 +520,7 @@ class PrometheusLogger(CustomLogger): api_base: str, api_provider: str, ): - self.deployment_state.labels( + self.litellm_deployment_state.labels( litellm_model_name, model_id, api_base, api_provider ).set(state) @@ -531,7 +531,7 @@ class PrometheusLogger(CustomLogger): api_base: str, api_provider: str, ): - self.set_deployment_state( + self.set_litellm_deployment_state( 0, litellm_model_name, model_id, api_base, api_provider ) @@ -542,7 +542,7 @@ class PrometheusLogger(CustomLogger): api_base: str, api_provider: str, ): - self.set_deployment_state( + self.set_litellm_deployment_state( 1, litellm_model_name, model_id, api_base, api_provider ) @@ -553,7 +553,7 @@ class PrometheusLogger(CustomLogger): api_base: str, api_provider: str, ): - self.set_deployment_state( + self.set_litellm_deployment_state( 2, litellm_model_name, model_id, api_base, api_provider ) diff --git a/litellm/integrations/prometheus_helpers/prometheus_api.py b/litellm/integrations/prometheus_helpers/prometheus_api.py index 86764df7dd..13ccc15620 100644 --- a/litellm/integrations/prometheus_helpers/prometheus_api.py +++ b/litellm/integrations/prometheus_helpers/prometheus_api.py @@ -41,8 +41,8 @@ async def get_fallback_metric_from_prometheus(): """ response_message = "" relevant_metrics = [ - "llm_deployment_successful_fallbacks_total", - "llm_deployment_failed_fallbacks_total", + "litellm_deployment_successful_fallbacks_total", + "litellm_deployment_failed_fallbacks_total", ] for metric in relevant_metrics: response_json = await get_metric_from_prometheus( diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py index ffc096f762..c433c32b7d 100644 --- a/litellm/llms/bedrock_httpx.py +++ b/litellm/llms/bedrock_httpx.py @@ -1055,8 +1055,8 @@ class BedrockLLM(BaseLLM): }, ) raise BedrockError( - status_code=400, - message="Bedrock HTTPX: Unsupported provider={}, model={}".format( + status_code=404, + message="Bedrock HTTPX: Unknown provider={}, model={}".format( provider, model ), ) diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index 6b984e1d82..f699cf0f5f 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -601,12 +601,13 @@ def ollama_embeddings( ): return asyncio.run( ollama_aembeddings( - api_base, - model, - prompts, - optional_params, - logging_obj, - model_response, - encoding, + api_base=api_base, + model=model, + prompts=prompts, + model_response=model_response, + optional_params=optional_params, + logging_obj=logging_obj, + encoding=encoding, ) + ) diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index b0dd5d905a..ea84fa95cf 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -356,6 +356,7 @@ def ollama_completion_stream(url, api_key, data, logging_obj): "json": data, "method": "POST", "timeout": litellm.request_timeout, + "follow_redirects": True } if api_key is not None: _request["headers"] = {"Authorization": "Bearer {}".format(api_key)} diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index 7c3c7e80fb..f39273c1a2 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -1701,12 +1701,12 @@ def cohere_messages_pt_v2( assistant_tool_calls: List[ToolCallObject] = [] ## MERGE CONSECUTIVE ASSISTANT CONTENT ## while msg_i < len(messages) and messages[msg_i]["role"] == "assistant": - assistant_text = ( - messages[msg_i].get("content") or "" - ) # either string or none - if assistant_text: - assistant_content += assistant_text - + if isinstance(messages[msg_i]["content"], list): + for m in messages[msg_i]["content"]: + if m.get("type", "") == "text": + assistant_content += m["text"] + else: + assistant_content += messages[msg_i]["content"] if messages[msg_i].get( "tool_calls", [] ): # support assistant tool invoke conversion diff --git a/litellm/llms/triton.py b/litellm/llms/triton.py index 7d0338d069..14a2e828b4 100644 --- a/litellm/llms/triton.py +++ b/litellm/llms/triton.py @@ -240,10 +240,10 @@ class TritonChatCompletion(BaseLLM): handler = HTTPHandler() if stream: return self._handle_stream( - handler, api_base, data_for_triton, model, logging_obj + handler, api_base, json_data_for_triton, model, logging_obj ) else: - response = handler.post(url=api_base, data=data_for_triton, headers=headers) + response = handler.post(url=api_base, data=json_data_for_triton, headers=headers) return self._handle_response( response, model_response, logging_obj, type_of_model=type_of_model ) diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 455fe1e3c5..d30270c5c8 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -57,6 +57,18 @@ "supports_parallel_function_calling": true, "supports_vision": true }, + "chatgpt-4o-latest": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.000005, + "output_cost_per_token": 0.000015, + "litellm_provider": "openai", + "mode": "chat", + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_vision": true + }, "gpt-4o-2024-05-13": { "max_tokens": 4096, "max_input_tokens": 128000, @@ -2062,7 +2074,8 @@ "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", "supports_function_calling": true, - "supports_vision": true + "supports_vision": true, + "supports_assistant_prefill": true }, "vertex_ai/claude-3-5-sonnet@20240620": { "max_tokens": 4096, @@ -2073,7 +2086,8 @@ "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", "supports_function_calling": true, - "supports_vision": true + "supports_vision": true, + "supports_assistant_prefill": true }, "vertex_ai/claude-3-haiku@20240307": { "max_tokens": 4096, @@ -2084,7 +2098,8 @@ "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", "supports_function_calling": true, - "supports_vision": true + "supports_vision": true, + "supports_assistant_prefill": true }, "vertex_ai/claude-3-opus@20240229": { "max_tokens": 4096, @@ -2095,7 +2110,8 @@ "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", "supports_function_calling": true, - "supports_vision": true + "supports_vision": true, + "supports_assistant_prefill": true }, "vertex_ai/meta/llama3-405b-instruct-maas": { "max_tokens": 32000, @@ -4519,6 +4535,69 @@ "litellm_provider": "perplexity", "mode": "chat" }, + "perplexity/llama-3.1-70b-instruct": { + "max_tokens": 131072, + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_token": 0.000001, + "output_cost_per_token": 0.000001, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-8b-instruct": { + "max_tokens": 131072, + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_token": 0.0000002, + "output_cost_per_token": 0.0000002, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-huge-128k-online": { + "max_tokens": 127072, + "max_input_tokens": 127072, + "max_output_tokens": 127072, + "input_cost_per_token": 0.000005, + "output_cost_per_token": 0.000005, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-large-128k-online": { + "max_tokens": 127072, + "max_input_tokens": 127072, + "max_output_tokens": 127072, + "input_cost_per_token": 0.000001, + "output_cost_per_token": 0.000001, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-large-128k-chat": { + "max_tokens": 131072, + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_token": 0.000001, + "output_cost_per_token": 0.000001, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-small-128k-chat": { + "max_tokens": 131072, + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_token": 0.0000002, + "output_cost_per_token": 0.0000002, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-small-128k-online": { + "max_tokens": 127072, + "max_input_tokens": 127072, + "max_output_tokens": 127072, + "input_cost_per_token": 0.0000002, + "output_cost_per_token": 0.0000002, + "litellm_provider": "perplexity", + "mode": "chat" + }, "perplexity/pplx-7b-chat": { "max_tokens": 8192, "max_input_tokens": 8192, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index bc3e0680f8..0cffb3f8a9 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,13 +1,7 @@ model_list: - - model_name: "*" + - model_name: "gpt-4" litellm_params: - model: "*" + model: "gpt-4" + model_info: + my_custom_key: "my_custom_value" -general_settings: - master_key: sk-1234 - pass_through_endpoints: - - path: "/api/public/ingestion" # route you want to add to LiteLLM Proxy Server - target: "https://us.cloud.langfuse.com/api/public/ingestion" # URL this route should forward - headers: - LANGFUSE_PUBLIC_KEY: "os.environ/LANGFUSE_PUBLIC_KEY" # your langfuse account public key - LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_SECRET_KEY" # your langfuse account secret key \ No newline at end of file diff --git a/litellm/proxy/auth/user_api_key_auth.py b/litellm/proxy/auth/user_api_key_auth.py index 5ae149f1bd..00e78f64e6 100644 --- a/litellm/proxy/auth/user_api_key_auth.py +++ b/litellm/proxy/auth/user_api_key_auth.py @@ -12,7 +12,7 @@ import json import secrets import traceback from datetime import datetime, timedelta, timezone -from typing import Optional +from typing import Optional, Tuple from uuid import uuid4 import fastapi @@ -125,7 +125,7 @@ async def user_api_key_auth( # Check 2. FILTER IP ADDRESS await check_if_request_size_is_safe(request=request) - is_valid_ip = _check_valid_ip( + is_valid_ip, passed_in_ip = _check_valid_ip( allowed_ips=general_settings.get("allowed_ips", None), use_x_forwarded_for=general_settings.get("use_x_forwarded_for", False), request=request, @@ -134,7 +134,7 @@ async def user_api_key_auth( if not is_valid_ip: raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, - detail="Access forbidden: IP address not allowed.", + detail=f"Access forbidden: IP address {passed_in_ip} not allowed.", ) pass_through_endpoints: Optional[List[dict]] = general_settings.get( @@ -1251,12 +1251,12 @@ def _check_valid_ip( allowed_ips: Optional[List[str]], request: Request, use_x_forwarded_for: Optional[bool] = False, -) -> bool: +) -> Tuple[bool, Optional[str]]: """ Returns if ip is allowed or not """ if allowed_ips is None: # if not set, assume true - return True + return True, None # if general_settings.get("use_x_forwarded_for") is True then use x-forwarded-for client_ip = None @@ -1267,9 +1267,9 @@ def _check_valid_ip( # Check if IP address is allowed if client_ip not in allowed_ips: - return False + return False, client_ip - return True + return True, client_ip def get_api_key_from_custom_header( diff --git a/litellm/proxy/common_utils/load_config_utils.py b/litellm/proxy/common_utils/load_config_utils.py new file mode 100644 index 0000000000..bded2e3470 --- /dev/null +++ b/litellm/proxy/common_utils/load_config_utils.py @@ -0,0 +1,53 @@ +import tempfile + +import boto3 +import yaml + +from litellm._logging import verbose_proxy_logger + + +def get_file_contents_from_s3(bucket_name, object_key): + # v0 rely on boto3 for authentication - allowing boto3 to handle IAM credentials etc + from botocore.config import Config + from botocore.credentials import Credentials + + from litellm.main import bedrock_converse_chat_completion + + credentials: Credentials = bedrock_converse_chat_completion.get_credentials() + s3_client = boto3.client( + "s3", + aws_access_key_id=credentials.access_key, + aws_secret_access_key=credentials.secret_key, + aws_session_token=credentials.token, # Optional, if using temporary credentials + ) + + try: + verbose_proxy_logger.debug( + f"Retrieving {object_key} from S3 bucket: {bucket_name}" + ) + response = s3_client.get_object(Bucket=bucket_name, Key=object_key) + verbose_proxy_logger.debug(f"Response: {response}") + + # Read the file contents + file_contents = response["Body"].read().decode("utf-8") + verbose_proxy_logger.debug(f"File contents retrieved from S3") + + # Create a temporary file with YAML extension + with tempfile.NamedTemporaryFile(delete=False, suffix=".yaml") as temp_file: + temp_file.write(file_contents.encode("utf-8")) + temp_file_path = temp_file.name + verbose_proxy_logger.debug(f"File stored temporarily at: {temp_file_path}") + + # Load the YAML file content + with open(temp_file_path, "r") as yaml_file: + config = yaml.safe_load(yaml_file) + + return config + except Exception as e: + verbose_proxy_logger.error(f"Error retrieving file contents: {str(e)}") + return None + + +# # Example usage +# bucket_name = 'litellm-proxy' +# object_key = 'litellm_proxy_config.yaml' diff --git a/litellm/proxy/litellm_pre_call_utils.py b/litellm/proxy/litellm_pre_call_utils.py index 990cb52337..9b896f66c2 100644 --- a/litellm/proxy/litellm_pre_call_utils.py +++ b/litellm/proxy/litellm_pre_call_utils.py @@ -5,7 +5,12 @@ from fastapi import Request import litellm from litellm._logging import verbose_logger, verbose_proxy_logger -from litellm.proxy._types import CommonProxyErrors, TeamCallbackMetadata, UserAPIKeyAuth +from litellm.proxy._types import ( + AddTeamCallback, + CommonProxyErrors, + TeamCallbackMetadata, + UserAPIKeyAuth, +) from litellm.types.utils import SupportedCacheControls if TYPE_CHECKING: @@ -59,6 +64,42 @@ def safe_add_api_version_from_query_params(data: dict, request: Request): verbose_logger.error("error checking api version in query params: %s", str(e)) +def convert_key_logging_metadata_to_callback( + data: AddTeamCallback, team_callback_settings_obj: Optional[TeamCallbackMetadata] +) -> TeamCallbackMetadata: + if team_callback_settings_obj is None: + team_callback_settings_obj = TeamCallbackMetadata() + if data.callback_type == "success": + if team_callback_settings_obj.success_callback is None: + team_callback_settings_obj.success_callback = [] + + if data.callback_name not in team_callback_settings_obj.success_callback: + team_callback_settings_obj.success_callback.append(data.callback_name) + elif data.callback_type == "failure": + if team_callback_settings_obj.failure_callback is None: + team_callback_settings_obj.failure_callback = [] + + if data.callback_name not in team_callback_settings_obj.failure_callback: + team_callback_settings_obj.failure_callback.append(data.callback_name) + elif data.callback_type == "success_and_failure": + if team_callback_settings_obj.success_callback is None: + team_callback_settings_obj.success_callback = [] + if team_callback_settings_obj.failure_callback is None: + team_callback_settings_obj.failure_callback = [] + if data.callback_name not in team_callback_settings_obj.success_callback: + team_callback_settings_obj.success_callback.append(data.callback_name) + + if data.callback_name in team_callback_settings_obj.failure_callback: + team_callback_settings_obj.failure_callback.append(data.callback_name) + + for var, value in data.callback_vars.items(): + if team_callback_settings_obj.callback_vars is None: + team_callback_settings_obj.callback_vars = {} + team_callback_settings_obj.callback_vars[var] = litellm.get_secret(value) + + return team_callback_settings_obj + + async def add_litellm_data_to_request( data: dict, request: Request, @@ -224,6 +265,7 @@ async def add_litellm_data_to_request( } # add the team-specific configs to the completion call # Team Callbacks controls + callback_settings_obj: Optional[TeamCallbackMetadata] = None if user_api_key_dict.team_metadata is not None: team_metadata = user_api_key_dict.team_metadata if "callback_settings" in team_metadata: @@ -241,13 +283,25 @@ async def add_litellm_data_to_request( } } """ - data["success_callback"] = callback_settings_obj.success_callback - data["failure_callback"] = callback_settings_obj.failure_callback + elif ( + user_api_key_dict.metadata is not None + and "logging" in user_api_key_dict.metadata + ): + for item in user_api_key_dict.metadata["logging"]: - if callback_settings_obj.callback_vars is not None: - # unpack callback_vars in data - for k, v in callback_settings_obj.callback_vars.items(): - data[k] = v + callback_settings_obj = convert_key_logging_metadata_to_callback( + data=AddTeamCallback(**item), + team_callback_settings_obj=callback_settings_obj, + ) + + if callback_settings_obj is not None: + data["success_callback"] = callback_settings_obj.success_callback + data["failure_callback"] = callback_settings_obj.failure_callback + + if callback_settings_obj.callback_vars is not None: + # unpack callback_vars in data + for k, v in callback_settings_obj.callback_vars.items(): + data[k] = v return data diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 660c27f249..4a1fc84a80 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -39,7 +39,4 @@ general_settings: litellm_settings: fallbacks: [{"gemini-1.5-pro-001": ["gpt-4o"]}] - success_callback: ["langfuse", "prometheus"] - langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] - failure_callback: ["prometheus"] - cache: True + callbacks: ["gcs_bucket"] diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index c79a18a5cc..b637bee21b 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -151,6 +151,7 @@ from litellm.proxy.common_utils.http_parsing_utils import ( check_file_size_under_limit, ) from litellm.proxy.common_utils.init_callbacks import initialize_callbacks_on_proxy +from litellm.proxy.common_utils.load_config_utils import get_file_contents_from_s3 from litellm.proxy.common_utils.openai_endpoint_utils import ( remove_sensitive_info_from_deployment, ) @@ -1402,7 +1403,18 @@ class ProxyConfig: global master_key, user_config_file_path, otel_logging, user_custom_auth, user_custom_auth_path, user_custom_key_generate, use_background_health_checks, health_check_interval, use_queue, custom_db_client, proxy_budget_rescheduler_max_time, proxy_budget_rescheduler_min_time, ui_access_mode, litellm_master_key_hash, proxy_batch_write_at, disable_spend_logs, prompt_injection_detection_obj, redis_usage_cache, store_model_in_db, premium_user, open_telemetry_logger, health_check_details # Load existing config - config = await self.get_config(config_file_path=config_file_path) + if os.environ.get("LITELLM_CONFIG_BUCKET_NAME") is not None: + bucket_name = os.environ.get("LITELLM_CONFIG_BUCKET_NAME") + object_key = os.environ.get("LITELLM_CONFIG_BUCKET_OBJECT_KEY") + verbose_proxy_logger.debug( + "bucket_name: %s, object_key: %s", bucket_name, object_key + ) + config = get_file_contents_from_s3( + bucket_name=bucket_name, object_key=object_key + ) + else: + # default to file + config = await self.get_config(config_file_path=config_file_path) ## PRINT YAML FOR CONFIRMING IT WORKS printed_yaml = copy.deepcopy(config) printed_yaml.pop("environment_variables", None) @@ -2601,6 +2613,15 @@ async def startup_event(): ) else: await initialize(**worker_config) + elif os.environ.get("LITELLM_CONFIG_BUCKET_NAME") is not None: + ( + llm_router, + llm_model_list, + general_settings, + ) = await proxy_config.load_config( + router=llm_router, config_file_path=worker_config + ) + else: # if not, assume it's a json string worker_config = json.loads(os.getenv("WORKER_CONFIG")) diff --git a/litellm/proxy/spend_tracking/spend_tracking_utils.py b/litellm/proxy/spend_tracking/spend_tracking_utils.py index cd7004e41d..6a28d70b17 100644 --- a/litellm/proxy/spend_tracking/spend_tracking_utils.py +++ b/litellm/proxy/spend_tracking/spend_tracking_utils.py @@ -21,6 +21,8 @@ def get_logging_payload( if kwargs is None: kwargs = {} + if response_obj is None: + response_obj = {} # standardize this function to be used across, s3, dynamoDB, langfuse logging litellm_params = kwargs.get("litellm_params", {}) metadata = ( diff --git a/litellm/tests/test_bedrock_completion.py b/litellm/tests/test_bedrock_completion.py index 4da18144d0..c331021213 100644 --- a/litellm/tests/test_bedrock_completion.py +++ b/litellm/tests/test_bedrock_completion.py @@ -1159,8 +1159,8 @@ def test_bedrock_tools_pt_invalid_names(): assert result[1]["toolSpec"]["name"] == "another_invalid_name" -def test_bad_request_error(): - with pytest.raises(litellm.BadRequestError): +def test_not_found_error(): + with pytest.raises(litellm.NotFoundError): completion( model="bedrock/bad_model", messages=[ diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index db0239ca33..83031aba08 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.llms.prompt_templates.factory import anthropic_messages_pt -# litellm.num_retries = 3 +# litellm.num_retries =3 litellm.cache = None litellm.success_callback = [] user_message = "Write a short poem about the sky" @@ -3705,19 +3705,21 @@ def test_completion_anyscale_api(): # test_completion_anyscale_api() -@pytest.mark.skip(reason="flaky test, times out frequently") +# @pytest.mark.skip(reason="flaky test, times out frequently") def test_completion_cohere(): try: # litellm.set_verbose=True messages = [ {"role": "system", "content": "You're a good bot"}, + {"role": "assistant", "content": [{"text": "2", "type": "text"}]}, + {"role": "assistant", "content": [{"text": "3", "type": "text"}]}, { "role": "user", "content": "Hey", }, ] response = completion( - model="command-nightly", + model="command-r", messages=messages, ) print(response) diff --git a/litellm/tests/test_function_call_parsing.py b/litellm/tests/test_function_call_parsing.py index d223a7c8f6..fab9cf110c 100644 --- a/litellm/tests/test_function_call_parsing.py +++ b/litellm/tests/test_function_call_parsing.py @@ -1,23 +1,27 @@ # What is this? ## Test to make sure function call response always works with json.loads() -> no extra parsing required. Relevant issue - https://github.com/BerriAI/litellm/issues/2654 -import sys, os +import os +import sys import traceback + from dotenv import load_dotenv load_dotenv() -import os, io +import io +import os sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path -import pytest -import litellm import json import warnings - -from litellm import completion from typing import List +import pytest + +import litellm +from litellm import completion + # Just a stub to keep the sample code simple class Trade: @@ -78,58 +82,60 @@ def trade(model_name: str) -> List[Trade]: }, } - response = completion( - model_name, - [ - { - "role": "system", - "content": """You are an expert asset manager, managing a portfolio. + try: + response = completion( + model_name, + [ + { + "role": "system", + "content": """You are an expert asset manager, managing a portfolio. - Always use the `trade` function. Make sure that you call it correctly. For example, the following is a valid call: + Always use the `trade` function. Make sure that you call it correctly. For example, the following is a valid call: + ``` + trade({ + "orders": [ + {"action": "buy", "asset": "BTC", "amount": 0.1}, + {"action": "sell", "asset": "ETH", "amount": 0.2} + ] + }) + ``` + + If there are no trades to make, call `trade` with an empty array: + ``` + trade({ "orders": [] }) + ``` + """, + }, + { + "role": "user", + "content": """Manage the portfolio. + + Don't jabber. + + This is the current market data: ``` - trade({ - "orders": [ - {"action": "buy", "asset": "BTC", "amount": 0.1}, - {"action": "sell", "asset": "ETH", "amount": 0.2} - ] - }) + {market_data} ``` - If there are no trades to make, call `trade` with an empty array: + Your portfolio is as follows: ``` - trade({ "orders": [] }) + {portfolio} ``` - """, + """.replace( + "{market_data}", "BTC: 64,000 USD\nETH: 3,500 USD" + ).replace( + "{portfolio}", "USD: 1000, BTC: 0.1, ETH: 0.2" + ), + }, + ], + tools=[tool_spec], + tool_choice={ + "type": "function", + "function": {"name": tool_spec["function"]["name"]}, # type: ignore }, - { - "role": "user", - "content": """Manage the portfolio. - - Don't jabber. - - This is the current market data: - ``` - {market_data} - ``` - - Your portfolio is as follows: - ``` - {portfolio} - ``` - """.replace( - "{market_data}", "BTC: 64,000 USD\nETH: 3,500 USD" - ).replace( - "{portfolio}", "USD: 1000, BTC: 0.1, ETH: 0.2" - ), - }, - ], - tools=[tool_spec], - tool_choice={ - "type": "function", - "function": {"name": tool_spec["function"]["name"]}, # type: ignore - }, - ) - + ) + except litellm.InternalServerError: + pass calls = response.choices[0].message.tool_calls trades = [trade for call in calls for trade in parse_call(call)] return trades diff --git a/litellm/tests/test_gcs_bucket.py b/litellm/tests/test_gcs_bucket.py index c21988c73d..f0aaf8d8dd 100644 --- a/litellm/tests/test_gcs_bucket.py +++ b/litellm/tests/test_gcs_bucket.py @@ -147,6 +147,117 @@ async def test_basic_gcs_logger(): assert gcs_payload["response_cost"] > 0.0 + assert gcs_payload["log_event_type"] == "successful_api_call" + gcs_payload["spend_log_metadata"] = json.loads(gcs_payload["spend_log_metadata"]) + + assert ( + gcs_payload["spend_log_metadata"]["user_api_key"] + == "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b" + ) + assert ( + gcs_payload["spend_log_metadata"]["user_api_key_user_id"] + == "116544810872468347480" + ) + + # Delete Object from GCS + print("deleting object from GCS") + await gcs_logger.delete_gcs_object(object_name=object_name) + + +@pytest.mark.asyncio +async def test_basic_gcs_logger_failure(): + load_vertex_ai_credentials() + gcs_logger = GCSBucketLogger() + print("GCSBucketLogger", gcs_logger) + + gcs_log_id = f"failure-test-{uuid.uuid4().hex}" + + litellm.callbacks = [gcs_logger] + + try: + response = await litellm.acompletion( + model="gpt-3.5-turbo", + temperature=0.7, + messages=[{"role": "user", "content": "This is a test"}], + max_tokens=10, + user="ishaan-2", + mock_response=litellm.BadRequestError( + model="gpt-3.5-turbo", + message="Error: 400: Bad Request: Invalid API key, please check your API key and try again.", + llm_provider="openai", + ), + metadata={ + "gcs_log_id": gcs_log_id, + "tags": ["model-anthropic-claude-v2.1", "app-ishaan-prod"], + "user_api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b", + "user_api_key_alias": None, + "user_api_end_user_max_budget": None, + "litellm_api_version": "0.0.0", + "global_max_parallel_requests": None, + "user_api_key_user_id": "116544810872468347480", + "user_api_key_org_id": None, + "user_api_key_team_id": None, + "user_api_key_team_alias": None, + "user_api_key_metadata": {}, + "requester_ip_address": "127.0.0.1", + "spend_logs_metadata": {"hello": "world"}, + "headers": { + "content-type": "application/json", + "user-agent": "PostmanRuntime/7.32.3", + "accept": "*/*", + "postman-token": "92300061-eeaa-423b-a420-0b44896ecdc4", + "host": "localhost:4000", + "accept-encoding": "gzip, deflate, br", + "connection": "keep-alive", + "content-length": "163", + }, + "endpoint": "http://localhost:4000/chat/completions", + "model_group": "gpt-3.5-turbo", + "deployment": "azure/chatgpt-v-2", + "model_info": { + "id": "4bad40a1eb6bebd1682800f16f44b9f06c52a6703444c99c7f9f32e9de3693b4", + "db_model": False, + }, + "api_base": "https://openai-gpt-4-test-v-1.openai.azure.com/", + "caching_groups": None, + "raw_request": "\n\nPOST Request Sent from LiteLLM:\ncurl -X POST \\\nhttps://openai-gpt-4-test-v-1.openai.azure.com//openai/ \\\n-H 'Authorization: *****' \\\n-d '{'model': 'chatgpt-v-2', 'messages': [{'role': 'system', 'content': 'you are a helpful assistant.\\n'}, {'role': 'user', 'content': 'bom dia'}], 'stream': False, 'max_tokens': 10, 'user': '116544810872468347480', 'extra_body': {}}'\n", + }, + ) + except: + pass + + await asyncio.sleep(5) + + # Get the current date + # Get the current date + current_date = datetime.now().strftime("%Y-%m-%d") + + # Modify the object_name to include the date-based folder + object_name = gcs_log_id + + print("object_name", object_name) + + # Check if object landed on GCS + object_from_gcs = await gcs_logger.download_gcs_object(object_name=object_name) + print("object from gcs=", object_from_gcs) + # convert object_from_gcs from bytes to DICT + parsed_data = json.loads(object_from_gcs) + print("object_from_gcs as dict", parsed_data) + + print("type of object_from_gcs", type(parsed_data)) + + gcs_payload = GCSBucketPayload(**parsed_data) + + print("gcs_payload", gcs_payload) + + assert gcs_payload["request_kwargs"]["model"] == "gpt-3.5-turbo" + assert gcs_payload["request_kwargs"]["messages"] == [ + {"role": "user", "content": "This is a test"} + ] + + assert gcs_payload["response_cost"] == 0 + assert gcs_payload["log_event_type"] == "failed_api_call" + gcs_payload["spend_log_metadata"] = json.loads(gcs_payload["spend_log_metadata"]) assert ( diff --git a/litellm/tests/test_prometheus.py b/litellm/tests/test_prometheus.py index 64e824e6db..7574beb9d9 100644 --- a/litellm/tests/test_prometheus.py +++ b/litellm/tests/test_prometheus.py @@ -76,6 +76,6 @@ async def test_async_prometheus_success_logging(): print("metrics from prometheus", metrics) assert metrics["litellm_requests_metric_total"] == 1.0 assert metrics["litellm_total_tokens_total"] == 30.0 - assert metrics["llm_deployment_success_responses_total"] == 1.0 - assert metrics["llm_deployment_total_requests_total"] == 1.0 - assert metrics["llm_deployment_latency_per_output_token_bucket"] == 1.0 + assert metrics["litellm_deployment_success_responses_total"] == 1.0 + assert metrics["litellm_deployment_total_requests_total"] == 1.0 + assert metrics["litellm_deployment_latency_per_output_token_bucket"] == 1.0 diff --git a/litellm/tests/test_proxy_server.py b/litellm/tests/test_proxy_server.py index 757eef6d62..9a1c091267 100644 --- a/litellm/tests/test_proxy_server.py +++ b/litellm/tests/test_proxy_server.py @@ -966,3 +966,203 @@ async def test_user_info_team_list(prisma_client): pass mock_client.assert_called() + + +@pytest.mark.skip(reason="Local test") +@pytest.mark.asyncio +async def test_add_callback_via_key(prisma_client): + """ + Test if callback specified in key, is used. + """ + global headers + import json + + from fastapi import HTTPException, Request, Response + from starlette.datastructures import URL + + from litellm.proxy.proxy_server import chat_completion + + setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client) + setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") + await litellm.proxy.proxy_server.prisma_client.connect() + + litellm.set_verbose = True + + try: + # Your test data + test_data = { + "model": "azure/chatgpt-v-2", + "messages": [ + {"role": "user", "content": "write 1 sentence poem"}, + ], + "max_tokens": 10, + "mock_response": "Hello world", + "api_key": "my-fake-key", + } + + request = Request(scope={"type": "http", "method": "POST", "headers": {}}) + request._url = URL(url="/chat/completions") + + json_bytes = json.dumps(test_data).encode("utf-8") + + request._body = json_bytes + + with patch.object( + litellm.litellm_core_utils.litellm_logging, + "LangFuseLogger", + new=MagicMock(), + ) as mock_client: + resp = await chat_completion( + request=request, + fastapi_response=Response(), + user_api_key_dict=UserAPIKeyAuth( + metadata={ + "logging": [ + { + "callback_name": "langfuse", # 'otel', 'langfuse', 'lunary' + "callback_type": "success", # set, if required by integration - future improvement, have logging tools work for success + failure by default + "callback_vars": { + "langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY", + "langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", + "langfuse_host": "https://us.cloud.langfuse.com", + }, + } + ] + } + ), + ) + print(resp) + mock_client.assert_called() + mock_client.return_value.log_event.assert_called() + args, kwargs = mock_client.return_value.log_event.call_args + kwargs = kwargs["kwargs"] + assert "user_api_key_metadata" in kwargs["litellm_params"]["metadata"] + assert ( + "logging" + in kwargs["litellm_params"]["metadata"]["user_api_key_metadata"] + ) + checked_keys = False + for item in kwargs["litellm_params"]["metadata"]["user_api_key_metadata"][ + "logging" + ]: + for k, v in item["callback_vars"].items(): + print("k={}, v={}".format(k, v)) + if "key" in k: + assert "os.environ" in v + checked_keys = True + + assert checked_keys + except Exception as e: + pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}") + + +@pytest.mark.asyncio +async def test_add_callback_via_key_litellm_pre_call_utils(prisma_client): + import json + + from fastapi import HTTPException, Request, Response + from starlette.datastructures import URL + + from litellm.proxy.litellm_pre_call_utils import add_litellm_data_to_request + + setattr(litellm.proxy.proxy_server, "prisma_client", prisma_client) + setattr(litellm.proxy.proxy_server, "master_key", "sk-1234") + await litellm.proxy.proxy_server.prisma_client.connect() + + proxy_config = getattr(litellm.proxy.proxy_server, "proxy_config") + + request = Request(scope={"type": "http", "method": "POST", "headers": {}}) + request._url = URL(url="/chat/completions") + + test_data = { + "model": "azure/chatgpt-v-2", + "messages": [ + {"role": "user", "content": "write 1 sentence poem"}, + ], + "max_tokens": 10, + "mock_response": "Hello world", + "api_key": "my-fake-key", + } + + json_bytes = json.dumps(test_data).encode("utf-8") + + request._body = json_bytes + + data = { + "data": { + "model": "azure/chatgpt-v-2", + "messages": [{"role": "user", "content": "write 1 sentence poem"}], + "max_tokens": 10, + "mock_response": "Hello world", + "api_key": "my-fake-key", + }, + "request": request, + "user_api_key_dict": UserAPIKeyAuth( + token=None, + key_name=None, + key_alias=None, + spend=0.0, + max_budget=None, + expires=None, + models=[], + aliases={}, + config={}, + user_id=None, + team_id=None, + max_parallel_requests=None, + metadata={ + "logging": [ + { + "callback_name": "langfuse", + "callback_type": "success", + "callback_vars": { + "langfuse_public_key": "os.environ/LANGFUSE_PUBLIC_KEY", + "langfuse_secret_key": "os.environ/LANGFUSE_SECRET_KEY", + "langfuse_host": "https://us.cloud.langfuse.com", + }, + } + ] + }, + tpm_limit=None, + rpm_limit=None, + budget_duration=None, + budget_reset_at=None, + allowed_cache_controls=[], + permissions={}, + model_spend={}, + model_max_budget={}, + soft_budget_cooldown=False, + litellm_budget_table=None, + org_id=None, + team_spend=None, + team_alias=None, + team_tpm_limit=None, + team_rpm_limit=None, + team_max_budget=None, + team_models=[], + team_blocked=False, + soft_budget=None, + team_model_aliases=None, + team_member_spend=None, + team_metadata=None, + end_user_id=None, + end_user_tpm_limit=None, + end_user_rpm_limit=None, + end_user_max_budget=None, + last_refreshed_at=None, + api_key=None, + user_role=None, + allowed_model_region=None, + parent_otel_span=None, + ), + "proxy_config": proxy_config, + "general_settings": {}, + "version": "0.0.0", + } + + new_data = await add_litellm_data_to_request(**data) + + assert "success_callback" in new_data + assert new_data["success_callback"] == ["langfuse"] + assert "langfuse_public_key" in new_data + assert "langfuse_secret_key" in new_data diff --git a/litellm/tests/test_user_api_key_auth.py b/litellm/tests/test_user_api_key_auth.py index ad057ee572..e0595ac13c 100644 --- a/litellm/tests/test_user_api_key_auth.py +++ b/litellm/tests/test_user_api_key_auth.py @@ -44,7 +44,7 @@ def test_check_valid_ip( request = Request(client_ip) - assert _check_valid_ip(allowed_ips, request) == expected_result # type: ignore + assert _check_valid_ip(allowed_ips, request)[0] == expected_result # type: ignore # test x-forwarder for is used when user has opted in @@ -72,7 +72,7 @@ def test_check_valid_ip_sent_with_x_forwarded_for( request = Request(client_ip, headers={"X-Forwarded-For": client_ip}) - assert _check_valid_ip(allowed_ips, request, use_x_forwarded_for=True) == expected_result # type: ignore + assert _check_valid_ip(allowed_ips, request, use_x_forwarded_for=True)[0] == expected_result # type: ignore @pytest.mark.asyncio diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 455fe1e3c5..d30270c5c8 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -57,6 +57,18 @@ "supports_parallel_function_calling": true, "supports_vision": true }, + "chatgpt-4o-latest": { + "max_tokens": 4096, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.000005, + "output_cost_per_token": 0.000015, + "litellm_provider": "openai", + "mode": "chat", + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_vision": true + }, "gpt-4o-2024-05-13": { "max_tokens": 4096, "max_input_tokens": 128000, @@ -2062,7 +2074,8 @@ "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", "supports_function_calling": true, - "supports_vision": true + "supports_vision": true, + "supports_assistant_prefill": true }, "vertex_ai/claude-3-5-sonnet@20240620": { "max_tokens": 4096, @@ -2073,7 +2086,8 @@ "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", "supports_function_calling": true, - "supports_vision": true + "supports_vision": true, + "supports_assistant_prefill": true }, "vertex_ai/claude-3-haiku@20240307": { "max_tokens": 4096, @@ -2084,7 +2098,8 @@ "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", "supports_function_calling": true, - "supports_vision": true + "supports_vision": true, + "supports_assistant_prefill": true }, "vertex_ai/claude-3-opus@20240229": { "max_tokens": 4096, @@ -2095,7 +2110,8 @@ "litellm_provider": "vertex_ai-anthropic_models", "mode": "chat", "supports_function_calling": true, - "supports_vision": true + "supports_vision": true, + "supports_assistant_prefill": true }, "vertex_ai/meta/llama3-405b-instruct-maas": { "max_tokens": 32000, @@ -4519,6 +4535,69 @@ "litellm_provider": "perplexity", "mode": "chat" }, + "perplexity/llama-3.1-70b-instruct": { + "max_tokens": 131072, + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_token": 0.000001, + "output_cost_per_token": 0.000001, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-8b-instruct": { + "max_tokens": 131072, + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_token": 0.0000002, + "output_cost_per_token": 0.0000002, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-huge-128k-online": { + "max_tokens": 127072, + "max_input_tokens": 127072, + "max_output_tokens": 127072, + "input_cost_per_token": 0.000005, + "output_cost_per_token": 0.000005, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-large-128k-online": { + "max_tokens": 127072, + "max_input_tokens": 127072, + "max_output_tokens": 127072, + "input_cost_per_token": 0.000001, + "output_cost_per_token": 0.000001, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-large-128k-chat": { + "max_tokens": 131072, + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_token": 0.000001, + "output_cost_per_token": 0.000001, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-small-128k-chat": { + "max_tokens": 131072, + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_token": 0.0000002, + "output_cost_per_token": 0.0000002, + "litellm_provider": "perplexity", + "mode": "chat" + }, + "perplexity/llama-3.1-sonar-small-128k-online": { + "max_tokens": 127072, + "max_input_tokens": 127072, + "max_output_tokens": 127072, + "input_cost_per_token": 0.0000002, + "output_cost_per_token": 0.0000002, + "litellm_provider": "perplexity", + "mode": "chat" + }, "perplexity/pplx-7b-chat": { "max_tokens": 8192, "max_input_tokens": 8192, diff --git a/pyproject.toml b/pyproject.toml index ae9ba13da2..73fa657017 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.43.9" +version = "1.43.12" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.43.9" +version = "1.43.12" version_files = [ "pyproject.toml:^version" ]