diff --git a/.gitignore b/.gitignore index abc4ecb0ce..b75a92309a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .venv .env +litellm/proxy/myenv/* litellm_uuid.txt __pycache__/ *.pyc @@ -52,3 +53,6 @@ litellm/proxy/_new_secret_config.yaml litellm/proxy/_new_secret_config.yaml litellm/proxy/_super_secret_config.yaml litellm/proxy/_super_secret_config.yaml +litellm/proxy/myenv/bin/activate +litellm/proxy/myenv/bin/Activate.ps1 +myenv/* \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc41d85f14..e8bb1ff66a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,11 +16,11 @@ repos: name: Check if files match entry: python3 ci_cd/check_files_match.py language: system -- repo: local - hooks: - - id: mypy - name: mypy - entry: python3 -m mypy --ignore-missing-imports - language: system - types: [python] - files: ^litellm/ \ No newline at end of file +# - repo: local +# hooks: +# - id: mypy +# name: mypy +# entry: python3 -m mypy --ignore-missing-imports +# language: system +# types: [python] +# files: ^litellm/ \ No newline at end of file diff --git a/deploy/azure_resource_manager/azure_marketplace.zip b/deploy/azure_resource_manager/azure_marketplace.zip new file mode 100644 index 0000000000..3475125863 Binary files /dev/null and b/deploy/azure_resource_manager/azure_marketplace.zip differ diff --git a/deploy/azure_resource_manager/azure_marketplace/createUiDefinition.json b/deploy/azure_resource_manager/azure_marketplace/createUiDefinition.json new file mode 100644 index 0000000000..4eba73bdba --- /dev/null +++ b/deploy/azure_resource_manager/azure_marketplace/createUiDefinition.json @@ -0,0 +1,15 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/0.1.2-preview/CreateUIDefinition.MultiVm.json#", + "handler": "Microsoft.Azure.CreateUIDef", + "version": "0.1.2-preview", + "parameters": { + "config": { + "isWizard": false, + "basics": { } + }, + "basics": [ ], + "steps": [ ], + "outputs": { }, + "resourceTypes": [ ] + } +} \ No newline at end of file diff --git a/deploy/azure_resource_manager/azure_marketplace/mainTemplate.json b/deploy/azure_resource_manager/azure_marketplace/mainTemplate.json new file mode 100644 index 0000000000..114e855bf5 --- /dev/null +++ b/deploy/azure_resource_manager/azure_marketplace/mainTemplate.json @@ -0,0 +1,63 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "imageName": { + "type": "string", + "defaultValue": "ghcr.io/berriai/litellm:main-latest" + }, + "containerName": { + "type": "string", + "defaultValue": "litellm-container" + }, + "dnsLabelName": { + "type": "string", + "defaultValue": "litellm" + }, + "portNumber": { + "type": "int", + "defaultValue": 4000 + } + }, + "resources": [ + { + "type": "Microsoft.ContainerInstance/containerGroups", + "apiVersion": "2021-03-01", + "name": "[parameters('containerName')]", + "location": "[resourceGroup().location]", + "properties": { + "containers": [ + { + "name": "[parameters('containerName')]", + "properties": { + "image": "[parameters('imageName')]", + "resources": { + "requests": { + "cpu": 1, + "memoryInGB": 2 + } + }, + "ports": [ + { + "port": "[parameters('portNumber')]" + } + ] + } + } + ], + "osType": "Linux", + "restartPolicy": "Always", + "ipAddress": { + "type": "Public", + "ports": [ + { + "protocol": "tcp", + "port": "[parameters('portNumber')]" + } + ], + "dnsNameLabel": "[parameters('dnsLabelName')]" + } + } + } + ] + } \ No newline at end of file diff --git a/deploy/azure_resource_manager/main.bicep b/deploy/azure_resource_manager/main.bicep new file mode 100644 index 0000000000..b104cefe1e --- /dev/null +++ b/deploy/azure_resource_manager/main.bicep @@ -0,0 +1,42 @@ +param imageName string = 'ghcr.io/berriai/litellm:main-latest' +param containerName string = 'litellm-container' +param dnsLabelName string = 'litellm' +param portNumber int = 4000 + +resource containerGroupName 'Microsoft.ContainerInstance/containerGroups@2021-03-01' = { + name: containerName + location: resourceGroup().location + properties: { + containers: [ + { + name: containerName + properties: { + image: imageName + resources: { + requests: { + cpu: 1 + memoryInGB: 2 + } + } + ports: [ + { + port: portNumber + } + ] + } + } + ] + osType: 'Linux' + restartPolicy: 'Always' + ipAddress: { + type: 'Public' + ports: [ + { + protocol: 'tcp' + port: portNumber + } + ] + dnsNameLabel: dnsLabelName + } + } +} diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md index 11ca131210..451deaac47 100644 --- a/docs/my-website/docs/completion/input.md +++ b/docs/my-website/docs/completion/input.md @@ -83,6 +83,7 @@ def completion( top_p: Optional[float] = None, n: Optional[int] = None, stream: Optional[bool] = None, + stream_options: Optional[dict] = None, stop=None, max_tokens: Optional[int] = None, presence_penalty: Optional[float] = None, @@ -139,6 +140,10 @@ def completion( - `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message. +- `stream_options` *dict or null (optional)* - Options for streaming response. Only set this when you set `stream: true` + + - `include_usage` *boolean (optional)* - If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value. + - `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens. - `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion. diff --git a/enterprise/utils.py b/enterprise/utils.py index 4a42dc996f..05bd7dac6e 100644 --- a/enterprise/utils.py +++ b/enterprise/utils.py @@ -291,7 +291,7 @@ def _create_clickhouse_aggregate_tables(client=None, table_names=[]): def _forecast_daily_cost(data: list): - import requests + import requests # type: ignore from datetime import datetime, timedelta if len(data) == 0: diff --git a/litellm/_redis.py b/litellm/_redis.py index d7789472c1..d72016dcd9 100644 --- a/litellm/_redis.py +++ b/litellm/_redis.py @@ -10,8 +10,8 @@ # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation import os import inspect -import redis, litellm -import redis.asyncio as async_redis +import redis, litellm # type: ignore +import redis.asyncio as async_redis # type: ignore from typing import List, Optional diff --git a/litellm/budget_manager.py b/litellm/budget_manager.py index 8410157537..9ef4bfafa9 100644 --- a/litellm/budget_manager.py +++ b/litellm/budget_manager.py @@ -10,7 +10,7 @@ import os, json, time import litellm from litellm.utils import ModelResponse -import requests, threading +import requests, threading # type: ignore from typing import Optional, Union, Literal diff --git a/litellm/integrations/aispend.py b/litellm/integrations/aispend.py index 2015d45ddd..a893f8923b 100644 --- a/litellm/integrations/aispend.py +++ b/litellm/integrations/aispend.py @@ -1,7 +1,6 @@ #### What this does #### # On success + failure, log events to aispend.io import dotenv, os -import requests dotenv.load_dotenv() # Loading env variables using dotenv import traceback diff --git a/litellm/integrations/athina.py b/litellm/integrations/athina.py index 897cf6c8d5..660dd51efb 100644 --- a/litellm/integrations/athina.py +++ b/litellm/integrations/athina.py @@ -4,18 +4,30 @@ import datetime class AthinaLogger: def __init__(self): import os + self.athina_api_key = os.getenv("ATHINA_API_KEY") self.headers = { "athina-api-key": self.athina_api_key, - "Content-Type": "application/json" + "Content-Type": "application/json", } self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference" - self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response", "user_query"] + self.additional_keys = [ + "environment", + "prompt_slug", + "customer_id", + "customer_user_id", + "session_id", + "external_reference_id", + "context", + "expected_response", + "user_query", + ] def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose): - import requests + import requests # type: ignore import json import traceback + try: response_json = response_obj.model_dump() if response_obj else {} data = { @@ -23,32 +35,51 @@ class AthinaLogger: "request": kwargs, "response": response_json, "prompt_tokens": response_json.get("usage", {}).get("prompt_tokens"), - "completion_tokens": response_json.get("usage", {}).get("completion_tokens"), + "completion_tokens": response_json.get("usage", {}).get( + "completion_tokens" + ), "total_tokens": response_json.get("usage", {}).get("total_tokens"), } - - if type(end_time) == datetime.datetime and type(start_time) == datetime.datetime: - data["response_time"] = int((end_time - start_time).total_seconds() * 1000) + + if ( + type(end_time) == datetime.datetime + and type(start_time) == datetime.datetime + ): + data["response_time"] = int( + (end_time - start_time).total_seconds() * 1000 + ) if "messages" in kwargs: data["prompt"] = kwargs.get("messages", None) # Directly add tools or functions if present optional_params = kwargs.get("optional_params", {}) - data.update((k, v) for k, v in optional_params.items() if k in ["tools", "functions"]) + data.update( + (k, v) + for k, v in optional_params.items() + if k in ["tools", "functions"] + ) # Add additional metadata keys - metadata = kwargs.get("litellm_params", {}).get("metadata", {}) + metadata = kwargs.get("litellm_params", {}).get("metadata", {}) if metadata: for key in self.additional_keys: if key in metadata: data[key] = metadata[key] - response = requests.post(self.athina_logging_url, headers=self.headers, data=json.dumps(data, default=str)) + response = requests.post( + self.athina_logging_url, + headers=self.headers, + data=json.dumps(data, default=str), + ) if response.status_code != 200: - print_verbose(f"Athina Logger Error - {response.text}, {response.status_code}") + print_verbose( + f"Athina Logger Error - {response.text}, {response.status_code}" + ) else: print_verbose(f"Athina Logger Succeeded - {response.text}") except Exception as e: - print_verbose(f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}") - pass \ No newline at end of file + print_verbose( + f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}" + ) + pass diff --git a/litellm/integrations/berrispend.py b/litellm/integrations/berrispend.py index 7d91ffca7f..1f0ae4581f 100644 --- a/litellm/integrations/berrispend.py +++ b/litellm/integrations/berrispend.py @@ -1,7 +1,7 @@ #### What this does #### # On success + failure, log events to aispend.io import dotenv, os -import requests +import requests # type: ignore dotenv.load_dotenv() # Loading env variables using dotenv import traceback diff --git a/litellm/integrations/clickhouse.py b/litellm/integrations/clickhouse.py index d5000e5c46..7d1fb37d94 100644 --- a/litellm/integrations/clickhouse.py +++ b/litellm/integrations/clickhouse.py @@ -3,7 +3,6 @@ #### What this does #### # On success, logs events to Promptlayer import dotenv, os -import requests from litellm.proxy._types import UserAPIKeyAuth from litellm.caching import DualCache diff --git a/litellm/integrations/custom_logger.py b/litellm/integrations/custom_logger.py index b288036ad5..8a3e0f4673 100644 --- a/litellm/integrations/custom_logger.py +++ b/litellm/integrations/custom_logger.py @@ -1,7 +1,6 @@ #### What this does #### # On success, logs events to Promptlayer import dotenv, os -import requests from litellm.proxy._types import UserAPIKeyAuth from litellm.caching import DualCache diff --git a/litellm/integrations/datadog.py b/litellm/integrations/datadog.py index f5db5bf1f7..d969341fc4 100644 --- a/litellm/integrations/datadog.py +++ b/litellm/integrations/datadog.py @@ -2,7 +2,7 @@ # On success + failure, log events to Supabase import dotenv, os -import requests +import requests # type: ignore dotenv.load_dotenv() # Loading env variables using dotenv import traceback diff --git a/litellm/integrations/dynamodb.py b/litellm/integrations/dynamodb.py index 2ed6c3f9f7..b5462ee7fa 100644 --- a/litellm/integrations/dynamodb.py +++ b/litellm/integrations/dynamodb.py @@ -2,7 +2,7 @@ # On success + failure, log events to Supabase import dotenv, os -import requests +import requests # type: ignore dotenv.load_dotenv() # Loading env variables using dotenv import traceback diff --git a/litellm/integrations/greenscale.py b/litellm/integrations/greenscale.py index 3ff808ddbb..78190d69dc 100644 --- a/litellm/integrations/greenscale.py +++ b/litellm/integrations/greenscale.py @@ -1,15 +1,17 @@ -import requests +import requests # type: ignore import json import traceback from datetime import datetime, timezone + class GreenscaleLogger: def __init__(self): import os + self.greenscale_api_key = os.getenv("GREENSCALE_API_KEY") self.headers = { "api-key": self.greenscale_api_key, - "Content-Type": "application/json" + "Content-Type": "application/json", } self.greenscale_logging_url = os.getenv("GREENSCALE_ENDPOINT") @@ -19,33 +21,48 @@ class GreenscaleLogger: data = { "modelId": kwargs.get("model"), "inputTokenCount": response_json.get("usage", {}).get("prompt_tokens"), - "outputTokenCount": response_json.get("usage", {}).get("completion_tokens"), + "outputTokenCount": response_json.get("usage", {}).get( + "completion_tokens" + ), } - data["timestamp"] = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') - - if type(end_time) == datetime and type(start_time) == datetime: - data["invocationLatency"] = int((end_time - start_time).total_seconds() * 1000) + data["timestamp"] = datetime.now(timezone.utc).strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + if type(end_time) == datetime and type(start_time) == datetime: + data["invocationLatency"] = int( + (end_time - start_time).total_seconds() * 1000 + ) # Add additional metadata keys to tags tags = [] metadata = kwargs.get("litellm_params", {}).get("metadata", {}) for key, value in metadata.items(): - if key.startswith("greenscale"): + if key.startswith("greenscale"): if key == "greenscale_project": data["project"] = value elif key == "greenscale_application": data["application"] = value else: - tags.append({"key": key.replace("greenscale_", ""), "value": str(value)}) - + tags.append( + {"key": key.replace("greenscale_", ""), "value": str(value)} + ) + data["tags"] = tags - response = requests.post(self.greenscale_logging_url, headers=self.headers, data=json.dumps(data, default=str)) + response = requests.post( + self.greenscale_logging_url, + headers=self.headers, + data=json.dumps(data, default=str), + ) if response.status_code != 200: - print_verbose(f"Greenscale Logger Error - {response.text}, {response.status_code}") + print_verbose( + f"Greenscale Logger Error - {response.text}, {response.status_code}" + ) else: print_verbose(f"Greenscale Logger Succeeded - {response.text}") except Exception as e: - print_verbose(f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}") - pass \ No newline at end of file + print_verbose( + f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}" + ) + pass diff --git a/litellm/integrations/helicone.py b/litellm/integrations/helicone.py index cb86637732..c8c1075419 100644 --- a/litellm/integrations/helicone.py +++ b/litellm/integrations/helicone.py @@ -1,7 +1,7 @@ #### What this does #### # On success, logs events to Helicone import dotenv, os -import requests +import requests # type: ignore import litellm dotenv.load_dotenv() # Loading env variables using dotenv diff --git a/litellm/integrations/langsmith.py b/litellm/integrations/langsmith.py index 415f3d2d20..8a0fb38522 100644 --- a/litellm/integrations/langsmith.py +++ b/litellm/integrations/langsmith.py @@ -1,15 +1,14 @@ #### What this does #### # On success, logs events to Langsmith -import dotenv, os -import requests -import requests +import dotenv, os # type: ignore +import requests # type: ignore from datetime import datetime dotenv.load_dotenv() # Loading env variables using dotenv import traceback import asyncio import types -from pydantic import BaseModel +from pydantic import BaseModel # type: ignore def is_serializable(value): @@ -79,8 +78,6 @@ class LangsmithLogger: except: response_obj = response_obj.dict() # type: ignore - print(f"response_obj: {response_obj}") - data = { "name": run_name, "run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain" @@ -90,7 +87,6 @@ class LangsmithLogger: "start_time": start_time, "end_time": end_time, } - print(f"data: {data}") response = requests.post( "https://api.smith.langchain.com/runs", diff --git a/litellm/integrations/openmeter.py b/litellm/integrations/openmeter.py index 237a40eb8d..a454739d54 100644 --- a/litellm/integrations/openmeter.py +++ b/litellm/integrations/openmeter.py @@ -2,7 +2,6 @@ ## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268 import dotenv, os, json -import requests import litellm dotenv.load_dotenv() # Loading env variables using dotenv @@ -60,7 +59,7 @@ class OpenMeterLogger(CustomLogger): "total_tokens": response_obj["usage"].get("total_tokens"), } - subject = kwargs.get("user", None), # end-user passed in via 'user' param + subject = (kwargs.get("user", None),) # end-user passed in via 'user' param if not subject: raise Exception("OpenMeter: user is required") diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index e3c6e8e774..577946ce18 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -3,7 +3,7 @@ # On success, log events to Prometheus import dotenv, os -import requests +import requests # type: ignore dotenv.load_dotenv() # Loading env variables using dotenv import traceback @@ -19,7 +19,6 @@ class PrometheusLogger: **kwargs, ): try: - print(f"in init prometheus metrics") from prometheus_client import Counter self.litellm_llm_api_failed_requests_metric = Counter( diff --git a/litellm/integrations/prometheus_services.py b/litellm/integrations/prometheus_services.py index 0249a71d0f..d276bb85ba 100644 --- a/litellm/integrations/prometheus_services.py +++ b/litellm/integrations/prometheus_services.py @@ -4,7 +4,7 @@ import dotenv, os -import requests +import requests # type: ignore dotenv.load_dotenv() # Loading env variables using dotenv import traceback @@ -183,7 +183,6 @@ class PrometheusServicesLogger: ) async def async_service_failure_hook(self, payload: ServiceLoggerPayload): - print(f"received error payload: {payload.error}") if self.mock_testing: self.mock_testing_failure_calls += 1 diff --git a/litellm/integrations/prompt_layer.py b/litellm/integrations/prompt_layer.py index 39a80940b7..ce610e1ef1 100644 --- a/litellm/integrations/prompt_layer.py +++ b/litellm/integrations/prompt_layer.py @@ -1,12 +1,13 @@ #### What this does #### # On success, logs events to Promptlayer import dotenv, os -import requests +import requests # type: ignore from pydantic import BaseModel dotenv.load_dotenv() # Loading env variables using dotenv import traceback + class PromptLayerLogger: # Class variables or attributes def __init__(self): @@ -32,7 +33,11 @@ class PromptLayerLogger: tags = kwargs["litellm_params"]["metadata"]["pl_tags"] # Remove "pl_tags" from metadata - metadata = {k:v for k, v in kwargs["litellm_params"]["metadata"].items() if k != "pl_tags"} + metadata = { + k: v + for k, v in kwargs["litellm_params"]["metadata"].items() + if k != "pl_tags" + } print_verbose( f"Prompt Layer Logging - Enters logging function for model kwargs: {new_kwargs}\n, response: {response_obj}" diff --git a/litellm/integrations/s3.py b/litellm/integrations/s3.py index dc35430bc1..d31b158402 100644 --- a/litellm/integrations/s3.py +++ b/litellm/integrations/s3.py @@ -2,7 +2,6 @@ # On success + failure, log events to Supabase import dotenv, os -import requests dotenv.load_dotenv() # Loading env variables using dotenv import traceback diff --git a/litellm/integrations/supabase.py b/litellm/integrations/supabase.py index a99e4abc4b..58beba8a3d 100644 --- a/litellm/integrations/supabase.py +++ b/litellm/integrations/supabase.py @@ -2,7 +2,7 @@ # On success + failure, log events to Supabase import dotenv, os -import requests +import requests # type: ignore dotenv.load_dotenv() # Loading env variables using dotenv import traceback diff --git a/litellm/llms/ai21.py b/litellm/llms/ai21.py index 73d5afebe8..a39a83f157 100644 --- a/litellm/llms/ai21.py +++ b/litellm/llms/ai21.py @@ -1,8 +1,8 @@ import os, types, traceback import json from enum import Enum -import requests -import time, httpx +import requests # type: ignore +import time, httpx # type: ignore from typing import Callable, Optional from litellm.utils import ModelResponse, Choices, Message import litellm diff --git a/litellm/llms/aleph_alpha.py b/litellm/llms/aleph_alpha.py index 86a30a9ec9..7edd11964b 100644 --- a/litellm/llms/aleph_alpha.py +++ b/litellm/llms/aleph_alpha.py @@ -1,12 +1,12 @@ import os, types import json from enum import Enum -import requests +import requests # type: ignore import time from typing import Callable, Optional import litellm from litellm.utils import ModelResponse, Choices, Message, Usage -import httpx +import httpx # type: ignore class AlephAlphaError(Exception): diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py index 3fc374dce7..818c4ecb3a 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic.py @@ -1,7 +1,7 @@ import os, types import json from enum import Enum -import requests, copy +import requests, copy # type: ignore import time from typing import Callable, Optional, List from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper @@ -9,7 +9,7 @@ import litellm from .prompt_templates.factory import prompt_factory, custom_prompt from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler from .base import BaseLLM -import httpx +import httpx # type: ignore class AnthropicConstants(Enum): diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py index e7af9d43b6..f416d14377 100644 --- a/litellm/llms/azure.py +++ b/litellm/llms/azure.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, Any +from typing import Optional, Union, Any, Literal import types, requests from .base import BaseLLM from litellm.utils import ( @@ -12,7 +12,7 @@ from litellm.utils import ( from typing import Callable, Optional, BinaryIO from litellm import OpenAIConfig import litellm, json -import httpx +import httpx # type: ignore from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport from openai import AzureOpenAI, AsyncAzureOpenAI import uuid @@ -952,6 +952,81 @@ class AzureChatCompletion(BaseLLM): ) raise e + def get_headers( + self, + model: Optional[str], + api_key: str, + api_base: str, + api_version: str, + timeout: float, + mode: str, + messages: Optional[list] = None, + input: Optional[list] = None, + prompt: Optional[str] = None, + ) -> dict: + client_session = litellm.client_session or httpx.Client( + transport=CustomHTTPTransport(), # handle dall-e-2 calls + ) + if "gateway.ai.cloudflare.com" in api_base: + ## build base url - assume api base includes resource name + if not api_base.endswith("/"): + api_base += "/" + api_base += f"{model}" + client = AzureOpenAI( + base_url=api_base, + api_version=api_version, + api_key=api_key, + timeout=timeout, + http_client=client_session, + ) + model = None + # cloudflare ai gateway, needs model=None + else: + client = AzureOpenAI( + api_version=api_version, + azure_endpoint=api_base, + api_key=api_key, + timeout=timeout, + http_client=client_session, + ) + + # only run this check if it's not cloudflare ai gateway + if model is None and mode != "image_generation": + raise Exception("model is not set") + + completion = None + + if messages is None: + messages = [{"role": "user", "content": "Hey"}] + try: + completion = client.chat.completions.with_raw_response.create( + model=model, # type: ignore + messages=messages, # type: ignore + ) + except Exception as e: + raise e + response = {} + + if completion is None or not hasattr(completion, "headers"): + raise Exception("invalid completion response") + + if ( + completion.headers.get("x-ratelimit-remaining-requests", None) is not None + ): # not provided for dall-e requests + response["x-ratelimit-remaining-requests"] = completion.headers[ + "x-ratelimit-remaining-requests" + ] + + if completion.headers.get("x-ratelimit-remaining-tokens", None) is not None: + response["x-ratelimit-remaining-tokens"] = completion.headers[ + "x-ratelimit-remaining-tokens" + ] + + if completion.headers.get("x-ms-region", None) is not None: + response["x-ms-region"] = completion.headers["x-ms-region"] + + return response + async def ahealth_check( self, model: Optional[str], @@ -963,7 +1038,7 @@ class AzureChatCompletion(BaseLLM): messages: Optional[list] = None, input: Optional[list] = None, prompt: Optional[str] = None, - ): + ) -> dict: client_session = litellm.aclient_session or httpx.AsyncClient( transport=AsyncCustomHTTPTransport(), # handle dall-e-2 calls ) @@ -1040,4 +1115,8 @@ class AzureChatCompletion(BaseLLM): response["x-ratelimit-remaining-tokens"] = completion.headers[ "x-ratelimit-remaining-tokens" ] + + if completion.headers.get("x-ms-region", None) is not None: + response["x-ms-region"] = completion.headers["x-ms-region"] + return response diff --git a/litellm/llms/azure_text.py b/litellm/llms/azure_text.py index e0d5474774..640ab82223 100644 --- a/litellm/llms/azure_text.py +++ b/litellm/llms/azure_text.py @@ -1,5 +1,5 @@ from typing import Optional, Union, Any -import types, requests +import types, requests # type: ignore from .base import BaseLLM from litellm.utils import ( ModelResponse, diff --git a/litellm/llms/baseten.py b/litellm/llms/baseten.py index 75db9ab465..643dae5304 100644 --- a/litellm/llms/baseten.py +++ b/litellm/llms/baseten.py @@ -1,7 +1,7 @@ import os import json from enum import Enum -import requests +import requests # type: ignore import time from typing import Callable from litellm.utils import ModelResponse, Usage diff --git a/litellm/llms/bedrock.py b/litellm/llms/bedrock.py index 2f26ae4a9a..08433ba18b 100644 --- a/litellm/llms/bedrock.py +++ b/litellm/llms/bedrock.py @@ -163,10 +163,9 @@ class AmazonAnthropicClaude3Config: "stop", "temperature", "top_p", - "extra_headers" + "extra_headers", ] - def map_openai_params(self, non_default_params: dict, optional_params: dict): for param, value in non_default_params.items(): if param == "max_tokens": @@ -534,10 +533,12 @@ class AmazonStabilityConfig: def add_custom_header(headers): """Closure to capture the headers and add them.""" + def callback(request, **kwargs): """Actual callback function that Boto3 will call.""" for header_name, header_value in headers.items(): request.headers.add_header(header_name, header_value) + return callback @@ -672,7 +673,9 @@ def init_bedrock_client( config=config, ) if extra_headers: - client.meta.events.register('before-sign.bedrock-runtime.*', add_custom_header(extra_headers)) + client.meta.events.register( + "before-sign.bedrock-runtime.*", add_custom_header(extra_headers) + ) return client @@ -1224,7 +1227,7 @@ def _embedding_func_single( "input_type", "search_document" ) # aws bedrock example default - https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=cohere.embed-english-v3 data = {"texts": [input], **inference_params} # type: ignore - body = json.dumps(data).encode("utf-8") + body = json.dumps(data).encode("utf-8") # type: ignore ## LOGGING request_str = f""" response = client.invoke_model( @@ -1416,7 +1419,7 @@ def image_generation( ## LOGGING request_str = f""" response = client.invoke_model( - body={body}, + body={body}, # type: ignore modelId={modelId}, accept="application/json", contentType="application/json", diff --git a/litellm/llms/cloudflare.py b/litellm/llms/cloudflare.py index b8187cbc94..5a24b3b443 100644 --- a/litellm/llms/cloudflare.py +++ b/litellm/llms/cloudflare.py @@ -1,11 +1,11 @@ import os, types import json from enum import Enum -import requests +import requests # type: ignore import time from typing import Callable, Optional import litellm -import httpx +import httpx # type: ignore from litellm.utils import ModelResponse, Usage from .prompt_templates.factory import prompt_factory, custom_prompt diff --git a/litellm/llms/cohere.py b/litellm/llms/cohere.py index b867559c3b..0ebdf38f19 100644 --- a/litellm/llms/cohere.py +++ b/litellm/llms/cohere.py @@ -1,12 +1,12 @@ import os, types import json from enum import Enum -import requests +import requests # type: ignore import time, traceback from typing import Callable, Optional from litellm.utils import ModelResponse, Choices, Message, Usage import litellm -import httpx +import httpx # type: ignore class CohereError(Exception): diff --git a/litellm/llms/cohere_chat.py b/litellm/llms/cohere_chat.py index 2a9bc320b9..e4de6ddcb0 100644 --- a/litellm/llms/cohere_chat.py +++ b/litellm/llms/cohere_chat.py @@ -1,12 +1,12 @@ import os, types import json from enum import Enum -import requests +import requests # type: ignore import time, traceback from typing import Callable, Optional from litellm.utils import ModelResponse, Choices, Message, Usage import litellm -import httpx +import httpx # type: ignore from .prompt_templates.factory import cohere_message_pt diff --git a/litellm/llms/maritalk.py b/litellm/llms/maritalk.py index 4c6b86d3cd..dfe53e9df0 100644 --- a/litellm/llms/maritalk.py +++ b/litellm/llms/maritalk.py @@ -1,7 +1,7 @@ import os, types import json from enum import Enum -import requests +import requests # type: ignore import time, traceback from typing import Callable, Optional, List from litellm.utils import ModelResponse, Choices, Message, Usage diff --git a/litellm/llms/nlp_cloud.py b/litellm/llms/nlp_cloud.py index 86648118f9..cd5f17a90b 100644 --- a/litellm/llms/nlp_cloud.py +++ b/litellm/llms/nlp_cloud.py @@ -1,7 +1,7 @@ import os, types import json from enum import Enum -import requests +import requests # type: ignore import time from typing import Callable, Optional import litellm diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index 02f3d29aa5..2884ff04a9 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -1,10 +1,10 @@ from itertools import chain -import requests, types, time +import requests, types, time # type: ignore import json, uuid import traceback from typing import Optional import litellm -import httpx, aiohttp, asyncio +import httpx, aiohttp, asyncio # type: ignore from .prompt_templates.factory import prompt_factory, custom_prompt @@ -245,7 +245,10 @@ def get_ollama_response( tool_calls=[ { "id": f"call_{str(uuid.uuid4())}", - "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])}, + "function": { + "name": function_call["name"], + "arguments": json.dumps(function_call["arguments"]), + }, "type": "function", } ], @@ -257,7 +260,9 @@ def get_ollama_response( model_response["created"] = int(time.time()) model_response["model"] = "ollama/" + model prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt, disallowed_special=()))) # type: ignore - completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", ""))) + completion_tokens = response_json.get( + "eval_count", len(response_json.get("message", dict()).get("content", "")) + ) model_response["usage"] = litellm.Usage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, @@ -298,7 +303,10 @@ def ollama_completion_stream(url, data, logging_obj): tool_calls=[ { "id": f"call_{str(uuid.uuid4())}", - "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])}, + "function": { + "name": function_call["name"], + "arguments": json.dumps(function_call["arguments"]), + }, "type": "function", } ], @@ -339,9 +347,10 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob first_chunk_content = first_chunk.choices[0].delta.content or "" response_content = first_chunk_content + "".join( [ - chunk.choices[0].delta.content - async for chunk in streamwrapper - if chunk.choices[0].delta.content] + chunk.choices[0].delta.content + async for chunk in streamwrapper + if chunk.choices[0].delta.content + ] ) function_call = json.loads(response_content) delta = litellm.utils.Delta( @@ -349,7 +358,10 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob tool_calls=[ { "id": f"call_{str(uuid.uuid4())}", - "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])}, + "function": { + "name": function_call["name"], + "arguments": json.dumps(function_call["arguments"]), + }, "type": "function", } ], @@ -398,7 +410,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj): tool_calls=[ { "id": f"call_{str(uuid.uuid4())}", - "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])}, + "function": { + "name": function_call["name"], + "arguments": json.dumps(function_call["arguments"]), + }, "type": "function", } ], @@ -412,7 +427,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj): model_response["created"] = int(time.time()) model_response["model"] = "ollama/" + data["model"] prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"], disallowed_special=()))) # type: ignore - completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", ""))) + completion_tokens = response_json.get( + "eval_count", + len(response_json.get("message", dict()).get("content", "")), + ) model_response["usage"] = litellm.Usage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, @@ -500,6 +518,7 @@ async def ollama_aembeddings( } return model_response + def ollama_embeddings( api_base: str, model: str, @@ -517,5 +536,6 @@ def ollama_embeddings( optional_params, logging_obj, model_response, - encoding) + encoding, ) + ) diff --git a/litellm/llms/oobabooga.py b/litellm/llms/oobabooga.py index b166c9069e..f8f32e0fe4 100644 --- a/litellm/llms/oobabooga.py +++ b/litellm/llms/oobabooga.py @@ -1,7 +1,7 @@ import os import json from enum import Enum -import requests +import requests # type: ignore import time from typing import Callable, Optional from litellm.utils import ModelResponse, Usage diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index f007507c9b..d542cbe079 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -22,7 +22,6 @@ from litellm.utils import ( TextCompletionResponse, ) from typing import Callable, Optional -import aiohttp, requests import litellm from .prompt_templates.factory import prompt_factory, custom_prompt from openai import OpenAI, AsyncOpenAI @@ -531,6 +530,7 @@ class OpenAIChatCompletion(BaseLLM): model=model, custom_llm_provider="openai", logging_obj=logging_obj, + stream_options=data.get("stream_options", None), ) return streamwrapper @@ -580,6 +580,7 @@ class OpenAIChatCompletion(BaseLLM): model=model, custom_llm_provider="openai", logging_obj=logging_obj, + stream_options=data.get("stream_options", None), ) return streamwrapper except ( diff --git a/litellm/llms/petals.py b/litellm/llms/petals.py index 25403f5980..334b80d388 100644 --- a/litellm/llms/petals.py +++ b/litellm/llms/petals.py @@ -1,7 +1,7 @@ import os, types import json from enum import Enum -import requests +import requests # type: ignore import time from typing import Callable, Optional import litellm diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index 0820303685..24a076dd0c 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -981,7 +981,7 @@ def anthropic_messages_pt(messages: list): # add role=tool support to allow function call result/error submission user_message_types = {"user", "tool", "function"} # reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them. - new_messages = [] + new_messages: list = [] msg_i = 0 tool_use_param = False while msg_i < len(messages): diff --git a/litellm/llms/replicate.py b/litellm/llms/replicate.py index 65052e3179..c297281347 100644 --- a/litellm/llms/replicate.py +++ b/litellm/llms/replicate.py @@ -1,11 +1,11 @@ import os, types import json -import requests +import requests # type: ignore import time from typing import Callable, Optional from litellm.utils import ModelResponse, Usage import litellm -import httpx +import httpx # type: ignore from .prompt_templates.factory import prompt_factory, custom_prompt diff --git a/litellm/llms/sagemaker.py b/litellm/llms/sagemaker.py index 27d3ff72a9..8e75428bb7 100644 --- a/litellm/llms/sagemaker.py +++ b/litellm/llms/sagemaker.py @@ -1,14 +1,14 @@ import os, types, traceback from enum import Enum import json -import requests +import requests # type: ignore import time from typing import Callable, Optional, Any import litellm from litellm.utils import ModelResponse, EmbeddingResponse, get_secret, Usage import sys from copy import deepcopy -import httpx +import httpx # type: ignore from .prompt_templates.factory import prompt_factory, custom_prompt @@ -295,7 +295,7 @@ def completion( EndpointName={model}, InferenceComponentName={model_id}, ContentType="application/json", - Body={data}, + Body={data}, # type: ignore CustomAttributes="accept_eula=true", ) """ # type: ignore @@ -321,7 +321,7 @@ def completion( response = client.invoke_endpoint( EndpointName={model}, ContentType="application/json", - Body={data}, + Body={data}, # type: ignore CustomAttributes="accept_eula=true", ) """ # type: ignore @@ -688,7 +688,7 @@ def embedding( response = client.invoke_endpoint( EndpointName={model}, ContentType="application/json", - Body={data}, + Body={data}, # type: ignore CustomAttributes="accept_eula=true", )""" # type: ignore logging_obj.pre_call( diff --git a/litellm/llms/together_ai.py b/litellm/llms/together_ai.py index 3f9d3b9dee..47453ca885 100644 --- a/litellm/llms/together_ai.py +++ b/litellm/llms/together_ai.py @@ -6,11 +6,11 @@ Reference: https://docs.together.ai/docs/openai-api-compatibility import os, types import json from enum import Enum -import requests +import requests # type: ignore import time from typing import Callable, Optional import litellm -import httpx +import httpx # type: ignore from litellm.utils import ModelResponse, Usage from .prompt_templates.factory import prompt_factory, custom_prompt diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py index ce0ccc73a0..cab7ae19f2 100644 --- a/litellm/llms/vertex_ai.py +++ b/litellm/llms/vertex_ai.py @@ -1,12 +1,12 @@ import os, types import json from enum import Enum -import requests +import requests # type: ignore import time from typing import Callable, Optional, Union, List from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason import litellm, uuid -import httpx, inspect +import httpx, inspect # type: ignore class VertexAIError(Exception): diff --git a/litellm/llms/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_anthropic.py index e73545f992..3bdcf4fd61 100644 --- a/litellm/llms/vertex_ai_anthropic.py +++ b/litellm/llms/vertex_ai_anthropic.py @@ -3,7 +3,7 @@ import os, types import json from enum import Enum -import requests, copy +import requests, copy # type: ignore import time, uuid from typing import Callable, Optional, List from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper @@ -17,7 +17,7 @@ from .prompt_templates.factory import ( extract_between_tags, parse_xml_params, ) -import httpx +import httpx # type: ignore class VertexAIError(Exception): diff --git a/litellm/llms/vllm.py b/litellm/llms/vllm.py index 15f18cbdca..b2a9dd54db 100644 --- a/litellm/llms/vllm.py +++ b/litellm/llms/vllm.py @@ -1,8 +1,8 @@ import os import json from enum import Enum -import requests -import time, httpx +import requests # type: ignore +import time, httpx # type: ignore from typing import Callable, Any from litellm.utils import ModelResponse, Usage from .prompt_templates.factory import prompt_factory, custom_prompt diff --git a/litellm/llms/watsonx.py b/litellm/llms/watsonx.py index ac38a2a8fe..99f2d18baf 100644 --- a/litellm/llms/watsonx.py +++ b/litellm/llms/watsonx.py @@ -3,8 +3,8 @@ import json, types, time # noqa: E401 from contextlib import contextmanager from typing import Callable, Dict, Optional, Any, Union, List -import httpx -import requests +import httpx # type: ignore +import requests # type: ignore import litellm from litellm.utils import ModelResponse, get_secret, Usage diff --git a/litellm/main.py b/litellm/main.py index bff9886acb..99e556bfa2 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -187,6 +187,7 @@ async def acompletion( top_p: Optional[float] = None, n: Optional[int] = None, stream: Optional[bool] = None, + stream_options: Optional[dict] = None, stop=None, max_tokens: Optional[int] = None, presence_penalty: Optional[float] = None, @@ -206,6 +207,7 @@ async def acompletion( api_version: Optional[str] = None, api_key: Optional[str] = None, model_list: Optional[list] = None, # pass in a list of api_base,keys, etc. + extra_headers: Optional[dict] = None, # Optional liteLLM function params **kwargs, ): @@ -223,6 +225,7 @@ async def acompletion( top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0). n (int, optional): The number of completions to generate (default is 1). stream (bool, optional): If True, return a streaming response (default is False). + stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True. stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. @@ -260,6 +263,7 @@ async def acompletion( "top_p": top_p, "n": n, "stream": stream, + "stream_options": stream_options, "stop": stop, "max_tokens": max_tokens, "presence_penalty": presence_penalty, @@ -457,6 +461,7 @@ def completion( top_p: Optional[float] = None, n: Optional[int] = None, stream: Optional[bool] = None, + stream_options: Optional[dict] = None, stop=None, max_tokens: Optional[int] = None, presence_penalty: Optional[float] = None, @@ -496,6 +501,7 @@ def completion( top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0). n (int, optional): The number of completions to generate (default is 1). stream (bool, optional): If True, return a streaming response (default is False). + stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true. stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. @@ -573,6 +579,7 @@ def completion( "top_p", "n", "stream", + "stream_options", "stop", "max_tokens", "presence_penalty", @@ -648,6 +655,8 @@ def completion( "base_model", "stream_timeout", "supports_system_message", + "region_name", + "allowed_model_region", ] default_params = openai_params + litellm_params non_default_params = { @@ -783,6 +792,7 @@ def completion( top_p=top_p, n=n, stream=stream, + stream_options=stream_options, stop=stop, max_tokens=max_tokens, presence_penalty=presence_penalty, @@ -2716,6 +2726,8 @@ def embedding( "ttl", "cache", "no-log", + "region_name", + "allowed_model_region", ] default_params = openai_params + litellm_params non_default_params = { @@ -3589,6 +3601,8 @@ def image_generation( "caching_groups", "ttl", "cache", + "region_name", + "allowed_model_region", ] default_params = openai_params + litellm_params non_default_params = { diff --git a/litellm/proxy/_experimental/out/404.html b/litellm/proxy/_experimental/out/404.html index 8a47f9984e..448d7cf877 100644 --- a/litellm/proxy/_experimental/out/404.html +++ b/litellm/proxy/_experimental/out/404.html @@ -1 +1 @@ -
0?a=a.charAt(0)+"."+a.slice(1)+x(r):i>1&&(a=a.charAt(0)+"."+a.slice(1)),a=a+(o<0?"e":"e+")+o):o<0?(a="0."+x(-o-1)+a,n&&(r=n-i)>0&&(a+=x(r))):o>=i?(a+=x(o+1-i),n&&(r=n-o-1)>0&&(a=a+"."+x(r))):((r=o+1)0&&(o+1===i&&(a+="."),a+=x(r))),e.s<0?"-"+a:a}function N(e,t){if(e.length>t)return e.length=t,!0}function I(e){if(!e||"object"!=typeof e)throw Error(s+"Object expected");var t,n,r,o=["precision",1,1e9,"rounding",0,8,"toExpNeg",-1/0,0,"toExpPos",0,1/0];for(t=0;t239?4:c>223?3:c>191?2:1;if(o+d<=n)switch(d){case 1:c<128&&(u=c);break;case 2:(192&(a=e[o+1]))==128&&(s=(31&c)<<6|63&a)>127&&(u=s);break;case 3:a=e[o+1],i=e[o+2],(192&a)==128&&(192&i)==128&&(s=(15&c)<<12|(63&a)<<6|63&i)>2047&&(s<55296||s>57343)&&(u=s);break;case 4:a=e[o+1],i=e[o+2],l=e[o+3],(192&a)==128&&(192&i)==128&&(192&l)==128&&(s=(15&c)<<18|(63&a)<<12|(63&i)<<6|63&l)>65535&&s<1114112&&(u=s)}null===u?(u=65533,d=1):u>65535&&(u-=65536,r.push(u>>>10&1023|55296),u=56320|1023&u),r.push(u),o+=d}return function(e){var t=e.length;if(t<=4096)return String.fromCharCode.apply(String,e);for(var n="",r=0;r>>=0,isFinite(n)?(n>>>=0,void 0===r&&(r="utf8")):(r=n,n=void 0);else throw Error("Buffer.write(string, encoding, offset[, length]) is no longer supported");var o,a,i,l,s,c,u,d,p,f,m,g,h=this.length-t;if((void 0===n||n>h)&&(n=h),e.length>0&&(n<0||t<0)||t>this.length)throw RangeError("Attempt to write outside buffer bounds");r||(r="utf8");for(var b=!1;;)switch(r){case"hex":return function(e,t,n,r){n=Number(n)||0;var o=e.length-n;r?(r=Number(r))>o&&(r=o):r=o;var a=t.length;r>a/2&&(r=a/2);for(var i=0;i