diff --git a/.circleci/config.yml b/.circleci/config.yml index 7a742afe0..b996dc312 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -625,7 +625,7 @@ jobs: paths: - llm_translation_coverage.xml - llm_translation_coverage - logging_testing: + pass_through_unit_testing: docker: - image: cimg/python:3.11 auth: @@ -646,6 +646,94 @@ jobs: pip install "pytest-asyncio==0.21.1" pip install "respx==0.21.1" # Run pytest and generate JUnit XML report + - run: + name: Run tests + command: | + pwd + ls + python -m pytest -vv tests/pass_through_unit_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5 + no_output_timeout: 120m + - run: + name: Rename the coverage files + command: | + mv coverage.xml pass_through_unit_tests_coverage.xml + mv .coverage pass_through_unit_tests_coverage + + # Store test results + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - pass_through_unit_tests_coverage.xml + - pass_through_unit_tests_coverage + image_gen_testing: + docker: + - image: cimg/python:3.11 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + + steps: + - checkout + - run: + name: Install Dependencies + command: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + pip install "pytest==7.3.1" + pip install "pytest-retry==1.6.3" + pip install "pytest-cov==5.0.0" + pip install "pytest-asyncio==0.21.1" + pip install "respx==0.21.1" + # Run pytest and generate JUnit XML report + - run: + name: Run tests + command: | + pwd + ls + python -m pytest -vv tests/image_gen_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5 + no_output_timeout: 120m + - run: + name: Rename the coverage files + command: | + mv coverage.xml image_gen_coverage.xml + mv .coverage image_gen_coverage + + # Store test results + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - image_gen_coverage.xml + - image_gen_coverage + logging_testing: + docker: + - image: cimg/python:3.11 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + + steps: + - checkout + - run: + name: Install Dependencies + command: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + pip install "pytest==7.3.1" + pip install "pytest-retry==1.6.3" + pip install "pytest-cov==5.0.0" + pip install "pytest-asyncio==0.21.1" + pip install pytest-mock + pip install "respx==0.21.1" + pip install "google-generativeai==0.3.2" + pip install "google-cloud-aiplatform==1.43.0" + pip install "mlflow==2.17.2" + # Run pytest and generate JUnit XML report - run: name: Run tests command: | @@ -719,11 +807,14 @@ jobs: curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - run: python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1) - run: ruff check ./litellm - - run: python ./tests/documentation_tests/test_general_setting_keys.py + # - run: python ./tests/documentation_tests/test_general_setting_keys.py - run: python ./tests/code_coverage_tests/router_code_coverage.py - run: python ./tests/code_coverage_tests/test_router_strategy_async.py - run: python ./tests/code_coverage_tests/litellm_logging_code_coverage.py - run: python ./tests/documentation_tests/test_env_keys.py + - run: python ./tests/documentation_tests/test_router_settings.py + - run: python ./tests/documentation_tests/test_api_docs.py + - run: python ./tests/code_coverage_tests/ensure_async_clients_test.py - run: helm lint ./deploy/charts/litellm-helm db_migration_disable_update_check: @@ -875,7 +966,7 @@ jobs: command: | pwd ls - python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation + python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests no_output_timeout: 120m # Store test results @@ -962,6 +1053,7 @@ jobs: -e AWS_REGION_NAME=$AWS_REGION_NAME \ -e APORIA_API_KEY_1=$APORIA_API_KEY_1 \ -e COHERE_API_KEY=$COHERE_API_KEY \ + -e GCS_FLUSH_INTERVAL="1" \ --name my-app \ -v $(pwd)/litellm/proxy/example_config_yaml/otel_test_config.yaml:/app/config.yaml \ -v $(pwd)/litellm/proxy/example_config_yaml/custom_guardrail.py:/app/custom_guardrail.py \ @@ -991,6 +1083,48 @@ jobs: ls python -m pytest -vv tests/otel_tests -x --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m + # Clean up first container + - run: + name: Stop and remove first container + command: | + docker stop my-app + docker rm my-app + + # Second Docker Container Run with Different Config + # NOTE: We intentionally pass a "bad" license here. We need to ensure proxy starts and serves request even with bad license + - run: + name: Run Second Docker container + command: | + docker run -d \ + -p 4000:4000 \ + -e DATABASE_URL=$PROXY_DATABASE_URL \ + -e REDIS_HOST=$REDIS_HOST \ + -e REDIS_PASSWORD=$REDIS_PASSWORD \ + -e REDIS_PORT=$REDIS_PORT \ + -e LITELLM_MASTER_KEY="sk-1234" \ + -e OPENAI_API_KEY=$OPENAI_API_KEY \ + -e LITELLM_LICENSE="bad-license" \ + --name my-app-3 \ + -v $(pwd)/litellm/proxy/example_config_yaml/enterprise_config.yaml:/app/config.yaml \ + my-app:latest \ + --config /app/config.yaml \ + --port 4000 \ + --detailed_debug + + - run: + name: Start outputting logs for second container + command: docker logs -f my-app-2 + background: true + + - run: + name: Wait for second app to be ready + command: dockerize -wait http://localhost:4000 -timeout 5m + + - run: + name: Run second round of tests + command: | + python -m pytest -vv tests/basic_proxy_startup_tests -x --junitxml=test-results/junit-2.xml --durations=5 + no_output_timeout: 120m # Store test results - store_test_results: @@ -1045,6 +1179,8 @@ jobs: pip install "asyncio==3.4.3" pip install "PyGithub==1.59.1" pip install "google-cloud-aiplatform==1.59.0" + pip install anthropic + # Run pytest and generate JUnit XML report - run: name: Build Docker image command: docker build -t my-app:latest -f ./docker/Dockerfile.database . @@ -1056,6 +1192,8 @@ jobs: -e DATABASE_URL=$PROXY_DATABASE_URL \ -e LITELLM_MASTER_KEY="sk-1234" \ -e OPENAI_API_KEY=$OPENAI_API_KEY \ + -e GEMINI_API_KEY=$GEMINI_API_KEY \ + -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \ -e LITELLM_LICENSE=$LITELLM_LICENSE \ --name my-app \ -v $(pwd)/litellm/proxy/example_config_yaml/pass_through_config.yaml:/app/config.yaml \ @@ -1079,6 +1217,27 @@ jobs: - run: name: Wait for app to be ready command: dockerize -wait http://localhost:4000 -timeout 5m + # New steps to run Node.js test + - run: + name: Install Node.js + command: | + curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - + sudo apt-get install -y nodejs + node --version + npm --version + + - run: + name: Install Node.js dependencies + command: | + npm install @google-cloud/vertexai + npm install @google/generative-ai + npm install --save-dev jest + + - run: + name: Run Vertex AI, Google AI Studio Node.js tests + command: | + npx jest tests/pass_through_tests --verbose + no_output_timeout: 30m - run: name: Run tests command: | @@ -1086,7 +1245,6 @@ jobs: ls python -m pytest -vv tests/pass_through_tests/ -x --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m - # Store test results - store_test_results: path: test-results @@ -1112,7 +1270,7 @@ jobs: python -m venv venv . venv/bin/activate pip install coverage - coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage + coverage combine llm_translation_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage coverage xml - codecov/upload: file: ./coverage.xml @@ -1218,6 +1376,7 @@ jobs: name: Install Dependencies command: | npm install -D @playwright/test + npm install @google-cloud/vertexai pip install "pytest==7.3.1" pip install "pytest-retry==1.6.3" pip install "pytest-asyncio==0.21.1" @@ -1249,7 +1408,7 @@ jobs: command: | docker run -d \ -p 4000:4000 \ - -e DATABASE_URL=$PROXY_DATABASE_URL \ + -e DATABASE_URL=$PROXY_DATABASE_URL_2 \ -e LITELLM_MASTER_KEY="sk-1234" \ -e OPENAI_API_KEY=$OPENAI_API_KEY \ -e UI_USERNAME="admin" \ @@ -1279,7 +1438,7 @@ jobs: - run: name: Run Playwright Tests command: | - npx playwright test --reporter=html --output=test-results + npx playwright test e2e_ui_tests/ --reporter=html --output=test-results no_output_timeout: 120m - store_test_results: path: test-results @@ -1401,6 +1560,18 @@ workflows: only: - main - /litellm_.*/ + - pass_through_unit_testing: + filters: + branches: + only: + - main + - /litellm_.*/ + - image_gen_testing: + filters: + branches: + only: + - main + - /litellm_.*/ - logging_testing: filters: branches: @@ -1410,6 +1581,8 @@ workflows: - upload-coverage: requires: - llm_translation_testing + - pass_through_unit_testing + - image_gen_testing - logging_testing - litellm_router_testing - caching_unit_tests @@ -1449,6 +1622,8 @@ workflows: - load_testing - test_bad_database_url - llm_translation_testing + - pass_through_unit_testing + - image_gen_testing - logging_testing - litellm_router_testing - caching_unit_tests diff --git a/README.md b/README.md index e13732000..5d3efe355 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ for part in response: ## Logging Observability ([Docs](https://docs.litellm.ai/docs/observability/callbacks)) -LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, DynamoDB, s3 Buckets, Helicone, Promptlayer, Traceloop, Athina, Slack +LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, DynamoDB, s3 Buckets, Helicone, Promptlayer, Traceloop, Athina, Slack, MLflow ```python from litellm import completion @@ -305,6 +305,36 @@ Step 4: Submit a PR with your changes! 🚀 - push your fork to your GitHub repo - submit a PR from there +### Building LiteLLM Docker Image + +Follow these instructions if you want to build / run the LiteLLM Docker Image yourself. + +Step 1: Clone the repo + +``` +git clone https://github.com/BerriAI/litellm.git +``` + +Step 2: Build the Docker Image + +Build using Dockerfile.non_root +``` +docker build -f docker/Dockerfile.non_root -t litellm_test_image . +``` + +Step 3: Run the Docker Image + +Make sure config.yaml is present in the root directory. This is your litellm proxy config file. +``` +docker run \ + -v $(pwd)/proxy_config.yaml:/app/config.yaml \ + -e DATABASE_URL="postgresql://xxxxxxxx" \ + -e LITELLM_MASTER_KEY="sk-1234" \ + -p 4000:4000 \ + litellm_test_image \ + --config /app/config.yaml --detailed_debug +``` + # Enterprise For companies that need better security, user management and professional support diff --git a/deploy/charts/litellm-helm/templates/migrations-job.yaml b/deploy/charts/litellm-helm/templates/migrations-job.yaml new file mode 100644 index 000000000..010d2d1b5 --- /dev/null +++ b/deploy/charts/litellm-helm/templates/migrations-job.yaml @@ -0,0 +1,30 @@ +# This job runs the prisma migrations for the LiteLLM DB. + +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "litellm.fullname" . }}-migrations + annotations: + argocd.argoproj.io/hook: PreSync + argocd.argoproj.io/hook-delete-policy: Never # keep this resource so we can debug status on ArgoCD + checksum/config: {{ toYaml .Values | sha256sum }} +spec: + template: + spec: + containers: + - name: prisma-migrations + image: ghcr.io/berriai/litellm-database:main-latest + command: ["python", "litellm/proxy/prisma_migration.py"] + workingDir: "/app" + env: + {{- if .Values.db.useExisting }} + - name: DATABASE_URL + value: {{ .Values.db.url | quote }} + {{- else }} + - name: DATABASE_URL + value: postgresql://{{ .Values.postgresql.auth.username }}:{{ .Values.postgresql.auth.password }}@{{ .Release.Name }}-postgresql/{{ .Values.postgresql.auth.database }} + {{- end }} + - name: DISABLE_SCHEMA_UPDATE + value: "false" # always run the migration from the Helm PreSync hook, override the value set + restartPolicy: OnFailure + backoffLimit: {{ .Values.migrationJob.backoffLimit }} diff --git a/deploy/charts/litellm-helm/values.yaml b/deploy/charts/litellm-helm/values.yaml index a2c55f2fa..c8e4aa1f2 100644 --- a/deploy/charts/litellm-helm/values.yaml +++ b/deploy/charts/litellm-helm/values.yaml @@ -179,3 +179,12 @@ postgresql: redis: enabled: false architecture: standalone + +# Prisma migration job settings +migrationJob: + enabled: true # Enable or disable the schema migration Job + retries: 3 # Number of retries for the Job in case of failure + backoffLimit: 4 # Backoff limit for Job restarts + disableSchemaUpdate: false # Skip schema migrations for specific environments. When True, the job will exit with code 0. + + diff --git a/docs/my-website/.gitignore b/docs/my-website/.gitignore index b2d6de306..4d8604572 100644 --- a/docs/my-website/.gitignore +++ b/docs/my-website/.gitignore @@ -18,3 +18,4 @@ npm-debug.log* yarn-debug.log* yarn-error.log* +yarn.lock diff --git a/docs/my-website/docs/anthropic_completion.md b/docs/my-website/docs/anthropic_completion.md deleted file mode 100644 index ca65f3f6f..000000000 --- a/docs/my-website/docs/anthropic_completion.md +++ /dev/null @@ -1,54 +0,0 @@ -# [BETA] Anthropic `/v1/messages` - -Call 100+ LLMs in the Anthropic format. - - -1. Setup config.yaml - -```yaml -model_list: - - model_name: my-test-model - litellm_params: - model: gpt-3.5-turbo -``` - -2. Start proxy - -```bash -litellm --config /path/to/config.yaml -``` - -3. Test it! - -```bash -curl -X POST 'http://0.0.0.0:4000/v1/messages' \ --H 'x-api-key: sk-1234' \ --H 'content-type: application/json' \ --D '{ - "model": "my-test-model", - "max_tokens": 1024, - "messages": [ - {"role": "user", "content": "Hello, world"} - ] -}' -``` - -## Test with Anthropic SDK - -```python -import os -from anthropic import Anthropic - -client = Anthropic(api_key="sk-1234", base_url="http://0.0.0.0:4000") # 👈 CONNECT TO PROXY - -message = client.messages.create( - messages=[ - { - "role": "user", - "content": "Hello, Claude", - } - ], - model="my-test-model", # 👈 set 'model_name' -) -print(message.content) -``` \ No newline at end of file diff --git a/docs/my-website/docs/benchmarks.md b/docs/my-website/docs/benchmarks.md new file mode 100644 index 000000000..86699008b --- /dev/null +++ b/docs/my-website/docs/benchmarks.md @@ -0,0 +1,41 @@ +# Benchmarks + +Benchmarks for LiteLLM Gateway (Proxy Server) + +Locust Settings: +- 2500 Users +- 100 user Ramp Up + + +## Basic Benchmarks + +Overhead when using a Deployed Proxy vs Direct to LLM +- Latency overhead added by LiteLLM Proxy: 107ms + +| Metric | Direct to Fake Endpoint | Basic Litellm Proxy | +|--------|------------------------|---------------------| +| RPS | 1196 | 1133.2 | +| Median Latency (ms) | 33 | 140 | + + +## Logging Callbacks + +### [GCS Bucket Logging](https://docs.litellm.ai/docs/proxy/bucket) + +Using GCS Bucket has **no impact on latency, RPS compared to Basic Litellm Proxy** + +| Metric | Basic Litellm Proxy | LiteLLM Proxy with GCS Bucket Logging | +|--------|------------------------|---------------------| +| RPS | 1133.2 | 1137.3 | +| Median Latency (ms) | 140 | 138 | + + +### [LangSmith logging](https://docs.litellm.ai/docs/proxy/logging) + +Using LangSmith has **no impact on latency, RPS compared to Basic Litellm Proxy** + +| Metric | Basic Litellm Proxy | LiteLLM Proxy with LangSmith | +|--------|------------------------|---------------------| +| RPS | 1133.2 | 1135 | +| Median Latency (ms) | 140 | 132 | + diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md index c563a5bf0..e55c160e0 100644 --- a/docs/my-website/docs/completion/input.md +++ b/docs/my-website/docs/completion/input.md @@ -41,7 +41,7 @@ Use `litellm.get_supported_openai_params()` for an updated list of params for ea | Provider | temperature | max_completion_tokens | max_tokens | top_p | stream | stream_options | stop | n | presence_penalty | frequency_penalty | functions | function_call | logit_bias | user | response_format | seed | tools | tool_choice | logprobs | top_logprobs | extra_headers | |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -|Anthropic| ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | | | | | | |✅ | ✅ | ✅ | ✅ | ✅ | | | ✅ | +|Anthropic| ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | | | | | | |✅ | ✅ | | ✅ | ✅ | | | ✅ | |OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ | |Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | | | ✅ | |Replicate | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | diff --git a/docs/my-website/docs/completion/json_mode.md b/docs/my-website/docs/completion/json_mode.md index a782bfb0a..379775bf2 100644 --- a/docs/my-website/docs/completion/json_mode.md +++ b/docs/my-website/docs/completion/json_mode.md @@ -75,6 +75,9 @@ Works for: - Google AI Studio - Gemini models - Vertex AI models (Gemini + Anthropic) - Bedrock Models +- Anthropic API Models +- Groq Models +- Ollama Models diff --git a/docs/my-website/docs/completion/prefix.md b/docs/my-website/docs/completion/prefix.md index e3619a2a0..d413ad989 100644 --- a/docs/my-website/docs/completion/prefix.md +++ b/docs/my-website/docs/completion/prefix.md @@ -93,7 +93,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \ ## Check Model Support -Call `litellm.get_model_info` to check if a model/provider supports `response_format`. +Call `litellm.get_model_info` to check if a model/provider supports `prefix`. @@ -116,4 +116,4 @@ curl -X GET 'http://0.0.0.0:4000/v1/model/info' \ -H 'Authorization: Bearer $LITELLM_KEY' \ ``` - \ No newline at end of file + diff --git a/docs/my-website/docs/embedding/supported_embedding.md b/docs/my-website/docs/embedding/supported_embedding.md index 5250ea403..603e04dd9 100644 --- a/docs/my-website/docs/embedding/supported_embedding.md +++ b/docs/my-website/docs/embedding/supported_embedding.md @@ -1,7 +1,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Embedding Models +# Embeddings ## Quick Start ```python diff --git a/docs/my-website/docs/guides/finetuned_models.md b/docs/my-website/docs/guides/finetuned_models.md new file mode 100644 index 000000000..cb0d49b44 --- /dev/null +++ b/docs/my-website/docs/guides/finetuned_models.md @@ -0,0 +1,74 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + +# Calling Finetuned Models + +## OpenAI + + +| Model Name | Function Call | +|---------------------------|-----------------------------------------------------------------| +| fine tuned `gpt-4-0613` | `response = completion(model="ft:gpt-4-0613", messages=messages)` | +| fine tuned `gpt-4o-2024-05-13` | `response = completion(model="ft:gpt-4o-2024-05-13", messages=messages)` | +| fine tuned `gpt-3.5-turbo-0125` | `response = completion(model="ft:gpt-3.5-turbo-0125", messages=messages)` | +| fine tuned `gpt-3.5-turbo-1106` | `response = completion(model="ft:gpt-3.5-turbo-1106", messages=messages)` | +| fine tuned `gpt-3.5-turbo-0613` | `response = completion(model="ft:gpt-3.5-turbo-0613", messages=messages)` | + + +## Vertex AI + +Fine tuned models on vertex have a numerical model/endpoint id. + + + + +```python +from litellm import completion +import os + +## set ENV variables +os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811" +os.environ["VERTEXAI_LOCATION"] = "us-central1" + +response = completion( + model="vertex_ai/", # e.g. vertex_ai/4965075652664360960 + messages=[{ "content": "Hello, how are you?","role": "user"}], + base_model="vertex_ai/gemini-1.5-pro" # the base model - used for routing +) +``` + + + + +1. Add Vertex Credentials to your env + +```bash +!gcloud auth application-default login +``` + +2. Setup config.yaml + +```yaml +- model_name: finetuned-gemini + litellm_params: + model: vertex_ai/ + vertex_project: + vertex_location: + model_info: + base_model: vertex_ai/gemini-1.5-pro # IMPORTANT +``` + +3. Test it! + +```bash +curl --location 'https://0.0.0.0:4000/v1/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: ' \ +--data '{"model": "finetuned-gemini" ,"messages":[{"role": "user", "content":[{"type": "text", "text": "hi"}]}]}' +``` + + + + + diff --git a/docs/my-website/docs/image_generation.md b/docs/my-website/docs/image_generation.md index 5a7ef6f4f..958ff4c02 100644 --- a/docs/my-website/docs/image_generation.md +++ b/docs/my-website/docs/image_generation.md @@ -1,4 +1,4 @@ -# Image Generation +# Images ## Quick Start diff --git a/docs/my-website/docs/moderation.md b/docs/my-website/docs/moderation.md new file mode 100644 index 000000000..6dd092fb5 --- /dev/null +++ b/docs/my-website/docs/moderation.md @@ -0,0 +1,135 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Moderation + + +### Usage + + + +```python +from litellm import moderation + +response = moderation( + input="hello from litellm", + model="text-moderation-stable" +) +``` + + + + +For `/moderations` endpoint, there is **no need to specify `model` in the request or on the litellm config.yaml** + +Start litellm proxy server + +``` +litellm +``` + + + + + +```python +from openai import OpenAI + +# set base_url to your proxy server +# set api_key to send to proxy server +client = OpenAI(api_key="", base_url="http://0.0.0.0:4000") + +response = client.moderations.create( + input="hello from litellm", + model="text-moderation-stable" # optional, defaults to `omni-moderation-latest` +) + +print(response) +``` + + + + +```shell +curl --location 'http://0.0.0.0:4000/moderations' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer sk-1234' \ + --data '{"input": "Sample text goes here", "model": "text-moderation-stable"}' +``` + + + + + + +## Input Params +LiteLLM accepts and translates the [OpenAI Moderation params](https://platform.openai.com/docs/api-reference/moderations) across all supported providers. + +### Required Fields + +- `input`: *string or array* - Input (or inputs) to classify. Can be a single string, an array of strings, or an array of multi-modal input objects similar to other models. + - If string: A string of text to classify for moderation + - If array of strings: An array of strings to classify for moderation + - If array of objects: An array of multi-modal inputs to the moderation model, where each object can be: + - An object describing an image to classify with: + - `type`: *string, required* - Always `image_url` + - `image_url`: *object, required* - Contains either an image URL or a data URL for a base64 encoded image + - An object describing text to classify with: + - `type`: *string, required* - Always `text` + - `text`: *string, required* - A string of text to classify + +### Optional Fields + +- `model`: *string (optional)* - The moderation model to use. Defaults to `omni-moderation-latest`. + +## Output Format +Here's the exact json output and type you can expect from all moderation calls: + +[**LiteLLM follows OpenAI's output format**](https://platform.openai.com/docs/api-reference/moderations/object) + + +```python +{ + "id": "modr-AB8CjOTu2jiq12hp1AQPfeqFWaORR", + "model": "text-moderation-007", + "results": [ + { + "flagged": true, + "categories": { + "sexual": false, + "hate": false, + "harassment": true, + "self-harm": false, + "sexual/minors": false, + "hate/threatening": false, + "violence/graphic": false, + "self-harm/intent": false, + "self-harm/instructions": false, + "harassment/threatening": true, + "violence": true + }, + "category_scores": { + "sexual": 0.000011726012417057063, + "hate": 0.22706663608551025, + "harassment": 0.5215635299682617, + "self-harm": 2.227119921371923e-6, + "sexual/minors": 7.107352217872176e-8, + "hate/threatening": 0.023547329008579254, + "violence/graphic": 0.00003391829886822961, + "self-harm/intent": 1.646940972932498e-6, + "self-harm/instructions": 1.1198755256458526e-9, + "harassment/threatening": 0.5694745779037476, + "violence": 0.9971134662628174 + } + } + ] +} + +``` + + +## **Supported Providers** + +| Provider | +|-------------| +| OpenAI | diff --git a/docs/my-website/docs/observability/argilla.md b/docs/my-website/docs/observability/argilla.md index 8d20b9daa..dad28ce90 100644 --- a/docs/my-website/docs/observability/argilla.md +++ b/docs/my-website/docs/observability/argilla.md @@ -4,24 +4,63 @@ import TabItem from '@theme/TabItem'; # Argilla -Argilla is a tool for annotating datasets. +Argilla is a collaborative annotation tool for AI engineers and domain experts who need to build high-quality datasets for their projects. +## Getting Started -## Usage +To log the data to Argilla, first you need to deploy the Argilla server. If you have not deployed the Argilla server, please follow the instructions [here](https://docs.argilla.io/latest/getting_started/quickstart/). + +Next, you will need to configure and create the Argilla dataset. + +```python +import argilla as rg + +client = rg.Argilla(api_url="", api_key="") + +settings = rg.Settings( + guidelines="These are some guidelines.", + fields=[ + rg.ChatField( + name="user_input", + ), + rg.TextField( + name="llm_output", + ), + ], + questions=[ + rg.RatingQuestion( + name="rating", + values=[1, 2, 3, 4, 5, 6, 7], + ), + ], +) + +dataset = rg.Dataset( + name="my_first_dataset", + settings=settings, +) + +dataset.create() +``` + +For further configuration, please refer to the [Argilla documentation](https://docs.argilla.io/latest/how_to_guides/dataset/). + + +## Usage ```python -from litellm import completion +import os import litellm -import os +from litellm import completion # add env vars os.environ["ARGILLA_API_KEY"]="argilla.apikey" os.environ["ARGILLA_BASE_URL"]="http://localhost:6900" -os.environ["ARGILLA_DATASET_NAME"]="my_second_dataset" +os.environ["ARGILLA_DATASET_NAME"]="my_first_dataset" os.environ["OPENAI_API_KEY"]="sk-proj-..." litellm.callbacks = ["argilla"] diff --git a/docs/my-website/docs/observability/mlflow.md b/docs/my-website/docs/observability/mlflow.md new file mode 100644 index 000000000..3b1e1d477 --- /dev/null +++ b/docs/my-website/docs/observability/mlflow.md @@ -0,0 +1,108 @@ +# MLflow + +## What is MLflow? + +**MLflow** is an end-to-end open source MLOps platform for [experiment tracking](https://www.mlflow.org/docs/latest/tracking.html), [model management](https://www.mlflow.org/docs/latest/models.html), [evaluation](https://www.mlflow.org/docs/latest/llms/llm-evaluate/index.html), [observability (tracing)](https://www.mlflow.org/docs/latest/llms/tracing/index.html), and [deployment](https://www.mlflow.org/docs/latest/deployment/index.html). MLflow empowers teams to collaboratively develop and refine LLM applications efficiently. + +MLflow’s integration with LiteLLM supports advanced observability compatible with OpenTelemetry. + + + + + +## Getting Started + +Install MLflow: + +```shell +pip install mlflow +``` + +To enable LiteLLM tracing: + +```python +import mlflow + +mlflow.litellm.autolog() + +# Alternative, you can set the callback manually in LiteLLM +# litellm.callbacks = ["mlflow"] +``` + +Since MLflow is open-source, no sign-up or API key is needed to log traces! + +``` +import litellm +import os + +# Set your LLM provider's API key +os.environ["OPENAI_API_KEY"] = "" + +# Call LiteLLM as usual +response = litellm.completion( + model="gpt-4o-mini", + messages=[ + {"role": "user", "content": "Hi 👋 - i'm openai"} + ] +) +``` + +Open the MLflow UI and go to the `Traces` tab to view logged traces: + +```bash +mlflow ui +``` + +## Exporting Traces to OpenTelemetry collectors + +MLflow traces are compatible with OpenTelemetry. You can export traces to any OpenTelemetry collector (e.g., Jaeger, Zipkin, Datadog, New Relic) by setting the endpoint URL in the environment variables. + +``` +# Set the endpoint of the OpenTelemetry Collector +os.environ["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"] = "http://localhost:4317/v1/traces" +# Optionally, set the service name to group traces +os.environ["OTEL_SERVICE_NAME"] = "" +``` + +See [MLflow documentation](https://mlflow.org/docs/latest/llms/tracing/index.html#using-opentelemetry-collector-for-exporting-traces) for more details. + +## Combine LiteLLM Trace with Your Application Trace + +LiteLLM is often part of larger LLM applications, such as agentic models. MLflow Tracing allows you to instrument custom Python code, which can then be combined with LiteLLM traces. + +```python +import litellm +import mlflow +from mlflow.entities import SpanType + +# Enable LiteLLM tracing +mlflow.litellm.autolog() + + +class CustomAgent: + # Use @mlflow.trace to instrument Python functions. + @mlflow.trace(span_type=SpanType.AGENT) + def run(self, query: str): + # do something + + while i < self.max_turns: + response = litellm.completion( + model="gpt-4o-mini", + messages=messages, + ) + + action = self.get_action(response) + ... + + @mlflow.trace + def get_action(llm_response): + ... +``` + +This approach generates a unified trace, combining your custom Python code with LiteLLM calls. + + +## Support + +* For advanced usage and integrations of tracing, visit the [MLflow Tracing documentation](https://mlflow.org/docs/latest/llms/tracing/index.html). +* For any question or issue with this integration, please [submit an issue](https://github.com/mlflow/mlflow/issues/new/choose) on our [Github](https://github.com/mlflow/mlflow) repository! \ No newline at end of file diff --git a/docs/my-website/docs/observability/opentelemetry_integration.md b/docs/my-website/docs/observability/opentelemetry_integration.md index ba5ef2ff8..5df82c93c 100644 --- a/docs/my-website/docs/observability/opentelemetry_integration.md +++ b/docs/my-website/docs/observability/opentelemetry_integration.md @@ -49,9 +49,19 @@ OTEL_ENDPOINT="http://0.0.0.0:4317" + + +```shell +OTEL_EXPORTER="otlp_grpc" +OTEL_ENDPOINT="https://api.lmnr.ai:8443" +OTEL_HEADERS="authorization=Bearer " +``` + + + -Use just 2 lines of code, to instantly log your LLM responses **across all providers** with OpenTelemetry: +Use just 1 line of code, to instantly log your LLM responses **across all providers** with OpenTelemetry: ```python litellm.callbacks = ["otel"] @@ -76,3 +86,20 @@ Be aware that if you are continuing an existing trace, and you set `update_trace ## Support For any question or issue with the integration you can reach out to the OpenLLMetry maintainers on [Slack](https://traceloop.com/slack) or via [email](mailto:dev@traceloop.com). + +## Troubleshooting + +### Trace LiteLLM Proxy user/key/org/team information on failed requests + +LiteLLM emits the user_api_key_metadata +- key hash +- key_alias +- org_id +- user_id +- team_id + +for successful + failed requests + +click under `litellm_request` in the trace + + \ No newline at end of file diff --git a/docs/my-website/docs/pass_through/anthropic_completion.md b/docs/my-website/docs/pass_through/anthropic_completion.md new file mode 100644 index 000000000..2e052f7cd --- /dev/null +++ b/docs/my-website/docs/pass_through/anthropic_completion.md @@ -0,0 +1,371 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Anthropic SDK + +Pass-through endpoints for Anthropic - call provider-specific endpoint, in native format (no translation). + +Just replace `https://api.anthropic.com` with `LITELLM_PROXY_BASE_URL/anthropic` + +#### **Example Usage** + + + + + +```bash +curl --request POST \ + --url http://0.0.0.0:4000/anthropic/v1/messages \ + --header 'accept: application/json' \ + --header 'content-type: application/json' \ + --header "Authorization: bearer sk-anything" \ + --data '{ + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ] + }' +``` + + + + +```python +from anthropic import Anthropic + +# Initialize client with proxy base URL +client = Anthropic( + base_url="http://0.0.0.0:4000/anthropic", # /anthropic + api_key="sk-anything" # proxy virtual key +) + +# Make a completion request +response = client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=1024, + messages=[ + {"role": "user", "content": "Hello, world"} + ] +) + +print(response) +``` + + + + +Supports **ALL** Anthropic Endpoints (including streaming). + +[**See All Anthropic Endpoints**](https://docs.anthropic.com/en/api/messages) + +## Quick Start + +Let's call the Anthropic [`/messages` endpoint](https://docs.anthropic.com/en/api/messages) + +1. Add Anthropic API Key to your environment + +```bash +export ANTHROPIC_API_KEY="" +``` + +2. Start LiteLLM Proxy + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +3. Test it! + +Let's call the Anthropic /messages endpoint + +```bash +curl http://0.0.0.0:4000/anthropic/v1/messages \ + --header "x-api-key: $LITELLM_API_KEY" \ + --header "anthropic-version: 2023-06-01" \ + --header "content-type: application/json" \ + --data \ + '{ + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ] + }' +``` + + +## Examples + +Anything after `http://0.0.0.0:4000/anthropic` is treated as a provider-specific route, and handled accordingly. + +Key Changes: + +| **Original Endpoint** | **Replace With** | +|------------------------------------------------------|-----------------------------------| +| `https://api.anthropic.com` | `http://0.0.0.0:4000/anthropic` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") | +| `bearer $ANTHROPIC_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) | + + +### **Example 1: Messages endpoint** + +#### LiteLLM Proxy Call + +```bash +curl --request POST \ + --url http://0.0.0.0:4000/anthropic/v1/messages \ + --header "x-api-key: $LITELLM_API_KEY" \ + --header "anthropic-version: 2023-06-01" \ + --header "content-type: application/json" \ + --data '{ + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ] + }' +``` + +#### Direct Anthropic API Call + +```bash +curl https://api.anthropic.com/v1/messages \ + --header "x-api-key: $ANTHROPIC_API_KEY" \ + --header "anthropic-version: 2023-06-01" \ + --header "content-type: application/json" \ + --data \ + '{ + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ] + }' +``` + +### **Example 2: Token Counting API** + +#### LiteLLM Proxy Call + +```bash +curl --request POST \ + --url http://0.0.0.0:4000/anthropic/v1/messages/count_tokens \ + --header "x-api-key: $LITELLM_API_KEY" \ + --header "anthropic-version: 2023-06-01" \ + --header "anthropic-beta: token-counting-2024-11-01" \ + --header "content-type: application/json" \ + --data \ + '{ + "model": "claude-3-5-sonnet-20241022", + "messages": [ + {"role": "user", "content": "Hello, world"} + ] + }' +``` + +#### Direct Anthropic API Call + +```bash +curl https://api.anthropic.com/v1/messages/count_tokens \ + --header "x-api-key: $ANTHROPIC_API_KEY" \ + --header "anthropic-version: 2023-06-01" \ + --header "anthropic-beta: token-counting-2024-11-01" \ + --header "content-type: application/json" \ + --data \ +'{ + "model": "claude-3-5-sonnet-20241022", + "messages": [ + {"role": "user", "content": "Hello, world"} + ] +}' +``` + +### **Example 3: Batch Messages** + + +#### LiteLLM Proxy Call + +```bash +curl --request POST \ + --url http://0.0.0.0:4000/anthropic/v1/messages/batches \ + --header "x-api-key: $LITELLM_API_KEY" \ + --header "anthropic-version: 2023-06-01" \ + --header "anthropic-beta: message-batches-2024-09-24" \ + --header "content-type: application/json" \ + --data \ +'{ + "requests": [ + { + "custom_id": "my-first-request", + "params": { + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ] + } + }, + { + "custom_id": "my-second-request", + "params": { + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hi again, friend"} + ] + } + } + ] +}' +``` + +#### Direct Anthropic API Call + +```bash +curl https://api.anthropic.com/v1/messages/batches \ + --header "x-api-key: $ANTHROPIC_API_KEY" \ + --header "anthropic-version: 2023-06-01" \ + --header "anthropic-beta: message-batches-2024-09-24" \ + --header "content-type: application/json" \ + --data \ +'{ + "requests": [ + { + "custom_id": "my-first-request", + "params": { + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ] + } + }, + { + "custom_id": "my-second-request", + "params": { + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hi again, friend"} + ] + } + } + ] +}' +``` + + +## Advanced + +Pre-requisites +- [Setup proxy with DB](../proxy/virtual_keys.md#setup) + +Use this, to avoid giving developers the raw Anthropic API key, but still letting them use Anthropic endpoints. + +### Use with Virtual Keys + +1. Setup environment + +```bash +export DATABASE_URL="" +export LITELLM_MASTER_KEY="" +export COHERE_API_KEY="" +``` + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +2. Generate virtual key + +```bash +curl -X POST 'http://0.0.0.0:4000/key/generate' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{}' +``` + +Expected Response + +```bash +{ + ... + "key": "sk-1234ewknldferwedojwojw" +} +``` + +3. Test it! + + +```bash +curl --request POST \ + --url http://0.0.0.0:4000/anthropic/v1/messages \ + --header 'accept: application/json' \ + --header 'content-type: application/json' \ + --header "Authorization: bearer sk-1234ewknldferwedojwojw" \ + --data '{ + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ] + }' +``` + + +### Send `litellm_metadata` (tags) + + + + +```bash +curl --request POST \ + --url http://0.0.0.0:4000/anthropic/v1/messages \ + --header 'accept: application/json' \ + --header 'content-type: application/json' \ + --header "Authorization: bearer sk-anything" \ + --data '{ + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ], + "litellm_metadata": { + "tags": ["test-tag-1", "test-tag-2"] + } + }' +``` + + + + +```python +from anthropic import Anthropic + +client = Anthropic( + base_url="http://0.0.0.0:4000/anthropic", + api_key="sk-anything" +) + +response = client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=1024, + messages=[ + {"role": "user", "content": "Hello, world"} + ], + extra_body={ + "litellm_metadata": { + "tags": ["test-tag-1", "test-tag-2"] + } + } +) + +print(response) +``` + + + \ No newline at end of file diff --git a/docs/my-website/docs/pass_through/cohere.md b/docs/my-website/docs/pass_through/cohere.md index 715afc1ed..64edf18b2 100644 --- a/docs/my-website/docs/pass_through/cohere.md +++ b/docs/my-website/docs/pass_through/cohere.md @@ -1,4 +1,4 @@ -# Cohere API +# Cohere SDK Pass-through endpoints for Cohere - call provider-specific endpoint, in native format (no translation). diff --git a/docs/my-website/docs/pass_through/google_ai_studio.md b/docs/my-website/docs/pass_through/google_ai_studio.md index 34fba97a4..ee5eecc19 100644 --- a/docs/my-website/docs/pass_through/google_ai_studio.md +++ b/docs/my-website/docs/pass_through/google_ai_studio.md @@ -1,12 +1,21 @@ -# Google AI Studio +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + +# Google AI Studio SDK Pass-through endpoints for Google AI Studio - call provider-specific endpoint, in native format (no translation). -Just replace `https://generativelanguage.googleapis.com` with `LITELLM_PROXY_BASE_URL/gemini` 🚀 +Just replace `https://generativelanguage.googleapis.com` with `LITELLM_PROXY_BASE_URL/gemini` #### **Example Usage** + + + + ```bash -http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-anything' \ +curl 'http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-anything' \ -H 'Content-Type: application/json' \ -d '{ "contents": [{ @@ -17,6 +26,53 @@ http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-any }' ``` + + + +```javascript +const { GoogleGenerativeAI } = require("@google/generative-ai"); + +const modelParams = { + model: 'gemini-pro', +}; + +const requestOptions = { + baseUrl: 'http://localhost:4000/gemini', // http:///gemini +}; + +const genAI = new GoogleGenerativeAI("sk-1234"); // litellm proxy API key +const model = genAI.getGenerativeModel(modelParams, requestOptions); + +async function main() { + try { + const result = await model.generateContent("Explain how AI works"); + console.log(result.response.text()); + } catch (error) { + console.error('Error:', error); + } +} + +// For streaming responses +async function main_streaming() { + try { + const streamingResult = await model.generateContentStream("Explain how AI works"); + for await (const chunk of streamingResult.stream) { + console.log('Stream chunk:', JSON.stringify(chunk)); + } + const aggregatedResponse = await streamingResult.response; + console.log('Aggregated response:', JSON.stringify(aggregatedResponse)); + } catch (error) { + console.error('Error:', error); + } +} + +main(); +// main_streaming(); +``` + + + + Supports **ALL** Google AI Studio Endpoints (including streaming). [**See All Google AI Studio Endpoints**](https://ai.google.dev/api) @@ -166,14 +222,14 @@ curl -X POST "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5 ``` -## Advanced - Use with Virtual Keys +## Advanced Pre-requisites - [Setup proxy with DB](../proxy/virtual_keys.md#setup) Use this, to avoid giving developers the raw Google AI Studio key, but still letting them use Google AI Studio endpoints. -### Usage +### Use with Virtual Keys 1. Setup environment @@ -220,4 +276,66 @@ http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:countTokens?key=sk-123 }] }] }' -``` \ No newline at end of file +``` + + +### Send `tags` in request headers + +Use this if you want `tags` to be tracked in the LiteLLM DB and on logging callbacks. + +Pass tags in request headers as a comma separated list. In the example below the following tags will be tracked + +``` +tags: ["gemini-js-sdk", "pass-through-endpoint"] +``` + + + + +```bash +curl 'http://0.0.0.0:4000/gemini/v1beta/models/gemini-1.5-flash:generateContent?key=sk-anything' \ +-H 'Content-Type: application/json' \ +-H 'tags: gemini-js-sdk,pass-through-endpoint' \ +-d '{ + "contents": [{ + "parts":[{ + "text": "The quick brown fox jumps over the lazy dog." + }] + }] +}' +``` + + + + +```javascript +const { GoogleGenerativeAI } = require("@google/generative-ai"); + +const modelParams = { + model: 'gemini-pro', +}; + +const requestOptions = { + baseUrl: 'http://localhost:4000/gemini', // http:///gemini + customHeaders: { + "tags": "gemini-js-sdk,pass-through-endpoint" + } +}; + +const genAI = new GoogleGenerativeAI("sk-1234"); +const model = genAI.getGenerativeModel(modelParams, requestOptions); + +async function main() { + try { + const result = await model.generateContent("Explain how AI works"); + console.log(result.response.text()); + } catch (error) { + console.error('Error:', error); + } +} + +main(); +``` + + + diff --git a/docs/my-website/docs/pass_through/langfuse.md b/docs/my-website/docs/pass_through/langfuse.md index 68d9903e6..7b95751b6 100644 --- a/docs/my-website/docs/pass_through/langfuse.md +++ b/docs/my-website/docs/pass_through/langfuse.md @@ -1,4 +1,4 @@ -# Langfuse Endpoints +# Langfuse SDK Pass-through endpoints for Langfuse - call langfuse endpoints with LiteLLM Virtual Key. diff --git a/docs/my-website/docs/pass_through/vertex_ai.md b/docs/my-website/docs/pass_through/vertex_ai.md index 1bf555823..601f89f4b 100644 --- a/docs/my-website/docs/pass_through/vertex_ai.md +++ b/docs/my-website/docs/pass_through/vertex_ai.md @@ -2,15 +2,110 @@ import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# [BETA] Vertex AI Endpoints +# Vertex AI SDK -Use VertexAI SDK to call endpoints on LiteLLM Gateway (native provider format) +Pass-through endpoints for Vertex AI - call provider-specific endpoint, in native format (no translation). -:::tip +Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE_URL/vertex_ai` + + +#### **Example Usage** + + + + +```bash +curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.0-pro:generateContent \ + -H "Content-Type: application/json" \ + -H "x-litellm-api-key: Bearer sk-1234" \ + -d '{ + "contents":[{ + "role": "user", + "parts":[{"text": "How are you doing today?"}] + }] + }' +``` + + + + +```javascript +const { VertexAI } = require('@google-cloud/vertexai'); + +const vertexAI = new VertexAI({ + project: 'your-project-id', // enter your vertex project id + location: 'us-central1', // enter your vertex region + apiEndpoint: "localhost:4000/vertex_ai" // /vertex_ai # note, do not include 'https://' in the url +}); + +const model = vertexAI.getGenerativeModel({ + model: 'gemini-1.0-pro' +}, { + customHeaders: { + "x-litellm-api-key": "sk-1234" // Your litellm Virtual Key + } +}); + +async function generateContent() { + try { + const prompt = { + contents: [{ + role: 'user', + parts: [{ text: 'How are you doing today?' }] + }] + }; + + const response = await model.generateContent(prompt); + console.log('Response:', response); + } catch (error) { + console.error('Error:', error); + } +} + +generateContent(); +``` + + + + + +## Quick Start + +Let's call the Vertex AI [`/generateContent` endpoint](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference) + +1. Add Vertex AI Credentials to your environment + +```bash +export DEFAULT_VERTEXAI_PROJECT="" # "adroit-crow-413218" +export DEFAULT_VERTEXAI_LOCATION="" # "us-central1" +export DEFAULT_GOOGLE_APPLICATION_CREDENTIALS="" # "/Users/Downloads/adroit-crow-413218-a956eef1a2a8.json" +``` + +2. Start LiteLLM Proxy + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +3. Test it! + +Let's call the Google AI Studio token counting endpoint + +```bash +curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.0-pro:generateContent \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "contents":[{ + "role": "user", + "parts":[{"text": "How are you doing today?"}] + }] + }' +``` -Looking for the Unified API (OpenAI format) for VertexAI ? [Go here - using vertexAI with LiteLLM SDK or LiteLLM Proxy Server](../providers/vertex.md) -::: ## Supported API Endpoints @@ -22,7 +117,7 @@ Looking for the Unified API (OpenAI format) for VertexAI ? [Go here - using vert - Tuning API - CountTokens API -## Authentication to Vertex AI +#### Authentication to Vertex AI LiteLLM Proxy Server supports two methods of authentication to Vertex AI: @@ -30,715 +125,60 @@ LiteLLM Proxy Server supports two methods of authentication to Vertex AI: 2. Set Vertex AI credentials on proxy server -## Quick Start Usage - - - - - -#### 1. Start litellm proxy - -```shell -litellm --config /path/to/config.yaml -``` - -#### 2. Test it - -```python -import vertexai -from vertexai.preview.generative_models import GenerativeModel - -LITE_LLM_ENDPOINT = "http://localhost:4000" - -vertexai.init( - project="", # enter your project id - location="", # enter your region - api_endpoint=f"{LITE_LLM_ENDPOINT}/vertex-ai", # route on litellm - api_transport="rest", -) - -model = GenerativeModel(model_name="gemini-1.0-pro") -model.generate_content("hi") - -``` - - - - - - -#### 1. Set `default_vertex_config` on your `config.yaml` - - -Add the following credentials to your litellm config.yaml to use the Vertex AI endpoints. - -```yaml -default_vertex_config: - vertex_project: "adroit-crow-413218" - vertex_location: "us-central1" - vertex_credentials: "/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json -``` - -#### 2. Start litellm proxy - -```shell -litellm --config /path/to/config.yaml -``` - -#### 3. Test it - -```python -import vertexai -from google.auth.credentials import Credentials -from vertexai.generative_models import GenerativeModel - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -import datetime - - -class CredentialsWrapper(Credentials): - def __init__(self, token=None): - super().__init__() - self.token = token - self.expiry = None # or set to a future date if needed - - def refresh(self, request): - pass - - def apply(self, headers, token=None): - headers["Authorization"] = f"Bearer {self.token}" - - @property - def expired(self): - return False # Always consider the token as non-expired - - @property - def valid(self): - return True # Always consider the credentials as valid - - -credentials = CredentialsWrapper(token=LITELLM_PROXY_API_KEY) - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - credentials=credentials, - api_transport="rest", -) - -model = GenerativeModel("gemini-1.5-flash-001") - -response = model.generate_content( - "What's a good name for a flower shop that specializes in selling bouquets of dried flowers?" -) - -print(response.text) -``` - - - - ## Usage Examples ### Gemini API (Generate Content) - - -```python -import vertexai -from vertexai.generative_models import GenerativeModel - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - api_transport="rest", - -) - -model = GenerativeModel("gemini-1.5-flash-001") - -response = model.generate_content( - "What's a good name for a flower shop that specializes in selling bouquets of dried flowers?" -) - -print(response.text) -``` - - - - -```python -import vertexai -from google.auth.credentials import Credentials -from vertexai.generative_models import GenerativeModel - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -import datetime - - -class CredentialsWrapper(Credentials): - def __init__(self, token=None): - super().__init__() - self.token = token - self.expiry = None # or set to a future date if needed - - def refresh(self, request): - pass - - def apply(self, headers, token=None): - headers["Authorization"] = f"Bearer {self.token}" - - @property - def expired(self): - return False # Always consider the token as non-expired - - @property - def valid(self): - return True # Always consider the credentials as valid - - -credentials = CredentialsWrapper(token=LITELLM_PROXY_API_KEY) - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - credentials=credentials, - api_transport="rest", - -) - -model = GenerativeModel("gemini-1.5-flash-001") - -response = model.generate_content( - "What's a good name for a flower shop that specializes in selling bouquets of dried flowers?" -) - -print(response.text) -``` - - - ```shell -curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:generateContent \ +curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.5-flash-001:generateContent \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-1234" \ + -H "x-litellm-api-key: Bearer sk-1234" \ -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}' ``` - - ### Embeddings API - - - - -```python -from typing import List, Optional -from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel -import vertexai -from vertexai.generative_models import GenerativeModel - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -import datetime - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - api_transport="rest", -) - - -def embed_text( - texts: List[str] = ["banana muffins? ", "banana bread? banana muffins?"], - task: str = "RETRIEVAL_DOCUMENT", - model_name: str = "text-embedding-004", - dimensionality: Optional[int] = 256, -) -> List[List[float]]: - """Embeds texts with a pre-trained, foundational model.""" - model = TextEmbeddingModel.from_pretrained(model_name) - inputs = [TextEmbeddingInput(text, task) for text in texts] - kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {} - embeddings = model.get_embeddings(inputs, **kwargs) - return [embedding.values for embedding in embeddings] -``` - - - - - -```python -from typing import List, Optional -from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel -import vertexai -from google.auth.credentials import Credentials -from vertexai.generative_models import GenerativeModel - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -import datetime - - -class CredentialsWrapper(Credentials): - def __init__(self, token=None): - super().__init__() - self.token = token - self.expiry = None # or set to a future date if needed - - def refresh(self, request): - pass - - def apply(self, headers, token=None): - headers["Authorization"] = f"Bearer {self.token}" - - @property - def expired(self): - return False # Always consider the token as non-expired - - @property - def valid(self): - return True # Always consider the credentials as valid - - -credentials = CredentialsWrapper(token=LITELLM_PROXY_API_KEY) - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - credentials=credentials, - api_transport="rest", -) - - -def embed_text( - texts: List[str] = ["banana muffins? ", "banana bread? banana muffins?"], - task: str = "RETRIEVAL_DOCUMENT", - model_name: str = "text-embedding-004", - dimensionality: Optional[int] = 256, -) -> List[List[float]]: - """Embeds texts with a pre-trained, foundational model.""" - model = TextEmbeddingModel.from_pretrained(model_name) - inputs = [TextEmbeddingInput(text, task) for text in texts] - kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {} - embeddings = model.get_embeddings(inputs, **kwargs) - return [embedding.values for embedding in embeddings] -``` - - - ```shell -curl http://localhost:4000/vertex-ai/publishers/google/models/textembedding-gecko@001:predict \ +curl http://localhost:4000/vertex_ai/publishers/google/models/textembedding-gecko@001:predict \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-1234" \ + -H "x-litellm-api-key: Bearer sk-1234" \ -d '{"instances":[{"content": "gm"}]}' ``` - - - ### Imagen API - - - - - -```python -from typing import List, Optional -from vertexai.preview.vision_models import ImageGenerationModel -import vertexai -from google.auth.credentials import Credentials - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -import datetime - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - api_transport="rest", -) - -model = ImageGenerationModel.from_pretrained("imagen-3.0-generate-001") - -images = model.generate_images( - prompt=prompt, - # Optional parameters - number_of_images=1, - language="en", - # You can't use a seed value and watermark at the same time. - # add_watermark=False, - # seed=100, - aspect_ratio="1:1", - safety_filter_level="block_some", - person_generation="allow_adult", -) - -images[0].save(location=output_file, include_generation_parameters=False) - -# Optional. View the generated image in a notebook. -# images[0].show() - -print(f"Created output image using {len(images[0]._image_bytes)} bytes") - -``` - - - - -```python -from typing import List, Optional -from vertexai.preview.vision_models import ImageGenerationModel -import vertexai -from google.auth.credentials import Credentials - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -import datetime - - -class CredentialsWrapper(Credentials): - def __init__(self, token=None): - super().__init__() - self.token = token - self.expiry = None # or set to a future date if needed - - def refresh(self, request): - pass - - def apply(self, headers, token=None): - headers["Authorization"] = f"Bearer {self.token}" - - @property - def expired(self): - return False # Always consider the token as non-expired - - @property - def valid(self): - return True # Always consider the credentials as valid - - -credentials = CredentialsWrapper(token=LITELLM_PROXY_API_KEY) - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - credentials=credentials, - api_transport="rest", -) - -model = ImageGenerationModel.from_pretrained("imagen-3.0-generate-001") - -images = model.generate_images( - prompt=prompt, - # Optional parameters - number_of_images=1, - language="en", - # You can't use a seed value and watermark at the same time. - # add_watermark=False, - # seed=100, - aspect_ratio="1:1", - safety_filter_level="block_some", - person_generation="allow_adult", -) - -images[0].save(location=output_file, include_generation_parameters=False) - -# Optional. View the generated image in a notebook. -# images[0].show() - -print(f"Created output image using {len(images[0]._image_bytes)} bytes") - -``` - - - - - ```shell -curl http://localhost:4000/vertex-ai/publishers/google/models/imagen-3.0-generate-001:predict \ +curl http://localhost:4000/vertex_ai/publishers/google/models/imagen-3.0-generate-001:predict \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-1234" \ + -H "x-litellm-api-key: Bearer sk-1234" \ -d '{"instances":[{"prompt": "make an otter"}], "parameters": {"sampleCount": 1}}' ``` - - - ### Count Tokens API - - - - - - -```python -from typing import List, Optional -from vertexai.generative_models import GenerativeModel -import vertexai - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -import datetime - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - api_transport="rest", -) - - -model = GenerativeModel("gemini-1.5-flash-001") - -prompt = "Why is the sky blue?" - -# Prompt tokens count -response = model.count_tokens(prompt) -print(f"Prompt Token Count: {response.total_tokens}") -print(f"Prompt Character Count: {response.total_billable_characters}") - -# Send text to Gemini -response = model.generate_content(prompt) - -# Response tokens count -usage_metadata = response.usage_metadata -print(f"Prompt Token Count: {usage_metadata.prompt_token_count}") -print(f"Candidates Token Count: {usage_metadata.candidates_token_count}") -print(f"Total Token Count: {usage_metadata.total_token_count}") -``` - - - - - - -```python -from typing import List, Optional -from vertexai.generative_models import GenerativeModel -import vertexai -from google.auth.credentials import Credentials - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -import datetime - - -class CredentialsWrapper(Credentials): - def __init__(self, token=None): - super().__init__() - self.token = token - self.expiry = None # or set to a future date if needed - - def refresh(self, request): - pass - - def apply(self, headers, token=None): - headers["Authorization"] = f"Bearer {self.token}" - - @property - def expired(self): - return False # Always consider the token as non-expired - - @property - def valid(self): - return True # Always consider the credentials as valid - - -credentials = CredentialsWrapper(token=LITELLM_PROXY_API_KEY) - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - credentials=credentials, - api_transport="rest", -) - - -model = GenerativeModel("gemini-1.5-flash-001") - -prompt = "Why is the sky blue?" - -# Prompt tokens count -response = model.count_tokens(prompt) -print(f"Prompt Token Count: {response.total_tokens}") -print(f"Prompt Character Count: {response.total_billable_characters}") - -# Send text to Gemini -response = model.generate_content(prompt) - -# Response tokens count -usage_metadata = response.usage_metadata -print(f"Prompt Token Count: {usage_metadata.prompt_token_count}") -print(f"Candidates Token Count: {usage_metadata.candidates_token_count}") -print(f"Total Token Count: {usage_metadata.total_token_count}") -``` - - - - - - - ```shell -curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.5-flash-001:countTokens \ +curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.5-flash-001:countTokens \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-1234" \ + -H "x-litellm-api-key: Bearer sk-1234" \ -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}' ``` - - - - ### Tuning API Create Fine Tuning Job - - - - -```python -from typing import List, Optional -from vertexai.preview.tuning import sft -import vertexai - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - api_transport="rest", -) - - -# TODO(developer): Update project -vertexai.init(project=PROJECT_ID, location="us-central1") - -sft_tuning_job = sft.train( - source_model="gemini-1.0-pro-002", - train_dataset="gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl", -) - -# Polling for job completion -while not sft_tuning_job.has_ended: - time.sleep(60) - sft_tuning_job.refresh() - -print(sft_tuning_job.tuned_model_name) -print(sft_tuning_job.tuned_model_endpoint_name) -print(sft_tuning_job.experiment) - -``` - - - - - -```python -from typing import List, Optional -from vertexai.preview.tuning import sft -import vertexai -from google.auth.credentials import Credentials - -LITELLM_PROXY_API_KEY = "sk-1234" -LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" - -import datetime - - -class CredentialsWrapper(Credentials): - def __init__(self, token=None): - super().__init__() - self.token = token - self.expiry = None # or set to a future date if needed - - def refresh(self, request): - pass - - def apply(self, headers, token=None): - headers["Authorization"] = f"Bearer {self.token}" - - @property - def expired(self): - return False # Always consider the token as non-expired - - @property - def valid(self): - return True # Always consider the credentials as valid - - -credentials = CredentialsWrapper(token=LITELLM_PROXY_API_KEY) - -vertexai.init( - project="adroit-crow-413218", - location="us-central1", - api_endpoint=LITELLM_PROXY_BASE, - credentials=credentials, - api_transport="rest", -) - - -# TODO(developer): Update project -vertexai.init(project=PROJECT_ID, location="us-central1") - -sft_tuning_job = sft.train( - source_model="gemini-1.0-pro-002", - train_dataset="gs://cloud-samples-data/ai-platform/generative_ai/sft_train_data.jsonl", -) - -# Polling for job completion -while not sft_tuning_job.has_ended: - time.sleep(60) - sft_tuning_job.refresh() - -print(sft_tuning_job.tuned_model_name) -print(sft_tuning_job.tuned_model_endpoint_name) -print(sft_tuning_job.experiment) -``` - - - - ```shell -curl http://localhost:4000/vertex-ai/tuningJobs \ +curl http://localhost:4000/vertex_ai/tuningJobs \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-1234" \ + -H "x-litellm-api-key: Bearer sk-1234" \ -d '{ "baseModel": "gemini-1.0-pro-002", "supervisedTuningSpec" : { @@ -747,112 +187,130 @@ curl http://localhost:4000/vertex-ai/tuningJobs \ }' ``` - +## Advanced - +Pre-requisites +- [Setup proxy with DB](../proxy/virtual_keys.md#setup) + +Use this, to avoid giving developers the raw Anthropic API key, but still letting them use Anthropic endpoints. + +### Use with Virtual Keys + +1. Setup environment + +```bash +export DATABASE_URL="" +export LITELLM_MASTER_KEY="" + +# vertex ai credentials +export DEFAULT_VERTEXAI_PROJECT="" # "adroit-crow-413218" +export DEFAULT_VERTEXAI_LOCATION="" # "us-central1" +export DEFAULT_GOOGLE_APPLICATION_CREDENTIALS="" # "/Users/Downloads/adroit-crow-413218-a956eef1a2a8.json" +``` + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +2. Generate virtual key + +```bash +curl -X POST 'http://0.0.0.0:4000/key/generate' \ +-H 'x-litellm-api-key: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{}' +``` + +Expected Response + +```bash +{ + ... + "key": "sk-1234ewknldferwedojwojw" +} +``` + +3. Test it! -### Context Caching +```bash +curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.0-pro:generateContent \ + -H "Content-Type: application/json" \ + -H "x-litellm-api-key: Bearer sk-1234" \ + -d '{ + "contents":[{ + "role": "user", + "parts":[{"text": "How are you doing today?"}] + }] + }' +``` -Use Vertex AI Context Caching +### Send `tags` in request headers -[**Relevant VertexAI Docs**](https://cloud.google.com/vertex-ai/generative-ai/docs/context-cache/context-cache-overview) +Use this if you wants `tags` to be tracked in the LiteLLM DB and on logging callbacks + +Pass `tags` in request headers as a comma separated list. In the example below the following tags will be tracked + +``` +tags: ["vertex-js-sdk", "pass-through-endpoint"] +``` + - - -1. Add model to config.yaml -```yaml -model_list: - # used for /chat/completions, /completions, /embeddings endpoints - - model_name: gemini-1.5-pro-001 - litellm_params: - model: vertex_ai/gemini-1.5-pro-001 - vertex_project: "project-id" - vertex_location: "us-central1" - vertex_credentials: "adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json - -# used for the /cachedContent and vertexAI native endpoints -default_vertex_config: - vertex_project: "adroit-crow-413218" - vertex_location: "us-central1" - vertex_credentials: "adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json - +```bash +curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.0-pro:generateContent \ + -H "Content-Type: application/json" \ + -H "x-litellm-api-key: Bearer sk-1234" \ + -H "tags: vertex-js-sdk,pass-through-endpoint" \ + -d '{ + "contents":[{ + "role": "user", + "parts":[{"text": "How are you doing today?"}] + }] + }' ``` -2. Start Proxy + + -``` -$ litellm --config /path/to/config.yaml -``` +```javascript +const { VertexAI } = require('@google-cloud/vertexai'); -3. Make Request! -We make the request in two steps: -- Create a cachedContents object -- Use the cachedContents object in your /chat/completions +const vertexAI = new VertexAI({ + project: 'your-project-id', // enter your vertex project id + location: 'us-central1', // enter your vertex region + apiEndpoint: "localhost:4000/vertex_ai" // /vertex_ai # note, do not include 'https://' in the url +}); -**Create a cachedContents object** - -First, create a cachedContents object by calling the Vertex `cachedContents` endpoint. The LiteLLM proxy forwards the `/cachedContents` request to the VertexAI API. - -```python -import httpx - -# Set Litellm proxy variables -LITELLM_BASE_URL = "http://0.0.0.0:4000" -LITELLM_PROXY_API_KEY = "sk-1234" - -httpx_client = httpx.Client(timeout=30) - -print("Creating cached content") -create_cache = httpx_client.post( - url=f"{LITELLM_BASE_URL}/vertex-ai/cachedContents", - headers={"Authorization": f"Bearer {LITELLM_PROXY_API_KEY}"}, - json={ - "model": "gemini-1.5-pro-001", - "contents": [ - { - "role": "user", - "parts": [{ - "text": "This is sample text to demonstrate explicit caching." * 4000 - }] - } - ], +const model = vertexAI.getGenerativeModel({ + model: 'gemini-1.0-pro' +}, { + customHeaders: { + "x-litellm-api-key": "sk-1234", // Your litellm Virtual Key + "tags": "vertex-js-sdk,pass-through-endpoint" } -) +}); -print("Response from create_cache:", create_cache) -create_cache_response = create_cache.json() -print("JSON from create_cache:", create_cache_response) -cached_content_name = create_cache_response["name"] -``` +async function generateContent() { + try { + const prompt = { + contents: [{ + role: 'user', + parts: [{ text: 'How are you doing today?' }] + }] + }; -**Use the cachedContents object in your /chat/completions request to VertexAI** + const response = await model.generateContent(prompt); + console.log('Response:', response); + } catch (error) { + console.error('Error:', error); + } +} -```python -import openai - -# Set Litellm proxy variables -LITELLM_BASE_URL = "http://0.0.0.0:4000" -LITELLM_PROXY_API_KEY = "sk-1234" - -client = openai.OpenAI(api_key=LITELLM_PROXY_API_KEY, base_url=LITELLM_BASE_URL) - -response = client.chat.completions.create( - model="gemini-1.5-pro-001", - max_tokens=8192, - messages=[ - { - "role": "user", - "content": "What is the sample text about?", - }, - ], - temperature=0.7, - extra_body={"cached_content": cached_content_name}, # Use the cached content -) - -print("Response from proxy:", response) +generateContent(); ``` diff --git a/docs/my-website/docs/providers/anthropic.md b/docs/my-website/docs/providers/anthropic.md index 0c7b2a442..b3bfe333c 100644 --- a/docs/my-website/docs/providers/anthropic.md +++ b/docs/my-website/docs/providers/anthropic.md @@ -10,6 +10,35 @@ LiteLLM supports all anthropic models. - `claude-2.1` - `claude-instant-1.2` + +| Property | Details | +|-------|-------| +| Description | Claude is a highly performant, trustworthy, and intelligent AI platform built by Anthropic. Claude excels at tasks involving language, reasoning, analysis, coding, and more. | +| Provider Route on LiteLLM | `anthropic/` (add this prefix to the model name, to route any requests to Anthropic - e.g. `anthropic/claude-3-5-sonnet-20240620`) | +| Provider Doc | [Anthropic ↗](https://docs.anthropic.com/en/docs/build-with-claude/overview) | +| API Endpoint for Provider | https://api.anthropic.com | +| Supported Endpoints | `/chat/completions` | + + +## Supported OpenAI Parameters + +Check this in code, [here](../completion/input.md#translated-openai-params) + +``` +"stream", +"stop", +"temperature", +"top_p", +"max_tokens", +"max_completion_tokens", +"tools", +"tool_choice", +"extra_headers", +"parallel_tool_calls", +"response_format", +"user" +``` + :::info Anthropic API fails requests when `max_tokens` are not passed. Due to this litellm passes `max_tokens=4096` when no `max_tokens` are passed. @@ -864,3 +893,145 @@ Human: How do I boil water? Assistant: ``` + +## Usage - PDF + +Pass base64 encoded PDF files to Anthropic models using the `image_url` field. + + + + +### **using base64** +```python +from litellm import completion, supports_pdf_input +import base64 +import requests + +# URL of the file +url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf" + +# Download the file +response = requests.get(url) +file_data = response.content + +encoded_file = base64.b64encode(file_data).decode("utf-8") + +## check if model supports pdf input - (2024/11/11) only claude-3-5-haiku-20241022 supports it +supports_pdf_input("anthropic/claude-3-5-haiku-20241022") # True + +response = completion( + model="anthropic/claude-3-5-haiku-20241022", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."}, + { + "type": "image_url", + "image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF + }, + ], + } + ], + max_tokens=300, +) + +print(response.choices[0]) +``` + + + +1. Add model to config + +```yaml +- model_name: claude-3-5-haiku-20241022 + litellm_params: + model: anthropic/claude-3-5-haiku-20241022 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start Proxy + +``` +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "claude-3-5-haiku-20241022", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "You are a very professional document summarization specialist. Please summarize the given document" + }, + { + "type": "image_url", + "image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF + } + } + ] + } + ], + "max_tokens": 300 + }' + +``` + + + +## Usage - passing 'user_id' to Anthropic + +LiteLLM translates the OpenAI `user` param to Anthropic's `metadata[user_id]` param. + + + + +```python +response = completion( + model="claude-3-5-sonnet-20240620", + messages=messages, + user="user_123", +) +``` + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: claude-3-5-sonnet-20240620 + litellm_params: + model: anthropic/claude-3-5-sonnet-20240620 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start Proxy + +``` +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "claude-3-5-sonnet-20240620", + "messages": [{"role": "user", "content": "What is Anthropic?"}], + "user": "user_123" + }' +``` + + + diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md index afd1fee39..579353d65 100644 --- a/docs/my-website/docs/providers/bedrock.md +++ b/docs/my-website/docs/providers/bedrock.md @@ -1082,5 +1082,6 @@ print(f"response: {response}") | Model Name | Function Call | |----------------------|---------------------------------------------| +| Stable Diffusion 3 - v0 | `embedding(model="bedrock/stability.stability.sd3-large-v1:0", prompt=prompt)` | | Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v0", prompt=prompt)` | | Stable Diffusion - v0 | `embedding(model="bedrock/stability.stable-diffusion-xl-v1", prompt=prompt)` | \ No newline at end of file diff --git a/docs/my-website/docs/providers/gemini.md b/docs/my-website/docs/providers/gemini.md index da83448c0..dc56e047b 100644 --- a/docs/my-website/docs/providers/gemini.md +++ b/docs/my-website/docs/providers/gemini.md @@ -10,6 +10,7 @@ import TabItem from '@theme/TabItem'; | Provider Route on LiteLLM | `gemini/` | | Provider Doc | [Google AI Studio ↗](https://ai.google.dev/aistudio) | | API Endpoint for Provider | https://generativelanguage.googleapis.com | +| Supported Endpoints | `/chat/completions`, `/embeddings` |
diff --git a/docs/my-website/docs/providers/huggingface.md b/docs/my-website/docs/providers/huggingface.md index 4620a6c5d..5297a688b 100644 --- a/docs/my-website/docs/providers/huggingface.md +++ b/docs/my-website/docs/providers/huggingface.md @@ -37,7 +37,7 @@ os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key" messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}] # e.g. Call 'https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct' from Serverless Inference API -response = litellm.completion( +response = completion( model="huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", messages=[{ "content": "Hello, how are you?","role": "user"}], stream=True @@ -165,14 +165,14 @@ Steps to use ```python import os -import litellm +from litellm import completion os.environ["HUGGINGFACE_API_KEY"] = "" # TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b # add the 'huggingface/' prefix to the model to set huggingface as the provider # set api base to your deployed api endpoint from hugging face -response = litellm.completion( +response = completion( model="huggingface/glaiveai/glaive-coder-7b", messages=[{ "content": "Hello, how are you?","role": "user"}], api_base="https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud" @@ -383,6 +383,8 @@ def default_pt(messages): #### Custom prompt templates ```python +import litellm + # Create your own custom prompt template works litellm.register_prompt_template( model="togethercomputer/LLaMA-2-7B-32K", diff --git a/docs/my-website/docs/providers/jina_ai.md b/docs/my-website/docs/providers/jina_ai.md index 499cf6709..6c13dbf1a 100644 --- a/docs/my-website/docs/providers/jina_ai.md +++ b/docs/my-website/docs/providers/jina_ai.md @@ -1,6 +1,13 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Jina AI https://jina.ai/embeddings/ +Supported endpoints: +- /embeddings +- /rerank + ## API Key ```python # env variable @@ -8,6 +15,10 @@ os.environ['JINA_AI_API_KEY'] ``` ## Sample Usage - Embedding + + + + ```python from litellm import embedding import os @@ -19,6 +30,142 @@ response = embedding( ) print(response) ``` + + + +1. Add to config.yaml +```yaml +model_list: + - model_name: embedding-model + litellm_params: + model: jina_ai/jina-embeddings-v3 + api_key: os.environ/JINA_AI_API_KEY +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000/ +``` + +3. Test it! + +```bash +curl -L -X POST 'http://0.0.0.0:4000/embeddings' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{"input": ["hello world"], "model": "embedding-model"}' +``` + + + + +## Sample Usage - Rerank + + + + +```python +from litellm import rerank +import os + +os.environ["JINA_AI_API_KEY"] = "sk-..." + +query = "What is the capital of the United States?" +documents = [ + "Carson City is the capital city of the American state of Nevada.", + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + "Washington, D.C. is the capital of the United States.", + "Capital punishment has existed in the United States since before it was a country.", +] + +response = rerank( + model="jina_ai/jina-reranker-v2-base-multilingual", + query=query, + documents=documents, + top_n=3, +) +print(response) +``` + + + +1. Add to config.yaml +```yaml +model_list: + - model_name: rerank-model + litellm_params: + model: jina_ai/jina-reranker-v2-base-multilingual + api_key: os.environ/JINA_AI_API_KEY +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl -L -X POST 'http://0.0.0.0:4000/rerank' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{ + "model": "rerank-model", + "query": "What is the capital of the United States?", + "documents": [ + "Carson City is the capital city of the American state of Nevada.", + "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.", + "Washington, D.C. is the capital of the United States.", + "Capital punishment has existed in the United States since before it was a country." + ], + "top_n": 3 +}' +``` + + + ## Supported Models All models listed here https://jina.ai/embeddings/ are supported + +## Supported Optional Rerank Parameters + +All cohere rerank parameters are supported. + +## Supported Optional Embeddings Parameters + +``` +dimensions +``` + +## Provider-specific parameters + +Pass any jina ai specific parameters as a keyword argument to the `embedding` or `rerank` function, e.g. + + + + +```python +response = embedding( + model="jina_ai/jina-embeddings-v3", + input=["good morning from litellm"], + dimensions=1536, + my_custom_param="my_custom_value", # any other jina ai specific parameters +) +``` + + + +```bash +curl -L -X POST 'http://0.0.0.0:4000/embeddings' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{"input": ["good morning from litellm"], "model": "jina_ai/jina-embeddings-v3", "dimensions": 1536, "my_custom_param": "my_custom_value"}' +``` + + + diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index b69e8ee56..a7b363be1 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -572,6 +572,96 @@ Here's how to use Vertex AI with the LiteLLM Proxy Server
+ +## Authentication - vertex_project, vertex_location, etc. + +Set your vertex credentials via: +- dynamic params +OR +- env vars + + +### **Dynamic Params** + +You can set: +- `vertex_credentials` (str) - can be a json string or filepath to your vertex ai service account.json +- `vertex_location` (str) - place where vertex model is deployed (us-central1, asia-southeast1, etc.) +- `vertex_project` Optional[str] - use if vertex project different from the one in vertex_credentials + +as dynamic params for a `litellm.completion` call. + + + + +```python +from litellm import completion +import json + +## GET CREDENTIALS +file_path = 'path/to/vertex_ai_service_account.json' + +# Load the JSON file +with open(file_path, 'r') as file: + vertex_credentials = json.load(file) + +# Convert to JSON string +vertex_credentials_json = json.dumps(vertex_credentials) + + +response = completion( + model="vertex_ai/gemini-pro", + messages=[{"content": "You are a good bot.","role": "system"}, {"content": "Hello, how are you?","role": "user"}], + vertex_credentials=vertex_credentials_json, + vertex_project="my-special-project", + vertex_location="my-special-location" +) +``` + + + + +```yaml +model_list: + - model_name: gemini-1.5-pro + litellm_params: + model: gemini-1.5-pro + vertex_credentials: os.environ/VERTEX_FILE_PATH_ENV_VAR # os.environ["VERTEX_FILE_PATH_ENV_VAR"] = "/path/to/service_account.json" + vertex_project: "my-special-project" + vertex_location: "my-special-location: +``` + + + + + + + +### **Environment Variables** + +You can set: +- `GOOGLE_APPLICATION_CREDENTIALS` - store the filepath for your service_account.json in here (used by vertex sdk directly). +- VERTEXAI_LOCATION - place where vertex model is deployed (us-central1, asia-southeast1, etc.) +- VERTEXAI_PROJECT - Optional[str] - use if vertex project different from the one in vertex_credentials + +1. GOOGLE_APPLICATION_CREDENTIALS + +```bash +export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service_account.json" +``` + +2. VERTEXAI_LOCATION + +```bash +export VERTEXAI_LOCATION="us-central1" # can be any vertex location +``` + +3. VERTEXAI_PROJECT + +```bash +export VERTEXAI_PROJECT="my-test-project" # ONLY use if model project is different from service account project +``` + + ## Specifying Safety Settings In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example: @@ -1161,12 +1251,96 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ ## Model Garden -| Model Name | Function Call | -|------------------|--------------------------------------| -| llama2 | `completion('vertex_ai/', messages)` | + +:::tip + +All OpenAI compatible models from Vertex Model Garden are supported. + +::: #### Using Model Garden +**Almost all Vertex Model Garden models are OpenAI compatible.** + + + + + +| Property | Details | +|----------|---------| +| Provider Route | `vertex_ai/openai/{MODEL_ID}` | +| Vertex Documentation | [Vertex Model Garden - OpenAI Chat Completions](https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_gradio_streaming_chat_completions.ipynb), [Vertex Model Garden](https://cloud.google.com/model-garden?hl=en) | +| Supported Operations | `/chat/completions`, `/embeddings` | + + + + +```python +from litellm import completion +import os + +## set ENV variables +os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811" +os.environ["VERTEXAI_LOCATION"] = "us-central1" + +response = completion( + model="vertex_ai/openai/", + messages=[{ "content": "Hello, how are you?","role": "user"}] +) +``` + + + + + + +**1. Add to config** + +```yaml +model_list: + - model_name: llama3-1-8b-instruct + litellm_params: + model: vertex_ai/openai/5464397967697903616 + vertex_ai_project: "my-test-project" + vertex_ai_location: "us-east-1" +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING at http://0.0.0.0:4000 +``` + +**3. Test it!** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "llama3-1-8b-instruct", # 👈 the 'model_name' in config + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' +``` + + + + + + + + + + + + ```python from litellm import completion import os @@ -1181,6 +1355,11 @@ response = completion( ) ``` + + + + + ## Gemini Pro | Model Name | Function Call | |------------------|--------------------------------------| @@ -1562,6 +1741,10 @@ curl http://0.0.0.0:4000/v1/chat/completions \ ## **Embedding Models** #### Usage - Embedding + + + + ```python import litellm from litellm import embedding @@ -1574,6 +1757,49 @@ response = embedding( ) print(response) ``` + + + + + +1. Add model to config.yaml +```yaml +model_list: + - model_name: snowflake-arctic-embed-m-long-1731622468876 + litellm_params: + model: vertex_ai/ + vertex_project: "adroit-crow-413218" + vertex_location: "us-central1" + vertex_credentials: adroit-crow-413218-a956eef1a2a8.json + +litellm_settings: + drop_params: True +``` + +2. Start Proxy + +``` +$ litellm --config /path/to/config.yaml +``` + +3. Make Request using OpenAI Python SDK, Langchain Python SDK + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +response = client.embeddings.create( + model="snowflake-arctic-embed-m-long-1731622468876", + input = ["good morning from litellm", "this is another item"], +) + +print(response) +``` + + + + #### Supported Embedding Models All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a0249f630a6792d49dffc2c5d9b7/model_prices_and_context_window.json#L835) are supported @@ -1589,6 +1815,7 @@ All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a02 | textembedding-gecko@003 | `embedding(model="vertex_ai/textembedding-gecko@003", input)` | | text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` | | text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` | +| Fine-tuned OR Custom Embedding models | `embedding(model="vertex_ai/", input)` | ### Supported OpenAI (Unified) Params @@ -2166,97 +2393,6 @@ print("response from proxy", response) - - -## Authentication - vertex_project, vertex_location, etc. - -Set your vertex credentials via: -- dynamic params -OR -- env vars - - -### **Dynamic Params** - -You can set: -- `vertex_credentials` (str) - can be a json string or filepath to your vertex ai service account.json -- `vertex_location` (str) - place where vertex model is deployed (us-central1, asia-southeast1, etc.) -- `vertex_project` Optional[str] - use if vertex project different from the one in vertex_credentials - -as dynamic params for a `litellm.completion` call. - - - - -```python -from litellm import completion -import json - -## GET CREDENTIALS -file_path = 'path/to/vertex_ai_service_account.json' - -# Load the JSON file -with open(file_path, 'r') as file: - vertex_credentials = json.load(file) - -# Convert to JSON string -vertex_credentials_json = json.dumps(vertex_credentials) - - -response = completion( - model="vertex_ai/gemini-pro", - messages=[{"content": "You are a good bot.","role": "system"}, {"content": "Hello, how are you?","role": "user"}], - vertex_credentials=vertex_credentials_json, - vertex_project="my-special-project", - vertex_location="my-special-location" -) -``` - - - - -```yaml -model_list: - - model_name: gemini-1.5-pro - litellm_params: - model: gemini-1.5-pro - vertex_credentials: os.environ/VERTEX_FILE_PATH_ENV_VAR # os.environ["VERTEX_FILE_PATH_ENV_VAR"] = "/path/to/service_account.json" - vertex_project: "my-special-project" - vertex_location: "my-special-location: -``` - - - - - - - -### **Environment Variables** - -You can set: -- `GOOGLE_APPLICATION_CREDENTIALS` - store the filepath for your service_account.json in here (used by vertex sdk directly). -- VERTEXAI_LOCATION - place where vertex model is deployed (us-central1, asia-southeast1, etc.) -- VERTEXAI_PROJECT - Optional[str] - use if vertex project different from the one in vertex_credentials - -1. GOOGLE_APPLICATION_CREDENTIALS - -```bash -export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service_account.json" -``` - -2. VERTEXAI_LOCATION - -```bash -export VERTEXAI_LOCATION="us-central1" # can be any vertex location -``` - -3. VERTEXAI_PROJECT - -```bash -export VERTEXAI_PROJECT="my-test-project" # ONLY use if model project is different from service account project -``` - - ## Extra ### Using `GOOGLE_APPLICATION_CREDENTIALS` diff --git a/docs/my-website/docs/proxy/alerting.md b/docs/my-website/docs/proxy/alerting.md index c4fd22d1e..a5519157c 100644 --- a/docs/my-website/docs/proxy/alerting.md +++ b/docs/my-website/docs/proxy/alerting.md @@ -2,7 +2,7 @@ import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# 🚨 Alerting / Webhooks +# Alerting / Webhooks Get alerts for: diff --git a/docs/my-website/docs/proxy/bucket.md b/docs/my-website/docs/proxy/bucket.md index 3422d0371..d1b9e6076 100644 --- a/docs/my-website/docs/proxy/bucket.md +++ b/docs/my-website/docs/proxy/bucket.md @@ -9,7 +9,7 @@ LiteLLM Supports Logging to the following Cloud Buckets - (Enterprise) ✨ [Google Cloud Storage Buckets](#logging-proxy-inputoutput-to-google-cloud-storage-buckets) - (Free OSS) [Amazon s3 Buckets](#logging-proxy-inputoutput---s3-buckets) -## Logging Proxy Input/Output to Google Cloud Storage Buckets +## Google Cloud Storage Buckets Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en) @@ -20,6 +20,14 @@ Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage? ::: +| Property | Details | +|----------|---------| +| Description | Log LLM Input/Output to cloud storage buckets | +| Load Test Benchmarks | [Benchmarks](https://docs.litellm.ai/docs/benchmarks) | +| Google Docs on Cloud Storage | [Google Cloud Storage](https://cloud.google.com/storage?hl=en) | + + + ### Usage 1. Add `gcs_bucket` to LiteLLM Config.yaml @@ -85,7 +93,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ 6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT` -## Logging Proxy Input/Output - s3 Buckets +## s3 Buckets We will use the `--config` to set diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index d81db5b93..3f5342c7e 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -136,6 +136,7 @@ litellm_settings: type: "redis" service_name: "mymaster" sentinel_nodes: [["localhost", 26379]] + sentinel_password: "password" # [OPTIONAL] ``` @@ -149,6 +150,7 @@ You can configure redis sentinel in your .env by setting `REDIS_SENTINEL_NODES` ```env REDIS_SENTINEL_NODES='[["localhost", 26379]]' REDIS_SERVICE_NAME = "mymaster" +REDIS_SENTINEL_PASSWORD = "password" ``` :::note diff --git a/docs/my-website/docs/proxy/config_management.md b/docs/my-website/docs/proxy/config_management.md new file mode 100644 index 000000000..4f7c5775b --- /dev/null +++ b/docs/my-website/docs/proxy/config_management.md @@ -0,0 +1,59 @@ +# File Management + +## `include` external YAML files in a config.yaml + +You can use `include` to include external YAML files in a config.yaml. + +**Quick Start Usage:** + +To include a config file, use `include` with either a single file or a list of files. + +Contents of `parent_config.yaml`: +```yaml +include: + - model_config.yaml # 👈 Key change, will include the contents of model_config.yaml + +litellm_settings: + callbacks: ["prometheus"] +``` + + +Contents of `model_config.yaml`: +```yaml +model_list: + - model_name: gpt-4o + litellm_params: + model: openai/gpt-4o + api_base: https://exampleopenaiendpoint-production.up.railway.app/ + - model_name: fake-anthropic-endpoint + litellm_params: + model: anthropic/fake + api_base: https://exampleanthropicendpoint-production.up.railway.app/ + +``` + +Start proxy server + +This will start the proxy server with config `parent_config.yaml`. Since the `include` directive is used, the server will also include the contents of `model_config.yaml`. +``` +litellm --config parent_config.yaml --detailed_debug +``` + + + + + +## Examples using `include` + +Include a single file: +```yaml +include: + - model_config.yaml +``` + +Include multiple files: +```yaml +include: + - model_config.yaml + - another_config.yaml +``` \ No newline at end of file diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md new file mode 100644 index 000000000..c762a0716 --- /dev/null +++ b/docs/my-website/docs/proxy/config_settings.md @@ -0,0 +1,507 @@ +# All settings + + +```yaml +environment_variables: {} + +model_list: + - model_name: string + litellm_params: {} + model_info: + id: string + mode: embedding + input_cost_per_token: 0 + output_cost_per_token: 0 + max_tokens: 2048 + base_model: gpt-4-1106-preview + additionalProp1: {} + +litellm_settings: + # Logging/Callback settings + success_callback: ["langfuse"] # list of success callbacks + failure_callback: ["sentry"] # list of failure callbacks + callbacks: ["otel"] # list of callbacks - runs on success and failure + service_callbacks: ["datadog", "prometheus"] # logs redis, postgres failures on datadog, prometheus + turn_off_message_logging: boolean # prevent the messages and responses from being logged to on your callbacks, but request metadata will still be logged. + redact_user_api_key_info: boolean # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging. + langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging + + # Networking settings + request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout + force_ipv4: boolean # If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API + + set_verbose: boolean # sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION + json_logs: boolean # if true, logs will be in json format + + # Fallbacks, reliability + default_fallbacks: ["claude-opus"] # set default_fallbacks, in case a specific model group is misconfigured / bad. + content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}] # fallbacks for ContentPolicyErrors + context_window_fallbacks: [{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}] # fallbacks for ContextWindowExceededErrors + + + + # Caching settings + cache: true + cache_params: # set cache params for redis + type: redis # type of cache to initialize + + # Optional - Redis Settings + host: "localhost" # The host address for the Redis cache. Required if type is "redis". + port: 6379 # The port number for the Redis cache. Required if type is "redis". + password: "your_password" # The password for the Redis cache. Required if type is "redis". + namespace: "litellm.caching.caching" # namespace for redis cache + + # Optional - Redis Cluster Settings + redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}] + + # Optional - Redis Sentinel Settings + service_name: "mymaster" + sentinel_nodes: [["localhost", 26379]] + + # Optional - Qdrant Semantic Cache Settings + qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list + qdrant_collection_name: test_collection + qdrant_quantization_config: binary + similarity_threshold: 0.8 # similarity threshold for semantic cache + + # Optional - S3 Cache Settings + s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3 + s3_region_name: us-west-2 # AWS Region Name for S3 + s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 + s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 + s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 bucket + + # Common Cache settings + # Optional - Supported call types for caching + supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"] + # /chat/completions, /completions, /embeddings, /audio/transcriptions + mode: default_off # if default_off, you need to opt in to caching on a per call basis + ttl: 600 # ttl for caching + + +callback_settings: + otel: + message_logging: boolean # OTEL logging callback specific settings + +general_settings: + completion_model: string + disable_spend_logs: boolean # turn off writing each transaction to the db + disable_master_key_return: boolean # turn off returning master key on UI (checked on '/user/info' endpoint) + disable_retry_on_max_parallel_request_limit_error: boolean # turn off retries when max parallel request limit is reached + disable_reset_budget: boolean # turn off reset budget scheduled task + disable_adding_master_key_hash_to_db: boolean # turn off storing master key hash in db, for spend tracking + enable_jwt_auth: boolean # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims + enforce_user_param: boolean # requires all openai endpoint requests to have a 'user' param + allowed_routes: ["route1", "route2"] # list of allowed proxy API routes - a user can access. (currently JWT-Auth only) + key_management_system: google_kms # either google_kms or azure_kms + master_key: string + + # Database Settings + database_url: string + database_connection_pool_limit: 0 # default 100 + database_connection_timeout: 0 # default 60s + allow_requests_on_db_unavailable: boolean # if true, will allow requests that can not connect to the DB to verify Virtual Key to still work + + custom_auth: string + max_parallel_requests: 0 # the max parallel requests allowed per deployment + global_max_parallel_requests: 0 # the max parallel requests allowed on the proxy all up + infer_model_from_keys: true + background_health_checks: true + health_check_interval: 300 + alerting: ["slack", "email"] + alerting_threshold: 0 + use_client_credentials_pass_through_routes: boolean # use client credentials for all pass through routes like "/vertex-ai", /bedrock/. When this is True Virtual Key auth will not be applied on these endpoints +``` + +### litellm_settings - Reference + +| Name | Type | Description | +|------|------|-------------| +| success_callback | array of strings | List of success callbacks. [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) | +| failure_callback | array of strings | List of failure callbacks [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) | +| callbacks | array of strings | List of callbacks - runs on success and failure [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) | +| service_callbacks | array of strings | System health monitoring - Logs redis, postgres failures on specified services (e.g. datadog, prometheus) [Doc Metrics](prometheus) | +| turn_off_message_logging | boolean | If true, prevents messages and responses from being logged to callbacks, but request metadata will still be logged [Proxy Logging](logging) | +| modify_params | boolean | If true, allows modifying the parameters of the request before it is sent to the LLM provider | +| enable_preview_features | boolean | If true, enables preview features - e.g. Azure O1 Models with streaming support.| +| redact_user_api_key_info | boolean | If true, redacts information about the user api key from logs [Proxy Logging](logging#redacting-userapikeyinfo) | +| langfuse_default_tags | array of strings | Default tags for Langfuse Logging. Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields as tags. [Further docs](./logging#litellm-specific-tags-on-langfuse---cache_hit-cache_key) | +| set_verbose | boolean | If true, sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION | +| json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the `litellm.json_logs = True`. We currently just log the raw POST request from litellm as a JSON [Further docs](./debugging) | +| default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. [Further docs](./reliability#default-fallbacks) | +| request_timeout | integer | The timeout for requests in seconds. If not set, the default value is `6000 seconds`. [For reference OpenAI Python SDK defaults to `600 seconds`.](https://github.com/openai/openai-python/blob/main/src/openai/_constants.py) | +| force_ipv4 | boolean | If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API | +| content_policy_fallbacks | array of objects | Fallbacks to use when a ContentPolicyViolationError is encountered. [Further docs](./reliability#content-policy-fallbacks) | +| context_window_fallbacks | array of objects | Fallbacks to use when a ContextWindowExceededError is encountered. [Further docs](./reliability#context-window-fallbacks) | +| cache | boolean | If true, enables caching. [Further docs](./caching) | +| cache_params | object | Parameters for the cache. [Further docs](./caching) | +| cache_params.type | string | The type of cache to initialize. Can be one of ["local", "redis", "redis-semantic", "s3", "disk", "qdrant-semantic"]. Defaults to "redis". [Furher docs](./caching) | +| cache_params.host | string | The host address for the Redis cache. Required if type is "redis". | +| cache_params.port | integer | The port number for the Redis cache. Required if type is "redis". | +| cache_params.password | string | The password for the Redis cache. Required if type is "redis". | +| cache_params.namespace | string | The namespace for the Redis cache. | +| cache_params.redis_startup_nodes | array of objects | Redis Cluster Settings. [Further docs](./caching) | +| cache_params.service_name | string | Redis Sentinel Settings. [Further docs](./caching) | +| cache_params.sentinel_nodes | array of arrays | Redis Sentinel Settings. [Further docs](./caching) | +| cache_params.ttl | integer | The time (in seconds) to store entries in cache. | +| cache_params.qdrant_semantic_cache_embedding_model | string | The embedding model to use for qdrant semantic cache. | +| cache_params.qdrant_collection_name | string | The name of the collection to use for qdrant semantic cache. | +| cache_params.qdrant_quantization_config | string | The quantization configuration for the qdrant semantic cache. | +| cache_params.similarity_threshold | float | The similarity threshold for the semantic cache. | +| cache_params.s3_bucket_name | string | The name of the S3 bucket to use for the semantic cache. | +| cache_params.s3_region_name | string | The region name for the S3 bucket. | +| cache_params.s3_aws_access_key_id | string | The AWS access key ID for the S3 bucket. | +| cache_params.s3_aws_secret_access_key | string | The AWS secret access key for the S3 bucket. | +| cache_params.s3_endpoint_url | string | Optional - The endpoint URL for the S3 bucket. | +| cache_params.supported_call_types | array of strings | The types of calls to cache. [Further docs](./caching) | +| cache_params.mode | string | The mode of the cache. [Further docs](./caching) | +| disable_end_user_cost_tracking | boolean | If true, turns off end user cost tracking on prometheus metrics + litellm spend logs table on proxy. | +| key_generation_settings | object | Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) | + +### general_settings - Reference + +| Name | Type | Description | +|------|------|-------------| +| completion_model | string | The default model to use for completions when `model` is not specified in the request | +| disable_spend_logs | boolean | If true, turns off writing each transaction to the database | +| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) | +| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached | +| disable_reset_budget | boolean | If true, turns off reset budget scheduled task | +| disable_adding_master_key_hash_to_db | boolean | If true, turns off storing master key hash in db | +| enable_jwt_auth | boolean | allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims. [Doc on JWT Tokens](token_auth) | +| enforce_user_param | boolean | If true, requires all OpenAI endpoint requests to have a 'user' param. [Doc on call hooks](call_hooks)| +| allowed_routes | array of strings | List of allowed proxy API routes a user can access [Doc on controlling allowed routes](enterprise#control-available-public-private-routes)| +| key_management_system | string | Specifies the key management system. [Doc Secret Managers](../secret) | +| master_key | string | The master key for the proxy [Set up Virtual Keys](virtual_keys) | +| database_url | string | The URL for the database connection [Set up Virtual Keys](virtual_keys) | +| database_connection_pool_limit | integer | The limit for database connection pool [Setting DB Connection Pool limit](#configure-db-pool-limits--connection-timeouts) | +| database_connection_timeout | integer | The timeout for database connections in seconds [Setting DB Connection Pool limit, timeout](#configure-db-pool-limits--connection-timeouts) | +| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key | +| custom_auth | string | Write your own custom authentication logic [Doc Custom Auth](virtual_keys#custom-auth) | +| max_parallel_requests | integer | The max parallel requests allowed per deployment | +| global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall | +| infer_model_from_keys | boolean | If true, infers the model from the provided keys | +| background_health_checks | boolean | If true, enables background health checks. [Doc on health checks](health) | +| health_check_interval | integer | The interval for health checks in seconds [Doc on health checks](health) | +| alerting | array of strings | List of alerting methods [Doc on Slack Alerting](alerting) | +| alerting_threshold | integer | The threshold for triggering alerts [Doc on Slack Alerting](alerting) | +| use_client_credentials_pass_through_routes | boolean | If true, uses client credentials for all pass-through routes. [Doc on pass through routes](pass_through) | +| health_check_details | boolean | If false, hides health check details (e.g. remaining rate limit). [Doc on health checks](health) | +| public_routes | List[str] | (Enterprise Feature) Control list of public routes | +| alert_types | List[str] | Control list of alert types to send to slack (Doc on alert types)[./alerting.md] | +| enforced_params | List[str] | (Enterprise Feature) List of params that must be included in all requests to the proxy | +| enable_oauth2_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication | +| use_x_forwarded_for | str | If true, uses the X-Forwarded-For header to get the client IP address | +| service_account_settings | List[Dict[str, Any]] | Set `service_account_settings` if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] | +| image_generation_model | str | The default model to use for image generation - ignores model set in request | +| store_model_in_db | boolean | If true, allows `/model/new` endpoint to store model information in db. Endpoint disabled by default. [Doc on `/model/new` endpoint](./model_management.md#create-a-new-model) | +| max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. | +| max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. | +| proxy_budget_rescheduler_min_time | int | The minimum time (in seconds) to wait before checking db for budget resets. **Default is 597 seconds** | +| proxy_budget_rescheduler_max_time | int | The maximum time (in seconds) to wait before checking db for budget resets. **Default is 605 seconds** | +| proxy_batch_write_at | int | Time (in seconds) to wait before batch writing spend logs to the db. **Default is 10 seconds** | +| alerting_args | dict | Args for Slack Alerting [Doc on Slack Alerting](./alerting.md) | +| custom_key_generate | str | Custom function for key generation [Doc on custom key generation](./virtual_keys.md#custom--key-generate) | +| allowed_ips | List[str] | List of IPs allowed to access the proxy. If not set, all IPs are allowed. | +| embedding_model | str | The default model to use for embeddings - ignores model set in request | +| default_team_disabled | boolean | If true, users cannot create 'personal' keys (keys with no team_id). | +| alert_to_webhook_url | Dict[str] | [Specify a webhook url for each alert type.](./alerting.md#set-specific-slack-channels-per-alert-type) | +| key_management_settings | List[Dict[str, Any]] | Settings for key management system (e.g. AWS KMS, Azure Key Vault) [Doc on key management](../secret.md) | +| allow_user_auth | boolean | (Deprecated) old approach for user authentication. | +| user_api_key_cache_ttl | int | The time (in seconds) to cache user api keys in memory. | +| disable_prisma_schema_update | boolean | If true, turns off automatic schema updates to DB | +| litellm_key_header_name | str | If set, allows passing LiteLLM keys as a custom header. [Doc on custom headers](./virtual_keys.md#custom-headers) | +| moderation_model | str | The default model to use for moderation. | +| custom_sso | str | Path to a python file that implements custom SSO logic. [Doc on custom SSO](./custom_sso.md) | +| allow_client_side_credentials | boolean | If true, allows passing client side credentials to the proxy. (Useful when testing finetuning models) [Doc on client side credentials](./virtual_keys.md#client-side-credentials) | +| admin_only_routes | List[str] | (Enterprise Feature) List of routes that are only accessible to admin users. [Doc on admin only routes](./enterprise#control-available-public-private-routes) | +| use_azure_key_vault | boolean | If true, load keys from azure key vault | +| use_google_kms | boolean | If true, load keys from google kms | +| spend_report_frequency | str | Specify how often you want a Spend Report to be sent (e.g. "1d", "2d", "30d") [More on this](./alerting.md#spend-report-frequency) | +| ui_access_mode | Literal["admin_only"] | If set, restricts access to the UI to admin users only. [Docs](./ui.md#restrict-ui-access) | +| litellm_jwtauth | Dict[str, Any] | Settings for JWT authentication. [Docs](./token_auth.md) | +| litellm_license | str | The license key for the proxy. [Docs](../enterprise.md#how-does-deployment-with-enterprise-license-work) | +| oauth2_config_mappings | Dict[str, str] | Define the OAuth2 config mappings | +| pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. [Docs](./pass_through) | +| enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication | +| forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). | +| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers) to the backend LLM call | + +### router_settings - Reference + +:::info + +Most values can also be set via `litellm_settings`. If you see overlapping values, settings on `router_settings` will override those on `litellm_settings`. +::: + +```yaml +router_settings: + routing_strategy: usage-based-routing-v2 # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" + redis_host: # string + redis_password: # string + redis_port: # string + enable_pre_call_check: true # bool - Before call is made check if a call is within model context window + allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. + cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails + disable_cooldowns: True # bool - Disable cooldowns for all models + enable_tag_filtering: True # bool - Use tag based routing for requests + retry_policy: { # Dict[str, int]: retry policy for different types of exceptions + "AuthenticationErrorRetries": 3, + "TimeoutErrorRetries": 3, + "RateLimitErrorRetries": 3, + "ContentPolicyViolationErrorRetries": 4, + "InternalServerErrorRetries": 4 + } + allowed_fails_policy: { + "BadRequestErrorAllowedFails": 1000, # Allow 1000 BadRequestErrors before cooling down a deployment + "AuthenticationErrorAllowedFails": 10, # int + "TimeoutErrorAllowedFails": 12, # int + "RateLimitErrorAllowedFails": 10000, # int + "ContentPolicyViolationErrorAllowedFails": 15, # int + "InternalServerErrorAllowedFails": 20, # int + } + content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for content policy violations + fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for all errors +``` + +| Name | Type | Description | +|------|------|-------------| +| routing_strategy | string | The strategy used for routing requests. Options: "simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing". Default is "simple-shuffle". [More information here](../routing) | +| redis_host | string | The host address for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** | +| redis_password | string | The password for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** | +| redis_port | string | The port number for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them**| +| enable_pre_call_check | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) | +| content_policy_fallbacks | array of objects | Specifies fallback models for content policy violations. [More information here](reliability) | +| fallbacks | array of objects | Specifies fallback models for all types of errors. [More information here](reliability) | +| enable_tag_filtering | boolean | If true, uses tag based routing for requests [Tag Based Routing](tag_routing) | +| cooldown_time | integer | The duration (in seconds) to cooldown a model if it exceeds the allowed failures. | +| disable_cooldowns | boolean | If true, disables cooldowns for all models. [More information here](reliability) | +| retry_policy | object | Specifies the number of retries for different types of exceptions. [More information here](reliability) | +| allowed_fails | integer | The number of failures allowed before cooling down a model. [More information here](reliability) | +| allowed_fails_policy | object | Specifies the number of allowed failures for different error types before cooling down a deployment. [More information here](reliability) | +| default_max_parallel_requests | Optional[int] | The default maximum number of parallel requests for a deployment. | +| default_priority | (Optional[int]) | The default priority for a request. Only for '.scheduler_acompletion()'. Default is None. | +| polling_interval | (Optional[float]) | frequency of polling queue. Only for '.scheduler_acompletion()'. Default is 3ms. | +| max_fallbacks | Optional[int] | The maximum number of fallbacks to try before exiting the call. Defaults to 5. | +| default_litellm_params | Optional[dict] | The default litellm parameters to add to all requests (e.g. `temperature`, `max_tokens`). | +| timeout | Optional[float] | The default timeout for a request. | +| debug_level | Literal["DEBUG", "INFO"] | The debug level for the logging library in the router. Defaults to "INFO". | +| client_ttl | int | Time-to-live for cached clients in seconds. Defaults to 3600. | +| cache_kwargs | dict | Additional keyword arguments for the cache initialization. | +| routing_strategy_args | dict | Additional keyword arguments for the routing strategy - e.g. lowest latency routing default ttl | +| model_group_alias | dict | Model group alias mapping. E.g. `{"claude-3-haiku": "claude-3-haiku-20240229"}` | +| num_retries | int | Number of retries for a request. Defaults to 3. | +| default_fallbacks | Optional[List[str]] | Fallbacks to try if no model group-specific fallbacks are defined. | +| caching_groups | Optional[List[tuple]] | List of model groups for caching across model groups. Defaults to None. - e.g. caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")]| +| alerting_config | AlertingConfig | [SDK-only arg] Slack alerting configuration. Defaults to None. [Further Docs](../routing.md#alerting-) | +| assistants_config | AssistantsConfig | Set on proxy via `assistant_settings`. [Further docs](../assistants.md) | +| set_verbose | boolean | [DEPRECATED PARAM - see debug docs](./debugging.md) If true, sets the logging level to verbose. | +| retry_after | int | Time to wait before retrying a request in seconds. Defaults to 0. If `x-retry-after` is received from LLM API, this value is overridden. | +| provider_budget_config | ProviderBudgetConfig | Provider budget configuration. Use this to set llm_provider budget limits. example $100/day to OpenAI, $100/day to Azure, etc. Defaults to None. [Further Docs](./provider_budget_routing.md) | +| enable_pre_call_checks | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) | +| model_group_retry_policy | Dict[str, RetryPolicy] | [SDK-only arg] Set retry policy for model groups. | +| context_window_fallbacks | List[Dict[str, List[str]]] | Fallback models for context window violations. | +| redis_url | str | URL for Redis server. **Known performance issue with Redis URL.** | +| cache_responses | boolean | Flag to enable caching LLM Responses, if cache set under `router_settings`. If true, caches responses. Defaults to False. | +| router_general_settings | RouterGeneralSettings | [SDK-Only] Router general settings - contains optimizations like 'async_only_mode'. [Docs](../routing.md#router-general-settings) | + +### environment variables - Reference + +| Name | Description | +|------|-------------| +| ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions +| ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions +| AISPEND_ACCOUNT_ID | Account ID for AI Spend +| AISPEND_API_KEY | API Key for AI Spend +| ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access +| ARIZE_API_KEY | API key for Arize platform integration +| ARIZE_SPACE_KEY | Space key for Arize platform +| ARGILLA_BATCH_SIZE | Batch size for Argilla logging +| ARGILLA_API_KEY | API key for Argilla platform +| ARGILLA_SAMPLING_RATE | Sampling rate for Argilla logging +| ARGILLA_DATASET_NAME | Dataset name for Argilla logging +| ARGILLA_BASE_URL | Base URL for Argilla service +| ATHINA_API_KEY | API key for Athina service +| AUTH_STRATEGY | Strategy used for authentication (e.g., OAuth, API key) +| AWS_ACCESS_KEY_ID | Access Key ID for AWS services +| AWS_PROFILE_NAME | AWS CLI profile name to be used +| AWS_REGION_NAME | Default AWS region for service interactions +| AWS_ROLE_NAME | Role name for AWS IAM usage +| AWS_SECRET_ACCESS_KEY | Secret Access Key for AWS services +| AWS_SESSION_NAME | Name for AWS session +| AWS_WEB_IDENTITY_TOKEN | Web identity token for AWS +| AZURE_API_VERSION | Version of the Azure API being used +| AZURE_AUTHORITY_HOST | Azure authority host URL +| AZURE_CLIENT_ID | Client ID for Azure services +| AZURE_CLIENT_SECRET | Client secret for Azure services +| AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token +| AZURE_KEY_VAULT_URI | URI for Azure Key Vault +| AZURE_TENANT_ID | Tenant ID for Azure Active Directory +| BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service +| BRAINTRUST_API_KEY | API key for Braintrust integration +| CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI +| CIRCLE_OIDC_TOKEN_V2 | Version 2 of the OpenID Connect token for CircleCI +| CONFIG_FILE_PATH | File path for configuration file +| CUSTOM_TIKTOKEN_CACHE_DIR | Custom directory for Tiktoken cache +| DATABASE_HOST | Hostname for the database server +| DATABASE_NAME | Name of the database +| DATABASE_PASSWORD | Password for the database user +| DATABASE_PORT | Port number for database connection +| DATABASE_SCHEMA | Schema name used in the database +| DATABASE_URL | Connection URL for the database +| DATABASE_USER | Username for database connection +| DATABASE_USERNAME | Alias for database user +| DATABRICKS_API_BASE | Base URL for Databricks API +| DD_BASE_URL | Base URL for Datadog integration +| DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration +| _DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration +| DD_API_KEY | API key for Datadog integration +| DD_SITE | Site URL for Datadog (e.g., datadoghq.com) +| DD_SOURCE | Source identifier for Datadog logs +| DD_ENV | Environment identifier for Datadog logs. Only supported for `datadog_llm_observability` callback +| DD_SERVICE | Service identifier for Datadog logs. Defaults to "litellm-server" +| DD_VERSION | Version identifier for Datadog logs. Defaults to "unknown" +| DEBUG_OTEL | Enable debug mode for OpenTelemetry +| DIRECT_URL | Direct URL for service endpoint +| DISABLE_ADMIN_UI | Toggle to disable the admin UI +| DISABLE_SCHEMA_UPDATE | Toggle to disable schema updates +| DOCS_DESCRIPTION | Description text for documentation pages +| DOCS_FILTERED | Flag indicating filtered documentation +| DOCS_TITLE | Title of the documentation pages +| DOCS_URL | The path to the Swagger API documentation. **By default this is "/"** +| EMAIL_SUPPORT_CONTACT | Support contact email address +| GCS_BUCKET_NAME | Name of the Google Cloud Storage bucket +| GCS_PATH_SERVICE_ACCOUNT | Path to the Google Cloud service account JSON file +| GCS_FLUSH_INTERVAL | Flush interval for GCS logging (in seconds). Specify how often you want a log to be sent to GCS. **Default is 20 seconds** +| GCS_BATCH_SIZE | Batch size for GCS logging. Specify after how many logs you want to flush to GCS. If `BATCH_SIZE` is set to 10, logs are flushed every 10 logs. **Default is 2048** +| GENERIC_AUTHORIZATION_ENDPOINT | Authorization endpoint for generic OAuth providers +| GENERIC_CLIENT_ID | Client ID for generic OAuth providers +| GENERIC_CLIENT_SECRET | Client secret for generic OAuth providers +| GENERIC_CLIENT_STATE | State parameter for generic client authentication +| GENERIC_INCLUDE_CLIENT_ID | Include client ID in requests for OAuth +| GENERIC_SCOPE | Scope settings for generic OAuth providers +| GENERIC_TOKEN_ENDPOINT | Token endpoint for generic OAuth providers +| GENERIC_USER_DISPLAY_NAME_ATTRIBUTE | Attribute for user's display name in generic auth +| GENERIC_USER_EMAIL_ATTRIBUTE | Attribute for user's email in generic auth +| GENERIC_USER_FIRST_NAME_ATTRIBUTE | Attribute for user's first name in generic auth +| GENERIC_USER_ID_ATTRIBUTE | Attribute for user ID in generic auth +| GENERIC_USER_LAST_NAME_ATTRIBUTE | Attribute for user's last name in generic auth +| GENERIC_USER_PROVIDER_ATTRIBUTE | Attribute specifying the user's provider +| GENERIC_USER_ROLE_ATTRIBUTE | Attribute specifying the user's role +| GENERIC_USERINFO_ENDPOINT | Endpoint to fetch user information in generic OAuth +| GALILEO_BASE_URL | Base URL for Galileo platform +| GALILEO_PASSWORD | Password for Galileo authentication +| GALILEO_PROJECT_ID | Project ID for Galileo usage +| GALILEO_USERNAME | Username for Galileo authentication +| GREENSCALE_API_KEY | API key for Greenscale service +| GREENSCALE_ENDPOINT | Endpoint URL for Greenscale service +| GOOGLE_APPLICATION_CREDENTIALS | Path to Google Cloud credentials JSON file +| GOOGLE_CLIENT_ID | Client ID for Google OAuth +| GOOGLE_CLIENT_SECRET | Client secret for Google OAuth +| GOOGLE_KMS_RESOURCE_NAME | Name of the resource in Google KMS +| HF_API_BASE | Base URL for Hugging Face API +| HELICONE_API_KEY | API key for Helicone service +| HUGGINGFACE_API_BASE | Base URL for Hugging Face API +| IAM_TOKEN_DB_AUTH | IAM token for database authentication +| JSON_LOGS | Enable JSON formatted logging +| JWT_AUDIENCE | Expected audience for JWT tokens +| JWT_PUBLIC_KEY_URL | URL to fetch public key for JWT verification +| LAGO_API_BASE | Base URL for Lago API +| LAGO_API_CHARGE_BY | Parameter to determine charge basis in Lago +| LAGO_API_EVENT_CODE | Event code for Lago API events +| LAGO_API_KEY | API key for accessing Lago services +| LANGFUSE_DEBUG | Toggle debug mode for Langfuse +| LANGFUSE_FLUSH_INTERVAL | Interval for flushing Langfuse logs +| LANGFUSE_HOST | Host URL for Langfuse service +| LANGFUSE_PUBLIC_KEY | Public key for Langfuse authentication +| LANGFUSE_RELEASE | Release version of Langfuse integration +| LANGFUSE_SECRET_KEY | Secret key for Langfuse authentication +| LANGSMITH_API_KEY | API key for Langsmith platform +| LANGSMITH_BASE_URL | Base URL for Langsmith service +| LANGSMITH_BATCH_SIZE | Batch size for operations in Langsmith +| LANGSMITH_DEFAULT_RUN_NAME | Default name for Langsmith run +| LANGSMITH_PROJECT | Project name for Langsmith integration +| LANGSMITH_SAMPLING_RATE | Sampling rate for Langsmith logging +| LANGTRACE_API_KEY | API key for Langtrace service +| LITERAL_API_KEY | API key for Literal integration +| LITERAL_API_URL | API URL for Literal service +| LITERAL_BATCH_SIZE | Batch size for Literal operations +| LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI +| LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests +| LITELLM_EMAIL | Email associated with LiteLLM account +| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM +| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM +| LITELLM_HOSTED_UI | URL of the hosted UI for LiteLLM +| LITELLM_LICENSE | License key for LiteLLM usage +| LITELLM_LOCAL_MODEL_COST_MAP | Local configuration for model cost mapping in LiteLLM +| LITELLM_LOG | Enable detailed logging for LiteLLM +| LITELLM_MODE | Operating mode for LiteLLM (e.g., production, development) +| LITELLM_SALT_KEY | Salt key for encryption in LiteLLM +| LITELLM_SECRET_AWS_KMS_LITELLM_LICENSE | AWS KMS encrypted license for LiteLLM +| LITELLM_TOKEN | Access token for LiteLLM integration +| LOGFIRE_TOKEN | Token for Logfire logging service +| MICROSOFT_CLIENT_ID | Client ID for Microsoft services +| MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services +| MICROSOFT_TENANT | Tenant ID for Microsoft Azure +| NO_DOCS | Flag to disable documentation generation +| NO_PROXY | List of addresses to bypass proxy +| OAUTH_TOKEN_INFO_ENDPOINT | Endpoint for OAuth token info retrieval +| OPENAI_API_BASE | Base URL for OpenAI API +| OPENAI_API_KEY | API key for OpenAI services +| OPENAI_ORGANIZATION | Organization identifier for OpenAI +| OPENID_BASE_URL | Base URL for OpenID Connect services +| OPENID_CLIENT_ID | Client ID for OpenID Connect authentication +| OPENID_CLIENT_SECRET | Client secret for OpenID Connect authentication +| OPENMETER_API_ENDPOINT | API endpoint for OpenMeter integration +| OPENMETER_API_KEY | API key for OpenMeter services +| OPENMETER_EVENT_TYPE | Type of events sent to OpenMeter +| OTEL_ENDPOINT | OpenTelemetry endpoint for traces +| OTEL_ENVIRONMENT_NAME | Environment name for OpenTelemetry +| OTEL_EXPORTER | Exporter type for OpenTelemetry +| OTEL_HEADERS | Headers for OpenTelemetry requests +| OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry +| OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing +| PREDIBASE_API_BASE | Base URL for Predibase API +| PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service +| PRESIDIO_ANONYMIZER_API_BASE | Base URL for Presidio Anonymizer service +| PROMETHEUS_URL | URL for Prometheus service +| PROMPTLAYER_API_KEY | API key for PromptLayer integration +| PROXY_ADMIN_ID | Admin identifier for proxy server +| PROXY_BASE_URL | Base URL for proxy service +| PROXY_LOGOUT_URL | URL for logging out of the proxy service +| PROXY_MASTER_KEY | Master key for proxy authentication +| QDRANT_API_BASE | Base URL for Qdrant API +| QDRANT_API_KEY | API key for Qdrant service +| QDRANT_URL | Connection URL for Qdrant database +| REDIS_HOST | Hostname for Redis server +| REDIS_PASSWORD | Password for Redis service +| REDIS_PORT | Port number for Redis server +| REDOC_URL | The path to the Redoc Fast API documentation. **By default this is "/redoc"** +| SERVER_ROOT_PATH | Root path for the server application +| SET_VERBOSE | Flag to enable verbose logging +| SLACK_DAILY_REPORT_FREQUENCY | Frequency of daily Slack reports (e.g., daily, weekly) +| SLACK_WEBHOOK_URL | Webhook URL for Slack integration +| SMTP_HOST | Hostname for the SMTP server +| SMTP_PASSWORD | Password for SMTP authentication +| SMTP_PORT | Port number for SMTP server +| SMTP_SENDER_EMAIL | Email address used as the sender in SMTP transactions +| SMTP_SENDER_LOGO | Logo used in emails sent via SMTP +| SMTP_TLS | Flag to enable or disable TLS for SMTP connections +| SMTP_USERNAME | Username for SMTP authentication +| SPEND_LOGS_URL | URL for retrieving spend logs +| SSL_CERTIFICATE | Path to the SSL certificate file +| SSL_VERIFY | Flag to enable or disable SSL certificate verification +| SUPABASE_KEY | API key for Supabase service +| SUPABASE_URL | Base URL for Supabase instance +| TEST_EMAIL_ADDRESS | Email address used for testing purposes +| UI_LOGO_PATH | Path to the logo image used in the UI +| UI_PASSWORD | Password for accessing the UI +| UI_USERNAME | Username for accessing the UI +| UPSTREAM_LANGFUSE_DEBUG | Flag to enable debugging for upstream Langfuse +| UPSTREAM_LANGFUSE_HOST | Host URL for upstream Langfuse service +| UPSTREAM_LANGFUSE_PUBLIC_KEY | Public key for upstream Langfuse authentication +| UPSTREAM_LANGFUSE_RELEASE | Release version identifier for upstream Langfuse +| UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication +| USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption +| WEBHOOK_URL | URL for receiving webhooks from external services + diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index 1adc4943d..7876c9dec 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -2,7 +2,7 @@ import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Proxy Config.yaml +# Overview Set model list, `api_base`, `api_key`, `temperature` & proxy server settings (`master-key`) on the config.yaml. | Param Name | Description | @@ -357,77 +357,6 @@ curl --location 'http://0.0.0.0:4000/v1/model/info' \ --data '' ``` - -### Provider specific wildcard routing -**Proxy all models from a provider** - -Use this if you want to **proxy all models from a specific provider without defining them on the config.yaml** - -**Step 1** - define provider specific routing on config.yaml -```yaml -model_list: - # provider specific wildcard routing - - model_name: "anthropic/*" - litellm_params: - model: "anthropic/*" - api_key: os.environ/ANTHROPIC_API_KEY - - model_name: "groq/*" - litellm_params: - model: "groq/*" - api_key: os.environ/GROQ_API_KEY - - model_name: "fo::*:static::*" # all requests matching this pattern will be routed to this deployment, example: model="fo::hi::static::hi" will be routed to deployment: "openai/fo::*:static::*" - litellm_params: - model: "openai/fo::*:static::*" - api_key: os.environ/OPENAI_API_KEY -``` - -Step 2 - Run litellm proxy - -```shell -$ litellm --config /path/to/config.yaml -``` - -Step 3 Test it - -Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*` -```shell -curl http://localhost:4000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-1234" \ - -d '{ - "model": "anthropic/claude-3-sonnet-20240229", - "messages": [ - {"role": "user", "content": "Hello, Claude!"} - ] - }' -``` - -Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*` -```shell -curl http://localhost:4000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-1234" \ - -d '{ - "model": "groq/llama3-8b-8192", - "messages": [ - {"role": "user", "content": "Hello, Claude!"} - ] - }' -``` - -Test with `fo::*::static::*` - all requests matching this pattern will be routed to `openai/fo::*:static::*` -```shell -curl http://localhost:4000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-1234" \ - -d '{ - "model": "fo::hi::static::hi", - "messages": [ - {"role": "user", "content": "Hello, Claude!"} - ] - }' -``` - ### Load Balancing :::info @@ -597,472 +526,6 @@ general_settings: database_connection_timeout: 60 # sets a 60s timeout for any connection call to the db ``` -## **All settings** - - -```yaml -environment_variables: {} - -model_list: - - model_name: string - litellm_params: {} - model_info: - id: string - mode: embedding - input_cost_per_token: 0 - output_cost_per_token: 0 - max_tokens: 2048 - base_model: gpt-4-1106-preview - additionalProp1: {} - -litellm_settings: - # Logging/Callback settings - success_callback: ["langfuse"] # list of success callbacks - failure_callback: ["sentry"] # list of failure callbacks - callbacks: ["otel"] # list of callbacks - runs on success and failure - service_callbacks: ["datadog", "prometheus"] # logs redis, postgres failures on datadog, prometheus - turn_off_message_logging: boolean # prevent the messages and responses from being logged to on your callbacks, but request metadata will still be logged. - redact_user_api_key_info: boolean # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging. - langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging - - request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout - - set_verbose: boolean # sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION - json_logs: boolean # if true, logs will be in json format - - # Fallbacks, reliability - default_fallbacks: ["claude-opus"] # set default_fallbacks, in case a specific model group is misconfigured / bad. - content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}] # fallbacks for ContentPolicyErrors - context_window_fallbacks: [{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}] # fallbacks for ContextWindowExceededErrors - - - - # Caching settings - cache: true - cache_params: # set cache params for redis - type: redis # type of cache to initialize - - # Optional - Redis Settings - host: "localhost" # The host address for the Redis cache. Required if type is "redis". - port: 6379 # The port number for the Redis cache. Required if type is "redis". - password: "your_password" # The password for the Redis cache. Required if type is "redis". - namespace: "litellm.caching.caching" # namespace for redis cache - - # Optional - Redis Cluster Settings - redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}] - - # Optional - Redis Sentinel Settings - service_name: "mymaster" - sentinel_nodes: [["localhost", 26379]] - - # Optional - Qdrant Semantic Cache Settings - qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list - qdrant_collection_name: test_collection - qdrant_quantization_config: binary - similarity_threshold: 0.8 # similarity threshold for semantic cache - - # Optional - S3 Cache Settings - s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3 - s3_region_name: us-west-2 # AWS Region Name for S3 - s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 - s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 - s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 bucket - - # Common Cache settings - # Optional - Supported call types for caching - supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"] - # /chat/completions, /completions, /embeddings, /audio/transcriptions - mode: default_off # if default_off, you need to opt in to caching on a per call basis - ttl: 600 # ttl for caching - - -callback_settings: - otel: - message_logging: boolean # OTEL logging callback specific settings - -general_settings: - completion_model: string - disable_spend_logs: boolean # turn off writing each transaction to the db - disable_master_key_return: boolean # turn off returning master key on UI (checked on '/user/info' endpoint) - disable_retry_on_max_parallel_request_limit_error: boolean # turn off retries when max parallel request limit is reached - disable_reset_budget: boolean # turn off reset budget scheduled task - disable_adding_master_key_hash_to_db: boolean # turn off storing master key hash in db, for spend tracking - enable_jwt_auth: boolean # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims - enforce_user_param: boolean # requires all openai endpoint requests to have a 'user' param - allowed_routes: ["route1", "route2"] # list of allowed proxy API routes - a user can access. (currently JWT-Auth only) - key_management_system: google_kms # either google_kms or azure_kms - master_key: string - - # Database Settings - database_url: string - database_connection_pool_limit: 0 # default 100 - database_connection_timeout: 0 # default 60s - allow_requests_on_db_unavailable: boolean # if true, will allow requests that can not connect to the DB to verify Virtual Key to still work - - custom_auth: string - max_parallel_requests: 0 # the max parallel requests allowed per deployment - global_max_parallel_requests: 0 # the max parallel requests allowed on the proxy all up - infer_model_from_keys: true - background_health_checks: true - health_check_interval: 300 - alerting: ["slack", "email"] - alerting_threshold: 0 - use_client_credentials_pass_through_routes: boolean # use client credentials for all pass through routes like "/vertex-ai", /bedrock/. When this is True Virtual Key auth will not be applied on these endpoints -``` - -### litellm_settings - Reference - -| Name | Type | Description | -|------|------|-------------| -| success_callback | array of strings | List of success callbacks. [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) | -| failure_callback | array of strings | List of failure callbacks [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) | -| callbacks | array of strings | List of callbacks - runs on success and failure [Doc Proxy logging callbacks](logging), [Doc Metrics](prometheus) | -| service_callbacks | array of strings | System health monitoring - Logs redis, postgres failures on specified services (e.g. datadog, prometheus) [Doc Metrics](prometheus) | -| turn_off_message_logging | boolean | If true, prevents messages and responses from being logged to callbacks, but request metadata will still be logged [Proxy Logging](logging) | -| modify_params | boolean | If true, allows modifying the parameters of the request before it is sent to the LLM provider | -| enable_preview_features | boolean | If true, enables preview features - e.g. Azure O1 Models with streaming support.| -| redact_user_api_key_info | boolean | If true, redacts information about the user api key from logs [Proxy Logging](logging#redacting-userapikeyinfo) | -| langfuse_default_tags | array of strings | Default tags for Langfuse Logging. Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields as tags. [Further docs](./logging#litellm-specific-tags-on-langfuse---cache_hit-cache_key) | -| set_verbose | boolean | If true, sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION | -| json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the `litellm.json_logs = True`. We currently just log the raw POST request from litellm as a JSON [Further docs](./debugging) | -| default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. [Further docs](./reliability#default-fallbacks) | -| request_timeout | integer | The timeout for requests in seconds. If not set, the default value is `6000 seconds`. [For reference OpenAI Python SDK defaults to `600 seconds`.](https://github.com/openai/openai-python/blob/main/src/openai/_constants.py) | -| content_policy_fallbacks | array of objects | Fallbacks to use when a ContentPolicyViolationError is encountered. [Further docs](./reliability#content-policy-fallbacks) | -| context_window_fallbacks | array of objects | Fallbacks to use when a ContextWindowExceededError is encountered. [Further docs](./reliability#context-window-fallbacks) | -| cache | boolean | If true, enables caching. [Further docs](./caching) | -| cache_params | object | Parameters for the cache. [Further docs](./caching) | -| cache_params.type | string | The type of cache to initialize. Can be one of ["local", "redis", "redis-semantic", "s3", "disk", "qdrant-semantic"]. Defaults to "redis". [Furher docs](./caching) | -| cache_params.host | string | The host address for the Redis cache. Required if type is "redis". | -| cache_params.port | integer | The port number for the Redis cache. Required if type is "redis". | -| cache_params.password | string | The password for the Redis cache. Required if type is "redis". | -| cache_params.namespace | string | The namespace for the Redis cache. | -| cache_params.redis_startup_nodes | array of objects | Redis Cluster Settings. [Further docs](./caching) | -| cache_params.service_name | string | Redis Sentinel Settings. [Further docs](./caching) | -| cache_params.sentinel_nodes | array of arrays | Redis Sentinel Settings. [Further docs](./caching) | -| cache_params.ttl | integer | The time (in seconds) to store entries in cache. | -| cache_params.qdrant_semantic_cache_embedding_model | string | The embedding model to use for qdrant semantic cache. | -| cache_params.qdrant_collection_name | string | The name of the collection to use for qdrant semantic cache. | -| cache_params.qdrant_quantization_config | string | The quantization configuration for the qdrant semantic cache. | -| cache_params.similarity_threshold | float | The similarity threshold for the semantic cache. | -| cache_params.s3_bucket_name | string | The name of the S3 bucket to use for the semantic cache. | -| cache_params.s3_region_name | string | The region name for the S3 bucket. | -| cache_params.s3_aws_access_key_id | string | The AWS access key ID for the S3 bucket. | -| cache_params.s3_aws_secret_access_key | string | The AWS secret access key for the S3 bucket. | -| cache_params.s3_endpoint_url | string | Optional - The endpoint URL for the S3 bucket. | -| cache_params.supported_call_types | array of strings | The types of calls to cache. [Further docs](./caching) | -| cache_params.mode | string | The mode of the cache. [Further docs](./caching) | - -### general_settings - Reference - -| Name | Type | Description | -|------|------|-------------| -| completion_model | string | The default model to use for completions when `model` is not specified in the request | -| disable_spend_logs | boolean | If true, turns off writing each transaction to the database | -| disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) | -| disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached | -| disable_reset_budget | boolean | If true, turns off reset budget scheduled task | -| disable_adding_master_key_hash_to_db | boolean | If true, turns off storing master key hash in db | -| enable_jwt_auth | boolean | allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims. [Doc on JWT Tokens](token_auth) | -| enforce_user_param | boolean | If true, requires all OpenAI endpoint requests to have a 'user' param. [Doc on call hooks](call_hooks)| -| allowed_routes | array of strings | List of allowed proxy API routes a user can access [Doc on controlling allowed routes](enterprise#control-available-public-private-routes)| -| key_management_system | string | Specifies the key management system. [Doc Secret Managers](../secret) | -| master_key | string | The master key for the proxy [Set up Virtual Keys](virtual_keys) | -| database_url | string | The URL for the database connection [Set up Virtual Keys](virtual_keys) | -| database_connection_pool_limit | integer | The limit for database connection pool [Setting DB Connection Pool limit](#configure-db-pool-limits--connection-timeouts) | -| database_connection_timeout | integer | The timeout for database connections in seconds [Setting DB Connection Pool limit, timeout](#configure-db-pool-limits--connection-timeouts) | -| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key | -| custom_auth | string | Write your own custom authentication logic [Doc Custom Auth](virtual_keys#custom-auth) | -| max_parallel_requests | integer | The max parallel requests allowed per deployment | -| global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall | -| infer_model_from_keys | boolean | If true, infers the model from the provided keys | -| background_health_checks | boolean | If true, enables background health checks. [Doc on health checks](health) | -| health_check_interval | integer | The interval for health checks in seconds [Doc on health checks](health) | -| alerting | array of strings | List of alerting methods [Doc on Slack Alerting](alerting) | -| alerting_threshold | integer | The threshold for triggering alerts [Doc on Slack Alerting](alerting) | -| use_client_credentials_pass_through_routes | boolean | If true, uses client credentials for all pass-through routes. [Doc on pass through routes](pass_through) | -| health_check_details | boolean | If false, hides health check details (e.g. remaining rate limit). [Doc on health checks](health) | -| public_routes | List[str] | (Enterprise Feature) Control list of public routes | -| alert_types | List[str] | Control list of alert types to send to slack (Doc on alert types)[./alerting.md] | -| enforced_params | List[str] | (Enterprise Feature) List of params that must be included in all requests to the proxy | -| enable_oauth2_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication | -| use_x_forwarded_for | str | If true, uses the X-Forwarded-For header to get the client IP address | -| service_account_settings | List[Dict[str, Any]] | Set `service_account_settings` if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] | -| image_generation_model | str | The default model to use for image generation - ignores model set in request | -| store_model_in_db | boolean | If true, allows `/model/new` endpoint to store model information in db. Endpoint disabled by default. [Doc on `/model/new` endpoint](./model_management.md#create-a-new-model) | -| max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. | -| max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. | -| proxy_budget_rescheduler_min_time | int | The minimum time (in seconds) to wait before checking db for budget resets. | -| proxy_budget_rescheduler_max_time | int | The maximum time (in seconds) to wait before checking db for budget resets. | -| proxy_batch_write_at | int | Time (in seconds) to wait before batch writing spend logs to the db. | -| alerting_args | dict | Args for Slack Alerting [Doc on Slack Alerting](./alerting.md) | -| custom_key_generate | str | Custom function for key generation [Doc on custom key generation](./virtual_keys.md#custom--key-generate) | -| allowed_ips | List[str] | List of IPs allowed to access the proxy. If not set, all IPs are allowed. | -| embedding_model | str | The default model to use for embeddings - ignores model set in request | -| default_team_disabled | boolean | If true, users cannot create 'personal' keys (keys with no team_id). | -| alert_to_webhook_url | Dict[str] | [Specify a webhook url for each alert type.](./alerting.md#set-specific-slack-channels-per-alert-type) | -| key_management_settings | List[Dict[str, Any]] | Settings for key management system (e.g. AWS KMS, Azure Key Vault) [Doc on key management](../secret.md) | -| allow_user_auth | boolean | (Deprecated) old approach for user authentication. | -| user_api_key_cache_ttl | int | The time (in seconds) to cache user api keys in memory. | -| disable_prisma_schema_update | boolean | If true, turns off automatic schema updates to DB | -| litellm_key_header_name | str | If set, allows passing LiteLLM keys as a custom header. [Doc on custom headers](./virtual_keys.md#custom-headers) | -| moderation_model | str | The default model to use for moderation. | -| custom_sso | str | Path to a python file that implements custom SSO logic. [Doc on custom SSO](./custom_sso.md) | -| allow_client_side_credentials | boolean | If true, allows passing client side credentials to the proxy. (Useful when testing finetuning models) [Doc on client side credentials](./virtual_keys.md#client-side-credentials) | -| admin_only_routes | List[str] | (Enterprise Feature) List of routes that are only accessible to admin users. [Doc on admin only routes](./enterprise#control-available-public-private-routes) | -| use_azure_key_vault | boolean | If true, load keys from azure key vault | -| use_google_kms | boolean | If true, load keys from google kms | -| spend_report_frequency | str | Specify how often you want a Spend Report to be sent (e.g. "1d", "2d", "30d") [More on this](./alerting.md#spend-report-frequency) | -| ui_access_mode | Literal["admin_only"] | If set, restricts access to the UI to admin users only. [Docs](./ui.md#restrict-ui-access) | -| litellm_jwtauth | Dict[str, Any] | Settings for JWT authentication. [Docs](./token_auth.md) | -| litellm_license | str | The license key for the proxy. [Docs](../enterprise.md#how-does-deployment-with-enterprise-license-work) | -| oauth2_config_mappings | Dict[str, str] | Define the OAuth2 config mappings | -| pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. [Docs](./pass_through) | -| enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication | -| forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). | -| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers) to the backend LLM call | - -### router_settings - Reference - -```yaml -router_settings: - routing_strategy: usage-based-routing-v2 # Literal["simple-shuffle", "least-busy", "usage-based-routing","latency-based-routing"], default="simple-shuffle" - redis_host: # string - redis_password: # string - redis_port: # string - enable_pre_call_check: true # bool - Before call is made check if a call is within model context window - allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. - cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails - disable_cooldowns: True # bool - Disable cooldowns for all models - enable_tag_filtering: True # bool - Use tag based routing for requests - retry_policy: { # Dict[str, int]: retry policy for different types of exceptions - "AuthenticationErrorRetries": 3, - "TimeoutErrorRetries": 3, - "RateLimitErrorRetries": 3, - "ContentPolicyViolationErrorRetries": 4, - "InternalServerErrorRetries": 4 - } - allowed_fails_policy: { - "BadRequestErrorAllowedFails": 1000, # Allow 1000 BadRequestErrors before cooling down a deployment - "AuthenticationErrorAllowedFails": 10, # int - "TimeoutErrorAllowedFails": 12, # int - "RateLimitErrorAllowedFails": 10000, # int - "ContentPolicyViolationErrorAllowedFails": 15, # int - "InternalServerErrorAllowedFails": 20, # int - } - content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for content policy violations - fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for all errors -``` - -| Name | Type | Description | -|------|------|-------------| -| routing_strategy | string | The strategy used for routing requests. Options: "simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing". Default is "simple-shuffle". [More information here](../routing) | -| redis_host | string | The host address for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** | -| redis_password | string | The password for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them** | -| redis_port | string | The port number for the Redis server. **Only set this if you have multiple instances of LiteLLM Proxy and want current tpm/rpm tracking to be shared across them**| -| enable_pre_call_check | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) | -| content_policy_fallbacks | array of objects | Specifies fallback models for content policy violations. [More information here](reliability) | -| fallbacks | array of objects | Specifies fallback models for all types of errors. [More information here](reliability) | -| enable_tag_filtering | boolean | If true, uses tag based routing for requests [Tag Based Routing](tag_routing) | -| cooldown_time | integer | The duration (in seconds) to cooldown a model if it exceeds the allowed failures. | -| disable_cooldowns | boolean | If true, disables cooldowns for all models. [More information here](reliability) | -| retry_policy | object | Specifies the number of retries for different types of exceptions. [More information here](reliability) | -| allowed_fails | integer | The number of failures allowed before cooling down a model. [More information here](reliability) | -| allowed_fails_policy | object | Specifies the number of allowed failures for different error types before cooling down a deployment. [More information here](reliability) | - - -### environment variables - Reference - -| Name | Description | -|------|-------------| -| ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions -| ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions -| AISPEND_ACCOUNT_ID | Account ID for AI Spend -| AISPEND_API_KEY | API Key for AI Spend -| ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access -| ARIZE_API_KEY | API key for Arize platform integration -| ARIZE_SPACE_KEY | Space key for Arize platform -| ARGILLA_BATCH_SIZE | Batch size for Argilla logging -| ARGILLA_API_KEY | API key for Argilla platform -| ARGILLA_SAMPLING_RATE | Sampling rate for Argilla logging -| ARGILLA_DATASET_NAME | Dataset name for Argilla logging -| ARGILLA_BASE_URL | Base URL for Argilla service -| ATHINA_API_KEY | API key for Athina service -| AUTH_STRATEGY | Strategy used for authentication (e.g., OAuth, API key) -| AWS_ACCESS_KEY_ID | Access Key ID for AWS services -| AWS_PROFILE_NAME | AWS CLI profile name to be used -| AWS_REGION_NAME | Default AWS region for service interactions -| AWS_ROLE_NAME | Role name for AWS IAM usage -| AWS_SECRET_ACCESS_KEY | Secret Access Key for AWS services -| AWS_SESSION_NAME | Name for AWS session -| AWS_WEB_IDENTITY_TOKEN | Web identity token for AWS -| AZURE_API_VERSION | Version of the Azure API being used -| AZURE_AUTHORITY_HOST | Azure authority host URL -| AZURE_CLIENT_ID | Client ID for Azure services -| AZURE_CLIENT_SECRET | Client secret for Azure services -| AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token -| AZURE_KEY_VAULT_URI | URI for Azure Key Vault -| AZURE_TENANT_ID | Tenant ID for Azure Active Directory -| BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service -| BRAINTRUST_API_KEY | API key for Braintrust integration -| CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI -| CIRCLE_OIDC_TOKEN_V2 | Version 2 of the OpenID Connect token for CircleCI -| CONFIG_FILE_PATH | File path for configuration file -| CUSTOM_TIKTOKEN_CACHE_DIR | Custom directory for Tiktoken cache -| DATABASE_HOST | Hostname for the database server -| DATABASE_NAME | Name of the database -| DATABASE_PASSWORD | Password for the database user -| DATABASE_PORT | Port number for database connection -| DATABASE_SCHEMA | Schema name used in the database -| DATABASE_URL | Connection URL for the database -| DATABASE_USER | Username for database connection -| DATABASE_USERNAME | Alias for database user -| DATABRICKS_API_BASE | Base URL for Databricks API -| DD_BASE_URL | Base URL for Datadog integration -| DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration -| _DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration -| DD_API_KEY | API key for Datadog integration -| DD_SITE | Site URL for Datadog (e.g., datadoghq.com) -| DD_SOURCE | Source identifier for Datadog logs -| DD_ENV | Environment identifier for Datadog logs. Only supported for `datadog_llm_observability` callback -| DEBUG_OTEL | Enable debug mode for OpenTelemetry -| DIRECT_URL | Direct URL for service endpoint -| DISABLE_ADMIN_UI | Toggle to disable the admin UI -| DISABLE_SCHEMA_UPDATE | Toggle to disable schema updates -| DOCS_DESCRIPTION | Description text for documentation pages -| DOCS_FILTERED | Flag indicating filtered documentation -| DOCS_TITLE | Title of the documentation pages -| EMAIL_SUPPORT_CONTACT | Support contact email address -| GCS_BUCKET_NAME | Name of the Google Cloud Storage bucket -| GCS_PATH_SERVICE_ACCOUNT | Path to the Google Cloud service account JSON file -| GENERIC_AUTHORIZATION_ENDPOINT | Authorization endpoint for generic OAuth providers -| GENERIC_CLIENT_ID | Client ID for generic OAuth providers -| GENERIC_CLIENT_SECRET | Client secret for generic OAuth providers -| GENERIC_CLIENT_STATE | State parameter for generic client authentication -| GENERIC_INCLUDE_CLIENT_ID | Include client ID in requests for OAuth -| GENERIC_SCOPE | Scope settings for generic OAuth providers -| GENERIC_TOKEN_ENDPOINT | Token endpoint for generic OAuth providers -| GENERIC_USER_DISPLAY_NAME_ATTRIBUTE | Attribute for user's display name in generic auth -| GENERIC_USER_EMAIL_ATTRIBUTE | Attribute for user's email in generic auth -| GENERIC_USER_FIRST_NAME_ATTRIBUTE | Attribute for user's first name in generic auth -| GENERIC_USER_ID_ATTRIBUTE | Attribute for user ID in generic auth -| GENERIC_USER_LAST_NAME_ATTRIBUTE | Attribute for user's last name in generic auth -| GENERIC_USER_PROVIDER_ATTRIBUTE | Attribute specifying the user's provider -| GENERIC_USER_ROLE_ATTRIBUTE | Attribute specifying the user's role -| GENERIC_USERINFO_ENDPOINT | Endpoint to fetch user information in generic OAuth -| GALILEO_BASE_URL | Base URL for Galileo platform -| GALILEO_PASSWORD | Password for Galileo authentication -| GALILEO_PROJECT_ID | Project ID for Galileo usage -| GALILEO_USERNAME | Username for Galileo authentication -| GREENSCALE_API_KEY | API key for Greenscale service -| GREENSCALE_ENDPOINT | Endpoint URL for Greenscale service -| GOOGLE_APPLICATION_CREDENTIALS | Path to Google Cloud credentials JSON file -| GOOGLE_CLIENT_ID | Client ID for Google OAuth -| GOOGLE_CLIENT_SECRET | Client secret for Google OAuth -| GOOGLE_KMS_RESOURCE_NAME | Name of the resource in Google KMS -| HF_API_BASE | Base URL for Hugging Face API -| HELICONE_API_KEY | API key for Helicone service -| HUGGINGFACE_API_BASE | Base URL for Hugging Face API -| IAM_TOKEN_DB_AUTH | IAM token for database authentication -| JSON_LOGS | Enable JSON formatted logging -| JWT_AUDIENCE | Expected audience for JWT tokens -| JWT_PUBLIC_KEY_URL | URL to fetch public key for JWT verification -| LAGO_API_BASE | Base URL for Lago API -| LAGO_API_CHARGE_BY | Parameter to determine charge basis in Lago -| LAGO_API_EVENT_CODE | Event code for Lago API events -| LAGO_API_KEY | API key for accessing Lago services -| LANGFUSE_DEBUG | Toggle debug mode for Langfuse -| LANGFUSE_FLUSH_INTERVAL | Interval for flushing Langfuse logs -| LANGFUSE_HOST | Host URL for Langfuse service -| LANGFUSE_PUBLIC_KEY | Public key for Langfuse authentication -| LANGFUSE_RELEASE | Release version of Langfuse integration -| LANGFUSE_SECRET_KEY | Secret key for Langfuse authentication -| LANGSMITH_API_KEY | API key for Langsmith platform -| LANGSMITH_BASE_URL | Base URL for Langsmith service -| LANGSMITH_BATCH_SIZE | Batch size for operations in Langsmith -| LANGSMITH_DEFAULT_RUN_NAME | Default name for Langsmith run -| LANGSMITH_PROJECT | Project name for Langsmith integration -| LANGSMITH_SAMPLING_RATE | Sampling rate for Langsmith logging -| LANGTRACE_API_KEY | API key for Langtrace service -| LITERAL_API_KEY | API key for Literal integration -| LITERAL_API_URL | API URL for Literal service -| LITERAL_BATCH_SIZE | Batch size for Literal operations -| LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI -| LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests -| LITELLM_EMAIL | Email associated with LiteLLM account -| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM -| LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM -| LITELLM_HOSTED_UI | URL of the hosted UI for LiteLLM -| LITELLM_LICENSE | License key for LiteLLM usage -| LITELLM_LOCAL_MODEL_COST_MAP | Local configuration for model cost mapping in LiteLLM -| LITELLM_LOG | Enable detailed logging for LiteLLM -| LITELLM_MODE | Operating mode for LiteLLM (e.g., production, development) -| LITELLM_SALT_KEY | Salt key for encryption in LiteLLM -| LITELLM_SECRET_AWS_KMS_LITELLM_LICENSE | AWS KMS encrypted license for LiteLLM -| LITELLM_TOKEN | Access token for LiteLLM integration -| LOGFIRE_TOKEN | Token for Logfire logging service -| MICROSOFT_CLIENT_ID | Client ID for Microsoft services -| MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services -| MICROSOFT_TENANT | Tenant ID for Microsoft Azure -| NO_DOCS | Flag to disable documentation generation -| NO_PROXY | List of addresses to bypass proxy -| OAUTH_TOKEN_INFO_ENDPOINT | Endpoint for OAuth token info retrieval -| OPENAI_API_BASE | Base URL for OpenAI API -| OPENAI_API_KEY | API key for OpenAI services -| OPENAI_ORGANIZATION | Organization identifier for OpenAI -| OPENID_BASE_URL | Base URL for OpenID Connect services -| OPENID_CLIENT_ID | Client ID for OpenID Connect authentication -| OPENID_CLIENT_SECRET | Client secret for OpenID Connect authentication -| OPENMETER_API_ENDPOINT | API endpoint for OpenMeter integration -| OPENMETER_API_KEY | API key for OpenMeter services -| OPENMETER_EVENT_TYPE | Type of events sent to OpenMeter -| OTEL_ENDPOINT | OpenTelemetry endpoint for traces -| OTEL_ENVIRONMENT_NAME | Environment name for OpenTelemetry -| OTEL_EXPORTER | Exporter type for OpenTelemetry -| OTEL_HEADERS | Headers for OpenTelemetry requests -| OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry -| OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing -| PREDIBASE_API_BASE | Base URL for Predibase API -| PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service -| PRESIDIO_ANONYMIZER_API_BASE | Base URL for Presidio Anonymizer service -| PROMETHEUS_URL | URL for Prometheus service -| PROMPTLAYER_API_KEY | API key for PromptLayer integration -| PROXY_ADMIN_ID | Admin identifier for proxy server -| PROXY_BASE_URL | Base URL for proxy service -| PROXY_LOGOUT_URL | URL for logging out of the proxy service -| PROXY_MASTER_KEY | Master key for proxy authentication -| QDRANT_API_BASE | Base URL for Qdrant API -| QDRANT_API_KEY | API key for Qdrant service -| QDRANT_URL | Connection URL for Qdrant database -| REDIS_HOST | Hostname for Redis server -| REDIS_PASSWORD | Password for Redis service -| REDIS_PORT | Port number for Redis server -| SERVER_ROOT_PATH | Root path for the server application -| SET_VERBOSE | Flag to enable verbose logging -| SLACK_DAILY_REPORT_FREQUENCY | Frequency of daily Slack reports (e.g., daily, weekly) -| SLACK_WEBHOOK_URL | Webhook URL for Slack integration -| SMTP_HOST | Hostname for the SMTP server -| SMTP_PASSWORD | Password for SMTP authentication -| SMTP_PORT | Port number for SMTP server -| SMTP_SENDER_EMAIL | Email address used as the sender in SMTP transactions -| SMTP_SENDER_LOGO | Logo used in emails sent via SMTP -| SMTP_TLS | Flag to enable or disable TLS for SMTP connections -| SMTP_USERNAME | Username for SMTP authentication -| SPEND_LOGS_URL | URL for retrieving spend logs -| SSL_CERTIFICATE | Path to the SSL certificate file -| SSL_VERIFY | Flag to enable or disable SSL certificate verification -| SUPABASE_KEY | API key for Supabase service -| SUPABASE_URL | Base URL for Supabase instance -| TEST_EMAIL_ADDRESS | Email address used for testing purposes -| UI_LOGO_PATH | Path to the logo image used in the UI -| UI_PASSWORD | Password for accessing the UI -| UI_USERNAME | Username for accessing the UI -| UPSTREAM_LANGFUSE_DEBUG | Flag to enable debugging for upstream Langfuse -| UPSTREAM_LANGFUSE_HOST | Host URL for upstream Langfuse service -| UPSTREAM_LANGFUSE_PUBLIC_KEY | Public key for upstream Langfuse authentication -| UPSTREAM_LANGFUSE_RELEASE | Release version identifier for upstream Langfuse -| UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication -| USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption -| WEBHOOK_URL | URL for receiving webhooks from external services ## Extras diff --git a/docs/my-website/docs/proxy/db_info.md b/docs/my-website/docs/proxy/db_info.md new file mode 100644 index 000000000..8429f6360 --- /dev/null +++ b/docs/my-website/docs/proxy/db_info.md @@ -0,0 +1,71 @@ +# What is stored in the DB + +The LiteLLM Proxy uses a PostgreSQL database to store various information. Here's are the main features the DB is used for: +- Virtual Keys, Organizations, Teams, Users, Budgets, and more. +- Per request Usage Tracking + +## Link to DB Schema + +You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/main/schema.prisma) + +## DB Tables + +### Organizations, Teams, Users, End Users + +| Table Name | Description | Row Insert Frequency | +|------------|-------------|---------------------| +| LiteLLM_OrganizationTable | Manages organization-level configurations. Tracks organization spend, model access, and metadata. Links to budget configurations and teams. | Low | +| LiteLLM_TeamTable | Handles team-level settings within organizations. Manages team members, admins, and their roles. Controls team-specific budgets, rate limits, and model access. | Low | +| LiteLLM_UserTable | Stores user information and their settings. Tracks individual user spend, model access, and rate limits. Manages user roles and team memberships. | Low | +| LiteLLM_EndUserTable | Manages end-user configurations. Controls model access and regional requirements. Tracks end-user spend. | Low | +| LiteLLM_TeamMembership | Tracks user participation in teams. Manages team-specific user budgets and spend. | Low | +| LiteLLM_OrganizationMembership | Manages user roles within organizations. Tracks organization-specific user permissions and spend. | Low | +| LiteLLM_InvitationLink | Handles user invitations. Manages invitation status and expiration. Tracks who created and accepted invitations. | Low | +| LiteLLM_UserNotifications | Handles model access requests. Tracks user requests for model access. Manages approval status. | Low | + +### Authentication + +| Table Name | Description | Row Insert Frequency | +|------------|-------------|---------------------| +| LiteLLM_VerificationToken | Manages Virtual Keys and their permissions. Controls token-specific budgets, rate limits, and model access. Tracks key-specific spend and metadata. | **Medium** - stores all Virtual Keys | + +### Model (LLM) Management + +| Table Name | Description | Row Insert Frequency | +|------------|-------------|---------------------| +| LiteLLM_ProxyModelTable | Stores model configurations. Defines available models and their parameters. Contains model-specific information and settings. | Low - Configuration only | + +### Budget Management + +| Table Name | Description | Row Insert Frequency | +|------------|-------------|---------------------| +| LiteLLM_BudgetTable | Stores budget and rate limit configurations for organizations, keys, and end users. Tracks max budgets, soft budgets, TPM/RPM limits, and model-specific budgets. Handles budget duration and reset timing. | Low - Configuration only | + + +### Tracking & Logging + +| Table Name | Description | Row Insert Frequency | +|------------|-------------|---------------------| +| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request** | +| LiteLLM_ErrorLogs | Captures failed requests and errors. Stores exception details and request information. Helps with debugging and monitoring. | **Medium - on errors only** | +| LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** | + +## Disable `LiteLLM_SpendLogs` & `LiteLLM_ErrorLogs` + +You can disable spend_logs and error_logs by setting `disable_spend_logs` and `disable_error_logs` to `True` on the `general_settings` section of your proxy_config.yaml file. + +```yaml +general_settings: + disable_spend_logs: True # Disable writing spend logs to DB + disable_error_logs: True # Disable writing error logs to DB +``` + +### What is the impact of disabling these logs? + +When disabling spend logs (`disable_spend_logs: True`): +- You **will not** be able to view Usage on the LiteLLM UI +- You **will** continue seeing cost metrics on s3, Prometheus, Langfuse (any other Logging integration you are using) + +When disabling error logs (`disable_error_logs: True`): +- You **will not** be able to view Errors on the LiteLLM UI +- You **will** continue seeing error logs in your application logs and any other logging integrations you are using diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md index 0287af2a2..ea8df446e 100644 --- a/docs/my-website/docs/proxy/deploy.md +++ b/docs/my-website/docs/proxy/deploy.md @@ -2,7 +2,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import Image from '@theme/IdealImage'; -# 🐳 Docker, Deployment +# Docker, Deployment You can find the Dockerfile to build litellm proxy [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile) @@ -688,8 +688,35 @@ Provide an ssl certificate when starting litellm proxy server Use this if you want to run the proxy with hypercorn to support http/2 -**Usage** -Pass the `--run_hypercorn` flag when starting the proxy +Step 1. Build your custom docker image with hypercorn + +```shell +# Use the provided base image +FROM ghcr.io/berriai/litellm:main-latest + +# Set the working directory to /app +WORKDIR /app + +# Copy the configuration file into the container at /app +COPY config.yaml . + +# Make sure your docker/entrypoint.sh is executable +RUN chmod +x ./docker/entrypoint.sh + +# Expose the necessary port +EXPOSE 4000/tcp + +# 👉 Key Change: Install hypercorn +RUN pip install hypercorn + +# Override the CMD instruction with your desired command and arguments +# WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD +# CMD ["--port", "4000", "--config", "config.yaml"] + +CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"] +``` + +Step 2. Pass the `--run_hypercorn` flag when starting the proxy ```shell docker run \ @@ -699,7 +726,7 @@ docker run \ -e SERVER_ROOT_PATH="/api/v1"\ -e DATABASE_URL=postgresql://:@:/ \ -e LITELLM_MASTER_KEY="sk-1234"\ - ghcr.io/berriai/litellm:main-latest \ + your_custom_docker_image \ --config /app/config.yaml --run_hypercorn ``` diff --git a/docs/my-website/docs/proxy/docker_quick_start.md b/docs/my-website/docs/proxy/docker_quick_start.md index 37b251e5a..1343f47b1 100644 --- a/docs/my-website/docs/proxy/docker_quick_start.md +++ b/docs/my-website/docs/proxy/docker_quick_start.md @@ -1,3 +1,7 @@ + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Getting Started - E2E Tutorial End-to-End tutorial for LiteLLM Proxy to: @@ -9,7 +13,11 @@ End-to-End tutorial for LiteLLM Proxy to: ## Pre-Requisites -- Install LiteLLM Docker Image +- Install LiteLLM Docker Image ** OR ** LiteLLM CLI (pip package) + + + + ``` docker pull ghcr.io/berriai/litellm:main-latest @@ -17,6 +25,18 @@ docker pull ghcr.io/berriai/litellm:main-latest [**See all docker images**](https://github.com/orgs/BerriAI/packages) + + + + +```shell +$ pip install 'litellm[proxy]' +``` + + + + + ## 1. Add a model Control LiteLLM Proxy with a config.yaml file. @@ -58,6 +78,11 @@ LiteLLM Proxy is 100% OpenAI-compatible. Test your azure model via the `/chat/co Save your config.yaml from step 1. as `litellm_config.yaml`. + + + + + ```bash docker run \ -v $(pwd)/litellm_config.yaml:/app/config.yaml \ @@ -70,6 +95,20 @@ docker run \ # RUNNING on http://0.0.0.0:4000 ``` + + + + +```shell +$ litellm --config /app/config.yaml --detailed_debug +``` + + + + + + + Confirm your config.yaml got mounted correctly ```bash diff --git a/docs/my-website/docs/proxy/ip_address.md b/docs/my-website/docs/proxy/ip_address.md index 31ffd98a4..80d5561da 100644 --- a/docs/my-website/docs/proxy/ip_address.md +++ b/docs/my-website/docs/proxy/ip_address.md @@ -1,5 +1,5 @@ -# ✨ IP Address Filtering +# IP Address Filtering :::info diff --git a/docs/my-website/docs/proxy/load_balancing.md b/docs/my-website/docs/proxy/load_balancing.md index 20b803777..dc5724066 100644 --- a/docs/my-website/docs/proxy/load_balancing.md +++ b/docs/my-website/docs/proxy/load_balancing.md @@ -1,4 +1,4 @@ -# Multiple Instances +# Proxy - Load Balancing Load balance multiple instances of the same model The proxy will handle routing requests (using LiteLLM's Router). **Set `rpm` in the config if you want maximize throughput** diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md index 94faa7734..c9c16ac46 100644 --- a/docs/my-website/docs/proxy/logging.md +++ b/docs/my-website/docs/proxy/logging.md @@ -4,6 +4,7 @@ Log Proxy input, output, and exceptions using: - Langfuse - OpenTelemetry +- GCS and s3 Buckets - Custom Callbacks - Langsmith - DataDog @@ -47,7 +48,19 @@ A number of these headers could be useful for troubleshooting, but the `x-litellm-call-id` is the one that is most useful for tracking a request across components in your system, including in logging tools. -## Redacting UserAPIKeyInfo + +## Logging Features + +### Conditional Logging by Virtual Keys, Teams + +Use this to: +1. Conditionally enable logging for some virtual keys/teams +2. Set different logging providers for different virtual keys/teams + +[👉 **Get Started** - Team/Key Based Logging](team_logging) + + +### Redacting UserAPIKeyInfo Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. @@ -59,17 +72,58 @@ litellm_settings: redact_user_api_key_info: true ``` + +### Redact Messages, Response Content + +Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to your logging provider, but request metadata will still be logged. + + +Example config.yaml +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo +litellm_settings: + success_callback: ["langfuse"] + turn_off_message_logging: True # 👈 Key Change +``` + +If you have this feature turned on, you can override it for specific requests by +setting a request header `LiteLLM-Disable-Message-Redaction: true`. + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --header 'LiteLLM-Disable-Message-Redaction: true' \ + --data '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] +}' +``` + Removes any field with `user_api_key_*` from metadata. -## What gets logged? StandardLoggingPayload +## What gets logged? Found under `kwargs["standard_logging_object"]`. This is a standard payload, logged for every response. ```python + class StandardLoggingPayload(TypedDict): id: str + trace_id: str # Trace multiple LLM calls belonging to same overall request (e.g. fallbacks/retries) call_type: str response_cost: float + response_cost_failure_debug_info: Optional[ + StandardLoggingModelCostFailureDebugInformation + ] + status: StandardLoggingPayloadStatus total_tokens: int prompt_tokens: int completion_tokens: int @@ -84,13 +138,13 @@ class StandardLoggingPayload(TypedDict): metadata: StandardLoggingMetadata cache_hit: Optional[bool] cache_key: Optional[str] - saved_cache_cost: Optional[float] - request_tags: list + saved_cache_cost: float + request_tags: list end_user: Optional[str] - requester_ip_address: Optional[str] # IP address of requester - requester_metadata: Optional[dict] # metadata passed in request in the "metadata" field + requester_ip_address: Optional[str] messages: Optional[Union[str, list, dict]] response: Optional[Union[str, list, dict]] + error_str: Optional[str] model_parameters: dict hidden_params: StandardLoggingHiddenParams @@ -99,15 +153,51 @@ class StandardLoggingHiddenParams(TypedDict): cache_key: Optional[str] api_base: Optional[str] response_cost: Optional[str] - additional_headers: Optional[dict] + additional_headers: Optional[StandardLoggingAdditionalHeaders] +class StandardLoggingAdditionalHeaders(TypedDict, total=False): + x_ratelimit_limit_requests: int + x_ratelimit_limit_tokens: int + x_ratelimit_remaining_requests: int + x_ratelimit_remaining_tokens: int + +class StandardLoggingMetadata(StandardLoggingUserAPIKeyMetadata): + """ + Specific metadata k,v pairs logged to integration for easier cost tracking + """ + + spend_logs_metadata: Optional[ + dict + ] # special param to log k,v pairs to spendlogs for a call + requester_ip_address: Optional[str] + requester_metadata: Optional[dict] class StandardLoggingModelInformation(TypedDict): model_map_key: str model_map_value: Optional[ModelInfo] + + +StandardLoggingPayloadStatus = Literal["success", "failure"] + +class StandardLoggingModelCostFailureDebugInformation(TypedDict, total=False): + """ + Debug information, if cost tracking fails. + + Avoid logging sensitive information like response or optional params + """ + + error_str: Required[str] + traceback_str: Required[str] + model: str + cache_hit: Optional[bool] + custom_llm_provider: Optional[str] + base_model: Optional[str] + call_type: str + custom_pricing: Optional[bool] ``` -## Logging Proxy Input/Output - Langfuse + +## Langfuse We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment @@ -259,73 +349,8 @@ print(response) -### Team based Logging to Langfuse -[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging) - - -### Redacting Messages, Response Content from Langfuse Logging - -Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged. - -```yaml -model_list: - - model_name: gpt-3.5-turbo - litellm_params: - model: gpt-3.5-turbo -litellm_settings: - success_callback: ["langfuse"] - turn_off_message_logging: True -``` - -If you have this feature turned on, you can override it for specific requests by -setting a request header `LiteLLM-Disable-Message-Redaction: true`. - -```shell -curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Content-Type: application/json' \ - --header 'LiteLLM-Disable-Message-Redaction: true' \ - --data '{ - "model": "gpt-3.5-turbo", - "messages": [ - { - "role": "user", - "content": "what llm are you" - } - ] -}' -``` - - -### LiteLLM-specific Tags on Langfuse - `cache_hit`, `cache_key` +### LiteLLM Tags - `cache_hit`, `cache_key` Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields @@ -360,7 +385,7 @@ litellm_settings: langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] ``` -### 🔧 Debugging - Viewing RAW CURL sent from LiteLLM to provider +### View POST sent from LiteLLM to provider Use this when you want to view the RAW curl request sent from LiteLLM to the LLM API @@ -463,7 +488,7 @@ You will see `raw_request` in your Langfuse Metadata. This is the RAW CURL comma -## Logging Proxy Input/Output in OpenTelemetry format +## OpenTelemetry :::info @@ -745,7 +770,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ ** 🎉 Expect to see this trace logged in your OTEL collector** -### Redacting Messages, Response Content from OTEL Logging +### Redacting Messages, Response Content Set `message_logging=False` for `otel`, no messages / response will be logged @@ -759,7 +784,8 @@ callback_settings: message_logging: False ``` -### Context propagation across Services `Traceparent HTTP Header` +### Traceparent Header +##### Context propagation across Services `Traceparent HTTP Header` ❓ Use this when you want to **pass information about the incoming request in a distributed tracing system** @@ -809,7 +835,7 @@ Search for Trace=`80e1afed08e019fc1110464cfa66635c` on your OTEL Collector -### Forwarding `Traceparent HTTP Header` to LLM APIs +##### Forwarding `Traceparent HTTP Header` to LLM APIs Use this if you want to forward the traceparent headers to your self hosted LLMs like vLLM @@ -826,6 +852,151 @@ litellm_settings: forward_traceparent_to_llm_provider: True ``` +## Google Cloud Storage Buckets + +Log LLM Logs to [Google Cloud Storage Buckets](https://cloud.google.com/storage?hl=en) + +:::info + +✨ This is an Enterprise only feature [Get Started with Enterprise here](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) + +::: + + +| Property | Details | +|----------|---------| +| Description | Log LLM Input/Output to cloud storage buckets | +| Load Test Benchmarks | [Benchmarks](https://docs.litellm.ai/docs/benchmarks) | +| Google Docs on Cloud Storage | [Google Cloud Storage](https://cloud.google.com/storage?hl=en) | + + + +#### Usage + +1. Add `gcs_bucket` to LiteLLM Config.yaml +```yaml +model_list: +- litellm_params: + api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/ + api_key: my-fake-key + model: openai/my-fake-model + model_name: fake-openai-endpoint + +litellm_settings: + callbacks: ["gcs_bucket"] # 👈 KEY CHANGE # 👈 KEY CHANGE +``` + +2. Set required env variables + +```shell +GCS_BUCKET_NAME="" +GCS_PATH_SERVICE_ACCOUNT="/Users/ishaanjaffer/Downloads/adroit-crow-413218-a956eef1a2a8.json" # Add path to service account.json +``` + +3. Start Proxy + +``` +litellm --config /path/to/config.yaml +``` + +4. Test it! + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data ' { + "model": "fake-openai-endpoint", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + } +' +``` + + +#### Expected Logs on GCS Buckets + + + +#### Fields Logged on GCS Buckets + +[**The standard logging object is logged on GCS Bucket**](../proxy/logging) + + +#### Getting `service_account.json` from Google Cloud Console + +1. Go to [Google Cloud Console](https://console.cloud.google.com/) +2. Search for IAM & Admin +3. Click on Service Accounts +4. Select a Service Account +5. Click on 'Keys' -> Add Key -> Create New Key -> JSON +6. Save the JSON file and add the path to `GCS_PATH_SERVICE_ACCOUNT` + + +## s3 Buckets + +We will use the `--config` to set + +- `litellm.success_callback = ["s3"]` + +This will log all successfull LLM calls to s3 Bucket + +**Step 1** Set AWS Credentials in .env + +```shell +AWS_ACCESS_KEY_ID = "" +AWS_SECRET_ACCESS_KEY = "" +AWS_REGION_NAME = "" +``` + +**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback` + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo +litellm_settings: + success_callback: ["s3"] + s3_callback_params: + s3_bucket_name: logs-bucket-litellm # AWS Bucket Name for S3 + s3_region_name: us-west-2 # AWS Region Name for S3 + s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 + s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 + s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to + s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets +``` + +**Step 3**: Start the proxy, make a test request + +Start proxy + +```shell +litellm --config config.yaml --debug +``` + +Test Request + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --data ' { + "model": "Azure OpenAI GPT-4 East", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] + }' +``` + +Your logs should be available on the specified s3 Bucket + + ## Custom Callback Class [Async] Use this when you want to run custom callbacks in `python` @@ -1054,7 +1225,7 @@ class MyCustomHandler(CustomLogger): {'mode': 'embedding', 'input_cost_per_token': 0.002} ``` -### Logging responses from proxy +##### Logging responses from proxy Both `/chat/completions` and `/embeddings` responses are available as `response_obj` @@ -1216,7 +1387,7 @@ litellm_settings: Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API -## Logging LLM IO to Langsmith +## Langsmith 1. Set `success_callback: ["langsmith"]` on litellm config.yaml @@ -1261,7 +1432,7 @@ Expect to see your log on Langfuse -## Logging LLM IO to Arize AI +## Arize AI 1. Set `success_callback: ["arize"]` on litellm config.yaml @@ -1309,7 +1480,7 @@ Expect to see your log on Langfuse -## Logging LLM IO to Langtrace +## Langtrace 1. Set `success_callback: ["langtrace"]` on litellm config.yaml @@ -1351,7 +1522,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ ' ``` -## Logging LLM IO to Galileo +## Galileo [BETA] @@ -1372,7 +1543,7 @@ export GALILEO_USERNAME="" export GALILEO_PASSWORD="" ``` -### Quick Start +#### Quick Start 1. Add to Config.yaml @@ -1413,7 +1584,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ 🎉 That's it - Expect to see your Logs on your Galileo Dashboard -## Logging Proxy Cost + Usage - OpenMeter +## OpenMeter Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md) @@ -1425,7 +1596,7 @@ export OPENMETER_API_ENDPOINT="" # defaults to https://openmeter.cloud export OPENMETER_API_KEY="" ``` -### Quick Start +##### Quick Start 1. Add to Config.yaml @@ -1466,7 +1637,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ -## Logging Proxy Input/Output - DataDog +## DataDog LiteLLM Supports logging to the following Datdog Integrations: - `datadog` [Datadog Logs](https://docs.datadoghq.com/logs/) @@ -1543,7 +1714,7 @@ Expected output on Datadog -## Logging Proxy Input/Output - DynamoDB +## DynamoDB We will use the `--config` to set @@ -1669,7 +1840,7 @@ Your logs should be available on DynamoDB } ``` -## Logging Proxy Input/Output - Sentry +## Sentry If api calls fail (llm/database) you can log those to Sentry: @@ -1711,7 +1882,7 @@ Test Request litellm --test ``` -## Logging Proxy Input/Output Athina +## Athina [Athina](https://athina.ai/) allows you to log LLM Input/Output for monitoring, analytics, and observability. @@ -1758,7 +1929,10 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ }' ``` -## (BETA) Moderation with Azure Content Safety + + \ No newline at end of file diff --git a/docs/my-website/docs/proxy/pass_through.md b/docs/my-website/docs/proxy/pass_through.md index bad23f0de..7ae8ba7c9 100644 --- a/docs/my-website/docs/proxy/pass_through.md +++ b/docs/my-website/docs/proxy/pass_through.md @@ -1,6 +1,6 @@ import Image from '@theme/IdealImage'; -# ➡️ Create Pass Through Endpoints +# Create Pass Through Endpoints Add pass through routes to LiteLLM Proxy diff --git a/docs/my-website/docs/proxy/prod.md b/docs/my-website/docs/proxy/prod.md index 66c719e5d..9dacedaab 100644 --- a/docs/my-website/docs/proxy/prod.md +++ b/docs/my-website/docs/proxy/prod.md @@ -1,5 +1,6 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; +import Image from '@theme/IdealImage'; # ⚡ Best Practices for Production @@ -22,6 +23,7 @@ general_settings: # OPTIONAL Best Practices disable_spend_logs: True # turn off writing each transaction to the db. We recommend doing this is you don't need to see Usage on the LiteLLM UI and are tracking metrics via Prometheus + disable_error_logs: True # turn off writing LLM Exceptions to DB allow_requests_on_db_unavailable: True # Only USE when running LiteLLM on your VPC. Allow requests to still be processed even if the DB is unavailable. We recommend doing this if you're running LiteLLM on VPC that cannot be accessed from the public internet. litellm_settings: @@ -101,18 +103,51 @@ general_settings: allow_requests_on_db_unavailable: True ``` -## 6. Disable spend_logs if you're not using the LiteLLM UI +## 6. Disable spend_logs & error_logs if not using the LiteLLM UI -By default LiteLLM will write every request to the `LiteLLM_SpendLogs` table. This is used for viewing Usage on the LiteLLM UI. +By default, LiteLLM writes several types of logs to the database: +- Every LLM API request to the `LiteLLM_SpendLogs` table +- LLM Exceptions to the `LiteLLM_LogsErrors` table -If you're not viewing Usage on the LiteLLM UI (most users use Prometheus when this is disabled), you can disable spend_logs by setting `disable_spend_logs` to `True`. +If you're not viewing these logs on the LiteLLM UI (most users use Prometheus for monitoring), you can disable them by setting the following flags to `True`: ```yaml general_settings: - disable_spend_logs: True + disable_spend_logs: True # Disable writing spend logs to DB + disable_error_logs: True # Disable writing error logs to DB ``` -## 7. Set LiteLLM Salt Key +[More information about what the Database is used for here](db_info) + +## 7. Use Helm PreSync Hook for Database Migrations [BETA] + +To ensure only one service manages database migrations, use our [Helm PreSync hook for Database Migrations](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm-helm/templates/migrations-job.yaml). This ensures migrations are handled during `helm upgrade` or `helm install`, while LiteLLM pods explicitly disable migrations. + + +1. **Helm PreSync Hook**: + - The Helm PreSync hook is configured in the chart to run database migrations during deployments. + - The hook always sets `DISABLE_SCHEMA_UPDATE=false`, ensuring migrations are executed reliably. + + Reference Settings to set on ArgoCD for `values.yaml` + + ```yaml + db: + useExisting: true # use existing Postgres DB + url: postgresql://ishaanjaffer0324:3rnwpOBau6hT@ep-withered-mud-a5dkdpke.us-east-2.aws.neon.tech/test-argo-cd?sslmode=require # url of existing Postgres DB + ``` + +2. **LiteLLM Pods**: + - Set `DISABLE_SCHEMA_UPDATE=true` in LiteLLM pod configurations to prevent them from running migrations. + + Example configuration for LiteLLM pod: + ```yaml + env: + - name: DISABLE_SCHEMA_UPDATE + value: "true" + ``` + + +## 8. Set LiteLLM Salt Key If you plan on using the DB, set a salt key for encrypting/decrypting variables in the DB. diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 58dc3dae3..f19101b36 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -192,3 +192,13 @@ Here is a screenshot of the metrics you can monitor with the LiteLLM Grafana Das |----------------------|--------------------------------------| | `litellm_llm_api_failed_requests_metric` | **deprecated** use `litellm_proxy_failed_requests_metric` | | `litellm_requests_metric` | **deprecated** use `litellm_proxy_total_requests_metric` | + + +## FAQ + +### What are `_created` vs. `_total` metrics? + +- `_created` metrics are metrics that are created when the proxy starts +- `_total` metrics are metrics that are incremented for each request + +You should consume the `_total` metrics for your counting purposes \ No newline at end of file diff --git a/docs/my-website/docs/proxy/provider_budget_routing.md b/docs/my-website/docs/proxy/provider_budget_routing.md new file mode 100644 index 000000000..1cb75d667 --- /dev/null +++ b/docs/my-website/docs/proxy/provider_budget_routing.md @@ -0,0 +1,191 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Provider Budget Routing +Use this to set budgets for LLM Providers - example $100/day for OpenAI, $100/day for Azure. + +## Quick Start + +Set provider budgets in your `proxy_config.yaml` file +### Proxy Config setup +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: os.environ/OPENAI_API_KEY + +router_settings: + provider_budget_config: + openai: + budget_limit: 0.000000000001 # float of $ value budget for time period + time_period: 1d # can be 1d, 2d, 30d, 1mo, 2mo + azure: + budget_limit: 100 + time_period: 1d + anthropic: + budget_limit: 100 + time_period: 10d + vertex_ai: + budget_limit: 100 + time_period: 12d + gemini: + budget_limit: 100 + time_period: 12d + + # OPTIONAL: Set Redis Host, Port, and Password if using multiple instance of LiteLLM + redis_host: os.environ/REDIS_HOST + redis_port: os.environ/REDIS_PORT + redis_password: os.environ/REDIS_PASSWORD + +general_settings: + master_key: sk-1234 +``` + +### Make a test request + +We expect the first request to succeed, and the second request to fail since we cross the budget for `openai` + + +**[Langchain, OpenAI SDK Usage Examples](../proxy/user_keys#request-format)** + + + + +```shell +curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gpt-4o", + "messages": [ + {"role": "user", "content": "hi my name is test request"} + ] + }' +``` + + + + +Expect this to fail since since `ishaan@berri.ai` in the request is PII + +```shell +curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gpt-4o", + "messages": [ + {"role": "user", "content": "hi my name is test request"} + ] + }' +``` + +Expected response on failure + +```json +{ + "error": { + "message": "No deployments available - crossed budget for provider: Exceeded budget for provider openai: 0.0007350000000000001 >= 1e-12", + "type": "None", + "param": "None", + "code": "429" + } +} +``` + + + + + + + + +## How provider budget routing works + +1. **Budget Tracking**: + - Uses Redis to track spend for each provider + - Tracks spend over specified time periods (e.g., "1d", "30d") + - Automatically resets spend after time period expires + +2. **Routing Logic**: + - Routes requests to providers under their budget limits + - Skips providers that have exceeded their budget + - If all providers exceed budget, raises an error + +3. **Supported Time Periods**: + - Seconds: "Xs" (e.g., "30s") + - Minutes: "Xm" (e.g., "10m") + - Hours: "Xh" (e.g., "24h") + - Days: "Xd" (e.g., "1d", "30d") + - Months: "Xmo" (e.g., "1mo", "2mo") + +4. **Requirements**: + - Redis required for tracking spend across instances + - Provider names must be litellm provider names. See [Supported Providers](https://docs.litellm.ai/docs/providers) + +## Monitoring Provider Remaining Budget + +LiteLLM will emit the following metric on Prometheus to track the remaining budget for each provider + +This metric indicates the remaining budget for a provider in dollars (USD) + +``` +litellm_provider_remaining_budget_metric{api_provider="openai"} 10 +``` + +## Multi-instance setup + +If you are using a multi-instance setup, you will need to set the Redis host, port, and password in the `proxy_config.yaml` file. Redis is used to sync the spend across LiteLLM instances. + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: os.environ/OPENAI_API_KEY + +router_settings: + provider_budget_config: + openai: + budget_limit: 0.000000000001 # float of $ value budget for time period + time_period: 1d # can be 1d, 2d, 30d, 1mo, 2mo + + # 👇 Add this: Set Redis Host, Port, and Password if using multiple instance of LiteLLM + redis_host: os.environ/REDIS_HOST + redis_port: os.environ/REDIS_PORT + redis_password: os.environ/REDIS_PASSWORD + +general_settings: + master_key: sk-1234 +``` + +## Spec for provider_budget_config + +The `provider_budget_config` is a dictionary where: +- **Key**: Provider name (string) - Must be a valid [LiteLLM provider name](https://docs.litellm.ai/docs/providers) +- **Value**: Budget configuration object with the following parameters: + - `budget_limit`: Float value representing the budget in USD + - `time_period`: Duration string in one of the following formats: + - Seconds: `"Xs"` (e.g., "30s") + - Minutes: `"Xm"` (e.g., "10m") + - Hours: `"Xh"` (e.g., "24h") + - Days: `"Xd"` (e.g., "1d", "30d") + - Months: `"Xmo"` (e.g., "1mo", "2mo") + +Example structure: +```yaml +provider_budget_config: + openai: + budget_limit: 100.0 # $100 USD + time_period: "1d" # 1 day period + azure: + budget_limit: 500.0 # $500 USD + time_period: "30d" # 30 day period + anthropic: + budget_limit: 200.0 # $200 USD + time_period: "1mo" # 1 month period + gemini: + budget_limit: 50.0 # $50 USD + time_period: "24h" # 24 hour period +``` \ No newline at end of file diff --git a/docs/my-website/docs/proxy/reliability.md b/docs/my-website/docs/proxy/reliability.md index 9a3ba4ec6..1e6d0e26c 100644 --- a/docs/my-website/docs/proxy/reliability.md +++ b/docs/my-website/docs/proxy/reliability.md @@ -2,7 +2,7 @@ import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Fallbacks, Load Balancing, Retries +# Proxy - Fallbacks, Retries - Quick Start [load balancing](#test---load-balancing) - Quick Start [client side fallbacks](#test---client-side-fallbacks) @@ -748,4 +748,19 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ "max_tokens": 300, "mock_testing_fallbacks": true }' +``` + +### Disable Fallbacks per key + +You can disable fallbacks per key by setting `disable_fallbacks: true` in your key metadata. + +```bash +curl -L -X POST 'http://0.0.0.0:4000/key/generate' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{ + "metadata": { + "disable_fallbacks": true + } +}' ``` \ No newline at end of file diff --git a/docs/my-website/docs/proxy/self_serve.md b/docs/my-website/docs/proxy/self_serve.md index e04aa4b44..494d9e60d 100644 --- a/docs/my-website/docs/proxy/self_serve.md +++ b/docs/my-website/docs/proxy/self_serve.md @@ -217,4 +217,10 @@ litellm_settings: max_parallel_requests: 1000 # (Optional[int], optional): Max number of requests that can be made in parallel. Defaults to None. tpm_limit: 1000 #(Optional[int], optional): Tpm limit. Defaults to None. rpm_limit: 1000 #(Optional[int], optional): Rpm limit. Defaults to None. -``` \ No newline at end of file + + key_generation_settings: # Restricts who can generate keys. [Further docs](./virtual_keys.md#restricting-key-generation) + team_key_generation: + allowed_team_member_roles: ["admin"] + personal_key_generation: # maps to 'Default Team' on UI + allowed_user_roles: ["proxy_admin"] +``` diff --git a/docs/my-website/docs/proxy/team_based_routing.md b/docs/my-website/docs/proxy/team_based_routing.md index 89b18ec63..bda286f4a 100644 --- a/docs/my-website/docs/proxy/team_based_routing.md +++ b/docs/my-website/docs/proxy/team_based_routing.md @@ -1,4 +1,4 @@ -# 👥 Team-based Routing +# Team-based Routing ## Routing Route calls to different model groups based on the team-id diff --git a/docs/my-website/docs/proxy/team_logging.md b/docs/my-website/docs/proxy/team_logging.md index e2fcfa4b5..25b367994 100644 --- a/docs/my-website/docs/proxy/team_logging.md +++ b/docs/my-website/docs/proxy/team_logging.md @@ -2,7 +2,7 @@ import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# 👥📊 Team/Key Based Logging +# Team/Key Based Logging Allow each key/team to use their own Langfuse Project / custom callbacks @@ -11,15 +11,13 @@ Allow each key/team to use their own Langfuse Project / custom callbacks Team 1 -> Logs to Langfuse Project 1 Team 2 -> Logs to Langfuse Project 2 Team 3 -> Disabled Logging (for GDPR compliance) - ``` ## Team Based Logging -[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging.md) -## Logging / Caching +### Setting Team Logging via `config.yaml` Turn on/off logging and caching for a specific team id. @@ -281,6 +279,51 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \ }' ``` + + + + +1. Create Virtual Key to log to a specific Langsmith Project + + ```bash + curl -X POST 'http://0.0.0.0:4000/key/generate' \ + -H 'Authorization: Bearer sk-1234' \ + -H 'Content-Type: application/json' \ + -d '{ + "metadata": { + "logging": [{ + "callback_name": "langsmith", # "otel", "gcs_bucket" + "callback_type": "success", # "success", "failure", "success_and_failure" + "callback_vars": { + "langsmith_api_key": "os.environ/LANGSMITH_API_KEY", # API Key for Langsmith logging + "langsmith_project": "pr-brief-resemblance-72", # project name on langsmith + "langsmith_base_url": "https://api.smith.langchain.com" + } + }] + } + }' + + ``` + +2. Test it - `/chat/completions` request + + Use the virtual key from step 3 to make a `/chat/completions` request + + You should see your logs on your Langsmith project on a successful request + + ```shell + curl -i http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-Fxq5XSyWKeXDKfPdqXZhPg" \ + -d '{ + "model": "fake-openai-endpoint", + "messages": [ + {"role": "user", "content": "Hello, Claude"} + ], + "user": "hello", + }' + ``` + diff --git a/docs/my-website/docs/proxy/ui.md b/docs/my-website/docs/proxy/ui.md index e18a9e2e5..5e6e9f52f 100644 --- a/docs/my-website/docs/proxy/ui.md +++ b/docs/my-website/docs/proxy/ui.md @@ -64,7 +64,7 @@ Allow others to create/delete their own keys. Features here are behind a commercial license in our `/enterprise` folder. [**See Code**](https://github.com/BerriAI/litellm/tree/main/enterprise) -### Setup SSO/Auth for UI +### SSO for UI #### Step 1: Set upperbounds for keys Control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key. @@ -88,12 +88,6 @@ litellm_settings: #### Step 2: Setup Oauth Client -:::tip - -Looking for how to use Oauth 2.0 for /chat, /completions API requests to the proxy? [Follow this doc](oauth2) - -::: - @@ -196,6 +190,13 @@ GENERIC_SCOPE = "openid profile email" # default scope openid is sometimes not e +### Default Login, Logout URLs + +Some SSO providers require a specific redirect url for login and logout. You can input the following values. + +- Login: `/sso/key/generate` +- Logout: `` + #### Step 3. Set `PROXY_BASE_URL` in your .env Set this in your .env (so the proxy can set the correct redirect url) @@ -216,9 +217,9 @@ export ALLOWED_EMAIL_DOMAINS="berri.ai" This will check if the user email we receive from SSO contains this domain, before allowing access. -### Set Admin view w/ SSO +### Set Proxy Admin -You just need to set Proxy Admin ID +Set a Proxy Admin when SSO is enabled. Once SSO is enabled, the `user_id` for users is retrieved from the SSO provider. In order to set a Proxy Admin, you need to copy the `user_id` from the UI and set it in your `.env` as `PROXY_ADMIN_ID`. #### Step 1: Copy your ID from the UI @@ -256,7 +257,7 @@ general_settings: default_team_disabled: true # OR you can set env var PROXY_DEFAULT_TEAM_DISABLED="true" ``` -### Sign in with Username, Password when SSO is on +### Use Username, Password when SSO is on If you need to access the UI via username/password when SSO is on navigate to `/fallback/login`. This route will allow you to sign in with your username/password credentials. diff --git a/docs/my-website/docs/proxy/virtual_keys.md b/docs/my-website/docs/proxy/virtual_keys.md index d3e67e3ec..5bbb6b2a0 100644 --- a/docs/my-website/docs/proxy/virtual_keys.md +++ b/docs/my-website/docs/proxy/virtual_keys.md @@ -1,7 +1,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# 🔑 Virtual Keys +# Virtual Keys Track Spend, and control model access via virtual keys for the proxy :::info @@ -811,6 +811,78 @@ litellm_settings: team_id: "core-infra" ``` +### Restricting Key Generation + +Use this to control who can generate keys. Useful when letting others create keys on the UI. + +```yaml +litellm_settings: + key_generation_settings: + team_key_generation: + allowed_team_member_roles: ["admin"] + required_params: ["tags"] # require team admins to set tags for cost-tracking when generating a team key + personal_key_generation: # maps to 'Default Team' on UI + allowed_user_roles: ["proxy_admin"] +``` + +#### Spec + +```python +class TeamUIKeyGenerationConfig(TypedDict): + allowed_team_member_roles: List[str] + required_params: List[str] # require params on `/key/generate` to be set if a team key (team_id in request) is being generated + + +class PersonalUIKeyGenerationConfig(TypedDict): + allowed_user_roles: List[LitellmUserRoles] + required_params: List[str] # require params on `/key/generate` to be set if a personal key (no team_id in request) is being generated + + +class StandardKeyGenerationConfig(TypedDict, total=False): + team_key_generation: TeamUIKeyGenerationConfig + personal_key_generation: PersonalUIKeyGenerationConfig + + +class LitellmUserRoles(str, enum.Enum): + """ + Admin Roles: + PROXY_ADMIN: admin over the platform + PROXY_ADMIN_VIEW_ONLY: can login, view all own keys, view all spend + ORG_ADMIN: admin over a specific organization, can create teams, users only within their organization + + Internal User Roles: + INTERNAL_USER: can login, view/create/delete their own keys, view their spend + INTERNAL_USER_VIEW_ONLY: can login, view their own keys, view their own spend + + + Team Roles: + TEAM: used for JWT auth + + + Customer Roles: + CUSTOMER: External users -> these are customers + + """ + + # Admin Roles + PROXY_ADMIN = "proxy_admin" + PROXY_ADMIN_VIEW_ONLY = "proxy_admin_viewer" + + # Organization admins + ORG_ADMIN = "org_admin" + + # Internal User Roles + INTERNAL_USER = "internal_user" + INTERNAL_USER_VIEW_ONLY = "internal_user_viewer" + + # Team Roles + TEAM = "team" + + # Customer Roles - External users of proxy + CUSTOMER = "customer" +``` + + ## **Next Steps - Set Budgets, Rate Limits per Virtual Key** [Follow this doc to set budgets, rate limiters per virtual key with LiteLLM](users) diff --git a/docs/my-website/docs/rerank.md b/docs/my-website/docs/rerank.md index 8179e6b81..d25b552fb 100644 --- a/docs/my-website/docs/rerank.md +++ b/docs/my-website/docs/rerank.md @@ -113,4 +113,5 @@ curl http://0.0.0.0:4000/rerank \ |-------------|--------------------| | Cohere | [Usage](#quick-start) | | Together AI| [Usage](../docs/providers/togetherai) | -| Azure AI| [Usage](../docs/providers/azure_ai) | \ No newline at end of file +| Azure AI| [Usage](../docs/providers/azure_ai) | +| Jina AI| [Usage](../docs/providers/jina_ai) | \ No newline at end of file diff --git a/docs/my-website/docs/router_architecture.md b/docs/my-website/docs/router_architecture.md new file mode 100644 index 000000000..13e9e411c --- /dev/null +++ b/docs/my-website/docs/router_architecture.md @@ -0,0 +1,24 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Router Architecture (Fallbacks / Retries) + +## High Level architecture + + + +### Request Flow + +1. **User Sends Request**: The process begins when a user sends a request to the LiteLLM Router endpoint. All unified endpoints (`.completion`, `.embeddings`, etc) are supported by LiteLLM Router. + +2. **function_with_fallbacks**: The initial request is sent to the `function_with_fallbacks` function. This function wraps the initial request in a try-except block, to handle any exceptions - doing fallbacks if needed. This request is then sent to the `function_with_retries` function. + + +3. **function_with_retries**: The `function_with_retries` function wraps the request in a try-except block and passes the initial request to a base litellm unified function (`litellm.completion`, `litellm.embeddings`, etc) to handle LLM API calling. `function_with_retries` handles any exceptions - doing retries on the model group if needed (i.e. if the request fails, it will retry on an available model within the model group). + +4. **litellm.completion**: The `litellm.completion` function is a base function that handles the LLM API calling. It is used by `function_with_retries` to make the actual request to the LLM API. + +## Legend + +**model_group**: A group of LLM API deployments that share the same `model_name`, are part of the same `model_group`, and can be load balanced across. \ No newline at end of file diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index ec692147b..87fad7437 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -281,7 +281,7 @@ Picks the deployment with the lowest response time. It caches, and updates the response times for deployments based on when a request was sent and received from a deployment. -[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_lowest_latency_routing.py) +[**How to test**](https://github.com/BerriAI/litellm/blob/main/tests/local_testing/test_lowest_latency_routing.py) ```python from litellm import Router @@ -567,7 +567,7 @@ print(response) Picks a deployment with the least number of ongoing calls, it's handling. -[**How to test**](https://github.com/BerriAI/litellm/blob/main/litellm/tests/test_least_busy_routing.py) +[**How to test**](https://github.com/BerriAI/litellm/blob/main/tests/local_testing/test_least_busy_routing.py) ```python from litellm import Router @@ -1035,7 +1035,7 @@ print(f"response: {response}") ### [Advanced]: Custom Retries, Cooldowns based on Error Type -- Use `RetryPolicy` if you want to set a `num_retries` based on the Exception receieved +- Use `RetryPolicy` if you want to set a `num_retries` based on the Exception received - Use `AllowedFailsPolicy` to set a custom number of `allowed_fails`/minute before cooling down a deployment [**See All Exception Types**](https://github.com/BerriAI/litellm/blob/ccda616f2f881375d4e8586c76fe4662909a7d22/litellm/types/router.py#L436) @@ -1891,3 +1891,22 @@ router = Router( debug_level="DEBUG" # defaults to INFO ) ``` + +## Router General Settings + +### Usage + +```python +router = Router(model_list=..., router_general_settings=RouterGeneralSettings(async_only_mode=True)) +``` + +### Spec +```python +class RouterGeneralSettings(BaseModel): + async_only_mode: bool = Field( + default=False + ) # this will only initialize async clients. Good for memory utils + pass_through_all_models: bool = Field( + default=False + ) # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding +``` \ No newline at end of file diff --git a/docs/my-website/docs/secret.md b/docs/my-website/docs/secret.md index db5ec6910..113a11750 100644 --- a/docs/my-website/docs/secret.md +++ b/docs/my-website/docs/secret.md @@ -1,3 +1,6 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Secret Manager LiteLLM supports reading secrets from Azure Key Vault, Google Secret Manager @@ -59,14 +62,36 @@ os.environ["AWS_REGION_NAME"] = "" # us-east-1, us-east-2, us-west-1, us-west-2 ``` 2. Enable AWS Secret Manager in config. + + + + ```yaml general_settings: master_key: os.environ/litellm_master_key key_management_system: "aws_secret_manager" # 👈 KEY CHANGE key_management_settings: hosted_keys: ["litellm_master_key"] # 👈 Specify which env keys you stored on AWS + ``` + + + + +This will only store virtual keys in AWS Secret Manager. No keys will be read from AWS Secret Manager. + +```yaml +general_settings: + key_management_system: "aws_secret_manager" # 👈 KEY CHANGE + key_management_settings: + store_virtual_keys: true # OPTIONAL. Defaults to False, when True will store virtual keys in secret manager + prefix_for_stored_virtual_keys: "litellm/" # OPTIONAL. If set, this prefix will be used for stored virtual keys in the secret manager + access_mode: "write_only" # Literal["read_only", "write_only", "read_and_write"] +``` + + + 3. Run proxy ```bash @@ -181,16 +206,14 @@ litellm --config /path/to/config.yaml Use encrypted keys from Google KMS on the proxy -### Usage with LiteLLM Proxy Server - -## Step 1. Add keys to env +Step 1. Add keys to env ``` export GOOGLE_APPLICATION_CREDENTIALS="/path/to/credentials.json" export GOOGLE_KMS_RESOURCE_NAME="projects/*/locations/*/keyRings/*/cryptoKeys/*" export PROXY_DATABASE_URL_ENCRYPTED=b'\n$\x00D\xac\xb4/\x8e\xc...' ``` -## Step 2: Update Config +Step 2: Update Config ```yaml general_settings: @@ -199,7 +222,7 @@ general_settings: master_key: sk-1234 ``` -## Step 3: Start + test proxy +Step 3: Start + test proxy ``` $ litellm --config /path/to/config.yaml @@ -215,3 +238,24 @@ $ litellm --test + + +## All Secret Manager Settings + +All settings related to secret management + +```yaml +general_settings: + key_management_system: "aws_secret_manager" # REQUIRED + key_management_settings: + + # Storing Virtual Keys Settings + store_virtual_keys: true # OPTIONAL. Defaults to False, when True will store virtual keys in secret manager + prefix_for_stored_virtual_keys: "litellm/" # OPTIONAL.I f set, this prefix will be used for stored virtual keys in the secret manager + + # Access Mode Settings + access_mode: "write_only" # OPTIONAL. Literal["read_only", "write_only", "read_and_write"]. Defaults to "read_only" + + # Hosted Keys Settings + hosted_keys: ["litellm_master_key"] # OPTIONAL. Specify which env keys you stored on AWS +``` \ No newline at end of file diff --git a/docs/my-website/docs/text_completion.md b/docs/my-website/docs/text_completion.md new file mode 100644 index 000000000..8be40dfdc --- /dev/null +++ b/docs/my-website/docs/text_completion.md @@ -0,0 +1,174 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Text Completion + +### Usage + + + +```python +from litellm import text_completion + +response = text_completion( + model="gpt-3.5-turbo-instruct", + prompt="Say this is a test", + max_tokens=7 +) +``` + + + + +1. Define models on config.yaml + +```yaml +model_list: + - model_name: gpt-3.5-turbo-instruct + litellm_params: + model: text-completion-openai/gpt-3.5-turbo-instruct # The `text-completion-openai/` prefix will call openai.completions.create + api_key: os.environ/OPENAI_API_KEY + - model_name: text-davinci-003 + litellm_params: + model: text-completion-openai/text-davinci-003 + api_key: os.environ/OPENAI_API_KEY +``` + +2. Start litellm proxy server + +``` +litellm --config config.yaml +``` + + + + +```python +from openai import OpenAI + +# set base_url to your proxy server +# set api_key to send to proxy server +client = OpenAI(api_key="", base_url="http://0.0.0.0:4000") + +response = client.completions.create( + model="gpt-3.5-turbo-instruct", + prompt="Say this is a test", + max_tokens=7 +) + +print(response) +``` + + + + +```shell +curl --location 'http://0.0.0.0:4000/completions' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer sk-1234' \ + --data '{ + "model": "gpt-3.5-turbo-instruct", + "prompt": "Say this is a test", + "max_tokens": 7 + }' +``` + + + + + + +## Input Params + +LiteLLM accepts and translates the [OpenAI Text Completion params](https://platform.openai.com/docs/api-reference/completions) across all supported providers. + +### Required Fields + +- `model`: *string* - ID of the model to use +- `prompt`: *string or array* - The prompt(s) to generate completions for + +### Optional Fields + +- `best_of`: *integer* - Generates best_of completions server-side and returns the "best" one +- `echo`: *boolean* - Echo back the prompt in addition to the completion. +- `frequency_penalty`: *number* - Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency. +- `logit_bias`: *map* - Modify the likelihood of specified tokens appearing in the completion +- `logprobs`: *integer* - Include the log probabilities on the logprobs most likely tokens. Max value of 5 +- `max_tokens`: *integer* - The maximum number of tokens to generate. +- `n`: *integer* - How many completions to generate for each prompt. +- `presence_penalty`: *number* - Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far. +- `seed`: *integer* - If specified, system will attempt to make deterministic samples +- `stop`: *string or array* - Up to 4 sequences where the API will stop generating tokens +- `stream`: *boolean* - Whether to stream back partial progress. Defaults to false +- `suffix`: *string* - The suffix that comes after a completion of inserted text +- `temperature`: *number* - What sampling temperature to use, between 0 and 2. +- `top_p`: *number* - An alternative to sampling with temperature, called nucleus sampling. +- `user`: *string* - A unique identifier representing your end-user + +## Output Format +Here's the exact JSON output format you can expect from completion calls: + + +[**Follows OpenAI's output format**](https://platform.openai.com/docs/api-reference/completions/object) + + + + + +```python +{ + "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7", + "object": "text_completion", + "created": 1589478378, + "model": "gpt-3.5-turbo-instruct", + "system_fingerprint": "fp_44709d6fcb", + "choices": [ + { + "text": "\n\nThis is indeed a test", + "index": 0, + "logprobs": null, + "finish_reason": "length" + } + ], + "usage": { + "prompt_tokens": 5, + "completion_tokens": 7, + "total_tokens": 12 + } +} + +``` + + + +```python +{ + "id": "cmpl-7iA7iJjj8V2zOkCGvWF2hAkDWBQZe", + "object": "text_completion", + "created": 1690759702, + "choices": [ + { + "text": "This", + "index": 0, + "logprobs": null, + "finish_reason": null + } + ], + "model": "gpt-3.5-turbo-instruct" + "system_fingerprint": "fp_44709d6fcb", +} + +``` + + + + + +## **Supported Providers** + +| Provider | Link to Usage | +|-------------|--------------------| +| OpenAI | [Usage](../docs/providers/text_completion_openai) | +| Azure OpenAI| [Usage](../docs/providers/azure) | + + diff --git a/docs/my-website/docs/wildcard_routing.md b/docs/my-website/docs/wildcard_routing.md new file mode 100644 index 000000000..80926d73e --- /dev/null +++ b/docs/my-website/docs/wildcard_routing.md @@ -0,0 +1,140 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Provider specific Wildcard routing + +**Proxy all models from a provider** + +Use this if you want to **proxy all models from a specific provider without defining them on the config.yaml** + +## Step 1. Define provider specific routing + + + + +```python +from litellm import Router + +router = Router( + model_list=[ + { + "model_name": "anthropic/*", + "litellm_params": { + "model": "anthropic/*", + "api_key": os.environ["ANTHROPIC_API_KEY"] + } + }, + { + "model_name": "groq/*", + "litellm_params": { + "model": "groq/*", + "api_key": os.environ["GROQ_API_KEY"] + } + }, + { + "model_name": "fo::*:static::*", # all requests matching this pattern will be routed to this deployment, example: model="fo::hi::static::hi" will be routed to deployment: "openai/fo::*:static::*" + "litellm_params": { + "model": "openai/fo::*:static::*", + "api_key": os.environ["OPENAI_API_KEY"] + } + } + ] +) +``` + + + + +**Step 1** - define provider specific routing on config.yaml +```yaml +model_list: + # provider specific wildcard routing + - model_name: "anthropic/*" + litellm_params: + model: "anthropic/*" + api_key: os.environ/ANTHROPIC_API_KEY + - model_name: "groq/*" + litellm_params: + model: "groq/*" + api_key: os.environ/GROQ_API_KEY + - model_name: "fo::*:static::*" # all requests matching this pattern will be routed to this deployment, example: model="fo::hi::static::hi" will be routed to deployment: "openai/fo::*:static::*" + litellm_params: + model: "openai/fo::*:static::*" + api_key: os.environ/OPENAI_API_KEY +``` + + + +## [PROXY-Only] Step 2 - Run litellm proxy + +```shell +$ litellm --config /path/to/config.yaml +``` + +## Step 3 - Test it + + + + +```python +from litellm import Router + +router = Router(model_list=...) + +# Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*` +resp = completion(model="anthropic/claude-3-sonnet-20240229", messages=[{"role": "user", "content": "Hello, Claude!"}]) +print(resp) + +# Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*` +resp = completion(model="groq/llama3-8b-8192", messages=[{"role": "user", "content": "Hello, Groq!"}]) +print(resp) + +# Test with `fo::*::static::*` - all requests matching this pattern will be routed to `openai/fo::*:static::*` +resp = completion(model="fo::hi::static::hi", messages=[{"role": "user", "content": "Hello, Claude!"}]) +print(resp) +``` + + + + +Test with `anthropic/` - all models with `anthropic/` prefix will get routed to `anthropic/*` +```bash +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "anthropic/claude-3-sonnet-20240229", + "messages": [ + {"role": "user", "content": "Hello, Claude!"} + ] + }' +``` + +Test with `groq/` - all models with `groq/` prefix will get routed to `groq/*` +```shell +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "groq/llama3-8b-8192", + "messages": [ + {"role": "user", "content": "Hello, Claude!"} + ] + }' +``` + +Test with `fo::*::static::*` - all requests matching this pattern will be routed to `openai/fo::*:static::*` +```shell +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "fo::hi::static::hi", + "messages": [ + {"role": "user", "content": "Hello, Claude!"} + ] + }' +``` + + + diff --git a/docs/my-website/docusaurus.config.js b/docs/my-website/docusaurus.config.js index 7dc9c487f..73d500b14 100644 --- a/docs/my-website/docusaurus.config.js +++ b/docs/my-website/docusaurus.config.js @@ -113,7 +113,7 @@ const config = { { sidebarId: 'tutorialSidebar', position: 'left', - label: '🚀 Hosted', + label: 'Hosted', to: "docs/hosted" }, { diff --git a/docs/my-website/img/mlflow_tracing.png b/docs/my-website/img/mlflow_tracing.png new file mode 100644 index 000000000..aee1fb79e Binary files /dev/null and b/docs/my-website/img/mlflow_tracing.png differ diff --git a/docs/my-website/img/otel_debug_trace.png b/docs/my-website/img/otel_debug_trace.png new file mode 100644 index 000000000..94fe5742f Binary files /dev/null and b/docs/my-website/img/otel_debug_trace.png differ diff --git a/docs/my-website/img/router_architecture.png b/docs/my-website/img/router_architecture.png new file mode 100644 index 000000000..195834185 Binary files /dev/null and b/docs/my-website/img/router_architecture.png differ diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 18ad940f8..e6a028d83 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -20,61 +20,72 @@ const sidebars = { { type: "doc", id: "index" }, // NEW { type: "category", - label: "💥 LiteLLM Proxy Server", + label: "LiteLLM Proxy Server", link: { type: "generated-index", - title: "💥 LiteLLM Proxy Server (LLM Gateway)", + title: "LiteLLM Proxy Server (LLM Gateway)", description: `OpenAI Proxy Server (LLM Gateway) to call 100+ LLMs in a unified interface & track spend, set budgets per virtual key/user`, slug: "/simple_proxy", }, items: [ - "proxy/quick_start", - "proxy/docker_quick_start", - "proxy/deploy", + "proxy/docker_quick_start", + { + "type": "category", + "label": "Config.yaml", + "items": ["proxy/configs", "proxy/config_management", "proxy/config_settings"] + }, + { + type: "category", + label: "Setup & Deployment", + items: [ + "proxy/deploy", + "proxy/prod", + "proxy/cli", + "proxy/model_management", + "proxy/health", + "proxy/debugging", + "proxy/pass_through", + ], + }, "proxy/demo", - "proxy/prod", { type: "category", label: "Architecture", - items: ["proxy/architecture"], + items: ["proxy/architecture", "proxy/db_info", "router_architecture"], }, { type: "link", - label: "📖 All Endpoints (Swagger)", + label: "All Endpoints (Swagger)", href: "https://litellm-api.up.railway.app/", }, "proxy/enterprise", - "proxy/user_keys", - "proxy/configs", - "proxy/response_headers", - "proxy/reliability", { type: "category", - label: "🔑 Authentication", - items: ["proxy/virtual_keys", "proxy/token_auth", "proxy/service_accounts", "proxy/access_control","proxy/ip_address"], - }, - { - type: "category", - label: "💸 Spend Tracking + Budgets", - items: ["proxy/cost_tracking", "proxy/users", "proxy/custom_pricing", "proxy/team_budgets", "proxy/billing", "proxy/customers"], - }, - { - type: "category", - label: "Routing", - items: ["proxy/load_balancing", "proxy/tag_routing", "proxy/team_based_routing", "proxy/customer_routing",], - }, - { - type: "category", - label: "Use with Provider SDKs", + label: "Making LLM Requests", items: [ + "proxy/user_keys", + "proxy/response_headers", "pass_through/vertex_ai", "pass_through/google_ai_studio", "pass_through/cohere", - "anthropic_completion", + "pass_through/anthropic_completion", "pass_through/bedrock", "pass_through/langfuse" ], }, + { + type: "category", + label: "Authentication", + items: [ + "proxy/virtual_keys", + "proxy/token_auth", + "proxy/service_accounts", + "proxy/access_control", + "proxy/ip_address", + "proxy/email", + "proxy/multiple_admins", + ], + }, { type: "category", label: "Admin UI", @@ -86,12 +97,22 @@ const sidebars = { }, { type: "category", - label: "🪢 Logging, Alerting, Metrics", - items: ["proxy/logging", "proxy/bucket", "proxy/team_logging","proxy/streaming_logging", "proxy/alerting", "proxy/prometheus",], + label: "Spend Tracking + Budgets", + items: ["proxy/cost_tracking", "proxy/users", "proxy/custom_pricing", "proxy/team_budgets", "proxy/billing", "proxy/customers"], + }, + { + type: "link", + label: "Load Balancing, Routing, Fallbacks", + href: "https://docs.litellm.ai/docs/routing-load-balancing", }, { type: "category", - label: "🛡️ [Beta] Guardrails", + label: "Logging, Alerting, Metrics", + items: ["proxy/logging", "proxy/team_logging","proxy/alerting", "proxy/prometheus",], + }, + { + type: "category", + label: "[Beta] Guardrails", items: [ "proxy/guardrails/quick_start", "proxy/guardrails/aporia_api", @@ -106,27 +127,20 @@ const sidebars = { }, { type: "category", - label: "Secret Manager - storing LLM API Keys", + label: "Secret Managers", items: [ "secret", "oidc" ] }, "proxy/caching", - "proxy/pass_through", - "proxy/email", - "proxy/multiple_admins", - "proxy/model_management", - "proxy/health", - "proxy/debugging", "proxy/call_hooks", - "proxy/rules", - "proxy/cli", + "proxy/rules", ] }, { type: "category", - label: "💯 Supported Models & Providers", + label: "Supported Models & Providers", link: { type: "generated-index", title: "Providers", @@ -183,7 +197,6 @@ const sidebars = { "providers/openrouter", "providers/palm", "providers/sambanova", - // "providers/custom_openai_proxy", "providers/custom_llm_server", "providers/petals", @@ -191,27 +204,19 @@ const sidebars = { }, { type: "category", - label: "Chat Completions (litellm.completion + PROXY)", - link: { - type: "generated-index", - title: "Chat Completions", - description: "Details on the completion() function", - slug: "/completion", - }, + label: "Guides", items: [ - "completion/input", + "exception_mapping", "completion/provider_specific_params", - "completion/json_mode", - "completion/prompt_caching", + "guides/finetuned_models", "completion/audio", "completion/vision", + "completion/json_mode", + "completion/prompt_caching", "completion/predict_outputs", "completion/prefix", "completion/drop_params", "completion/prompt_formatting", - "completion/output", - "completion/usage", - "exception_mapping", "completion/stream", "completion/message_trimming", "completion/function_call", @@ -219,21 +224,45 @@ const sidebars = { "completion/batching", "completion/mock_requests", "completion/reliable_completions", - ], + + ] }, { type: "category", - label: "Supported Endpoints - /images, /audio/speech, /assistants etc", + label: "Supported Endpoints", items: [ + { + type: "category", + label: "Chat", + link: { + type: "generated-index", + title: "Chat Completions", + description: "Details on the completion() function", + slug: "/completion", + }, + items: [ + "completion/input", + "completion/output", + "completion/usage", + ], + }, + "text_completion", "embedding/supported_embedding", "image_generation", - "audio_transcription", - "text_to_speech", + { + type: "category", + label: "Audio", + "items": [ + "audio_transcription", + "text_to_speech", + ] + }, "rerank", "assistants", "batches", "realtime", "fine_tuning", + "moderation", { type: "link", label: "Use LiteLLM Proxy with Vertex, Bedrock SDK", @@ -241,11 +270,20 @@ const sidebars = { }, ], }, - "routing", - "scheduler", { type: "category", - label: "🚅 LiteLLM Python SDK", + label: "Routing, Loadbalancing & Fallbacks", + link: { + type: "generated-index", + title: "Routing, Loadbalancing & Fallbacks", + description: "Learn how to load balance, route, and set fallbacks for your LLM requests", + slug: "/routing-load-balancing", + }, + items: ["routing", "scheduler", "proxy/load_balancing", "proxy/reliability", "proxy/tag_routing", "proxy/provider_budget_routing", "proxy/team_based_routing", "proxy/customer_routing", "wildcard_routing"], + }, + { + type: "category", + label: "LiteLLM Python SDK", items: [ "set_keys", "completion/token_usage", @@ -266,6 +304,7 @@ const sidebars = { type: "category", label: "Load Testing", items: [ + "benchmarks", "load_test", "load_test_advanced", "load_test_sdk", diff --git a/enterprise/utils.py b/enterprise/utils.py index f0af1d676..cc97661d7 100644 --- a/enterprise/utils.py +++ b/enterprise/utils.py @@ -2,7 +2,9 @@ from typing import Optional, List from litellm._logging import verbose_logger from litellm.proxy.proxy_server import PrismaClient, HTTPException +from litellm.llms.custom_httpx.http_handler import HTTPHandler import collections +import httpx from datetime import datetime @@ -114,7 +116,6 @@ async def ui_get_spend_by_tags( def _forecast_daily_cost(data: list): - import requests # type: ignore from datetime import datetime, timedelta if len(data) == 0: @@ -136,17 +137,17 @@ def _forecast_daily_cost(data: list): print("last entry date", last_entry_date) - # Assuming today_date is a datetime object - today_date = datetime.now() - # Calculate the last day of the month last_day_of_todays_month = datetime( today_date.year, today_date.month % 12 + 1, 1 ) - timedelta(days=1) + print("last day of todays month", last_day_of_todays_month) # Calculate the remaining days in the month remaining_days = (last_day_of_todays_month - last_entry_date).days + print("remaining days", remaining_days) + current_spend_this_month = 0 series = {} for entry in data: @@ -176,13 +177,19 @@ def _forecast_daily_cost(data: list): "Content-Type": "application/json", } - response = requests.post( - url="https://trend-api-production.up.railway.app/forecast", - json=payload, - headers=headers, - ) - # check the status code - response.raise_for_status() + client = HTTPHandler() + + try: + response = client.post( + url="https://trend-api-production.up.railway.app/forecast", + json=payload, + headers=headers, + ) + except httpx.HTTPStatusError as e: + raise HTTPException( + status_code=500, + detail={"error": f"Error getting forecast: {e.response.text}"}, + ) json_response = response.json() forecast_data = json_response["forecast"] @@ -206,13 +213,3 @@ def _forecast_daily_cost(data: list): f"Predicted Spend for { today_month } 2024, ${total_predicted_spend}" ) return {"response": response_data, "predicted_spend": predicted_spend} - - # print(f"Date: {entry['date']}, Spend: {entry['spend']}, Response: {response.text}") - - -# _forecast_daily_cost( -# [ -# {"date": "2022-01-01", "spend": 100}, - -# ] -# ) diff --git a/litellm/__init__.py b/litellm/__init__.py index f388bf17a..43f91fe58 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -24,6 +24,7 @@ from litellm.proxy._types import ( KeyManagementSettings, LiteLLM_UpperboundKeyGenerateParams, ) +from litellm.types.utils import StandardKeyGenerationConfig import httpx import dotenv from enum import Enum @@ -57,6 +58,7 @@ _custom_logger_compatible_callbacks_literal = Literal[ "gcs_bucket", "opik", "argilla", + "mlflow", ] logged_real_time_event_types: Optional[Union[List[str], Literal["*"]]] = None _known_custom_logger_compatible_callbacks: List = list( @@ -66,6 +68,7 @@ callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = langfuse_default_tags: Optional[List[str]] = None langsmith_batch_size: Optional[int] = None argilla_batch_size: Optional[int] = None +datadog_use_v1: Optional[bool] = False # if you want to use v1 datadog logged payload argilla_transformation_object: Optional[Dict[str, Any]] = None _async_input_callback: List[Callable] = ( [] @@ -132,7 +135,7 @@ use_client: bool = False ssl_verify: Union[str, bool] = True ssl_certificate: Optional[str] = None disable_streaming_logging: bool = False -in_memory_llm_clients_cache: dict = {} +in_memory_llm_clients_cache: InMemoryCache = InMemoryCache() safe_memory_mode: bool = False enable_azure_ad_token_refresh: Optional[bool] = False ### DEFAULT AZURE API VERSION ### @@ -272,6 +275,7 @@ s3_callback_params: Optional[Dict] = None generic_logger_headers: Optional[Dict] = None default_key_generate_params: Optional[Dict] = None upperbound_key_generate_params: Optional[LiteLLM_UpperboundKeyGenerateParams] = None +key_generation_settings: Optional[StandardKeyGenerationConfig] = None default_internal_user_params: Optional[Dict] = None default_team_settings: Optional[List] = None max_user_budget: Optional[float] = None @@ -279,15 +283,23 @@ default_max_internal_user_budget: Optional[float] = None max_internal_user_budget: Optional[float] = None internal_user_budget_duration: Optional[str] = None max_end_user_budget: Optional[float] = None +disable_end_user_cost_tracking: Optional[bool] = None #### REQUEST PRIORITIZATION #### priority_reservation: Optional[Dict[str, float]] = None #### RELIABILITY #### REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives. + +#### Networking settings #### request_timeout: float = 6000 # time in seconds +force_ipv4: bool = ( + False # when True, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6. +) module_level_aclient = AsyncHTTPHandler( timeout=request_timeout, client_alias="module level aclient" ) module_level_client = HTTPHandler(timeout=request_timeout) + +#### RETRIES #### num_retries: Optional[int] = None # per model endpoint max_fallbacks: Optional[int] = None default_fallbacks: Optional[List] = None @@ -304,7 +316,7 @@ secret_manager_client: Optional[Any] = ( ) _google_kms_resource_name: Optional[str] = None _key_management_system: Optional[KeyManagementSystem] = None -_key_management_settings: Optional[KeyManagementSettings] = None +_key_management_settings: KeyManagementSettings = KeyManagementSettings() #### PII MASKING #### output_parse_pii: bool = False ############################################# @@ -375,6 +387,7 @@ open_ai_text_completion_models: List = [] cohere_models: List = [] cohere_chat_models: List = [] mistral_chat_models: List = [] +text_completion_codestral_models: List = [] anthropic_models: List = [] empower_models: List = [] openrouter_models: List = [] @@ -401,6 +414,19 @@ deepinfra_models: List = [] perplexity_models: List = [] watsonx_models: List = [] gemini_models: List = [] +xai_models: List = [] +deepseek_models: List = [] +azure_ai_models: List = [] +voyage_models: List = [] +databricks_models: List = [] +cloudflare_models: List = [] +codestral_models: List = [] +friendliai_models: List = [] +palm_models: List = [] +groq_models: List = [] +azure_models: List = [] +anyscale_models: List = [] +cerebras_models: List = [] def add_known_models(): @@ -477,6 +503,34 @@ def add_known_models(): # ignore the 'up-to', '-to-' model names -> not real models. just for cost tracking based on model params. if "-to-" not in key: fireworks_ai_embedding_models.append(key) + elif value.get("litellm_provider") == "text-completion-codestral": + text_completion_codestral_models.append(key) + elif value.get("litellm_provider") == "xai": + xai_models.append(key) + elif value.get("litellm_provider") == "deepseek": + deepseek_models.append(key) + elif value.get("litellm_provider") == "azure_ai": + azure_ai_models.append(key) + elif value.get("litellm_provider") == "voyage": + voyage_models.append(key) + elif value.get("litellm_provider") == "databricks": + databricks_models.append(key) + elif value.get("litellm_provider") == "cloudflare": + cloudflare_models.append(key) + elif value.get("litellm_provider") == "codestral": + codestral_models.append(key) + elif value.get("litellm_provider") == "friendliai": + friendliai_models.append(key) + elif value.get("litellm_provider") == "palm": + palm_models.append(key) + elif value.get("litellm_provider") == "groq": + groq_models.append(key) + elif value.get("litellm_provider") == "azure": + azure_models.append(key) + elif value.get("litellm_provider") == "anyscale": + anyscale_models.append(key) + elif value.get("litellm_provider") == "cerebras": + cerebras_models.append(key) add_known_models() @@ -722,6 +776,20 @@ model_list = ( + vertex_language_models + watsonx_models + gemini_models + + text_completion_codestral_models + + xai_models + + deepseek_models + + azure_ai_models + + voyage_models + + databricks_models + + cloudflare_models + + codestral_models + + friendliai_models + + palm_models + + groq_models + + azure_models + + anyscale_models + + cerebras_models ) @@ -778,6 +846,7 @@ class LlmProviders(str, Enum): FIREWORKS_AI = "fireworks_ai" FRIENDLIAI = "friendliai" WATSONX = "watsonx" + WATSONX_TEXT = "watsonx_text" TRITON = "triton" PREDIBASE = "predibase" DATABRICKS = "databricks" @@ -794,6 +863,7 @@ provider_list: List[Union[LlmProviders, str]] = list(LlmProviders) models_by_provider: dict = { "openai": open_ai_chat_completion_models + open_ai_text_completion_models, + "text-completion-openai": open_ai_text_completion_models, "cohere": cohere_models + cohere_chat_models, "cohere_chat": cohere_chat_models, "anthropic": anthropic_models, @@ -817,6 +887,23 @@ models_by_provider: dict = { "watsonx": watsonx_models, "gemini": gemini_models, "fireworks_ai": fireworks_ai_models + fireworks_ai_embedding_models, + "aleph_alpha": aleph_alpha_models, + "text-completion-codestral": text_completion_codestral_models, + "xai": xai_models, + "deepseek": deepseek_models, + "mistral": mistral_chat_models, + "azure_ai": azure_ai_models, + "voyage": voyage_models, + "databricks": databricks_models, + "cloudflare": cloudflare_models, + "codestral": codestral_models, + "nlp_cloud": nlp_cloud_models, + "friendliai": friendliai_models, + "palm": palm_models, + "groq": groq_models, + "azure": azure_models, + "anyscale": anyscale_models, + "cerebras": cerebras_models, } # mapping for those models which have larger equivalents @@ -886,10 +973,11 @@ from .utils import ( supports_response_schema, supports_parallel_function_calling, supports_vision, + supports_audio_input, + supports_audio_output, supports_system_messages, get_litellm_params, acreate, - get_model_list, get_max_tokens, get_model_info, register_prompt_template, @@ -984,10 +1072,11 @@ from .llms.bedrock.common_utils import ( AmazonAnthropicClaude3Config, AmazonCohereConfig, AmazonLlamaConfig, - AmazonStabilityConfig, AmazonMistralConfig, AmazonBedrockGlobalConfig, ) +from .llms.bedrock.image.amazon_stability1_transformation import AmazonStabilityConfig +from .llms.bedrock.image.amazon_stability3_transformation import AmazonStability3Config from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config from .llms.bedrock.embed.amazon_titan_multimodal_transformation import ( AmazonTitanMultimodalEmbeddingG1Config, @@ -1045,7 +1134,9 @@ from .llms.AzureOpenAI.azure import ( from .llms.AzureOpenAI.chat.gpt_transformation import AzureOpenAIConfig from .llms.hosted_vllm.chat.transformation import HostedVLLMChatConfig +from .llms.deepseek.chat.transformation import DeepSeekChatConfig from .llms.lm_studio.chat.transformation import LMStudioChatConfig +from .llms.lm_studio.embed.transformation import LmStudioEmbeddingConfig from .llms.perplexity.chat.transformation import PerplexityChatConfig from .llms.AzureOpenAI.chat.o1_transformation import AzureOpenAIO1Config from .llms.watsonx.completion.handler import IBMWatsonXAIConfig diff --git a/litellm/_redis.py b/litellm/_redis.py index c058a0d3a..d905f1c9d 100644 --- a/litellm/_redis.py +++ b/litellm/_redis.py @@ -12,13 +12,13 @@ import json # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation import os -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union import redis # type: ignore import redis.asyncio as async_redis # type: ignore import litellm -from litellm import get_secret +from litellm import get_secret, get_secret_str from ._logging import verbose_logger @@ -141,6 +141,13 @@ def _get_redis_client_logic(**env_overrides): if _sentinel_nodes is not None and isinstance(_sentinel_nodes, str): redis_kwargs["sentinel_nodes"] = json.loads(_sentinel_nodes) + _sentinel_password: Optional[str] = redis_kwargs.get( + "sentinel_password", None + ) or get_secret_str("REDIS_SENTINEL_PASSWORD") + + if _sentinel_password is not None: + redis_kwargs["sentinel_password"] = _sentinel_password + _service_name: Optional[str] = redis_kwargs.get("service_name", None) or get_secret( # type: ignore "REDIS_SERVICE_NAME" ) @@ -217,6 +224,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis: def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis: sentinel_nodes = redis_kwargs.get("sentinel_nodes") + sentinel_password = redis_kwargs.get("sentinel_password") service_name = redis_kwargs.get("service_name") if not sentinel_nodes or not service_name: @@ -227,7 +235,11 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis: verbose_logger.debug("init_redis_sentinel: sentinel nodes are being initialized.") # Set up the Sentinel client - sentinel = async_redis.Sentinel(sentinel_nodes, socket_timeout=0.1) + sentinel = async_redis.Sentinel( + sentinel_nodes, + socket_timeout=0.1, + password=sentinel_password, + ) # Return the master instance for the given service @@ -301,12 +313,13 @@ def get_redis_async_client(**env_overrides) -> async_redis.Redis: def get_redis_connection_pool(**env_overrides): redis_kwargs = _get_redis_client_logic(**env_overrides) + verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs) if "url" in redis_kwargs and redis_kwargs["url"] is not None: return async_redis.BlockingConnectionPool.from_url( timeout=5, url=redis_kwargs["url"] ) connection_class = async_redis.Connection - if "ssl" in redis_kwargs and redis_kwargs["ssl"] is not None: + if "ssl" in redis_kwargs: connection_class = async_redis.SSLConnection redis_kwargs.pop("ssl", None) redis_kwargs["connection_class"] = connection_class diff --git a/litellm/caching/base_cache.py b/litellm/caching/base_cache.py index a50e09bf9..7109951d1 100644 --- a/litellm/caching/base_cache.py +++ b/litellm/caching/base_cache.py @@ -8,6 +8,7 @@ Has 4 methods: - async_get_cache """ +from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any, Optional if TYPE_CHECKING: @@ -18,7 +19,7 @@ else: Span = Any -class BaseCache: +class BaseCache(ABC): def __init__(self, default_ttl: int = 60): self.default_ttl = default_ttl @@ -37,6 +38,10 @@ class BaseCache: async def async_set_cache(self, key, value, **kwargs): raise NotImplementedError + @abstractmethod + async def async_set_cache_pipeline(self, cache_list, **kwargs): + pass + def get_cache(self, key, **kwargs): raise NotImplementedError diff --git a/litellm/caching/caching.py b/litellm/caching/caching.py index 5fd972a76..17c09b997 100644 --- a/litellm/caching/caching.py +++ b/litellm/caching/caching.py @@ -233,19 +233,18 @@ class Cache: if self.namespace is not None and isinstance(self.cache, RedisCache): self.cache.namespace = self.namespace - def get_cache_key(self, *args, **kwargs) -> str: + def get_cache_key(self, **kwargs) -> str: """ Get the cache key for the given arguments. Args: - *args: args to litellm.completion() or embedding() **kwargs: kwargs to litellm.completion() or embedding() Returns: str: The cache key generated from the arguments, or None if no cache key could be generated. """ cache_key = "" - verbose_logger.debug("\nGetting Cache key. Kwargs: %s", kwargs) + # verbose_logger.debug("\nGetting Cache key. Kwargs: %s", kwargs) preset_cache_key = self._get_preset_cache_key_from_kwargs(**kwargs) if preset_cache_key is not None: @@ -521,7 +520,7 @@ class Cache: return cached_response return cached_result - def get_cache(self, *args, **kwargs): + def get_cache(self, **kwargs): """ Retrieves the cached result for the given arguments. @@ -533,13 +532,13 @@ class Cache: The cached result if it exists, otherwise None. """ try: # never block execution - if self.should_use_cache(*args, **kwargs) is not True: + if self.should_use_cache(**kwargs) is not True: return messages = kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: - cache_key = self.get_cache_key(*args, **kwargs) + cache_key = self.get_cache_key(**kwargs) if cache_key is not None: cache_control_args = kwargs.get("cache", {}) max_age = cache_control_args.get( @@ -553,29 +552,28 @@ class Cache: print_verbose(f"An exception occurred: {traceback.format_exc()}") return None - async def async_get_cache(self, *args, **kwargs): + async def async_get_cache(self, **kwargs): """ Async get cache implementation. Used for embedding calls in async wrapper """ + try: # never block execution - if self.should_use_cache(*args, **kwargs) is not True: + if self.should_use_cache(**kwargs) is not True: return kwargs.get("messages", []) if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: - cache_key = self.get_cache_key(*args, **kwargs) + cache_key = self.get_cache_key(**kwargs) if cache_key is not None: cache_control_args = kwargs.get("cache", {}) max_age = cache_control_args.get( "s-max-age", cache_control_args.get("s-maxage", float("inf")) ) - cached_result = await self.cache.async_get_cache( - cache_key, *args, **kwargs - ) + cached_result = await self.cache.async_get_cache(cache_key, **kwargs) return self._get_cache_logic( cached_result=cached_result, max_age=max_age ) @@ -583,7 +581,7 @@ class Cache: print_verbose(f"An exception occurred: {traceback.format_exc()}") return None - def _add_cache_logic(self, result, *args, **kwargs): + def _add_cache_logic(self, result, **kwargs): """ Common implementation across sync + async add_cache functions """ @@ -591,7 +589,7 @@ class Cache: if "cache_key" in kwargs: cache_key = kwargs["cache_key"] else: - cache_key = self.get_cache_key(*args, **kwargs) + cache_key = self.get_cache_key(**kwargs) if cache_key is not None: if isinstance(result, BaseModel): result = result.model_dump_json() @@ -613,7 +611,7 @@ class Cache: except Exception as e: raise e - def add_cache(self, result, *args, **kwargs): + def add_cache(self, result, **kwargs): """ Adds a result to the cache. @@ -625,41 +623,42 @@ class Cache: None """ try: - if self.should_use_cache(*args, **kwargs) is not True: + if self.should_use_cache(**kwargs) is not True: return cache_key, cached_data, kwargs = self._add_cache_logic( - result=result, *args, **kwargs + result=result, **kwargs ) self.cache.set_cache(cache_key, cached_data, **kwargs) except Exception as e: verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}") - async def async_add_cache(self, result, *args, **kwargs): + async def async_add_cache(self, result, **kwargs): """ Async implementation of add_cache """ try: - if self.should_use_cache(*args, **kwargs) is not True: + if self.should_use_cache(**kwargs) is not True: return if self.type == "redis" and self.redis_flush_size is not None: # high traffic - fill in results in memory and then flush - await self.batch_cache_write(result, *args, **kwargs) + await self.batch_cache_write(result, **kwargs) else: cache_key, cached_data, kwargs = self._add_cache_logic( - result=result, *args, **kwargs + result=result, **kwargs ) + await self.cache.async_set_cache(cache_key, cached_data, **kwargs) except Exception as e: verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}") - async def async_add_cache_pipeline(self, result, *args, **kwargs): + async def async_add_cache_pipeline(self, result, **kwargs): """ Async implementation of add_cache for Embedding calls Does a bulk write, to prevent using too many clients """ try: - if self.should_use_cache(*args, **kwargs) is not True: + if self.should_use_cache(**kwargs) is not True: return # set default ttl if not set @@ -668,29 +667,27 @@ class Cache: cache_list = [] for idx, i in enumerate(kwargs["input"]): - preset_cache_key = self.get_cache_key(*args, **{**kwargs, "input": i}) + preset_cache_key = self.get_cache_key(**{**kwargs, "input": i}) kwargs["cache_key"] = preset_cache_key embedding_response = result.data[idx] cache_key, cached_data, kwargs = self._add_cache_logic( result=embedding_response, - *args, **kwargs, ) cache_list.append((cache_key, cached_data)) - async_set_cache_pipeline = getattr( - self.cache, "async_set_cache_pipeline", None - ) - if async_set_cache_pipeline: - await async_set_cache_pipeline(cache_list=cache_list, **kwargs) - else: - tasks = [] - for val in cache_list: - tasks.append(self.cache.async_set_cache(val[0], val[1], **kwargs)) - await asyncio.gather(*tasks) + + await self.cache.async_set_cache_pipeline(cache_list=cache_list, **kwargs) + # if async_set_cache_pipeline: + # await async_set_cache_pipeline(cache_list=cache_list, **kwargs) + # else: + # tasks = [] + # for val in cache_list: + # tasks.append(self.cache.async_set_cache(val[0], val[1], **kwargs)) + # await asyncio.gather(*tasks) except Exception as e: verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}") - def should_use_cache(self, *args, **kwargs): + def should_use_cache(self, **kwargs): """ Returns true if we should use the cache for LLM API calls @@ -708,10 +705,8 @@ class Cache: return True return False - async def batch_cache_write(self, result, *args, **kwargs): - cache_key, cached_data, kwargs = self._add_cache_logic( - result=result, *args, **kwargs - ) + async def batch_cache_write(self, result, **kwargs): + cache_key, cached_data, kwargs = self._add_cache_logic(result=result, **kwargs) await self.cache.batch_cache_write(cache_key, cached_data, **kwargs) async def ping(self): diff --git a/litellm/caching/caching_handler.py b/litellm/caching/caching_handler.py index f4e7d8476..11ae600b7 100644 --- a/litellm/caching/caching_handler.py +++ b/litellm/caching/caching_handler.py @@ -137,7 +137,7 @@ class LLMCachingHandler: if litellm.cache is not None and self._is_call_type_supported_by_cache( original_function=original_function ): - print_verbose("Checking Cache") + verbose_logger.debug("Checking Cache") cached_result = await self._retrieve_from_cache( call_type=call_type, kwargs=kwargs, @@ -145,7 +145,7 @@ class LLMCachingHandler: ) if cached_result is not None and not isinstance(cached_result, list): - print_verbose("Cache Hit!") + verbose_logger.debug("Cache Hit!") cache_hit = True end_time = datetime.datetime.now() model, _, _, _ = litellm.get_llm_provider( @@ -215,6 +215,7 @@ class LLMCachingHandler: final_embedding_cached_response=final_embedding_cached_response, embedding_all_elements_cache_hit=embedding_all_elements_cache_hit, ) + verbose_logger.debug(f"CACHE RESULT: {cached_result}") return CachingHandlerResponse( cached_result=cached_result, final_embedding_cached_response=final_embedding_cached_response, @@ -233,12 +234,19 @@ class LLMCachingHandler: from litellm.utils import CustomStreamWrapper args = args or () + new_kwargs = kwargs.copy() + new_kwargs.update( + convert_args_to_kwargs( + self.original_function, + args, + ) + ) cached_result: Optional[Any] = None if litellm.cache is not None and self._is_call_type_supported_by_cache( original_function=original_function ): print_verbose("Checking Cache") - cached_result = litellm.cache.get_cache(*args, **kwargs) + cached_result = litellm.cache.get_cache(**new_kwargs) if cached_result is not None: if "detail" in cached_result: # implies an error occurred @@ -475,14 +483,21 @@ class LLMCachingHandler: if litellm.cache is None: return None + new_kwargs = kwargs.copy() + new_kwargs.update( + convert_args_to_kwargs( + self.original_function, + args, + ) + ) cached_result: Optional[Any] = None if call_type == CallTypes.aembedding.value and isinstance( - kwargs["input"], list + new_kwargs["input"], list ): tasks = [] - for idx, i in enumerate(kwargs["input"]): + for idx, i in enumerate(new_kwargs["input"]): preset_cache_key = litellm.cache.get_cache_key( - *args, **{**kwargs, "input": i} + **{**new_kwargs, "input": i} ) tasks.append(litellm.cache.async_get_cache(cache_key=preset_cache_key)) cached_result = await asyncio.gather(*tasks) @@ -493,9 +508,9 @@ class LLMCachingHandler: cached_result = None else: if litellm.cache._supports_async() is True: - cached_result = await litellm.cache.async_get_cache(*args, **kwargs) + cached_result = await litellm.cache.async_get_cache(**new_kwargs) else: # for s3 caching. [NOT RECOMMENDED IN PROD - this will slow down responses since boto3 is sync] - cached_result = litellm.cache.get_cache(*args, **kwargs) + cached_result = litellm.cache.get_cache(**new_kwargs) return cached_result def _convert_cached_result_to_model_response( @@ -580,6 +595,7 @@ class LLMCachingHandler: model_response_object=EmbeddingResponse(), response_type="embedding", ) + elif ( call_type == CallTypes.arerank.value or call_type == CallTypes.rerank.value ) and isinstance(cached_result, dict): @@ -603,6 +619,13 @@ class LLMCachingHandler: response_type="audio_transcription", hidden_params=hidden_params, ) + + if ( + hasattr(cached_result, "_hidden_params") + and cached_result._hidden_params is not None + and isinstance(cached_result._hidden_params, dict) + ): + cached_result._hidden_params["cache_hit"] = True return cached_result def _convert_cached_stream_response( @@ -658,12 +681,19 @@ class LLMCachingHandler: Raises: None """ - kwargs.update(convert_args_to_kwargs(result, original_function, kwargs, args)) + + new_kwargs = kwargs.copy() + new_kwargs.update( + convert_args_to_kwargs( + original_function, + args, + ) + ) if litellm.cache is None: return # [OPTIONAL] ADD TO CACHE if self._should_store_result_in_cache( - original_function=original_function, kwargs=kwargs + original_function=original_function, kwargs=new_kwargs ): if ( isinstance(result, litellm.ModelResponse) @@ -673,29 +703,29 @@ class LLMCachingHandler: ): if ( isinstance(result, EmbeddingResponse) - and isinstance(kwargs["input"], list) + and isinstance(new_kwargs["input"], list) and litellm.cache is not None and not isinstance( litellm.cache.cache, S3Cache ) # s3 doesn't support bulk writing. Exclude. ): asyncio.create_task( - litellm.cache.async_add_cache_pipeline(result, **kwargs) + litellm.cache.async_add_cache_pipeline(result, **new_kwargs) ) elif isinstance(litellm.cache.cache, S3Cache): threading.Thread( target=litellm.cache.add_cache, args=(result,), - kwargs=kwargs, + kwargs=new_kwargs, ).start() else: asyncio.create_task( litellm.cache.async_add_cache( - result.model_dump_json(), **kwargs + result.model_dump_json(), **new_kwargs ) ) else: - asyncio.create_task(litellm.cache.async_add_cache(result, **kwargs)) + asyncio.create_task(litellm.cache.async_add_cache(result, **new_kwargs)) def sync_set_cache( self, @@ -706,16 +736,20 @@ class LLMCachingHandler: """ Sync internal method to add the result to the cache """ - kwargs.update( - convert_args_to_kwargs(result, self.original_function, kwargs, args) + new_kwargs = kwargs.copy() + new_kwargs.update( + convert_args_to_kwargs( + self.original_function, + args, + ) ) if litellm.cache is None: return if self._should_store_result_in_cache( - original_function=self.original_function, kwargs=kwargs + original_function=self.original_function, kwargs=new_kwargs ): - litellm.cache.add_cache(result, **kwargs) + litellm.cache.add_cache(result, **new_kwargs) return @@ -865,9 +899,7 @@ class LLMCachingHandler: def convert_args_to_kwargs( - result: Any, original_function: Callable, - kwargs: Dict[str, Any], args: Optional[Tuple[Any, ...]] = None, ) -> Dict[str, Any]: # Get the signature of the original function diff --git a/litellm/caching/disk_cache.py b/litellm/caching/disk_cache.py index 2c086ed50..94f82926d 100644 --- a/litellm/caching/disk_cache.py +++ b/litellm/caching/disk_cache.py @@ -24,7 +24,6 @@ class DiskCache(BaseCache): self.disk_cache = dc.Cache(disk_cache_dir) def set_cache(self, key, value, **kwargs): - print_verbose("DiskCache: set_cache") if "ttl" in kwargs: self.disk_cache.set(key, value, expire=kwargs["ttl"]) else: @@ -33,10 +32,10 @@ class DiskCache(BaseCache): async def async_set_cache(self, key, value, **kwargs): self.set_cache(key=key, value=value, **kwargs) - async def async_set_cache_pipeline(self, cache_list, ttl=None): + async def async_set_cache_pipeline(self, cache_list, **kwargs): for cache_key, cache_value in cache_list: - if ttl is not None: - self.set_cache(key=cache_key, value=cache_value, ttl=ttl) + if "ttl" in kwargs: + self.set_cache(key=cache_key, value=cache_value, ttl=kwargs["ttl"]) else: self.set_cache(key=cache_key, value=cache_value) diff --git a/litellm/caching/dual_cache.py b/litellm/caching/dual_cache.py index ddcd02abe..a6c218c01 100644 --- a/litellm/caching/dual_cache.py +++ b/litellm/caching/dual_cache.py @@ -314,7 +314,8 @@ class DualCache(BaseCache): f"LiteLLM Cache: Excepton async add_cache: {str(e)}" ) - async def async_batch_set_cache( + # async_batch_set_cache + async def async_set_cache_pipeline( self, cache_list: list, local_only: bool = False, **kwargs ): """ diff --git a/litellm/caching/qdrant_semantic_cache.py b/litellm/caching/qdrant_semantic_cache.py index be67001f6..acaa8e918 100644 --- a/litellm/caching/qdrant_semantic_cache.py +++ b/litellm/caching/qdrant_semantic_cache.py @@ -9,6 +9,7 @@ Has 4 methods: """ import ast +import asyncio import json from typing import Any @@ -422,3 +423,9 @@ class QdrantSemanticCache(BaseCache): async def _collection_info(self): return self.collection_info + + async def async_set_cache_pipeline(self, cache_list, **kwargs): + tasks = [] + for val in cache_list: + tasks.append(self.async_set_cache(val[0], val[1], **kwargs)) + await asyncio.gather(*tasks) diff --git a/litellm/caching/redis_cache.py b/litellm/caching/redis_cache.py index 40bb49f44..ba5c3a695 100644 --- a/litellm/caching/redis_cache.py +++ b/litellm/caching/redis_cache.py @@ -20,6 +20,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Tuple import litellm from litellm._logging import print_verbose, verbose_logger from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs +from litellm.types.caching import RedisPipelineIncrementOperation from litellm.types.services import ServiceLoggerPayload, ServiceTypes from litellm.types.utils import all_litellm_params @@ -404,7 +405,7 @@ class RedisCache(BaseCache): parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs), ) ) - return results + return None except Exception as e: ## LOGGING ## end_time = time.time() @@ -890,3 +891,92 @@ class RedisCache(BaseCache): def delete_cache(self, key): self.redis_client.delete(key) + + async def _pipeline_increment_helper( + self, + pipe: pipeline, + increment_list: List[RedisPipelineIncrementOperation], + ) -> Optional[List[float]]: + """Helper function for pipeline increment operations""" + # Iterate through each increment operation and add commands to pipeline + for increment_op in increment_list: + cache_key = self.check_and_fix_namespace(key=increment_op["key"]) + print_verbose( + f"Increment ASYNC Redis Cache PIPELINE: key: {cache_key}\nValue {increment_op['increment_value']}\nttl={increment_op['ttl']}" + ) + pipe.incrbyfloat(cache_key, increment_op["increment_value"]) + if increment_op["ttl"] is not None: + _td = timedelta(seconds=increment_op["ttl"]) + pipe.expire(cache_key, _td) + # Execute the pipeline and return results + results = await pipe.execute() + print_verbose(f"Increment ASYNC Redis Cache PIPELINE: results: {results}") + return results + + async def async_increment_pipeline( + self, increment_list: List[RedisPipelineIncrementOperation], **kwargs + ) -> Optional[List[float]]: + """ + Use Redis Pipelines for bulk increment operations + Args: + increment_list: List of RedisPipelineIncrementOperation dicts containing: + - key: str + - increment_value: float + - ttl_seconds: int + """ + # don't waste a network request if there's nothing to increment + if len(increment_list) == 0: + return None + + from redis.asyncio import Redis + + _redis_client: Redis = self.init_async_client() # type: ignore + start_time = time.time() + + print_verbose( + f"Increment Async Redis Cache Pipeline: increment list: {increment_list}" + ) + + try: + async with _redis_client as redis_client: + async with redis_client.pipeline(transaction=True) as pipe: + results = await self._pipeline_increment_helper( + pipe, increment_list + ) + + print_verbose(f"pipeline increment results: {results}") + + ## LOGGING ## + end_time = time.time() + _duration = end_time - start_time + asyncio.create_task( + self.service_logger_obj.async_service_success_hook( + service=ServiceTypes.REDIS, + duration=_duration, + call_type="async_increment_pipeline", + start_time=start_time, + end_time=end_time, + parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs), + ) + ) + return results + except Exception as e: + ## LOGGING ## + end_time = time.time() + _duration = end_time - start_time + asyncio.create_task( + self.service_logger_obj.async_service_failure_hook( + service=ServiceTypes.REDIS, + duration=_duration, + error=e, + call_type="async_increment_pipeline", + start_time=start_time, + end_time=end_time, + parent_otel_span=_get_parent_otel_span_from_kwargs(kwargs), + ) + ) + verbose_logger.error( + "LiteLLM Redis Caching: async increment_pipeline() - Got exception from REDIS %s", + str(e), + ) + raise e diff --git a/litellm/caching/redis_semantic_cache.py b/litellm/caching/redis_semantic_cache.py index 444a3259f..e3098f085 100644 --- a/litellm/caching/redis_semantic_cache.py +++ b/litellm/caching/redis_semantic_cache.py @@ -9,6 +9,7 @@ Has 4 methods: """ import ast +import asyncio import json from typing import Any @@ -331,3 +332,9 @@ class RedisSemanticCache(BaseCache): async def _index_info(self): return await self.index.ainfo() + + async def async_set_cache_pipeline(self, cache_list, **kwargs): + tasks = [] + for val in cache_list: + tasks.append(self.async_set_cache(val[0], val[1], **kwargs)) + await asyncio.gather(*tasks) diff --git a/litellm/caching/s3_cache.py b/litellm/caching/s3_cache.py index c22347a7f..6be16e289 100644 --- a/litellm/caching/s3_cache.py +++ b/litellm/caching/s3_cache.py @@ -10,6 +10,7 @@ Has 4 methods: """ import ast +import asyncio import json from typing import Any, Optional @@ -153,3 +154,9 @@ class S3Cache(BaseCache): async def disconnect(self): pass + + async def async_set_cache_pipeline(self, cache_list, **kwargs): + tasks = [] + for val in cache_list: + tasks.append(self.async_set_cache(val[0], val[1], **kwargs)) + await asyncio.gather(*tasks) diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 0be7f1d38..50bed6fe9 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -28,6 +28,9 @@ from litellm.llms.azure_ai.cost_calculator import ( from litellm.llms.AzureOpenAI.cost_calculation import ( cost_per_token as azure_openai_cost_per_token, ) +from litellm.llms.bedrock.image.cost_calculator import ( + cost_calculator as bedrock_image_cost_calculator, +) from litellm.llms.cohere.cost_calculator import ( cost_per_query as cohere_rerank_cost_per_query, ) @@ -43,6 +46,9 @@ from litellm.llms.OpenAI.cost_calculation import ( from litellm.llms.OpenAI.cost_calculation import cost_per_token as openai_cost_per_token from litellm.llms.OpenAI.cost_calculation import cost_router as openai_cost_router from litellm.llms.together_ai.cost_calculator import get_model_params_and_category +from litellm.llms.vertex_ai_and_google_ai_studio.image_generation.cost_calculator import ( + cost_calculator as vertex_ai_image_cost_calculator, +) from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.types.rerank import RerankResponse from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS @@ -168,7 +174,6 @@ def cost_per_token( # noqa: PLR0915 model_with_provider = model_with_provider_and_region else: _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model) - model_without_prefix = model model_parts = model.split("/", 1) if len(model_parts) > 1: @@ -451,7 +456,6 @@ def _select_model_name_for_cost_calc( if base_model is not None: return base_model - return_model = model if isinstance(completion_response, str): return return_model @@ -521,12 +525,13 @@ def completion_cost( # noqa: PLR0915 custom_llm_provider=None, region_name=None, # used for bedrock pricing ### IMAGE GEN ### - size=None, + size: Optional[str] = None, quality=None, n=None, # number of images ### CUSTOM PRICING ### custom_cost_per_token: Optional[CostPerToken] = None, custom_cost_per_second: Optional[float] = None, + optional_params: Optional[dict] = None, ) -> float: """ Calculate the cost of a given completion call fot GPT-3.5-turbo, llama2, any litellm supported llm. @@ -616,7 +621,8 @@ def completion_cost( # noqa: PLR0915 f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} " ) model = _select_model_name_for_cost_calc( - model=model, completion_response=completion_response + model=model, + completion_response=completion_response, ) hidden_params = getattr(completion_response, "_hidden_params", None) if hidden_params is not None: @@ -664,10 +670,22 @@ def completion_cost( # noqa: PLR0915 ): ### IMAGE GENERATION COST CALCULATION ### if custom_llm_provider == "vertex_ai": - # https://cloud.google.com/vertex-ai/generative-ai/pricing - # Vertex Charges Flat $0.20 per image - return 0.020 - + if isinstance(completion_response, ImageResponse): + return vertex_ai_image_cost_calculator( + model=model, + image_response=completion_response, + ) + elif custom_llm_provider == "bedrock": + if isinstance(completion_response, ImageResponse): + return bedrock_image_cost_calculator( + model=model, + size=size, + image_response=completion_response, + optional_params=optional_params, + ) + raise TypeError( + "completion_response must be of type ImageResponse for bedrock image cost calculation" + ) if size is None: size = "1024-x-1024" # openai default # fix size to match naming convention @@ -677,9 +695,9 @@ def completion_cost( # noqa: PLR0915 image_gen_model_name_with_quality = image_gen_model_name if quality is not None: image_gen_model_name_with_quality = f"{quality}/{image_gen_model_name}" - size = size.split("-x-") - height = int(size[0]) # if it's 1024-x-1024 vs. 1024x1024 - width = int(size[1]) + size_parts = size.split("-x-") + height = int(size_parts[0]) # if it's 1024-x-1024 vs. 1024x1024 + width = int(size_parts[1]) verbose_logger.debug(f"image_gen_model_name: {image_gen_model_name}") verbose_logger.debug( f"image_gen_model_name_with_quality: {image_gen_model_name_with_quality}" @@ -839,11 +857,14 @@ def response_cost_calculator( if isinstance(response_object, BaseModel): response_object._hidden_params["optional_params"] = optional_params if isinstance(response_object, ImageResponse): + if base_model is not None: + model = base_model response_cost = completion_cost( completion_response=response_object, model=model, call_type=call_type, custom_llm_provider=custom_llm_provider, + optional_params=optional_params, ) else: if custom_pricing is True: # override defaults if custom pricing is set diff --git a/litellm/integrations/SlackAlerting/slack_alerting.py b/litellm/integrations/SlackAlerting/slack_alerting.py index 85d54a337..d585e235b 100644 --- a/litellm/integrations/SlackAlerting/slack_alerting.py +++ b/litellm/integrations/SlackAlerting/slack_alerting.py @@ -423,7 +423,7 @@ class SlackAlerting(CustomBatchLogger): latency_cache_keys = [(key, 0) for key in latency_keys] failed_request_cache_keys = [(key, 0) for key in failed_request_keys] combined_metrics_cache_keys = latency_cache_keys + failed_request_cache_keys - await self.internal_usage_cache.async_batch_set_cache( + await self.internal_usage_cache.async_set_cache_pipeline( cache_list=combined_metrics_cache_keys ) diff --git a/litellm/integrations/custom_batch_logger.py b/litellm/integrations/custom_batch_logger.py index aa7f0bba2..7ef63d25c 100644 --- a/litellm/integrations/custom_batch_logger.py +++ b/litellm/integrations/custom_batch_logger.py @@ -21,6 +21,7 @@ class CustomBatchLogger(CustomLogger): self, flush_lock: Optional[asyncio.Lock] = None, batch_size: Optional[int] = DEFAULT_BATCH_SIZE, + flush_interval: Optional[int] = DEFAULT_FLUSH_INTERVAL_SECONDS, **kwargs, ) -> None: """ @@ -28,7 +29,7 @@ class CustomBatchLogger(CustomLogger): flush_lock (Optional[asyncio.Lock], optional): Lock to use when flushing the queue. Defaults to None. Only used for custom loggers that do batching """ self.log_queue: List = [] - self.flush_interval = DEFAULT_FLUSH_INTERVAL_SECONDS # 10 seconds + self.flush_interval = flush_interval or DEFAULT_FLUSH_INTERVAL_SECONDS self.batch_size: int = batch_size or DEFAULT_BATCH_SIZE self.last_flush_time = time.time() self.flush_lock = flush_lock diff --git a/litellm/integrations/datadog/datadog.py b/litellm/integrations/datadog/datadog.py index 40044ce9f..482c2bc10 100644 --- a/litellm/integrations/datadog/datadog.py +++ b/litellm/integrations/datadog/datadog.py @@ -32,9 +32,11 @@ from litellm.llms.custom_httpx.http_handler import ( get_async_httpx_client, httpxSpecialProvider, ) +from litellm.proxy._types import UserAPIKeyAuth +from litellm.types.integrations.datadog import * from litellm.types.services import ServiceLoggerPayload +from litellm.types.utils import StandardLoggingPayload -from .types import DD_ERRORS, DatadogPayload, DataDogStatus from .utils import make_json_serializable DD_MAX_BATCH_SIZE = 1000 # max number of logs DD API can accept @@ -106,20 +108,20 @@ class DataDogLogger(CustomBatchLogger): verbose_logger.debug( "Datadog: Logging - Enters logging function for model %s", kwargs ) - dd_payload = self.create_datadog_logging_payload( - kwargs=kwargs, - response_obj=response_obj, - start_time=start_time, - end_time=end_time, - ) + await self._log_async_event(kwargs, response_obj, start_time, end_time) - self.log_queue.append(dd_payload) + except Exception as e: + verbose_logger.exception( + f"Datadog Layer Error - {str(e)}\n{traceback.format_exc()}" + ) + pass + + async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): + try: verbose_logger.debug( - f"Datadog, event added to queue. Will flush in {self.flush_interval} seconds..." + "Datadog: Logging - Enters logging function for model %s", kwargs ) - - if len(self.log_queue) >= self.batch_size: - await self.async_send_batch() + await self._log_async_event(kwargs, response_obj, start_time, end_time) except Exception as e: verbose_logger.exception( @@ -181,12 +183,20 @@ class DataDogLogger(CustomBatchLogger): verbose_logger.debug( "Datadog: Logging - Enters logging function for model %s", kwargs ) - dd_payload = self.create_datadog_logging_payload( - kwargs=kwargs, - response_obj=response_obj, - start_time=start_time, - end_time=end_time, - ) + if litellm.datadog_use_v1 is True: + dd_payload = self._create_v0_logging_payload( + kwargs=kwargs, + response_obj=response_obj, + start_time=start_time, + end_time=end_time, + ) + else: + dd_payload = self.create_datadog_logging_payload( + kwargs=kwargs, + response_obj=response_obj, + start_time=start_time, + end_time=end_time, + ) response = self.sync_client.post( url=self.intake_url, @@ -215,6 +225,22 @@ class DataDogLogger(CustomBatchLogger): pass pass + async def _log_async_event(self, kwargs, response_obj, start_time, end_time): + dd_payload = self.create_datadog_logging_payload( + kwargs=kwargs, + response_obj=response_obj, + start_time=start_time, + end_time=end_time, + ) + + self.log_queue.append(dd_payload) + verbose_logger.debug( + f"Datadog, event added to queue. Will flush in {self.flush_interval} seconds..." + ) + + if len(self.log_queue) >= self.batch_size: + await self.async_send_batch() + def create_datadog_logging_payload( self, kwargs: Union[dict, Any], @@ -236,73 +262,29 @@ class DataDogLogger(CustomBatchLogger): """ import json - litellm_params = kwargs.get("litellm_params", {}) - metadata = ( - litellm_params.get("metadata", {}) or {} - ) # if litellm_params['metadata'] == None - messages = kwargs.get("messages") - optional_params = kwargs.get("optional_params", {}) - call_type = kwargs.get("call_type", "litellm.completion") - cache_hit = kwargs.get("cache_hit", False) - usage = response_obj["usage"] - id = response_obj.get("id", str(uuid.uuid4())) - usage = dict(usage) - try: - response_time = (end_time - start_time).total_seconds() * 1000 - except Exception: - response_time = None + standard_logging_object: Optional[StandardLoggingPayload] = kwargs.get( + "standard_logging_object", None + ) + if standard_logging_object is None: + raise ValueError("standard_logging_object not found in kwargs") - try: - response_obj = dict(response_obj) - except Exception: - response_obj = response_obj - - # Clean Metadata before logging - never log raw metadata - # the raw metadata can contain circular references which leads to infinite recursion - # we clean out all extra litellm metadata params before logging - clean_metadata = {} - if isinstance(metadata, dict): - for key, value in metadata.items(): - # clean litellm metadata before logging - if key in [ - "endpoint", - "caching_groups", - "previous_models", - ]: - continue - else: - clean_metadata[key] = value + status = DataDogStatus.INFO + if standard_logging_object.get("status") == "failure": + status = DataDogStatus.ERROR # Build the initial payload - payload = { - "id": id, - "call_type": call_type, - "cache_hit": cache_hit, - "start_time": start_time, - "end_time": end_time, - "response_time": response_time, - "model": kwargs.get("model", ""), - "user": kwargs.get("user", ""), - "model_parameters": optional_params, - "spend": kwargs.get("response_cost", 0), - "messages": messages, - "response": response_obj, - "usage": usage, - "metadata": clean_metadata, - } - - make_json_serializable(payload) - json_payload = json.dumps(payload) + make_json_serializable(standard_logging_object) + json_payload = json.dumps(standard_logging_object) verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload) dd_payload = DatadogPayload( - ddsource=os.getenv("DD_SOURCE", "litellm"), - ddtags="", - hostname="", + ddsource=self._get_datadog_source(), + ddtags=self._get_datadog_tags(), + hostname=self._get_datadog_hostname(), message=json_payload, - service="litellm-server", - status=DataDogStatus.INFO, + service=self._get_datadog_service(), + status=status, ) return dd_payload @@ -382,3 +364,140 @@ class DataDogLogger(CustomBatchLogger): No user has asked for this so far, this might be spammy on datatdog. If need arises we can implement this """ return + + async def async_post_call_failure_hook( + self, + request_data: dict, + original_exception: Exception, + user_api_key_dict: UserAPIKeyAuth, + ): + """ + Handles Proxy Errors (not-related to LLM API), ex: Authentication Errors + """ + import json + + _exception_payload = DatadogProxyFailureHookJsonMessage( + exception=str(original_exception), + error_class=str(original_exception.__class__.__name__), + status_code=getattr(original_exception, "status_code", None), + traceback=traceback.format_exc(), + user_api_key_dict=user_api_key_dict.model_dump(), + ) + + json_payload = json.dumps(_exception_payload) + verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload) + dd_payload = DatadogPayload( + ddsource=self._get_datadog_source(), + ddtags=self._get_datadog_tags(), + hostname=self._get_datadog_hostname(), + message=json_payload, + service=self._get_datadog_service(), + status=DataDogStatus.ERROR, + ) + + self.log_queue.append(dd_payload) + + def _create_v0_logging_payload( + self, + kwargs: Union[dict, Any], + response_obj: Any, + start_time: datetime.datetime, + end_time: datetime.datetime, + ) -> DatadogPayload: + """ + Note: This is our V1 Version of DataDog Logging Payload + + + (Not Recommended) If you want this to get logged set `litellm.datadog_use_v1 = True` + """ + import json + + litellm_params = kwargs.get("litellm_params", {}) + metadata = ( + litellm_params.get("metadata", {}) or {} + ) # if litellm_params['metadata'] == None + messages = kwargs.get("messages") + optional_params = kwargs.get("optional_params", {}) + call_type = kwargs.get("call_type", "litellm.completion") + cache_hit = kwargs.get("cache_hit", False) + usage = response_obj["usage"] + id = response_obj.get("id", str(uuid.uuid4())) + usage = dict(usage) + try: + response_time = (end_time - start_time).total_seconds() * 1000 + except Exception: + response_time = None + + try: + response_obj = dict(response_obj) + except Exception: + response_obj = response_obj + + # Clean Metadata before logging - never log raw metadata + # the raw metadata can contain circular references which leads to infinite recursion + # we clean out all extra litellm metadata params before logging + clean_metadata = {} + if isinstance(metadata, dict): + for key, value in metadata.items(): + # clean litellm metadata before logging + if key in [ + "endpoint", + "caching_groups", + "previous_models", + ]: + continue + else: + clean_metadata[key] = value + + # Build the initial payload + payload = { + "id": id, + "call_type": call_type, + "cache_hit": cache_hit, + "start_time": start_time, + "end_time": end_time, + "response_time": response_time, + "model": kwargs.get("model", ""), + "user": kwargs.get("user", ""), + "model_parameters": optional_params, + "spend": kwargs.get("response_cost", 0), + "messages": messages, + "response": response_obj, + "usage": usage, + "metadata": clean_metadata, + } + + make_json_serializable(payload) + json_payload = json.dumps(payload) + + verbose_logger.debug("Datadog: Logger - Logging payload = %s", json_payload) + + dd_payload = DatadogPayload( + ddsource=self._get_datadog_source(), + ddtags=self._get_datadog_tags(), + hostname=self._get_datadog_hostname(), + message=json_payload, + service=self._get_datadog_service(), + status=DataDogStatus.INFO, + ) + return dd_payload + + @staticmethod + def _get_datadog_tags(): + return f"env:{os.getenv('DD_ENV', 'unknown')},service:{os.getenv('DD_SERVICE', 'litellm')},version:{os.getenv('DD_VERSION', 'unknown')}" + + @staticmethod + def _get_datadog_source(): + return os.getenv("DD_SOURCE", "litellm") + + @staticmethod + def _get_datadog_service(): + return os.getenv("DD_SERVICE", "litellm-server") + + @staticmethod + def _get_datadog_hostname(): + return "" + + @staticmethod + def _get_datadog_env(): + return os.getenv("DD_ENV", "unknown") diff --git a/litellm/integrations/gcs_bucket/gcs_bucket.py b/litellm/integrations/gcs_bucket/gcs_bucket.py index 111730d1f..83b831904 100644 --- a/litellm/integrations/gcs_bucket/gcs_bucket.py +++ b/litellm/integrations/gcs_bucket/gcs_bucket.py @@ -1,3 +1,4 @@ +import asyncio import json import os import uuid @@ -10,10 +11,12 @@ from pydantic import BaseModel, Field import litellm from litellm._logging import verbose_logger +from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler from litellm.proxy._types import CommonProxyErrors, SpendLogsMetadata, SpendLogsPayload +from litellm.types.integrations.gcs_bucket import * from litellm.types.utils import ( StandardCallbackDynamicParams, StandardLoggingMetadata, @@ -27,12 +30,8 @@ else: IAM_AUTH_KEY = "IAM_AUTH" - - -class GCSLoggingConfig(TypedDict): - bucket_name: str - vertex_instance: VertexBase - path_service_account: Optional[str] +GCS_DEFAULT_BATCH_SIZE = 2048 +GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20 class GCSBucketLogger(GCSBucketBase): @@ -41,6 +40,21 @@ class GCSBucketLogger(GCSBucketBase): super().__init__(bucket_name=bucket_name) self.vertex_instances: Dict[str, VertexBase] = {} + + # Init Batch logging settings + self.log_queue: List[GCSLogQueueItem] = [] + self.batch_size = int(os.getenv("GCS_BATCH_SIZE", GCS_DEFAULT_BATCH_SIZE)) + self.flush_interval = int( + os.getenv("GCS_FLUSH_INTERVAL", GCS_DEFAULT_FLUSH_INTERVAL_SECONDS) + ) + asyncio.create_task(self.periodic_flush()) + self.flush_lock = asyncio.Lock() + super().__init__( + flush_lock=self.flush_lock, + batch_size=self.batch_size, + flush_interval=self.flush_interval, + ) + if premium_user is not True: raise ValueError( f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}" @@ -60,54 +74,23 @@ class GCSBucketLogger(GCSBucketBase): kwargs, response_obj, ) - gcs_logging_config: GCSLoggingConfig = await self.get_gcs_logging_config( - kwargs - ) - headers = await self.construct_request_headers( - vertex_instance=gcs_logging_config["vertex_instance"], - service_account_json=gcs_logging_config["path_service_account"], - ) - bucket_name = gcs_logging_config["bucket_name"] - logging_payload: Optional[StandardLoggingPayload] = kwargs.get( "standard_logging_object", None ) - if logging_payload is None: raise ValueError("standard_logging_object not found in kwargs") - json_logged_payload = json.dumps(logging_payload, default=str) - - # Get the current date - current_date = datetime.now().strftime("%Y-%m-%d") - - # Modify the object_name to include the date-based folder - object_name = f"{current_date}/{response_obj['id']}" - try: - response = await self.async_httpx_client.post( - headers=headers, - url=f"https://storage.googleapis.com/upload/storage/v1/b/{bucket_name}/o?uploadType=media&name={object_name}", - data=json_logged_payload, + # Add to logging queue - this will be flushed periodically + self.log_queue.append( + GCSLogQueueItem( + payload=logging_payload, kwargs=kwargs, response_obj=response_obj ) - except httpx.HTTPStatusError as e: - raise Exception(f"GCS Bucket logging error: {e.response.text}") + ) - if response.status_code != 200: - verbose_logger.error("GCS Bucket logging error: %s", str(response.text)) - - verbose_logger.debug("GCS Bucket response %s", response) - verbose_logger.debug("GCS Bucket status code %s", response.status_code) - verbose_logger.debug("GCS Bucket response.text %s", response.text) except Exception as e: verbose_logger.exception(f"GCS Bucket logging error: {str(e)}") async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): - from litellm.proxy.proxy_server import premium_user - - if premium_user is not True: - raise ValueError( - f"GCS Bucket logging is a premium feature. Please upgrade to use it. {CommonProxyErrors.not_premium_user.value}" - ) try: verbose_logger.debug( "GCS Logger: async_log_failure_event logging kwargs: %s, response_obj: %s", @@ -115,51 +98,138 @@ class GCSBucketLogger(GCSBucketBase): response_obj, ) - gcs_logging_config: GCSLoggingConfig = await self.get_gcs_logging_config( - kwargs - ) - headers = await self.construct_request_headers( - vertex_instance=gcs_logging_config["vertex_instance"], - service_account_json=gcs_logging_config["path_service_account"], - ) - bucket_name = gcs_logging_config["bucket_name"] - logging_payload: Optional[StandardLoggingPayload] = kwargs.get( "standard_logging_object", None ) - if logging_payload is None: raise ValueError("standard_logging_object not found in kwargs") - _litellm_params = kwargs.get("litellm_params") or {} - metadata = _litellm_params.get("metadata") or {} - - json_logged_payload = json.dumps(logging_payload, default=str) - - # Get the current date - current_date = datetime.now().strftime("%Y-%m-%d") - - # Modify the object_name to include the date-based folder - object_name = f"{current_date}/failure-{uuid.uuid4().hex}" - - if "gcs_log_id" in metadata: - object_name = metadata["gcs_log_id"] - - response = await self.async_httpx_client.post( - headers=headers, - url=f"https://storage.googleapis.com/upload/storage/v1/b/{bucket_name}/o?uploadType=media&name={object_name}", - data=json_logged_payload, + # Add to logging queue - this will be flushed periodically + self.log_queue.append( + GCSLogQueueItem( + payload=logging_payload, kwargs=kwargs, response_obj=response_obj + ) ) - if response.status_code != 200: - verbose_logger.error("GCS Bucket logging error: %s", str(response.text)) - - verbose_logger.debug("GCS Bucket response %s", response) - verbose_logger.debug("GCS Bucket status code %s", response.status_code) - verbose_logger.debug("GCS Bucket response.text %s", response.text) except Exception as e: verbose_logger.exception(f"GCS Bucket logging error: {str(e)}") + async def async_send_batch(self): + """ + Process queued logs in batch - sends logs to GCS Bucket + + + GCS Bucket does not have a Batch endpoint to batch upload logs + + Instead, we + - collect the logs to flush every `GCS_FLUSH_INTERVAL` seconds + - during async_send_batch, we make 1 POST request per log to GCS Bucket + + """ + if not self.log_queue: + return + + try: + for log_item in self.log_queue: + logging_payload = log_item["payload"] + kwargs = log_item["kwargs"] + response_obj = log_item.get("response_obj", None) or {} + + gcs_logging_config: GCSLoggingConfig = ( + await self.get_gcs_logging_config(kwargs) + ) + headers = await self.construct_request_headers( + vertex_instance=gcs_logging_config["vertex_instance"], + service_account_json=gcs_logging_config["path_service_account"], + ) + bucket_name = gcs_logging_config["bucket_name"] + object_name = self._get_object_name( + kwargs, logging_payload, response_obj + ) + await self._log_json_data_on_gcs( + headers=headers, + bucket_name=bucket_name, + object_name=object_name, + logging_payload=logging_payload, + ) + + # Clear the queue after processing + self.log_queue.clear() + + except Exception as e: + verbose_logger.exception(f"GCS Bucket batch logging error: {str(e)}") + + def _get_object_name( + self, kwargs: Dict, logging_payload: StandardLoggingPayload, response_obj: Any + ) -> str: + """ + Get the object name to use for the current payload + """ + current_date = datetime.now().strftime("%Y-%m-%d") + if logging_payload.get("error_str", None) is not None: + object_name = f"{current_date}/failure-{uuid.uuid4().hex}" + else: + object_name = f"{current_date}/{response_obj.get('id', '')}" + + # used for testing + _litellm_params = kwargs.get("litellm_params", None) or {} + _metadata = _litellm_params.get("metadata", None) or {} + if "gcs_log_id" in _metadata: + object_name = _metadata["gcs_log_id"] + + return object_name + + def _handle_folders_in_bucket_name( + self, + bucket_name: str, + object_name: str, + ) -> Tuple[str, str]: + """ + Handles when the user passes a bucket name with a folder postfix + + + Example: + - Bucket name: "my-bucket/my-folder/dev" + - Object name: "my-object" + - Returns: bucket_name="my-bucket", object_name="my-folder/dev/my-object" + + """ + if "/" in bucket_name: + bucket_name, prefix = bucket_name.split("/", 1) + object_name = f"{prefix}/{object_name}" + return bucket_name, object_name + return bucket_name, object_name + + async def _log_json_data_on_gcs( + self, + headers: Dict[str, str], + bucket_name: str, + object_name: str, + logging_payload: StandardLoggingPayload, + ): + """ + Helper function to make POST request to GCS Bucket in the specified bucket. + """ + json_logged_payload = json.dumps(logging_payload, default=str) + + bucket_name, object_name = self._handle_folders_in_bucket_name( + bucket_name=bucket_name, + object_name=object_name, + ) + + response = await self.async_httpx_client.post( + headers=headers, + url=f"https://storage.googleapis.com/upload/storage/v1/b/{bucket_name}/o?uploadType=media&name={object_name}", + data=json_logged_payload, + ) + + if response.status_code != 200: + verbose_logger.error("GCS Bucket logging error: %s", str(response.text)) + + verbose_logger.debug("GCS Bucket response %s", response) + verbose_logger.debug("GCS Bucket status code %s", response.status_code) + verbose_logger.debug("GCS Bucket response.text %s", response.text) + async def get_gcs_logging_config( self, kwargs: Optional[Dict[str, Any]] = {} ) -> GCSLoggingConfig: @@ -267,6 +337,11 @@ class GCSBucketLogger(GCSBucketBase): service_account_json=gcs_logging_config["path_service_account"], ) bucket_name = gcs_logging_config["bucket_name"] + bucket_name, object_name = self._handle_folders_in_bucket_name( + bucket_name=bucket_name, + object_name=object_name, + ) + url = f"https://storage.googleapis.com/storage/v1/b/{bucket_name}/o/{object_name}?alt=media" # Send the GET request to download the object @@ -302,6 +377,11 @@ class GCSBucketLogger(GCSBucketBase): service_account_json=gcs_logging_config["path_service_account"], ) bucket_name = gcs_logging_config["bucket_name"] + bucket_name, object_name = self._handle_folders_in_bucket_name( + bucket_name=bucket_name, + object_name=object_name, + ) + url = f"https://storage.googleapis.com/storage/v1/b/{bucket_name}/o/{object_name}" # Send the DELETE request to delete the object diff --git a/litellm/integrations/gcs_bucket/gcs_bucket_base.py b/litellm/integrations/gcs_bucket/gcs_bucket_base.py index 56df3aa80..9615b9b21 100644 --- a/litellm/integrations/gcs_bucket/gcs_bucket_base.py +++ b/litellm/integrations/gcs_bucket/gcs_bucket_base.py @@ -9,7 +9,7 @@ from pydantic import BaseModel, Field import litellm from litellm._logging import verbose_logger -from litellm.integrations.custom_logger import CustomLogger +from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.llms.custom_httpx.http_handler import ( get_async_httpx_client, httpxSpecialProvider, @@ -21,8 +21,8 @@ else: VertexBase = Any -class GCSBucketBase(CustomLogger): - def __init__(self, bucket_name: Optional[str] = None) -> None: +class GCSBucketBase(CustomBatchLogger): + def __init__(self, bucket_name: Optional[str] = None, **kwargs) -> None: self.async_httpx_client = get_async_httpx_client( llm_provider=httpxSpecialProvider.LoggingCallback ) @@ -30,6 +30,7 @@ class GCSBucketBase(CustomLogger): _bucket_name = bucket_name or os.getenv("GCS_BUCKET_NAME") self.path_service_account_json: Optional[str] = _path_service_account self.BUCKET_NAME: Optional[str] = _bucket_name + super().__init__(**kwargs) async def construct_request_headers( self, diff --git a/litellm/integrations/langfuse/langfuse.py b/litellm/integrations/langfuse/langfuse.py index 18892871e..73485a0bd 100644 --- a/litellm/integrations/langfuse/langfuse.py +++ b/litellm/integrations/langfuse/langfuse.py @@ -3,8 +3,9 @@ import copy import os import traceback +import types from collections.abc import MutableMapping, MutableSequence, MutableSet -from typing import TYPE_CHECKING, Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional, cast from packaging.version import Version from pydantic import BaseModel @@ -355,17 +356,28 @@ class LangFuseLogger: ) ) - def _prepare_metadata(self, metadata) -> Any: + def is_base_type(self, value: Any) -> bool: + # Check if the value is of a base type + base_types = (int, float, str, bool, list, dict, tuple) + return isinstance(value, base_types) + + def _prepare_metadata(self, metadata: Optional[dict]) -> Any: try: - return copy.deepcopy(metadata) # Avoid modifying the original metadata - except (TypeError, copy.Error) as e: - verbose_logger.warning(f"Langfuse Layer Error - {e}") + if metadata is None: + return None + + # Filter out function types from the metadata + sanitized_metadata = {k: v for k, v in metadata.items() if not callable(v)} + + return copy.deepcopy(sanitized_metadata) + except Exception as e: + verbose_logger.debug(f"Langfuse Layer Error - {e}, metadata: {metadata}") new_metadata: Dict[str, Any] = {} # if metadata is not a MutableMapping, return an empty dict since we can't call items() on it if not isinstance(metadata, MutableMapping): - verbose_logger.warning( + verbose_logger.debug( "Langfuse Layer Logging - metadata is not a MutableMapping, returning empty dict" ) return new_metadata @@ -373,25 +385,40 @@ class LangFuseLogger: for key, value in metadata.items(): try: if isinstance(value, MutableMapping): - new_metadata[key] = self._prepare_metadata(value) - elif isinstance(value, (MutableSequence, MutableSet)): - new_metadata[key] = type(value)( - *( - ( - self._prepare_metadata(v) - if isinstance(v, MutableMapping) - else copy.deepcopy(v) - ) - for v in value + new_metadata[key] = self._prepare_metadata(cast(dict, value)) + elif isinstance(value, MutableSequence): + # For lists or other mutable sequences + new_metadata[key] = list( + ( + self._prepare_metadata(cast(dict, v)) + if isinstance(v, MutableMapping) + else copy.deepcopy(v) ) + for v in value + ) + elif isinstance(value, MutableSet): + # For sets specifically, create a new set by passing an iterable + new_metadata[key] = set( + ( + self._prepare_metadata(cast(dict, v)) + if isinstance(v, MutableMapping) + else copy.deepcopy(v) + ) + for v in value ) elif isinstance(value, BaseModel): new_metadata[key] = value.model_dump() + elif self.is_base_type(value): + new_metadata[key] = value else: - new_metadata[key] = copy.deepcopy(value) + verbose_logger.debug( + f"Langfuse Layer Error - Unsupported metadata type: {type(value)} for key: {key}" + ) + continue + except (TypeError, copy.Error): - verbose_logger.warning( - f"Langfuse Layer Error - Couldn't copy metadata key: {key} - {traceback.format_exc()}" + verbose_logger.debug( + f"Langfuse Layer Error - Couldn't copy metadata key: {key}, type of key: {type(key)}, type of value: {type(value)} - {traceback.format_exc()}" ) return new_metadata diff --git a/litellm/integrations/langsmith.py b/litellm/integrations/langsmith.py index 951393445..4abd2a2c3 100644 --- a/litellm/integrations/langsmith.py +++ b/litellm/integrations/langsmith.py @@ -23,34 +23,8 @@ from litellm.llms.custom_httpx.http_handler import ( get_async_httpx_client, httpxSpecialProvider, ) -from litellm.types.utils import StandardLoggingPayload - - -class LangsmithInputs(BaseModel): - model: Optional[str] = None - messages: Optional[List[Any]] = None - stream: Optional[bool] = None - call_type: Optional[str] = None - litellm_call_id: Optional[str] = None - completion_start_time: Optional[datetime] = None - temperature: Optional[float] = None - max_tokens: Optional[int] = None - custom_llm_provider: Optional[str] = None - input: Optional[List[Any]] = None - log_event_type: Optional[str] = None - original_response: Optional[Any] = None - response_cost: Optional[float] = None - - # LiteLLM Virtual Key specific fields - user_api_key: Optional[str] = None - user_api_key_user_id: Optional[str] = None - user_api_key_team_alias: Optional[str] = None - - -class LangsmithCredentialsObject(TypedDict): - LANGSMITH_API_KEY: str - LANGSMITH_PROJECT: str - LANGSMITH_BASE_URL: str +from litellm.types.integrations.langsmith import * +from litellm.types.utils import StandardCallbackDynamicParams, StandardLoggingPayload def is_serializable(value): @@ -93,15 +67,16 @@ class LangsmithLogger(CustomBatchLogger): ) if _batch_size: self.batch_size = int(_batch_size) + self.log_queue: List[LangsmithQueueObject] = [] asyncio.create_task(self.periodic_flush()) self.flush_lock = asyncio.Lock() super().__init__(**kwargs, flush_lock=self.flush_lock) def get_credentials_from_env( self, - langsmith_api_key: Optional[str], - langsmith_project: Optional[str], - langsmith_base_url: Optional[str], + langsmith_api_key: Optional[str] = None, + langsmith_project: Optional[str] = None, + langsmith_base_url: Optional[str] = None, ) -> LangsmithCredentialsObject: _credentials_api_key = langsmith_api_key or os.getenv("LANGSMITH_API_KEY") @@ -132,42 +107,19 @@ class LangsmithLogger(CustomBatchLogger): LANGSMITH_PROJECT=_credentials_project, ) - def _prepare_log_data( # noqa: PLR0915 - self, kwargs, response_obj, start_time, end_time + def _prepare_log_data( + self, + kwargs, + response_obj, + start_time, + end_time, + credentials: LangsmithCredentialsObject, ): - import json - from datetime import datetime as dt - try: _litellm_params = kwargs.get("litellm_params", {}) or {} metadata = _litellm_params.get("metadata", {}) or {} - new_metadata = {} - for key, value in metadata.items(): - if ( - isinstance(value, list) - or isinstance(value, str) - or isinstance(value, int) - or isinstance(value, float) - ): - new_metadata[key] = value - elif isinstance(value, BaseModel): - new_metadata[key] = value.model_dump_json() - elif isinstance(value, dict): - for k, v in value.items(): - if isinstance(v, dt): - value[k] = v.isoformat() - new_metadata[key] = value - - metadata = new_metadata - - kwargs["user_api_key"] = metadata.get("user_api_key", None) - kwargs["user_api_key_user_id"] = metadata.get("user_api_key_user_id", None) - kwargs["user_api_key_team_alias"] = metadata.get( - "user_api_key_team_alias", None - ) - project_name = metadata.get( - "project_name", self.default_credentials["LANGSMITH_PROJECT"] + "project_name", credentials["LANGSMITH_PROJECT"] ) run_name = metadata.get("run_name", self.langsmith_default_run_name) run_id = metadata.get("id", None) @@ -175,16 +127,10 @@ class LangsmithLogger(CustomBatchLogger): trace_id = metadata.get("trace_id", None) session_id = metadata.get("session_id", None) dotted_order = metadata.get("dotted_order", None) - tags = metadata.get("tags", []) or [] verbose_logger.debug( f"Langsmith Logging - project_name: {project_name}, run_name {run_name}" ) - # filter out kwargs to not include any dicts, langsmith throws an erros when trying to log kwargs - # logged_kwargs = LangsmithInputs(**kwargs) - # kwargs = logged_kwargs.model_dump() - - # new_kwargs = {} # Ensure everything in the payload is converted to str payload: Optional[StandardLoggingPayload] = kwargs.get( "standard_logging_object", None @@ -193,7 +139,6 @@ class LangsmithLogger(CustomBatchLogger): if payload is None: raise Exception("Error logging request payload. Payload=none.") - new_kwargs = payload metadata = payload[ "metadata" ] # ensure logged metadata is json serializable @@ -201,12 +146,12 @@ class LangsmithLogger(CustomBatchLogger): data = { "name": run_name, "run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain" - "inputs": new_kwargs, - "outputs": new_kwargs["response"], + "inputs": payload, + "outputs": payload["response"], "session_name": project_name, - "start_time": new_kwargs["startTime"], - "end_time": new_kwargs["endTime"], - "tags": tags, + "start_time": payload["startTime"], + "end_time": payload["endTime"], + "tags": payload["request_tags"], "extra": metadata, } @@ -243,37 +188,6 @@ class LangsmithLogger(CustomBatchLogger): except Exception: raise - def _send_batch(self): - if not self.log_queue: - return - - langsmith_api_key = self.default_credentials["LANGSMITH_API_KEY"] - langsmith_api_base = self.default_credentials["LANGSMITH_BASE_URL"] - - url = f"{langsmith_api_base}/runs/batch" - - headers = {"x-api-key": langsmith_api_key} - - try: - response = requests.post( - url=url, - json=self.log_queue, - headers=headers, - ) - - if response.status_code >= 300: - verbose_logger.error( - f"Langsmith Error: {response.status_code} - {response.text}" - ) - else: - verbose_logger.debug( - f"Batch of {len(self.log_queue)} runs successfully created" - ) - - self.log_queue.clear() - except Exception: - verbose_logger.exception("Langsmith Layer Error - Error sending batch.") - def log_success_event(self, kwargs, response_obj, start_time, end_time): try: sampling_rate = ( @@ -295,8 +209,20 @@ class LangsmithLogger(CustomBatchLogger): kwargs, response_obj, ) - data = self._prepare_log_data(kwargs, response_obj, start_time, end_time) - self.log_queue.append(data) + credentials = self._get_credentials_to_use_for_request(kwargs=kwargs) + data = self._prepare_log_data( + kwargs=kwargs, + response_obj=response_obj, + start_time=start_time, + end_time=end_time, + credentials=credentials, + ) + self.log_queue.append( + LangsmithQueueObject( + data=data, + credentials=credentials, + ) + ) verbose_logger.debug( f"Langsmith, event added to queue. Will flush in {self.flush_interval} seconds..." ) @@ -323,8 +249,20 @@ class LangsmithLogger(CustomBatchLogger): kwargs, response_obj, ) - data = self._prepare_log_data(kwargs, response_obj, start_time, end_time) - self.log_queue.append(data) + credentials = self._get_credentials_to_use_for_request(kwargs=kwargs) + data = self._prepare_log_data( + kwargs=kwargs, + response_obj=response_obj, + start_time=start_time, + end_time=end_time, + credentials=credentials, + ) + self.log_queue.append( + LangsmithQueueObject( + data=data, + credentials=credentials, + ) + ) verbose_logger.debug( "Langsmith logging: queue length %s, batch size %s", len(self.log_queue), @@ -349,8 +287,20 @@ class LangsmithLogger(CustomBatchLogger): return # Skip logging verbose_logger.info("Langsmith Failure Event Logging!") try: - data = self._prepare_log_data(kwargs, response_obj, start_time, end_time) - self.log_queue.append(data) + credentials = self._get_credentials_to_use_for_request(kwargs=kwargs) + data = self._prepare_log_data( + kwargs=kwargs, + response_obj=response_obj, + start_time=start_time, + end_time=end_time, + credentials=credentials, + ) + self.log_queue.append( + LangsmithQueueObject( + data=data, + credentials=credentials, + ) + ) verbose_logger.debug( "Langsmith logging: queue length %s, batch size %s", len(self.log_queue), @@ -365,31 +315,58 @@ class LangsmithLogger(CustomBatchLogger): async def async_send_batch(self): """ - sends runs to /batch endpoint + Handles sending batches of runs to Langsmith - Sends runs from self.log_queue + self.log_queue contains LangsmithQueueObjects + Each LangsmithQueueObject has the following: + - "credentials" - credentials to use for the request (langsmith_api_key, langsmith_project, langsmith_base_url) + - "data" - data to log on to langsmith for the request + + + This function + - groups the queue objects by credentials + - loops through each unique credentials and sends batches to Langsmith + + + This was added to support key/team based logging on langsmith + """ + if not self.log_queue: + return + + batch_groups = self._group_batches_by_credentials() + for batch_group in batch_groups.values(): + await self._log_batch_on_langsmith( + credentials=batch_group.credentials, + queue_objects=batch_group.queue_objects, + ) + + async def _log_batch_on_langsmith( + self, + credentials: LangsmithCredentialsObject, + queue_objects: List[LangsmithQueueObject], + ): + """ + Logs a batch of runs to Langsmith + sends runs to /batch endpoint for the given credentials + + Args: + credentials: LangsmithCredentialsObject + queue_objects: List[LangsmithQueueObject] Returns: None Raises: Does not raise an exception, will only verbose_logger.exception() """ - if not self.log_queue: - return - - langsmith_api_base = self.default_credentials["LANGSMITH_BASE_URL"] - + langsmith_api_base = credentials["LANGSMITH_BASE_URL"] + langsmith_api_key = credentials["LANGSMITH_API_KEY"] url = f"{langsmith_api_base}/runs/batch" - - langsmith_api_key = self.default_credentials["LANGSMITH_API_KEY"] - headers = {"x-api-key": langsmith_api_key} + elements_to_log = [queue_object["data"] for queue_object in queue_objects] try: response = await self.async_httpx_client.post( url=url, - json={ - "post": self.log_queue, - }, + json={"post": elements_to_log}, headers=headers, ) response.raise_for_status() @@ -411,6 +388,74 @@ class LangsmithLogger(CustomBatchLogger): f"Langsmith Layer Error - {traceback.format_exc()}" ) + def _group_batches_by_credentials(self) -> Dict[CredentialsKey, BatchGroup]: + """Groups queue objects by credentials using a proper key structure""" + log_queue_by_credentials: Dict[CredentialsKey, BatchGroup] = {} + + for queue_object in self.log_queue: + credentials = queue_object["credentials"] + key = CredentialsKey( + api_key=credentials["LANGSMITH_API_KEY"], + project=credentials["LANGSMITH_PROJECT"], + base_url=credentials["LANGSMITH_BASE_URL"], + ) + + if key not in log_queue_by_credentials: + log_queue_by_credentials[key] = BatchGroup( + credentials=credentials, queue_objects=[] + ) + + log_queue_by_credentials[key].queue_objects.append(queue_object) + + return log_queue_by_credentials + + def _get_credentials_to_use_for_request( + self, kwargs: Dict[str, Any] + ) -> LangsmithCredentialsObject: + """ + Handles key/team based logging + + If standard_callback_dynamic_params are provided, use those credentials. + + Otherwise, use the default credentials. + """ + standard_callback_dynamic_params: Optional[StandardCallbackDynamicParams] = ( + kwargs.get("standard_callback_dynamic_params", None) + ) + if standard_callback_dynamic_params is not None: + credentials = self.get_credentials_from_env( + langsmith_api_key=standard_callback_dynamic_params.get( + "langsmith_api_key", None + ), + langsmith_project=standard_callback_dynamic_params.get( + "langsmith_project", None + ), + langsmith_base_url=standard_callback_dynamic_params.get( + "langsmith_base_url", None + ), + ) + else: + credentials = self.default_credentials + return credentials + + def _send_batch(self): + """Calls async_send_batch in an event loop""" + if not self.log_queue: + return + + try: + # Try to get the existing event loop + loop = asyncio.get_event_loop() + if loop.is_running(): + # If we're already in an event loop, create a task + asyncio.create_task(self.async_send_batch()) + else: + # If no event loop is running, run the coroutine directly + loop.run_until_complete(self.async_send_batch()) + except RuntimeError: + # If we can't get an event loop, create a new one + asyncio.run(self.async_send_batch()) + def get_run_by_id(self, run_id): langsmith_api_key = self.default_credentials["LANGSMITH_API_KEY"] diff --git a/litellm/integrations/mlflow.py b/litellm/integrations/mlflow.py new file mode 100644 index 000000000..7268350d1 --- /dev/null +++ b/litellm/integrations/mlflow.py @@ -0,0 +1,247 @@ +import json +import threading +from typing import Optional + +from litellm._logging import verbose_logger +from litellm.integrations.custom_logger import CustomLogger + + +class MlflowLogger(CustomLogger): + def __init__(self): + from mlflow.tracking import MlflowClient + + self._client = MlflowClient() + + self._stream_id_to_span = {} + self._lock = threading.Lock() # lock for _stream_id_to_span + + def log_success_event(self, kwargs, response_obj, start_time, end_time): + self._handle_success(kwargs, response_obj, start_time, end_time) + + async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): + self._handle_success(kwargs, response_obj, start_time, end_time) + + def _handle_success(self, kwargs, response_obj, start_time, end_time): + """ + Log the success event as an MLflow span. + Note that this method is called asynchronously in the background thread. + """ + from mlflow.entities import SpanStatusCode + + try: + verbose_logger.debug("MLflow logging start for success event") + + if kwargs.get("stream"): + self._handle_stream_event(kwargs, response_obj, start_time, end_time) + else: + span = self._start_span_or_trace(kwargs, start_time) + end_time_ns = int(end_time.timestamp() * 1e9) + self._end_span_or_trace( + span=span, + outputs=response_obj, + status=SpanStatusCode.OK, + end_time_ns=end_time_ns, + ) + except Exception: + verbose_logger.debug("MLflow Logging Error", stack_info=True) + + def log_failure_event(self, kwargs, response_obj, start_time, end_time): + self._handle_failure(kwargs, response_obj, start_time, end_time) + + async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): + self._handle_failure(kwargs, response_obj, start_time, end_time) + + def _handle_failure(self, kwargs, response_obj, start_time, end_time): + """ + Log the failure event as an MLflow span. + Note that this method is called *synchronously* unlike the success handler. + """ + from mlflow.entities import SpanEvent, SpanStatusCode + + try: + span = self._start_span_or_trace(kwargs, start_time) + + end_time_ns = int(end_time.timestamp() * 1e9) + + # Record exception info as event + if exception := kwargs.get("exception"): + span.add_event(SpanEvent.from_exception(exception)) + + self._end_span_or_trace( + span=span, + outputs=response_obj, + status=SpanStatusCode.ERROR, + end_time_ns=end_time_ns, + ) + + except Exception as e: + verbose_logger.debug(f"MLflow Logging Error - {e}", stack_info=True) + + def _handle_stream_event(self, kwargs, response_obj, start_time, end_time): + """ + Handle the success event for a streaming response. For streaming calls, + log_success_event handle is triggered for every chunk of the stream. + We create a single span for the entire stream request as follows: + + 1. For the first chunk, start a new span and store it in the map. + 2. For subsequent chunks, add the chunk as an event to the span. + 3. For the final chunk, end the span and remove the span from the map. + """ + from mlflow.entities import SpanStatusCode + + litellm_call_id = kwargs.get("litellm_call_id") + + if litellm_call_id not in self._stream_id_to_span: + with self._lock: + # Check again after acquiring lock + if litellm_call_id not in self._stream_id_to_span: + # Start a new span for the first chunk of the stream + span = self._start_span_or_trace(kwargs, start_time) + self._stream_id_to_span[litellm_call_id] = span + + # Add chunk as event to the span + span = self._stream_id_to_span[litellm_call_id] + self._add_chunk_events(span, response_obj) + + # If this is the final chunk, end the span. The final chunk + # has complete_streaming_response that gathers the full response. + if final_response := kwargs.get("complete_streaming_response"): + end_time_ns = int(end_time.timestamp() * 1e9) + self._end_span_or_trace( + span=span, + outputs=final_response, + status=SpanStatusCode.OK, + end_time_ns=end_time_ns, + ) + + # Remove the stream_id from the map + with self._lock: + self._stream_id_to_span.pop(litellm_call_id) + + def _add_chunk_events(self, span, response_obj): + from mlflow.entities import SpanEvent + + try: + for choice in response_obj.choices: + span.add_event( + SpanEvent( + name="streaming_chunk", + attributes={"delta": json.dumps(choice.delta.model_dump())}, + ) + ) + except Exception: + verbose_logger.debug("Error adding chunk events to span", stack_info=True) + + def _construct_input(self, kwargs): + """Construct span inputs with optional parameters""" + inputs = {"messages": kwargs.get("messages")} + for key in ["functions", "tools", "stream", "tool_choice", "user"]: + if value := kwargs.get("optional_params", {}).pop(key, None): + inputs[key] = value + return inputs + + def _extract_attributes(self, kwargs): + """ + Extract span attributes from kwargs. + + With the latest version of litellm, the standard_logging_object contains + canonical information for logging. If it is not present, we extract + subset of attributes from other kwargs. + """ + attributes = { + "litellm_call_id": kwargs.get("litellm_call_id"), + "call_type": kwargs.get("call_type"), + "model": kwargs.get("model"), + } + standard_obj = kwargs.get("standard_logging_object") + if standard_obj: + attributes.update( + { + "api_base": standard_obj.get("api_base"), + "cache_hit": standard_obj.get("cache_hit"), + "usage": { + "completion_tokens": standard_obj.get("completion_tokens"), + "prompt_tokens": standard_obj.get("prompt_tokens"), + "total_tokens": standard_obj.get("total_tokens"), + }, + "raw_llm_response": standard_obj.get("response"), + "response_cost": standard_obj.get("response_cost"), + "saved_cache_cost": standard_obj.get("saved_cache_cost"), + } + ) + else: + litellm_params = kwargs.get("litellm_params", {}) + attributes.update( + { + "model": kwargs.get("model"), + "cache_hit": kwargs.get("cache_hit"), + "custom_llm_provider": kwargs.get("custom_llm_provider"), + "api_base": litellm_params.get("api_base"), + "response_cost": kwargs.get("response_cost"), + } + ) + return attributes + + def _get_span_type(self, call_type: Optional[str]) -> str: + from mlflow.entities import SpanType + + if call_type in ["completion", "acompletion"]: + return SpanType.LLM + elif call_type == "embeddings": + return SpanType.EMBEDDING + else: + return SpanType.LLM + + def _start_span_or_trace(self, kwargs, start_time): + """ + Start an MLflow span or a trace. + + If there is an active span, we start a new span as a child of + that span. Otherwise, we start a new trace. + """ + import mlflow + + call_type = kwargs.get("call_type", "completion") + span_name = f"litellm-{call_type}" + span_type = self._get_span_type(call_type) + start_time_ns = int(start_time.timestamp() * 1e9) + + inputs = self._construct_input(kwargs) + attributes = self._extract_attributes(kwargs) + + if active_span := mlflow.get_current_active_span(): # type: ignore + return self._client.start_span( + name=span_name, + request_id=active_span.request_id, + parent_id=active_span.span_id, + span_type=span_type, + inputs=inputs, + attributes=attributes, + start_time_ns=start_time_ns, + ) + else: + return self._client.start_trace( + name=span_name, + span_type=span_type, + inputs=inputs, + attributes=attributes, + start_time_ns=start_time_ns, + ) + + def _end_span_or_trace(self, span, outputs, end_time_ns, status): + """End an MLflow span or a trace.""" + if span.parent_id is None: + self._client.end_trace( + request_id=span.request_id, + outputs=outputs, + status=status, + end_time_ns=end_time_ns, + ) + else: + self._client.end_span( + request_id=span.request_id, + span_id=span.span_id, + outputs=outputs, + status=status, + end_time_ns=end_time_ns, + ) diff --git a/litellm/integrations/opentelemetry.py b/litellm/integrations/opentelemetry.py index a1d4b781a..30a280e57 100644 --- a/litellm/integrations/opentelemetry.py +++ b/litellm/integrations/opentelemetry.py @@ -2,20 +2,23 @@ import os from dataclasses import dataclass from datetime import datetime from functools import wraps -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import litellm from litellm._logging import verbose_logger from litellm.integrations.custom_logger import CustomLogger from litellm.types.services import ServiceLoggerPayload from litellm.types.utils import ( + ChatCompletionMessageToolCall, EmbeddingResponse, + Function, ImageResponse, ModelResponse, StandardLoggingPayload, ) if TYPE_CHECKING: + from opentelemetry.sdk.trace.export import SpanExporter as _SpanExporter from opentelemetry.trace import Span as _Span from litellm.proxy._types import ( @@ -24,10 +27,12 @@ if TYPE_CHECKING: from litellm.proxy.proxy_server import UserAPIKeyAuth as _UserAPIKeyAuth Span = _Span + SpanExporter = _SpanExporter UserAPIKeyAuth = _UserAPIKeyAuth ManagementEndpointLoggingPayload = _ManagementEndpointLoggingPayload else: Span = Any + SpanExporter = Any UserAPIKeyAuth = Any ManagementEndpointLoggingPayload = Any @@ -44,7 +49,6 @@ LITELLM_REQUEST_SPAN_NAME = "litellm_request" @dataclass class OpenTelemetryConfig: - from opentelemetry.sdk.trace.export import SpanExporter exporter: Union[str, SpanExporter] = "console" endpoint: Optional[str] = None @@ -77,7 +81,7 @@ class OpenTelemetryConfig: class OpenTelemetry(CustomLogger): def __init__( self, - config: OpenTelemetryConfig = OpenTelemetryConfig.from_env(), + config: Optional[OpenTelemetryConfig] = None, callback_name: Optional[str] = None, **kwargs, ): @@ -85,6 +89,9 @@ class OpenTelemetry(CustomLogger): from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider + if config is None: + config = OpenTelemetryConfig.from_env() + self.config = config self.OTEL_EXPORTER = self.config.exporter self.OTEL_ENDPOINT = self.config.endpoint @@ -319,8 +326,8 @@ class OpenTelemetry(CustomLogger): span.end(end_time=self._to_ns(end_time)) - # if parent_otel_span is not None: - # parent_otel_span.end(end_time=self._to_ns(datetime.now())) + if parent_otel_span is not None: + parent_otel_span.end(end_time=self._to_ns(datetime.now())) def _handle_failure(self, kwargs, response_obj, start_time, end_time): from opentelemetry.trace import Status, StatusCode @@ -398,6 +405,28 @@ class OpenTelemetry(CustomLogger): except Exception: return "" + @staticmethod + def _tool_calls_kv_pair( + tool_calls: List[ChatCompletionMessageToolCall], + ) -> Dict[str, Any]: + from litellm.proxy._types import SpanAttributes + + kv_pairs: Dict[str, Any] = {} + for idx, tool_call in enumerate(tool_calls): + _function = tool_call.get("function") + if not _function: + continue + + keys = Function.__annotations__.keys() + for key in keys: + _value = _function.get(key) + if _value: + kv_pairs[ + f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.{key}" + ] = _value + + return kv_pairs + def set_attributes( # noqa: PLR0915 self, span: Span, kwargs, response_obj: Optional[Any] ): @@ -592,18 +621,13 @@ class OpenTelemetry(CustomLogger): message = choice.get("message") tool_calls = message.get("tool_calls") if tool_calls: - self.safe_set_attribute( - span=span, - key=f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.name", - value=tool_calls[0].get("function").get("name"), - ) - self.safe_set_attribute( - span=span, - key=f"{SpanAttributes.LLM_COMPLETIONS}.{idx}.function_call.arguments", - value=tool_calls[0] - .get("function") - .get("arguments"), - ) + kv_pairs = OpenTelemetry._tool_calls_kv_pair(tool_calls) # type: ignore + for key, value in kv_pairs.items(): + self.safe_set_attribute( + span=span, + key=key, + value=value, + ) except Exception as e: verbose_logger.exception( @@ -700,10 +724,10 @@ class OpenTelemetry(CustomLogger): TraceContextTextMapPropagator, ) - verbose_logger.debug("OpenTelemetry: GOT A TRACEPARENT {}".format(_traceparent)) propagator = TraceContextTextMapPropagator() - _parent_context = propagator.extract(carrier={"traceparent": _traceparent}) - verbose_logger.debug("OpenTelemetry: PARENT CONTEXT {}".format(_parent_context)) + carrier = {"traceparent": _traceparent} + _parent_context = propagator.extract(carrier=carrier) + return _parent_context def _get_span_context(self, kwargs): diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index cbeb4d336..1460a1d7f 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -18,6 +18,7 @@ from litellm.integrations.custom_logger import CustomLogger from litellm.proxy._types import UserAPIKeyAuth from litellm.types.integrations.prometheus import * from litellm.types.utils import StandardLoggingPayload +from litellm.utils import get_end_user_id_for_cost_tracking class PrometheusLogger(CustomLogger): @@ -228,6 +229,13 @@ class PrometheusLogger(CustomLogger): "api_key_alias", ], ) + # llm api provider budget metrics + self.litellm_provider_remaining_budget_metric = Gauge( + "litellm_provider_remaining_budget_metric", + "Remaining budget for provider - used when you set provider budget limits", + labelnames=["api_provider"], + ) + # Get all keys _logged_llm_labels = [ "litellm_model_name", @@ -357,8 +365,7 @@ class PrometheusLogger(CustomLogger): model = kwargs.get("model", "") litellm_params = kwargs.get("litellm_params", {}) or {} _metadata = litellm_params.get("metadata", {}) - proxy_server_request = litellm_params.get("proxy_server_request") or {} - end_user_id = proxy_server_request.get("body", {}).get("user", None) + end_user_id = get_end_user_id_for_cost_tracking(litellm_params) user_id = standard_logging_payload["metadata"]["user_api_key_user_id"] user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"] user_api_key_alias = standard_logging_payload["metadata"]["user_api_key_alias"] @@ -657,13 +664,11 @@ class PrometheusLogger(CustomLogger): # unpack kwargs model = kwargs.get("model", "") - litellm_params = kwargs.get("litellm_params", {}) or {} standard_logging_payload: StandardLoggingPayload = kwargs.get( "standard_logging_object", {} ) - proxy_server_request = litellm_params.get("proxy_server_request") or {} - - end_user_id = proxy_server_request.get("body", {}).get("user", None) + litellm_params = kwargs.get("litellm_params", {}) or {} + end_user_id = get_end_user_id_for_cost_tracking(litellm_params) user_id = standard_logging_payload["metadata"]["user_api_key_user_id"] user_api_key = standard_logging_payload["metadata"]["user_api_key_hash"] user_api_key_alias = standard_logging_payload["metadata"]["user_api_key_alias"] @@ -1130,6 +1135,19 @@ class PrometheusLogger(CustomLogger): litellm_model_name, model_id, api_base, api_provider, exception_status ).inc() + def track_provider_remaining_budget( + self, provider: str, spend: float, budget_limit: float + ): + """ + Track provider remaining budget in Prometheus + """ + self.litellm_provider_remaining_budget_metric.labels(provider).set( + self._safe_get_remaining_budget( + max_budget=budget_limit, + spend=spend, + ) + ) + def _safe_get_remaining_budget( self, max_budget: Optional[float], spend: Optional[float] ) -> float: diff --git a/litellm/litellm_core_utils/README.md b/litellm/litellm_core_utils/README.md new file mode 100644 index 000000000..649404129 --- /dev/null +++ b/litellm/litellm_core_utils/README.md @@ -0,0 +1,12 @@ +## Folder Contents + +This folder contains general-purpose utilities that are used in multiple places in the codebase. + +Core files: +- `streaming_handler.py`: The core streaming logic + streaming related helper utils +- `core_helpers.py`: code used in `types/` - e.g. `map_finish_reason`. +- `exception_mapping_utils.py`: utils for mapping exceptions to openai-compatible error types. +- `default_encoding.py`: code for loading the default encoding (tiktoken) +- `get_llm_provider_logic.py`: code for inferring the LLM provider from a given model name. +- `duration_parser.py`: code for parsing durations - e.g. "1d", "1mo", "10s" + diff --git a/litellm/litellm_core_utils/core_helpers.py b/litellm/litellm_core_utils/core_helpers.py index cddca61ee..816dff81e 100644 --- a/litellm/litellm_core_utils/core_helpers.py +++ b/litellm/litellm_core_utils/core_helpers.py @@ -3,6 +3,8 @@ import os from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple, Union +import httpx + from litellm._logging import verbose_logger if TYPE_CHECKING: @@ -99,3 +101,28 @@ def _get_parent_otel_span_from_kwargs( "Error in _get_parent_otel_span_from_kwargs: " + str(e) ) return None + + +def process_response_headers(response_headers: Union[httpx.Headers, dict]) -> dict: + from litellm.types.utils import OPENAI_RESPONSE_HEADERS + + openai_headers = {} + processed_headers = {} + additional_headers = {} + + for k, v in response_headers.items(): + if k in OPENAI_RESPONSE_HEADERS: # return openai-compatible headers + openai_headers[k] = v + if k.startswith( + "llm_provider-" + ): # return raw provider headers (incl. openai-compatible ones) + processed_headers[k] = v + else: + additional_headers["{}-{}".format("llm_provider", k)] = v + + additional_headers = { + **openai_headers, + **processed_headers, + **additional_headers, + } + return additional_headers diff --git a/litellm/litellm_core_utils/default_encoding.py b/litellm/litellm_core_utils/default_encoding.py new file mode 100644 index 000000000..e09332582 --- /dev/null +++ b/litellm/litellm_core_utils/default_encoding.py @@ -0,0 +1,21 @@ +import os + +import litellm + +try: + # New and recommended way to access resources + from importlib import resources + + filename = str(resources.files(litellm).joinpath("llms/tokenizers")) +except (ImportError, AttributeError): + # Old way to access resources, which setuptools deprecated some time ago + import pkg_resources # type: ignore + + filename = pkg_resources.resource_filename(__name__, "llms/tokenizers") + +os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv( + "CUSTOM_TIKTOKEN_CACHE_DIR", filename +) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071 +import tiktoken + +encoding = tiktoken.get_encoding("cl100k_base") diff --git a/litellm/litellm_core_utils/duration_parser.py b/litellm/litellm_core_utils/duration_parser.py new file mode 100644 index 000000000..c8c6bea83 --- /dev/null +++ b/litellm/litellm_core_utils/duration_parser.py @@ -0,0 +1,92 @@ +""" +Helper utilities for parsing durations - 1s, 1d, 10d, 30d, 1mo, 2mo + +duration_in_seconds is used in diff parts of the code base, example +- Router - Provider budget routing +- Proxy - Key, Team Generation +""" + +import re +import time +from datetime import datetime, timedelta +from typing import Tuple + + +def _extract_from_regex(duration: str) -> Tuple[int, str]: + match = re.match(r"(\d+)(mo|[smhd]?)", duration) + + if not match: + raise ValueError("Invalid duration format") + + value, unit = match.groups() + value = int(value) + + return value, unit + + +def get_last_day_of_month(year, month): + # Handle December case + if month == 12: + return 31 + # Next month is January, so subtract a day from March 1st + next_month = datetime(year=year, month=month + 1, day=1) + last_day_of_month = (next_month - timedelta(days=1)).day + return last_day_of_month + + +def duration_in_seconds(duration: str) -> int: + """ + Parameters: + - duration: + - "s" - seconds + - "m" - minutes + - "h" - hours + - "d" - days + - "mo" - months + + Returns time in seconds till when budget needs to be reset + """ + value, unit = _extract_from_regex(duration=duration) + + if unit == "s": + return value + elif unit == "m": + return value * 60 + elif unit == "h": + return value * 3600 + elif unit == "d": + return value * 86400 + elif unit == "mo": + now = time.time() + current_time = datetime.fromtimestamp(now) + + if current_time.month == 12: + target_year = current_time.year + 1 + target_month = 1 + else: + target_year = current_time.year + target_month = current_time.month + value + + # Determine the day to set for next month + target_day = current_time.day + last_day_of_target_month = get_last_day_of_month(target_year, target_month) + + if target_day > last_day_of_target_month: + target_day = last_day_of_target_month + + next_month = datetime( + year=target_year, + month=target_month, + day=target_day, + hour=current_time.hour, + minute=current_time.minute, + second=current_time.second, + microsecond=current_time.microsecond, + ) + + # Calculate the duration until the first day of the next month + duration_until_next_month = next_month - current_time + return int(duration_until_next_month.total_seconds()) + + else: + raise ValueError(f"Unsupported duration unit, passed duration: {duration}") diff --git a/litellm/litellm_core_utils/exception_mapping_utils.py b/litellm/litellm_core_utils/exception_mapping_utils.py index a4a30fc31..3fb276611 100644 --- a/litellm/litellm_core_utils/exception_mapping_utils.py +++ b/litellm/litellm_core_utils/exception_mapping_utils.py @@ -239,7 +239,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ContextWindowExceededError: {exception_provider} - {message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif ( @@ -251,7 +251,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"{exception_provider} - {message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif "A timeout occurred" in error_str: @@ -271,7 +271,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ContentPolicyViolationError: {exception_provider} - {message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif ( @@ -283,7 +283,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"{exception_provider} - {message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif "Web server is returning an unknown error" in error_str: @@ -299,7 +299,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"RateLimitError: {exception_provider} - {message}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif ( @@ -311,7 +311,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AuthenticationError: {exception_provider} - {message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif "Mistral API raised a streaming error" in error_str: @@ -335,7 +335,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"{exception_provider} - {message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 401: @@ -344,7 +344,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AuthenticationError: {exception_provider} - {message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 404: @@ -353,7 +353,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"NotFoundError: {exception_provider} - {message}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 408: @@ -516,7 +516,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {error_str}", llm_provider="replicate", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "input is too long" in error_str: exception_mapping_worked = True @@ -524,7 +524,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {error_str}", model=model, llm_provider="replicate", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif exception_type == "ModelError": exception_mapping_worked = True @@ -532,7 +532,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {error_str}", model=model, llm_provider="replicate", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "Request was throttled" in error_str: exception_mapping_worked = True @@ -540,7 +540,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {error_str}", llm_provider="replicate", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif hasattr(original_exception, "status_code"): if original_exception.status_code == 401: @@ -549,7 +549,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {original_exception.message}", llm_provider="replicate", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( original_exception.status_code == 400 @@ -560,7 +560,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {original_exception.message}", model=model, llm_provider="replicate", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 422: exception_mapping_worked = True @@ -568,7 +568,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {original_exception.message}", model=model, llm_provider="replicate", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 408: exception_mapping_worked = True @@ -583,7 +583,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {original_exception.message}", llm_provider="replicate", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 429: exception_mapping_worked = True @@ -591,7 +591,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {original_exception.message}", llm_provider="replicate", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 500: exception_mapping_worked = True @@ -599,7 +599,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ReplicateException - {original_exception.message}", llm_provider="replicate", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) exception_mapping_worked = True raise APIError( @@ -631,7 +631,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"{custom_llm_provider}Exception: Authentication Error - {error_str}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif "token_quota_reached" in error_str: @@ -640,7 +640,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"{custom_llm_provider}Exception: Rate Limit Errror - {error_str}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "The server received an invalid response from an upstream server." @@ -750,7 +750,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException - {error_str}\n. Enable 'litellm.modify_params=True' (for PROXY do: `litellm_settings::modify_params: True`) to insert a dummy assistant message and fix this error.", model=model, llm_provider="bedrock", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "Malformed input request" in error_str: exception_mapping_worked = True @@ -758,7 +758,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException - {error_str}", model=model, llm_provider="bedrock", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "A conversation must start with a user message." in error_str: exception_mapping_worked = True @@ -766,7 +766,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException - {error_str}\n. Pass in default user message via `completion(..,user_continue_message=)` or enable `litellm.modify_params=True`.\nFor Proxy: do via `litellm_settings::modify_params: True` or user_continue_message under `litellm_params`", model=model, llm_provider="bedrock", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "Unable to locate credentials" in error_str @@ -778,7 +778,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException Invalid Authentication - {error_str}", model=model, llm_provider="bedrock", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "AccessDeniedException" in error_str: exception_mapping_worked = True @@ -786,7 +786,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException PermissionDeniedError - {error_str}", model=model, llm_provider="bedrock", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "throttlingException" in error_str @@ -797,7 +797,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException: Rate Limit Error - {error_str}", model=model, llm_provider="bedrock", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "Connect timeout on endpoint URL" in error_str @@ -836,7 +836,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException - {original_exception.message}", llm_provider="bedrock", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 400: exception_mapping_worked = True @@ -844,7 +844,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException - {original_exception.message}", llm_provider="bedrock", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 404: exception_mapping_worked = True @@ -852,7 +852,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException - {original_exception.message}", llm_provider="bedrock", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 408: exception_mapping_worked = True @@ -868,7 +868,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException - {original_exception.message}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 429: @@ -877,7 +877,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException - {original_exception.message}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 503: @@ -886,7 +886,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BedrockException - {original_exception.message}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 504: # gateway timeout error @@ -907,7 +907,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"litellm.BadRequestError: SagemakerException - {error_str}", model=model, llm_provider="sagemaker", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "Input validation error: `best_of` must be > 0 and <= 2" @@ -918,7 +918,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message="SagemakerException - the value of 'n' must be > 0 and <= 2 for sagemaker endpoints", model=model, llm_provider="sagemaker", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "`inputs` tokens + `max_new_tokens` must be <=" in error_str @@ -929,7 +929,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"SagemakerException - {error_str}", model=model, llm_provider="sagemaker", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif hasattr(original_exception, "status_code"): if original_exception.status_code == 500: @@ -951,7 +951,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"SagemakerException - {original_exception.message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 400: exception_mapping_worked = True @@ -959,7 +959,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"SagemakerException - {original_exception.message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 404: exception_mapping_worked = True @@ -967,7 +967,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"SagemakerException - {original_exception.message}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 408: exception_mapping_worked = True @@ -986,7 +986,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"SagemakerException - {original_exception.message}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 429: @@ -995,7 +995,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"SagemakerException - {original_exception.message}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 503: @@ -1004,7 +1004,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"SagemakerException - {original_exception.message}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 504: # gateway timeout error @@ -1124,10 +1124,13 @@ def exception_type( # type: ignore # noqa: PLR0915 ), ), ) - elif "500 Internal Server Error" in error_str: + elif ( + "500 Internal Server Error" in error_str + or "The model is overloaded." in error_str + ): exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"litellm.ServiceUnavailableError: VertexAIException - {error_str}", + raise litellm.InternalServerError( + message=f"litellm.InternalServerError: VertexAIException - {error_str}", model=model, llm_provider="vertex_ai", litellm_debug_info=extra_information, @@ -1214,7 +1217,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message="GeminiException - Invalid api key", model=model, llm_provider="palm", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) if ( "504 Deadline expired before operation could complete." in error_str @@ -1232,7 +1235,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"GeminiException - {error_str}", model=model, llm_provider="palm", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) if ( "500 An internal error has occurred." in error_str @@ -1259,7 +1262,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"GeminiException - {error_str}", model=model, llm_provider="palm", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) # Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes elif custom_llm_provider == "cloudflare": @@ -1269,7 +1272,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"Cloudflare Exception - {original_exception.message}", llm_provider="cloudflare", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) if "must have required property" in error_str: exception_mapping_worked = True @@ -1277,7 +1280,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"Cloudflare Exception - {original_exception.message}", llm_provider="cloudflare", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( custom_llm_provider == "cohere" or custom_llm_provider == "cohere_chat" @@ -1291,7 +1294,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"CohereException - {original_exception.message}", llm_provider="cohere", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "too many tokens" in error_str: exception_mapping_worked = True @@ -1299,7 +1302,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"CohereException - {original_exception.message}", model=model, llm_provider="cohere", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif hasattr(original_exception, "status_code"): if ( @@ -1311,7 +1314,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"CohereException - {original_exception.message}", llm_provider="cohere", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 408: exception_mapping_worked = True @@ -1326,7 +1329,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"CohereException - {original_exception.message}", llm_provider="cohere", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "CohereConnectionError" in exception_type @@ -1336,7 +1339,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"CohereException - {original_exception.message}", llm_provider="cohere", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "invalid type:" in error_str: exception_mapping_worked = True @@ -1344,7 +1347,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"CohereException - {original_exception.message}", llm_provider="cohere", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "Unexpected server error" in error_str: exception_mapping_worked = True @@ -1352,7 +1355,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"CohereException - {original_exception.message}", llm_provider="cohere", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) else: if hasattr(original_exception, "status_code"): @@ -1372,7 +1375,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=error_str, model=model, llm_provider="huggingface", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "A valid user token is required" in error_str: exception_mapping_worked = True @@ -1380,7 +1383,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=error_str, llm_provider="huggingface", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "Rate limit reached" in error_str: exception_mapping_worked = True @@ -1388,7 +1391,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=error_str, llm_provider="huggingface", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) if hasattr(original_exception, "status_code"): if original_exception.status_code == 401: @@ -1397,7 +1400,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 400: exception_mapping_worked = True @@ -1405,7 +1408,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"HuggingfaceException - {original_exception.message}", model=model, llm_provider="huggingface", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 408: exception_mapping_worked = True @@ -1420,7 +1423,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 503: exception_mapping_worked = True @@ -1428,7 +1431,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) else: exception_mapping_worked = True @@ -1447,7 +1450,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AI21Exception - {original_exception.message}", model=model, llm_provider="ai21", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) if "Bad or missing API token." in original_exception.message: exception_mapping_worked = True @@ -1455,7 +1458,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AI21Exception - {original_exception.message}", model=model, llm_provider="ai21", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) if hasattr(original_exception, "status_code"): if original_exception.status_code == 401: @@ -1464,7 +1467,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AI21Exception - {original_exception.message}", llm_provider="ai21", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 408: exception_mapping_worked = True @@ -1479,7 +1482,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AI21Exception - {original_exception.message}", model=model, llm_provider="ai21", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 429: exception_mapping_worked = True @@ -1487,7 +1490,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AI21Exception - {original_exception.message}", llm_provider="ai21", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) else: exception_mapping_worked = True @@ -1506,7 +1509,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"NLPCloudException - {error_str}", model=model, llm_provider="nlp_cloud", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "value is not a valid" in error_str: exception_mapping_worked = True @@ -1514,7 +1517,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"NLPCloudException - {error_str}", model=model, llm_provider="nlp_cloud", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) else: exception_mapping_worked = True @@ -1539,7 +1542,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"NLPCloudException - {original_exception.message}", llm_provider="nlp_cloud", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( original_exception.status_code == 401 @@ -1550,7 +1553,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"NLPCloudException - {original_exception.message}", llm_provider="nlp_cloud", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( original_exception.status_code == 522 @@ -1571,7 +1574,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"NLPCloudException - {original_exception.message}", llm_provider="nlp_cloud", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( original_exception.status_code == 500 @@ -1594,7 +1597,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"NLPCloudException - {original_exception.message}", model=model, llm_provider="nlp_cloud", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) else: exception_mapping_worked = True @@ -1620,7 +1623,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"TogetherAIException - {error_response['error']}", model=model, llm_provider="together_ai", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "error" in error_response @@ -1631,7 +1634,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"TogetherAIException - {error_response['error']}", llm_provider="together_ai", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "error" in error_response @@ -1642,7 +1645,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"TogetherAIException - {error_response['error']}", model=model, llm_provider="together_ai", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "A timeout occurred" in error_str: exception_mapping_worked = True @@ -1661,7 +1664,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"TogetherAIException - {error_response['error']}", model=model, llm_provider="together_ai", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif ( "error_type" in error_response @@ -1672,7 +1675,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"TogetherAIException - {error_response['error']}", model=model, llm_provider="together_ai", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) if hasattr(original_exception, "status_code"): if original_exception.status_code == 408: @@ -1688,7 +1691,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"TogetherAIException - {error_response['error']}", model=model, llm_provider="together_ai", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 429: exception_mapping_worked = True @@ -1696,7 +1699,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"TogetherAIException - {original_exception.message}", llm_provider="together_ai", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 524: exception_mapping_worked = True @@ -1724,7 +1727,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AlephAlphaException - {original_exception.message}", llm_provider="aleph_alpha", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "InvalidToken" in error_str or "No token provided" in error_str: exception_mapping_worked = True @@ -1732,7 +1735,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AlephAlphaException - {original_exception.message}", llm_provider="aleph_alpha", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif hasattr(original_exception, "status_code"): verbose_logger.debug( @@ -1751,7 +1754,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AlephAlphaException - {original_exception.message}", llm_provider="aleph_alpha", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 429: exception_mapping_worked = True @@ -1759,7 +1762,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AlephAlphaException - {original_exception.message}", llm_provider="aleph_alpha", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 500: exception_mapping_worked = True @@ -1767,7 +1770,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AlephAlphaException - {original_exception.message}", llm_provider="aleph_alpha", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) raise original_exception raise original_exception @@ -1784,7 +1787,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}", model=model, llm_provider="ollama", - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "Failed to establish a new connection" in error_str: exception_mapping_worked = True @@ -1792,7 +1795,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"OllamaException: {original_exception}", llm_provider="ollama", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "Invalid response object from API" in error_str: exception_mapping_worked = True @@ -1800,7 +1803,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"OllamaException: {original_exception}", llm_provider="ollama", model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), ) elif "Read timed out" in error_str: exception_mapping_worked = True @@ -1834,6 +1837,7 @@ def exception_type( # type: ignore # noqa: PLR0915 llm_provider="azure", model=model, litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif "This model's maximum context length is" in error_str: exception_mapping_worked = True @@ -1842,6 +1846,7 @@ def exception_type( # type: ignore # noqa: PLR0915 llm_provider="azure", model=model, litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif "DeploymentNotFound" in error_str: exception_mapping_worked = True @@ -1850,6 +1855,7 @@ def exception_type( # type: ignore # noqa: PLR0915 llm_provider="azure", model=model, litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif ( ( @@ -1870,6 +1876,7 @@ def exception_type( # type: ignore # noqa: PLR0915 llm_provider="azure", model=model, litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif "invalid_request_error" in error_str: exception_mapping_worked = True @@ -1878,6 +1885,7 @@ def exception_type( # type: ignore # noqa: PLR0915 llm_provider="azure", model=model, litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif ( "The api_key client option must be set either by passing api_key to the client or by setting" @@ -1889,6 +1897,7 @@ def exception_type( # type: ignore # noqa: PLR0915 llm_provider=custom_llm_provider, model=model, litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif "Connection error" in error_str: exception_mapping_worked = True @@ -1907,6 +1916,7 @@ def exception_type( # type: ignore # noqa: PLR0915 llm_provider="azure", model=model, litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 401: exception_mapping_worked = True @@ -1915,6 +1925,7 @@ def exception_type( # type: ignore # noqa: PLR0915 llm_provider="azure", model=model, litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 408: exception_mapping_worked = True @@ -1931,6 +1942,7 @@ def exception_type( # type: ignore # noqa: PLR0915 model=model, llm_provider="azure", litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 429: exception_mapping_worked = True @@ -1939,6 +1951,7 @@ def exception_type( # type: ignore # noqa: PLR0915 model=model, llm_provider="azure", litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 503: exception_mapping_worked = True @@ -1947,6 +1960,7 @@ def exception_type( # type: ignore # noqa: PLR0915 model=model, llm_provider="azure", litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 504: # gateway timeout error exception_mapping_worked = True @@ -1986,7 +2000,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"{exception_provider} - {error_str}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 401: @@ -1995,7 +2009,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AuthenticationError: {exception_provider} - {error_str}", llm_provider=custom_llm_provider, model=model, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 404: @@ -2004,7 +2018,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"NotFoundError: {exception_provider} - {error_str}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 408: @@ -2021,7 +2035,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"BadRequestError: {exception_provider} - {error_str}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 429: @@ -2030,7 +2044,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"RateLimitError: {exception_provider} - {error_str}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 503: @@ -2039,7 +2053,7 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"ServiceUnavailableError: {exception_provider} - {error_str}", model=model, llm_provider=custom_llm_provider, - response=original_exception.response, + response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) elif original_exception.status_code == 504: # gateway timeout error diff --git a/litellm/litellm_core_utils/get_supported_openai_params.py b/litellm/litellm_core_utils/get_supported_openai_params.py index bb94d54d5..05b4b9c48 100644 --- a/litellm/litellm_core_utils/get_supported_openai_params.py +++ b/litellm/litellm_core_utils/get_supported_openai_params.py @@ -161,17 +161,7 @@ def get_supported_openai_params( # noqa: PLR0915 elif custom_llm_provider == "huggingface": return litellm.HuggingfaceConfig().get_supported_openai_params() elif custom_llm_provider == "together_ai": - return [ - "stream", - "temperature", - "max_tokens", - "top_p", - "stop", - "frequency_penalty", - "tools", - "tool_choice", - "response_format", - ] + return litellm.TogetherAIConfig().get_supported_openai_params(model=model) elif custom_llm_provider == "ai21": return [ "stream", diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index 2ab905e85..298e28974 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -28,6 +28,7 @@ from litellm.caching.caching_handler import LLMCachingHandler from litellm.cost_calculator import _select_model_name_for_cost_calc from litellm.integrations.custom_guardrail import CustomGuardrail from litellm.integrations.custom_logger import CustomLogger +from litellm.integrations.mlflow import MlflowLogger from litellm.litellm_core_utils.redact_messages import ( redact_message_input_output_from_custom_logger, redact_message_input_output_from_logging, @@ -201,6 +202,7 @@ class Logging: start_time, litellm_call_id: str, function_id: str, + litellm_trace_id: Optional[str] = None, dynamic_input_callbacks: Optional[ List[Union[str, Callable, CustomLogger]] ] = None, @@ -238,6 +240,7 @@ class Logging: self.start_time = start_time # log the call start time self.call_type = call_type self.litellm_call_id = litellm_call_id + self.litellm_trace_id = litellm_trace_id self.function_id = function_id self.streaming_chunks: List[Any] = [] # for generating complete stream response self.sync_streaming_chunks: List[Any] = ( @@ -274,6 +277,11 @@ class Logging: self.completion_start_time: Optional[datetime.datetime] = None self._llm_caching_handler: Optional[LLMCachingHandler] = None + self.model_call_details = { + "litellm_trace_id": litellm_trace_id, + "litellm_call_id": litellm_call_id, + } + def process_dynamic_callbacks(self): """ Initializes CustomLogger compatible callbacks in self.dynamic_* callbacks @@ -381,21 +389,23 @@ class Logging: self.logger_fn = litellm_params.get("logger_fn", None) verbose_logger.debug(f"self.optional_params: {self.optional_params}") - self.model_call_details = { - "model": self.model, - "messages": self.messages, - "optional_params": self.optional_params, - "litellm_params": self.litellm_params, - "start_time": self.start_time, - "stream": self.stream, - "user": user, - "call_type": str(self.call_type), - "litellm_call_id": self.litellm_call_id, - "completion_start_time": self.completion_start_time, - "standard_callback_dynamic_params": self.standard_callback_dynamic_params, - **self.optional_params, - **additional_params, - } + self.model_call_details.update( + { + "model": self.model, + "messages": self.messages, + "optional_params": self.optional_params, + "litellm_params": self.litellm_params, + "start_time": self.start_time, + "stream": self.stream, + "user": user, + "call_type": str(self.call_type), + "litellm_call_id": self.litellm_call_id, + "completion_start_time": self.completion_start_time, + "standard_callback_dynamic_params": self.standard_callback_dynamic_params, + **self.optional_params, + **additional_params, + } + ) ## check if stream options is set ## - used by CustomStreamWrapper for easy instrumentation if "stream_options" in additional_params: @@ -554,6 +564,7 @@ class Logging: message=f"Model Call Details pre-call: {details_to_log}", level="info", ) + elif isinstance(callback, CustomLogger): # custom logger class callback.log_pre_api_call( model=self.model, @@ -923,19 +934,10 @@ class Logging: status="success", ) ) - if self.dynamic_success_callbacks is not None and isinstance( - self.dynamic_success_callbacks, list - ): - callbacks = self.dynamic_success_callbacks - ## keep the internal functions ## - for callback in litellm.success_callback: - if ( - isinstance(callback, CustomLogger) - and "_PROXY_" in callback.__class__.__name__ - ): - callbacks.append(callback) - else: - callbacks = litellm.success_callback + callbacks = get_combined_callback_list( + dynamic_success_callbacks=self.dynamic_success_callbacks, + global_callbacks=litellm.success_callback, + ) ## REDACT MESSAGES ## result = redact_message_input_output_from_logging( @@ -1249,6 +1251,7 @@ class Logging: end_time=end_time, print_verbose=print_verbose, ) + if ( callback == "openmeter" and self.model_call_details.get("litellm_params", {}).get( @@ -1356,8 +1359,11 @@ class Logging: and customLogger is not None ): # custom logger functions print_verbose( - "success callbacks: Running Custom Callback Function" + "success callbacks: Running Custom Callback Function - {}".format( + callback + ) ) + customLogger.log_event( kwargs=self.model_call_details, response_obj=result, @@ -1454,21 +1460,10 @@ class Logging: status="success", ) ) - if self.dynamic_async_success_callbacks is not None and isinstance( - self.dynamic_async_success_callbacks, list - ): - callbacks = self.dynamic_async_success_callbacks - ## keep the internal functions ## - for callback in litellm._async_success_callback: - callback_name = "" - if isinstance(callback, CustomLogger): - callback_name = callback.__class__.__name__ - if callable(callback): - callback_name = callback.__name__ - if "_PROXY_" in callback_name: - callbacks.append(callback) - else: - callbacks = litellm._async_success_callback + callbacks = get_combined_callback_list( + dynamic_success_callbacks=self.dynamic_async_success_callbacks, + global_callbacks=litellm._async_success_callback, + ) result = redact_message_input_output_from_logging( model_call_details=( @@ -1735,21 +1730,10 @@ class Logging: start_time=start_time, end_time=end_time, ) - callbacks = [] # init this to empty incase it's not created - - if self.dynamic_failure_callbacks is not None and isinstance( - self.dynamic_failure_callbacks, list - ): - callbacks = self.dynamic_failure_callbacks - ## keep the internal functions ## - for callback in litellm.failure_callback: - if ( - isinstance(callback, CustomLogger) - and "_PROXY_" in callback.__class__.__name__ - ): - callbacks.append(callback) - else: - callbacks = litellm.failure_callback + callbacks = get_combined_callback_list( + dynamic_success_callbacks=self.dynamic_failure_callbacks, + global_callbacks=litellm.failure_callback, + ) result = None # result sent to all loggers, init this to None incase it's not created @@ -1932,21 +1916,10 @@ class Logging: end_time=end_time, ) - callbacks = [] # init this to empty incase it's not created - - if self.dynamic_async_failure_callbacks is not None and isinstance( - self.dynamic_async_failure_callbacks, list - ): - callbacks = self.dynamic_async_failure_callbacks - ## keep the internal functions ## - for callback in litellm._async_failure_callback: - if ( - isinstance(callback, CustomLogger) - and "_PROXY_" in callback.__class__.__name__ - ): - callbacks.append(callback) - else: - callbacks = litellm._async_failure_callback + callbacks = get_combined_callback_list( + dynamic_success_callbacks=self.dynamic_async_failure_callbacks, + global_callbacks=litellm._async_failure_callback, + ) result = None # result sent to all loggers, init this to None incase it's not created for callback in callbacks: @@ -2338,6 +2311,15 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 _in_memory_loggers.append(_otel_logger) return _otel_logger # type: ignore + elif logging_integration == "mlflow": + for callback in _in_memory_loggers: + if isinstance(callback, MlflowLogger): + return callback # type: ignore + + _mlflow_logger = MlflowLogger() + _in_memory_loggers.append(_mlflow_logger) + return _mlflow_logger # type: ignore + def get_custom_logger_compatible_class( logging_integration: litellm._custom_logger_compatible_callbacks_literal, @@ -2439,6 +2421,12 @@ def get_custom_logger_compatible_class( and callback.callback_name == "langtrace" ): return callback + + elif logging_integration == "mlflow": + for callback in _in_memory_loggers: + if isinstance(callback, MlflowLogger): + return callback + return None @@ -2774,11 +2762,6 @@ def get_standard_logging_object_payload( metadata=metadata ) - if litellm.cache is not None: - cache_key = litellm.cache.get_cache_key(**kwargs) - else: - cache_key = None - saved_cache_cost: float = 0.0 if cache_hit is True: @@ -2811,6 +2794,7 @@ def get_standard_logging_object_payload( payload: StandardLoggingPayload = StandardLoggingPayload( id=str(id), + trace_id=kwargs.get("litellm_trace_id"), # type: ignore call_type=call_type or "", cache_hit=cache_hit, status=status, @@ -2820,7 +2804,7 @@ def get_standard_logging_object_payload( completionStartTime=completion_start_time_float, model=kwargs.get("model", "") or "", metadata=clean_metadata, - cache_key=cache_key, + cache_key=clean_hidden_params["cache_key"], response_cost=response_cost, total_tokens=usage.total_tokens, prompt_tokens=usage.prompt_tokens, @@ -2927,3 +2911,11 @@ def modify_integration(integration_name, integration_params): if integration_name == "supabase": if "table_name" in integration_params: Supabase.supabase_table_name = integration_params["table_name"] + + +def get_combined_callback_list( + dynamic_success_callbacks: Optional[List], global_callbacks: List +) -> List: + if dynamic_success_callbacks is None: + return global_callbacks + return list(set(dynamic_success_callbacks + global_callbacks)) diff --git a/litellm/litellm_core_utils/rules.py b/litellm/litellm_core_utils/rules.py new file mode 100644 index 000000000..beeb012d0 --- /dev/null +++ b/litellm/litellm_core_utils/rules.py @@ -0,0 +1,50 @@ +from typing import Optional + +import litellm + + +class Rules: + """ + Fail calls based on the input or llm api output + + Example usage: + import litellm + def my_custom_rule(input): # receives the model response + if "i don't think i can answer" in input: # trigger fallback if the model refuses to answer + return False + return True + + litellm.post_call_rules = [my_custom_rule] # have these be functions that can be called to fail a call + + response = litellm.completion(model="gpt-3.5-turbo", messages=[{"role": "user", + "content": "Hey, how's it going?"}], fallbacks=["openrouter/mythomax"]) + """ + + def __init__(self) -> None: + pass + + def pre_call_rules(self, input: str, model: str): + for rule in litellm.pre_call_rules: + if callable(rule): + decision = rule(input) + if decision is False: + raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore + return True + + def post_call_rules(self, input: Optional[str], model: str) -> bool: + if input is None: + return True + for rule in litellm.post_call_rules: + if callable(rule): + decision = rule(input) + if isinstance(decision, bool): + if decision is False: + raise litellm.APIResponseValidationError(message="LLM Response failed post-call-rule check", llm_provider="", model=model) # type: ignore + elif isinstance(decision, dict): + decision_val = decision.get("decision", True) + decision_message = decision.get( + "message", "LLM Response failed post-call-rule check" + ) + if decision_val is False: + raise litellm.APIResponseValidationError(message=decision_message, llm_provider="", model=model) # type: ignore + return True diff --git a/litellm/litellm_core_utils/streaming_handler.py b/litellm/litellm_core_utils/streaming_handler.py new file mode 100644 index 000000000..483121c38 --- /dev/null +++ b/litellm/litellm_core_utils/streaming_handler.py @@ -0,0 +1,2020 @@ +import asyncio +import json +import threading +import time +import traceback +import uuid +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Callable, List, Optional + +import httpx +from pydantic import BaseModel + +import litellm +from litellm import verbose_logger +from litellm.litellm_core_utils.redact_messages import ( + LiteLLMLoggingObject, + redact_message_input_output_from_logging, +) +from litellm.types.utils import Delta +from litellm.types.utils import GenericStreamingChunk as GChunk +from litellm.types.utils import ( + ModelResponse, + ModelResponseStream, + StreamingChoices, + Usage, +) + +from ..exceptions import OpenAIError +from .core_helpers import map_finish_reason, process_response_headers +from .default_encoding import encoding +from .exception_mapping_utils import exception_type +from .rules import Rules + +MAX_THREADS = 100 + +# Create a ThreadPoolExecutor +executor = ThreadPoolExecutor(max_workers=MAX_THREADS) + + +def print_verbose(print_statement): + try: + if litellm.set_verbose: + print(print_statement) # noqa + except Exception: + pass + + +class CustomStreamWrapper: + def __init__( + self, + completion_stream, + model, + logging_obj: Any, + custom_llm_provider: Optional[str] = None, + stream_options=None, + make_call: Optional[Callable] = None, + _response_headers: Optional[dict] = None, + ): + self.model = model + self.make_call = make_call + self.custom_llm_provider = custom_llm_provider + self.logging_obj: LiteLLMLoggingObject = logging_obj + self.completion_stream = completion_stream + self.sent_first_chunk = False + self.sent_last_chunk = False + self.system_fingerprint: Optional[str] = None + self.received_finish_reason: Optional[str] = None + self.intermittent_finish_reason: Optional[str] = ( + None # finish reasons that show up mid-stream + ) + self.special_tokens = [ + "<|assistant|>", + "<|system|>", + "<|user|>", + "", + "", + "<|im_end|>", + "<|im_start|>", + ] + self.holding_chunk = "" + self.complete_response = "" + self.response_uptil_now = "" + _model_info = ( + self.logging_obj.model_call_details.get("litellm_params", {}).get( + "model_info", {} + ) + or {} + ) + self._hidden_params = { + "model_id": (_model_info.get("id", None)), + } # returned as x-litellm-model-id response header in proxy + + self._hidden_params["additional_headers"] = process_response_headers( + _response_headers or {} + ) # GUARANTEE OPENAI HEADERS IN RESPONSE + + self._response_headers = _response_headers + self.response_id = None + self.logging_loop = None + self.rules = Rules() + self.stream_options = stream_options or getattr( + logging_obj, "stream_options", None + ) + self.messages = getattr(logging_obj, "messages", None) + self.sent_stream_usage = False + self.send_stream_usage = ( + True if self.check_send_stream_usage(self.stream_options) else False + ) + self.tool_call = False + self.chunks: List = ( + [] + ) # keep track of the returned chunks - used for calculating the input/output tokens for stream options + self.is_function_call = self.check_is_function_call(logging_obj=logging_obj) + + def __iter__(self): + return self + + def __aiter__(self): + return self + + def check_send_stream_usage(self, stream_options: Optional[dict]): + return ( + stream_options is not None + and stream_options.get("include_usage", False) is True + ) + + def check_is_function_call(self, logging_obj) -> bool: + if hasattr(logging_obj, "optional_params") and isinstance( + logging_obj.optional_params, dict + ): + if ( + "litellm_param_is_function_call" in logging_obj.optional_params + and logging_obj.optional_params["litellm_param_is_function_call"] + is True + ): + return True + + return False + + def process_chunk(self, chunk: str): + """ + NLP Cloud streaming returns the entire response, for each chunk. Process this, to only return the delta. + """ + try: + chunk = chunk.strip() + self.complete_response = self.complete_response.strip() + + if chunk.startswith(self.complete_response): + # Remove last_sent_chunk only if it appears at the start of the new chunk + chunk = chunk[len(self.complete_response) :] + + self.complete_response += chunk + return chunk + except Exception as e: + raise e + + def safety_checker(self) -> None: + """ + Fixes - https://github.com/BerriAI/litellm/issues/5158 + + if the model enters a loop and starts repeating the same chunk again, break out of loop and raise an internalservererror - allows for retries. + + Raises - InternalServerError, if LLM enters infinite loop while streaming + """ + if len(self.chunks) >= litellm.REPEATED_STREAMING_CHUNK_LIMIT: + # Get the last n chunks + last_chunks = self.chunks[-litellm.REPEATED_STREAMING_CHUNK_LIMIT :] + + # Extract the relevant content from the chunks + last_contents = [chunk.choices[0].delta.content for chunk in last_chunks] + + # Check if all extracted contents are identical + if all(content == last_contents[0] for content in last_contents): + if ( + last_contents[0] is not None + and isinstance(last_contents[0], str) + and len(last_contents[0]) > 2 + ): # ignore empty content - https://github.com/BerriAI/litellm/issues/5158#issuecomment-2287156946 + # All last n chunks are identical + raise litellm.InternalServerError( + message="The model is repeating the same chunk = {}.".format( + last_contents[0] + ), + model="", + llm_provider="", + ) + + def check_special_tokens(self, chunk: str, finish_reason: Optional[str]): + """ + Output parse / special tokens for sagemaker + hf streaming. + """ + hold = False + if ( + self.custom_llm_provider != "huggingface" + and self.custom_llm_provider != "sagemaker" + ): + return hold, chunk + + if finish_reason: + for token in self.special_tokens: + if token in chunk: + chunk = chunk.replace(token, "") + return hold, chunk + + if self.sent_first_chunk is True: + return hold, chunk + + curr_chunk = self.holding_chunk + chunk + curr_chunk = curr_chunk.strip() + + for token in self.special_tokens: + if len(curr_chunk) < len(token) and curr_chunk in token: + hold = True + self.holding_chunk = curr_chunk + elif len(curr_chunk) >= len(token): + if token in curr_chunk: + self.holding_chunk = curr_chunk.replace(token, "") + hold = True + else: + pass + + if hold is False: # reset + self.holding_chunk = "" + return hold, curr_chunk + + def handle_anthropic_text_chunk(self, chunk): + """ + For old anthropic models - claude-1, claude-2. + + Claude-3 is handled from within Anthropic.py VIA ModelResponseIterator() + """ + str_line = chunk + if isinstance(chunk, bytes): # Handle binary data + str_line = chunk.decode("utf-8") # Convert bytes to string + text = "" + is_finished = False + finish_reason = None + if str_line.startswith("data:"): + data_json = json.loads(str_line[5:]) + type_chunk = data_json.get("type", None) + if type_chunk == "completion": + text = data_json.get("completion") + finish_reason = data_json.get("stop_reason") + if finish_reason is not None: + is_finished = True + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif "error" in str_line: + raise ValueError(f"Unable to parse response. Original response: {str_line}") + else: + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + + def handle_predibase_chunk(self, chunk): + try: + if not isinstance(chunk, str): + chunk = chunk.decode( + "utf-8" + ) # DO NOT REMOVE this: This is required for HF inference API + Streaming + text = "" + is_finished = False + finish_reason = "" + print_verbose(f"chunk: {chunk}") + if chunk.startswith("data:"): + data_json = json.loads(chunk[5:]) + print_verbose(f"data json: {data_json}") + if "token" in data_json and "text" in data_json["token"]: + text = data_json["token"]["text"] + if data_json.get("details", False) and data_json["details"].get( + "finish_reason", False + ): + is_finished = True + finish_reason = data_json["details"]["finish_reason"] + elif data_json.get( + "generated_text", False + ): # if full generated text exists, then stream is complete + text = "" # don't return the final bos token + is_finished = True + finish_reason = "stop" + elif data_json.get("error", False): + raise Exception(data_json.get("error")) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif "error" in chunk: + raise ValueError(chunk) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception as e: + raise e + + def handle_huggingface_chunk(self, chunk): + try: + if not isinstance(chunk, str): + chunk = chunk.decode( + "utf-8" + ) # DO NOT REMOVE this: This is required for HF inference API + Streaming + text = "" + is_finished = False + finish_reason = "" + print_verbose(f"chunk: {chunk}") + if chunk.startswith("data:"): + data_json = json.loads(chunk[5:]) + print_verbose(f"data json: {data_json}") + if "token" in data_json and "text" in data_json["token"]: + text = data_json["token"]["text"] + if data_json.get("details", False) and data_json["details"].get( + "finish_reason", False + ): + is_finished = True + finish_reason = data_json["details"]["finish_reason"] + elif data_json.get( + "generated_text", False + ): # if full generated text exists, then stream is complete + text = "" # don't return the final bos token + is_finished = True + finish_reason = "stop" + elif data_json.get("error", False): + raise Exception(data_json.get("error")) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif "error" in chunk: + raise ValueError(chunk) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception as e: + raise e + + def handle_ai21_chunk(self, chunk): # fake streaming + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + text = data_json["completions"][0]["data"]["text"] + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_maritalk_chunk(self, chunk): # fake streaming + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + text = data_json["answer"] + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_nlp_cloud_chunk(self, chunk): + text = "" + is_finished = False + finish_reason = "" + try: + if "dolphin" in self.model: + chunk = self.process_chunk(chunk=chunk) + else: + data_json = json.loads(chunk) + chunk = data_json["generated_text"] + text = chunk + if "[DONE]" in text: + text = text.replace("[DONE]", "") + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_aleph_alpha_chunk(self, chunk): + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + text = data_json["completions"][0]["completion"] + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_cohere_chunk(self, chunk): + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + text = "" + is_finished = False + finish_reason = "" + index: Optional[int] = None + if "index" in data_json: + index = data_json.get("index") + if "text" in data_json: + text = data_json["text"] + elif "is_finished" in data_json: + is_finished = data_json["is_finished"] + finish_reason = data_json["finish_reason"] + else: + raise Exception(data_json) + return { + "index": index, + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_cohere_chat_chunk(self, chunk): + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + print_verbose(f"chunk: {chunk}") + try: + text = "" + is_finished = False + finish_reason = "" + if "text" in data_json: + text = data_json["text"] + elif "is_finished" in data_json and data_json["is_finished"] is True: + is_finished = data_json["is_finished"] + finish_reason = data_json["finish_reason"] + else: + return + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_azure_chunk(self, chunk): + is_finished = False + finish_reason = "" + text = "" + print_verbose(f"chunk: {chunk}") + if "data: [DONE]" in chunk: + text = "" + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif chunk.startswith("data:"): + data_json = json.loads(chunk[5:]) # chunk.startswith("data:"): + try: + if len(data_json["choices"]) > 0: + delta = data_json["choices"][0]["delta"] + text = "" if delta is None else delta.get("content", "") + if data_json["choices"][0].get("finish_reason", None): + is_finished = True + finish_reason = data_json["choices"][0]["finish_reason"] + print_verbose( + f"text: {text}; is_finished: {is_finished}; finish_reason: {finish_reason}" + ) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError( + f"Unable to parse response. Original response: {chunk}" + ) + elif "error" in chunk: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + else: + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + + def handle_replicate_chunk(self, chunk): + try: + text = "" + is_finished = False + finish_reason = "" + if "output" in chunk: + text = chunk["output"] + if "status" in chunk: + if chunk["status"] == "succeeded": + is_finished = True + finish_reason = "stop" + elif chunk.get("error", None): + raise Exception(chunk["error"]) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + except Exception: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + + def handle_openai_chat_completion_chunk(self, chunk): + try: + print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") + str_line = chunk + text = "" + is_finished = False + finish_reason = None + logprobs = None + usage = None + if str_line and str_line.choices and len(str_line.choices) > 0: + if ( + str_line.choices[0].delta is not None + and str_line.choices[0].delta.content is not None + ): + text = str_line.choices[0].delta.content + else: # function/tool calling chunk - when content is None. in this case we just return the original chunk from openai + pass + if str_line.choices[0].finish_reason: + is_finished = True + finish_reason = str_line.choices[0].finish_reason + + # checking for logprobs + if ( + hasattr(str_line.choices[0], "logprobs") + and str_line.choices[0].logprobs is not None + ): + logprobs = str_line.choices[0].logprobs + else: + logprobs = None + + usage = getattr(str_line, "usage", None) + + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + "logprobs": logprobs, + "original_chunk": str_line, + "usage": usage, + } + except Exception as e: + raise e + + def handle_azure_text_completion_chunk(self, chunk): + try: + print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") + text = "" + is_finished = False + finish_reason = None + choices = getattr(chunk, "choices", []) + if len(choices) > 0: + text = choices[0].text + if choices[0].finish_reason is not None: + is_finished = True + finish_reason = choices[0].finish_reason + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + + except Exception as e: + raise e + + def handle_openai_text_completion_chunk(self, chunk): + try: + print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") + text = "" + is_finished = False + finish_reason = None + usage = None + choices = getattr(chunk, "choices", []) + if len(choices) > 0: + text = choices[0].text + if choices[0].finish_reason is not None: + is_finished = True + finish_reason = choices[0].finish_reason + usage = getattr(chunk, "usage", None) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + "usage": usage, + } + + except Exception as e: + raise e + + def handle_baseten_chunk(self, chunk): + try: + chunk = chunk.decode("utf-8") + if len(chunk) > 0: + if chunk.startswith("data:"): + data_json = json.loads(chunk[5:]) + if "token" in data_json and "text" in data_json["token"]: + return data_json["token"]["text"] + else: + return "" + data_json = json.loads(chunk) + if "model_output" in data_json: + if ( + isinstance(data_json["model_output"], dict) + and "data" in data_json["model_output"] + and isinstance(data_json["model_output"]["data"], list) + ): + return data_json["model_output"]["data"][0] + elif isinstance(data_json["model_output"], str): + return data_json["model_output"] + elif "completion" in data_json and isinstance( + data_json["completion"], str + ): + return data_json["completion"] + else: + raise ValueError( + f"Unable to parse response. Original response: {chunk}" + ) + else: + return "" + else: + return "" + except Exception as e: + verbose_logger.exception( + "litellm.CustomStreamWrapper.handle_baseten_chunk(): Exception occured - {}".format( + str(e) + ) + ) + return "" + + def handle_cloudlfare_stream(self, chunk): + try: + print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n") + chunk = chunk.decode("utf-8") + str_line = chunk + text = "" + is_finished = False + finish_reason = None + + if "[DONE]" in chunk: + return {"text": text, "is_finished": True, "finish_reason": "stop"} + elif str_line.startswith("data:"): + data_json = json.loads(str_line[5:]) + print_verbose(f"delta content: {data_json}") + text = data_json["response"] + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + else: + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + + except Exception as e: + raise e + + def handle_ollama_stream(self, chunk): + try: + if isinstance(chunk, dict): + json_chunk = chunk + else: + json_chunk = json.loads(chunk) + if "error" in json_chunk: + raise Exception(f"Ollama Error - {json_chunk}") + + text = "" + is_finished = False + finish_reason = None + if json_chunk["done"] is True: + text = "" + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif json_chunk["response"]: + print_verbose(f"delta content: {json_chunk}") + text = json_chunk["response"] + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + else: + raise Exception(f"Ollama Error - {json_chunk}") + except Exception as e: + raise e + + def handle_ollama_chat_stream(self, chunk): + # for ollama_chat/ provider + try: + if isinstance(chunk, dict): + json_chunk = chunk + else: + json_chunk = json.loads(chunk) + if "error" in json_chunk: + raise Exception(f"Ollama Error - {json_chunk}") + + text = "" + is_finished = False + finish_reason = None + if json_chunk["done"] is True: + text = "" + is_finished = True + finish_reason = "stop" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + elif "message" in json_chunk: + print_verbose(f"delta content: {json_chunk}") + text = json_chunk["message"]["content"] + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + } + else: + raise Exception(f"Ollama Error - {json_chunk}") + except Exception as e: + raise e + + def handle_watsonx_stream(self, chunk): + try: + if isinstance(chunk, dict): + parsed_response = chunk + elif isinstance(chunk, (str, bytes)): + if isinstance(chunk, bytes): + chunk = chunk.decode("utf-8") + if "generated_text" in chunk: + response = chunk.replace("data: ", "").strip() + parsed_response = json.loads(response) + else: + return { + "text": "", + "is_finished": False, + "prompt_tokens": 0, + "completion_tokens": 0, + } + else: + print_verbose(f"chunk: {chunk} (Type: {type(chunk)})") + raise ValueError( + f"Unable to parse response. Original response: {chunk}" + ) + results = parsed_response.get("results", []) + if len(results) > 0: + text = results[0].get("generated_text", "") + finish_reason = results[0].get("stop_reason") + is_finished = finish_reason != "not_finished" + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + "prompt_tokens": results[0].get("input_token_count", 0), + "completion_tokens": results[0].get("generated_token_count", 0), + } + return {"text": "", "is_finished": False} + except Exception as e: + raise e + + def handle_triton_stream(self, chunk): + try: + if isinstance(chunk, dict): + parsed_response = chunk + elif isinstance(chunk, (str, bytes)): + if isinstance(chunk, bytes): + chunk = chunk.decode("utf-8") + if "text_output" in chunk: + response = chunk.replace("data: ", "").strip() + parsed_response = json.loads(response) + else: + return { + "text": "", + "is_finished": False, + "prompt_tokens": 0, + "completion_tokens": 0, + } + else: + print_verbose(f"chunk: {chunk} (Type: {type(chunk)})") + raise ValueError( + f"Unable to parse response. Original response: {chunk}" + ) + text = parsed_response.get("text_output", "") + finish_reason = parsed_response.get("stop_reason") + is_finished = parsed_response.get("is_finished", False) + return { + "text": text, + "is_finished": is_finished, + "finish_reason": finish_reason, + "prompt_tokens": parsed_response.get("input_token_count", 0), + "completion_tokens": parsed_response.get("generated_token_count", 0), + } + return {"text": "", "is_finished": False} + except Exception as e: + raise e + + def handle_clarifai_completion_chunk(self, chunk): + try: + if isinstance(chunk, dict): + parsed_response = chunk + elif isinstance(chunk, (str, bytes)): + if isinstance(chunk, bytes): + parsed_response = chunk.decode("utf-8") + else: + parsed_response = chunk + else: + raise ValueError("Unable to parse streaming chunk") + if isinstance(parsed_response, dict): + data_json = parsed_response + else: + data_json = json.loads(parsed_response) + text = ( + data_json.get("outputs", "")[0] + .get("data", "") + .get("text", "") + .get("raw", "") + ) + len( + encoding.encode( + data_json.get("outputs", "")[0] + .get("input", "") + .get("data", "") + .get("text", "") + .get("raw", "") + ) + ) + len(encoding.encode(text)) + return { + "text": text, + "is_finished": True, + } + except Exception as e: + verbose_logger.exception( + "litellm.CustomStreamWrapper.handle_clarifai_chunk(): Exception occured - {}".format( + str(e) + ) + ) + return "" + + def model_response_creator( + self, chunk: Optional[dict] = None, hidden_params: Optional[dict] = None + ): + _model = self.model + _received_llm_provider = self.custom_llm_provider + _logging_obj_llm_provider = self.logging_obj.model_call_details.get("custom_llm_provider", None) # type: ignore + if ( + _received_llm_provider == "openai" + and _received_llm_provider != _logging_obj_llm_provider + ): + _model = "{}/{}".format(_logging_obj_llm_provider, _model) + if chunk is None: + chunk = {} + else: + # pop model keyword + chunk.pop("model", None) + + model_response = ModelResponse( + stream=True, model=_model, stream_options=self.stream_options, **chunk + ) + if self.response_id is not None: + model_response.id = self.response_id + else: + self.response_id = model_response.id # type: ignore + if self.system_fingerprint is not None: + model_response.system_fingerprint = self.system_fingerprint + if hidden_params is not None: + model_response._hidden_params = hidden_params + model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider + model_response._hidden_params["created_at"] = time.time() + model_response._hidden_params = { + **model_response._hidden_params, + **self._hidden_params, + } + + if ( + len(model_response.choices) > 0 + and getattr(model_response.choices[0], "delta") is not None + ): + # do nothing, if object instantiated + pass + else: + model_response.choices = [StreamingChoices(finish_reason=None)] + return model_response + + def is_delta_empty(self, delta: Delta) -> bool: + is_empty = True + if delta.content is not None: + is_empty = False + elif delta.tool_calls is not None: + is_empty = False + elif delta.function_call is not None: + is_empty = False + return is_empty + + def return_processed_chunk_logic( # noqa + self, + completion_obj: dict, + model_response: ModelResponseStream, + response_obj: dict, + ): + + print_verbose( + f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}" + ) + if ( + "content" in completion_obj + and ( + isinstance(completion_obj["content"], str) + and len(completion_obj["content"]) > 0 + ) + or ( + "tool_calls" in completion_obj + and completion_obj["tool_calls"] is not None + and len(completion_obj["tool_calls"]) > 0 + ) + or ( + "function_call" in completion_obj + and completion_obj["function_call"] is not None + ) + ): # cannot set content of an OpenAI Object to be an empty string + self.safety_checker() + hold, model_response_str = self.check_special_tokens( + chunk=completion_obj["content"], + finish_reason=model_response.choices[0].finish_reason, + ) # filter out bos/eos tokens from openai-compatible hf endpoints + print_verbose(f"hold - {hold}, model_response_str - {model_response_str}") + if hold is False: + ## check if openai/azure chunk + original_chunk = response_obj.get("original_chunk", None) + if original_chunk: + model_response.id = original_chunk.id + self.response_id = original_chunk.id + if len(original_chunk.choices) > 0: + choices = [] + for choice in original_chunk.choices: + try: + if isinstance(choice, BaseModel): + choice_json = choice.model_dump() + choice_json.pop( + "finish_reason", None + ) # for mistral etc. which return a value in their last chunk (not-openai compatible). + print_verbose(f"choice_json: {choice_json}") + choices.append(StreamingChoices(**choice_json)) + except Exception: + choices.append(StreamingChoices()) + print_verbose(f"choices in streaming: {choices}") + setattr(model_response, "choices", choices) + else: + return + model_response.system_fingerprint = ( + original_chunk.system_fingerprint + ) + setattr( + model_response, + "citations", + getattr(original_chunk, "citations", None), + ) + print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") + if self.sent_first_chunk is False: + model_response.choices[0].delta["role"] = "assistant" + self.sent_first_chunk = True + elif self.sent_first_chunk is True and hasattr( + model_response.choices[0].delta, "role" + ): + _initial_delta = model_response.choices[0].delta.model_dump() + _initial_delta.pop("role", None) + model_response.choices[0].delta = Delta(**_initial_delta) + print_verbose( + f"model_response.choices[0].delta: {model_response.choices[0].delta}" + ) + else: + ## else + completion_obj["content"] = model_response_str + if self.sent_first_chunk is False: + completion_obj["role"] = "assistant" + self.sent_first_chunk = True + + model_response.choices[0].delta = Delta(**completion_obj) + _index: Optional[int] = completion_obj.get("index") + if _index is not None: + model_response.choices[0].index = _index + print_verbose(f"returning model_response: {model_response}") + return model_response + else: + return + elif self.received_finish_reason is not None: + if self.sent_last_chunk is True: + # Bedrock returns the guardrail trace in the last chunk - we want to return this here + if self.custom_llm_provider == "bedrock" and "trace" in model_response: + return model_response + + # Default - return StopIteration + raise StopIteration + # flush any remaining holding chunk + if len(self.holding_chunk) > 0: + if model_response.choices[0].delta.content is None: + model_response.choices[0].delta.content = self.holding_chunk + else: + model_response.choices[0].delta.content = ( + self.holding_chunk + model_response.choices[0].delta.content + ) + self.holding_chunk = "" + # if delta is None + _is_delta_empty = self.is_delta_empty(delta=model_response.choices[0].delta) + + if _is_delta_empty: + # get any function call arguments + model_response.choices[0].finish_reason = map_finish_reason( + finish_reason=self.received_finish_reason + ) # ensure consistent output to openai + + self.sent_last_chunk = True + + return model_response + elif ( + model_response.choices[0].delta.tool_calls is not None + or model_response.choices[0].delta.function_call is not None + ): + if self.sent_first_chunk is False: + model_response.choices[0].delta["role"] = "assistant" + self.sent_first_chunk = True + return model_response + elif ( + len(model_response.choices) > 0 + and hasattr(model_response.choices[0].delta, "audio") + and model_response.choices[0].delta.audio is not None + ): + return model_response + else: + if hasattr(model_response, "usage"): + self.chunks.append(model_response) + return + + def chunk_creator(self, chunk): # type: ignore # noqa: PLR0915 + model_response = self.model_response_creator() + response_obj: dict = {} + try: + # return this for all models + completion_obj = {"content": ""} + from litellm.types.utils import GenericStreamingChunk as GChunk + + if ( + isinstance(chunk, dict) + and generic_chunk_has_all_required_fields( + chunk=chunk + ) # check if chunk is a generic streaming chunk + ) or ( + self.custom_llm_provider + and ( + self.custom_llm_provider == "anthropic" + or self.custom_llm_provider in litellm._custom_providers + ) + ): + + if self.received_finish_reason is not None: + if "provider_specific_fields" not in chunk: + raise StopIteration + anthropic_response_obj: GChunk = chunk + completion_obj["content"] = anthropic_response_obj["text"] + if anthropic_response_obj["is_finished"]: + self.received_finish_reason = anthropic_response_obj[ + "finish_reason" + ] + + if anthropic_response_obj["finish_reason"]: + self.intermittent_finish_reason = anthropic_response_obj[ + "finish_reason" + ] + + if anthropic_response_obj["usage"] is not None: + model_response.usage = litellm.Usage( + **anthropic_response_obj["usage"] + ) + + if ( + "tool_use" in anthropic_response_obj + and anthropic_response_obj["tool_use"] is not None + ): + completion_obj["tool_calls"] = [anthropic_response_obj["tool_use"]] + + if ( + "provider_specific_fields" in anthropic_response_obj + and anthropic_response_obj["provider_specific_fields"] is not None + ): + for key, value in anthropic_response_obj[ + "provider_specific_fields" + ].items(): + setattr(model_response, key, value) + + response_obj = anthropic_response_obj + elif ( + self.custom_llm_provider + and self.custom_llm_provider == "anthropic_text" + ): + response_obj = self.handle_anthropic_text_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "clarifai": + response_obj = self.handle_clarifai_completion_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.model == "replicate" or self.custom_llm_provider == "replicate": + response_obj = self.handle_replicate_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": + response_obj = self.handle_huggingface_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "predibase": + response_obj = self.handle_predibase_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif ( + self.custom_llm_provider and self.custom_llm_provider == "baseten" + ): # baseten doesn't provide streaming + completion_obj["content"] = self.handle_baseten_chunk(chunk) + elif ( + self.custom_llm_provider and self.custom_llm_provider == "ai21" + ): # ai21 doesn't provide streaming + response_obj = self.handle_ai21_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "maritalk": + response_obj = self.handle_maritalk_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider and self.custom_llm_provider == "vllm": + completion_obj["content"] = chunk[0].outputs[0].text + elif ( + self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha" + ): # aleph alpha doesn't provide streaming + response_obj = self.handle_aleph_alpha_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "nlp_cloud": + try: + response_obj = self.handle_nlp_cloud_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + except Exception as e: + if self.received_finish_reason: + raise e + else: + if self.sent_first_chunk is False: + raise Exception("An unknown error occurred with the stream") + self.received_finish_reason = "stop" + elif self.custom_llm_provider == "vertex_ai": + import proto # type: ignore + + if hasattr(chunk, "candidates") is True: + try: + try: + completion_obj["content"] = chunk.text + except Exception as e: + if "Part has no text." in str(e): + ## check for function calling + function_call = ( + chunk.candidates[0].content.parts[0].function_call + ) + + args_dict = {} + + # Check if it's a RepeatedComposite instance + for key, val in function_call.args.items(): + if isinstance( + val, + proto.marshal.collections.repeated.RepeatedComposite, + ): + # If so, convert to list + args_dict[key] = [v for v in val] + else: + args_dict[key] = val + + try: + args_str = json.dumps(args_dict) + except Exception as e: + raise e + _delta_obj = litellm.utils.Delta( + content=None, + tool_calls=[ + { + "id": f"call_{str(uuid.uuid4())}", + "function": { + "arguments": args_str, + "name": function_call.name, + }, + "type": "function", + } + ], + ) + _streaming_response = StreamingChoices(delta=_delta_obj) + _model_response = ModelResponse(stream=True) + _model_response.choices = [_streaming_response] + response_obj = {"original_chunk": _model_response} + else: + raise e + if ( + hasattr(chunk.candidates[0], "finish_reason") + and chunk.candidates[0].finish_reason.name + != "FINISH_REASON_UNSPECIFIED" + ): # every non-final chunk in vertex ai has this + self.received_finish_reason = chunk.candidates[ + 0 + ].finish_reason.name + except Exception: + if chunk.candidates[0].finish_reason.name == "SAFETY": + raise Exception( + f"The response was blocked by VertexAI. {str(chunk)}" + ) + else: + completion_obj["content"] = str(chunk) + elif self.custom_llm_provider == "cohere": + response_obj = self.handle_cohere_chunk(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "cohere_chat": + response_obj = self.handle_cohere_chat_chunk(chunk) + if response_obj is None: + return + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + + elif self.custom_llm_provider == "petals": + if len(self.completion_stream) == 0: + if self.received_finish_reason is not None: + raise StopIteration + else: + self.received_finish_reason = "stop" + chunk_size = 30 + new_chunk = self.completion_stream[:chunk_size] + completion_obj["content"] = new_chunk + self.completion_stream = self.completion_stream[chunk_size:] + elif self.custom_llm_provider == "palm": + # fake streaming + response_obj = {} + if len(self.completion_stream) == 0: + if self.received_finish_reason is not None: + raise StopIteration + else: + self.received_finish_reason = "stop" + chunk_size = 30 + new_chunk = self.completion_stream[:chunk_size] + completion_obj["content"] = new_chunk + self.completion_stream = self.completion_stream[chunk_size:] + elif self.custom_llm_provider == "ollama": + response_obj = self.handle_ollama_stream(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "ollama_chat": + response_obj = self.handle_ollama_chat_stream(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "cloudflare": + response_obj = self.handle_cloudlfare_stream(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "watsonx": + response_obj = self.handle_watsonx_stream(chunk) + completion_obj["content"] = response_obj["text"] + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "triton": + response_obj = self.handle_triton_stream(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "text-completion-openai": + response_obj = self.handle_openai_text_completion_chunk(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + if response_obj["usage"] is not None: + model_response.usage = litellm.Usage( + prompt_tokens=response_obj["usage"].prompt_tokens, + completion_tokens=response_obj["usage"].completion_tokens, + total_tokens=response_obj["usage"].total_tokens, + ) + elif self.custom_llm_provider == "text-completion-codestral": + response_obj = litellm.MistralTextCompletionConfig()._chunk_parser( + chunk + ) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + if "usage" in response_obj is not None: + model_response.usage = litellm.Usage( + prompt_tokens=response_obj["usage"].prompt_tokens, + completion_tokens=response_obj["usage"].completion_tokens, + total_tokens=response_obj["usage"].total_tokens, + ) + elif self.custom_llm_provider == "azure_text": + response_obj = self.handle_azure_text_completion_chunk(chunk) + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + elif self.custom_llm_provider == "cached_response": + response_obj = { + "text": chunk.choices[0].delta.content, + "is_finished": True, + "finish_reason": chunk.choices[0].finish_reason, + "original_chunk": chunk, + "tool_calls": ( + chunk.choices[0].delta.tool_calls + if hasattr(chunk.choices[0].delta, "tool_calls") + else None + ), + } + + completion_obj["content"] = response_obj["text"] + if response_obj["tool_calls"] is not None: + completion_obj["tool_calls"] = response_obj["tool_calls"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if hasattr(chunk, "id"): + model_response.id = chunk.id + self.response_id = chunk.id + if hasattr(chunk, "system_fingerprint"): + self.system_fingerprint = chunk.system_fingerprint + if response_obj["is_finished"]: + self.received_finish_reason = response_obj["finish_reason"] + else: # openai / azure chat model + if self.custom_llm_provider == "azure": + if hasattr(chunk, "model"): + # for azure, we need to pass the model from the orignal chunk + self.model = chunk.model + response_obj = self.handle_openai_chat_completion_chunk(chunk) + if response_obj is None: + return + completion_obj["content"] = response_obj["text"] + print_verbose(f"completion obj content: {completion_obj['content']}") + if response_obj["is_finished"]: + if response_obj["finish_reason"] == "error": + raise Exception( + "{} raised a streaming error - finish_reason: error, no content string given. Received Chunk={}".format( + self.custom_llm_provider, response_obj + ) + ) + self.received_finish_reason = response_obj["finish_reason"] + if response_obj.get("original_chunk", None) is not None: + if hasattr(response_obj["original_chunk"], "id"): + model_response.id = response_obj["original_chunk"].id + self.response_id = model_response.id + if hasattr(response_obj["original_chunk"], "system_fingerprint"): + model_response.system_fingerprint = response_obj[ + "original_chunk" + ].system_fingerprint + self.system_fingerprint = response_obj[ + "original_chunk" + ].system_fingerprint + if response_obj["logprobs"] is not None: + model_response.choices[0].logprobs = response_obj["logprobs"] + + if response_obj["usage"] is not None: + if isinstance(response_obj["usage"], dict): + model_response.usage = litellm.Usage( + prompt_tokens=response_obj["usage"].get( + "prompt_tokens", None + ) + or None, + completion_tokens=response_obj["usage"].get( + "completion_tokens", None + ) + or None, + total_tokens=response_obj["usage"].get("total_tokens", None) + or None, + ) + elif isinstance(response_obj["usage"], BaseModel): + model_response.usage = litellm.Usage( + **response_obj["usage"].model_dump() + ) + + model_response.model = self.model + print_verbose( + f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}" + ) + ## FUNCTION CALL PARSING + if ( + response_obj is not None + and response_obj.get("original_chunk", None) is not None + ): # function / tool calling branch - only set for openai/azure compatible endpoints + # enter this branch when no content has been passed in response + original_chunk = response_obj.get("original_chunk", None) + model_response.id = original_chunk.id + self.response_id = original_chunk.id + if original_chunk.choices and len(original_chunk.choices) > 0: + delta = original_chunk.choices[0].delta + if delta is not None and ( + delta.function_call is not None or delta.tool_calls is not None + ): + try: + model_response.system_fingerprint = ( + original_chunk.system_fingerprint + ) + ## AZURE - check if arguments is not None + if ( + original_chunk.choices[0].delta.function_call + is not None + ): + if ( + getattr( + original_chunk.choices[0].delta.function_call, + "arguments", + ) + is None + ): + original_chunk.choices[ + 0 + ].delta.function_call.arguments = "" + elif original_chunk.choices[0].delta.tool_calls is not None: + if isinstance( + original_chunk.choices[0].delta.tool_calls, list + ): + for t in original_chunk.choices[0].delta.tool_calls: + if hasattr(t, "functions") and hasattr( + t.functions, "arguments" + ): + if ( + getattr( + t.function, + "arguments", + ) + is None + ): + t.function.arguments = "" + _json_delta = delta.model_dump() + print_verbose(f"_json_delta: {_json_delta}") + if "role" not in _json_delta or _json_delta["role"] is None: + _json_delta["role"] = ( + "assistant" # mistral's api returns role as None + ) + if "tool_calls" in _json_delta and isinstance( + _json_delta["tool_calls"], list + ): + for tool in _json_delta["tool_calls"]: + if ( + isinstance(tool, dict) + and "function" in tool + and isinstance(tool["function"], dict) + and ("type" not in tool or tool["type"] is None) + ): + # if function returned but type set to None - mistral's api returns type: None + tool["type"] = "function" + model_response.choices[0].delta = Delta(**_json_delta) + except Exception as e: + verbose_logger.exception( + "litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format( + str(e) + ) + ) + model_response.choices[0].delta = Delta() + elif ( + delta is not None and getattr(delta, "audio", None) is not None + ): + model_response.choices[0].delta.audio = delta.audio + else: + try: + delta = ( + dict() + if original_chunk.choices[0].delta is None + else dict(original_chunk.choices[0].delta) + ) + print_verbose(f"original delta: {delta}") + model_response.choices[0].delta = Delta(**delta) + print_verbose( + f"new delta: {model_response.choices[0].delta}" + ) + except Exception: + model_response.choices[0].delta = Delta() + else: + if ( + self.stream_options is not None + and self.stream_options["include_usage"] is True + ): + return model_response + return + print_verbose( + f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}" + ) + print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") + + ## CHECK FOR TOOL USE + if "tool_calls" in completion_obj and len(completion_obj["tool_calls"]) > 0: + if self.is_function_call is True: # user passed in 'functions' param + completion_obj["function_call"] = completion_obj["tool_calls"][0][ + "function" + ] + completion_obj["tool_calls"] = None + + self.tool_call = True + + ## RETURN ARG + return self.return_processed_chunk_logic( + completion_obj=completion_obj, + model_response=model_response, # type: ignore + response_obj=response_obj, + ) + + except StopIteration: + raise StopIteration + except Exception as e: + traceback.format_exc() + e.message = str(e) + raise exception_type( + model=self.model, + custom_llm_provider=self.custom_llm_provider, + original_exception=e, + ) + + def set_logging_event_loop(self, loop): + """ + import litellm, asyncio + + loop = asyncio.get_event_loop() # 👈 gets the current event loop + + response = litellm.completion(.., stream=True) + + response.set_logging_event_loop(loop=loop) # 👈 enables async_success callbacks for sync logging + + for chunk in response: + ... + """ + self.logging_loop = loop + + def run_success_logging_and_cache_storage(self, processed_chunk, cache_hit: bool): + """ + Runs success logging in a thread and adds the response to the cache + """ + if litellm.disable_streaming_logging is True: + """ + [NOT RECOMMENDED] + Set this via `litellm.disable_streaming_logging = True`. + + Disables streaming logging. + """ + return + ## ASYNC LOGGING + # Create an event loop for the new thread + if self.logging_loop is not None: + future = asyncio.run_coroutine_threadsafe( + self.logging_obj.async_success_handler( + processed_chunk, None, None, cache_hit + ), + loop=self.logging_loop, + ) + future.result() + else: + asyncio.run( + self.logging_obj.async_success_handler( + processed_chunk, None, None, cache_hit + ) + ) + ## SYNC LOGGING + self.logging_obj.success_handler(processed_chunk, None, None, cache_hit) + + ## Sync store in cache + if self.logging_obj._llm_caching_handler is not None: + self.logging_obj._llm_caching_handler._sync_add_streaming_response_to_cache( + processed_chunk + ) + + def finish_reason_handler(self): + model_response = self.model_response_creator() + _finish_reason = self.received_finish_reason or self.intermittent_finish_reason + if _finish_reason is not None: + model_response.choices[0].finish_reason = _finish_reason + else: + model_response.choices[0].finish_reason = "stop" + + ## if tool use + if ( + model_response.choices[0].finish_reason == "stop" and self.tool_call + ): # don't overwrite for other - potential error finish reasons + model_response.choices[0].finish_reason = "tool_calls" + return model_response + + def __next__(self): # noqa: PLR0915 + cache_hit = False + if ( + self.custom_llm_provider is not None + and self.custom_llm_provider == "cached_response" + ): + cache_hit = True + try: + if self.completion_stream is None: + self.fetch_sync_stream() + while True: + if ( + isinstance(self.completion_stream, str) + or isinstance(self.completion_stream, bytes) + or isinstance(self.completion_stream, ModelResponse) + ): + chunk = self.completion_stream + else: + chunk = next(self.completion_stream) + if chunk is not None and chunk != b"": + print_verbose( + f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}; custom_llm_provider: {self.custom_llm_provider}" + ) + response: Optional[ModelResponse] = self.chunk_creator(chunk=chunk) + print_verbose(f"PROCESSED CHUNK POST CHUNK CREATOR: {response}") + + if response is None: + continue + ## LOGGING + threading.Thread( + target=self.run_success_logging_and_cache_storage, + args=(response, cache_hit), + ).start() # log response + choice = response.choices[0] + if isinstance(choice, StreamingChoices): + self.response_uptil_now += choice.delta.get("content", "") or "" + else: + self.response_uptil_now += "" + self.rules.post_call_rules( + input=self.response_uptil_now, model=self.model + ) + # HANDLE STREAM OPTIONS + self.chunks.append(response) + if hasattr( + response, "usage" + ): # remove usage from chunk, only send on final chunk + # Convert the object to a dictionary + obj_dict = response.dict() + + # Remove an attribute (e.g., 'attr2') + if "usage" in obj_dict: + del obj_dict["usage"] + + # Create a new object without the removed attribute + response = self.model_response_creator( + chunk=obj_dict, hidden_params=response._hidden_params + ) + # add usage as hidden param + if self.sent_last_chunk is True and self.stream_options is None: + usage = calculate_total_usage(chunks=self.chunks) + response._hidden_params["usage"] = usage + # RETURN RESULT + return response + + except StopIteration: + if self.sent_last_chunk is True: + complete_streaming_response = litellm.stream_chunk_builder( + chunks=self.chunks, messages=self.messages + ) + response = self.model_response_creator() + if complete_streaming_response is not None: + setattr( + response, + "usage", + getattr(complete_streaming_response, "usage"), + ) + + ## LOGGING + threading.Thread( + target=self.logging_obj.success_handler, + args=(response, None, None, cache_hit), + ).start() # log response + + if self.sent_stream_usage is False and self.send_stream_usage is True: + self.sent_stream_usage = True + return response + raise # Re-raise StopIteration + else: + self.sent_last_chunk = True + processed_chunk = self.finish_reason_handler() + if self.stream_options is None: # add usage as hidden param + usage = calculate_total_usage(chunks=self.chunks) + processed_chunk._hidden_params["usage"] = usage + ## LOGGING + threading.Thread( + target=self.run_success_logging_and_cache_storage, + args=(processed_chunk, cache_hit), + ).start() # log response + return processed_chunk + except Exception as e: + traceback_exception = traceback.format_exc() + # LOG FAILURE - handle streaming failure logging in the _next_ object, remove `handle_failure` once it's deprecated + threading.Thread( + target=self.logging_obj.failure_handler, args=(e, traceback_exception) + ).start() + if isinstance(e, OpenAIError): + raise e + else: + raise exception_type( + model=self.model, + original_exception=e, + custom_llm_provider=self.custom_llm_provider, + ) + + def fetch_sync_stream(self): + if self.completion_stream is None and self.make_call is not None: + # Call make_call to get the completion stream + self.completion_stream = self.make_call(client=litellm.module_level_client) + self._stream_iter = self.completion_stream.__iter__() + + return self.completion_stream + + async def fetch_stream(self): + if self.completion_stream is None and self.make_call is not None: + # Call make_call to get the completion stream + self.completion_stream = await self.make_call( + client=litellm.module_level_aclient + ) + self._stream_iter = self.completion_stream.__aiter__() + + return self.completion_stream + + async def __anext__(self): # noqa: PLR0915 + cache_hit = False + if ( + self.custom_llm_provider is not None + and self.custom_llm_provider == "cached_response" + ): + cache_hit = True + try: + if self.completion_stream is None: + await self.fetch_stream() + + if ( + self.custom_llm_provider == "openai" + or self.custom_llm_provider == "azure" + or self.custom_llm_provider == "custom_openai" + or self.custom_llm_provider == "text-completion-openai" + or self.custom_llm_provider == "text-completion-codestral" + or self.custom_llm_provider == "azure_text" + or self.custom_llm_provider == "anthropic" + or self.custom_llm_provider == "anthropic_text" + or self.custom_llm_provider == "huggingface" + or self.custom_llm_provider == "ollama" + or self.custom_llm_provider == "ollama_chat" + or self.custom_llm_provider == "vertex_ai" + or self.custom_llm_provider == "vertex_ai_beta" + or self.custom_llm_provider == "sagemaker" + or self.custom_llm_provider == "sagemaker_chat" + or self.custom_llm_provider == "gemini" + or self.custom_llm_provider == "replicate" + or self.custom_llm_provider == "cached_response" + or self.custom_llm_provider == "predibase" + or self.custom_llm_provider == "databricks" + or self.custom_llm_provider == "bedrock" + or self.custom_llm_provider == "triton" + or self.custom_llm_provider == "watsonx" + or self.custom_llm_provider in litellm.openai_compatible_providers + or self.custom_llm_provider in litellm._custom_providers + ): + async for chunk in self.completion_stream: + if chunk == "None" or chunk is None: + raise Exception + elif ( + self.custom_llm_provider == "gemini" + and hasattr(chunk, "parts") + and len(chunk.parts) == 0 + ): + continue + # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks. + # __anext__ also calls async_success_handler, which does logging + print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}") + + processed_chunk: Optional[ModelResponse] = self.chunk_creator( + chunk=chunk + ) + print_verbose( + f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}" + ) + if processed_chunk is None: + continue + ## LOGGING + ## LOGGING + executor.submit( + self.logging_obj.success_handler, + result=processed_chunk, + start_time=None, + end_time=None, + cache_hit=cache_hit, + ) + + asyncio.create_task( + self.logging_obj.async_success_handler( + processed_chunk, cache_hit=cache_hit + ) + ) + + if self.logging_obj._llm_caching_handler is not None: + asyncio.create_task( + self.logging_obj._llm_caching_handler._add_streaming_response_to_cache( + processed_chunk=processed_chunk, + ) + ) + + choice = processed_chunk.choices[0] + if isinstance(choice, StreamingChoices): + self.response_uptil_now += choice.delta.get("content", "") or "" + else: + self.response_uptil_now += "" + self.rules.post_call_rules( + input=self.response_uptil_now, model=self.model + ) + self.chunks.append(processed_chunk) + if hasattr( + processed_chunk, "usage" + ): # remove usage from chunk, only send on final chunk + # Convert the object to a dictionary + obj_dict = processed_chunk.dict() + + # Remove an attribute (e.g., 'attr2') + if "usage" in obj_dict: + del obj_dict["usage"] + + # Create a new object without the removed attribute + processed_chunk = self.model_response_creator(chunk=obj_dict) + print_verbose(f"final returned processed chunk: {processed_chunk}") + return processed_chunk + raise StopAsyncIteration + else: # temporary patch for non-aiohttp async calls + # example - boto3 bedrock llms + while True: + if isinstance(self.completion_stream, str) or isinstance( + self.completion_stream, bytes + ): + chunk = self.completion_stream + else: + chunk = next(self.completion_stream) + if chunk is not None and chunk != b"": + print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}") + processed_chunk: Optional[ModelResponse] = self.chunk_creator( + chunk=chunk + ) + print_verbose( + f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}" + ) + if processed_chunk is None: + continue + ## LOGGING + threading.Thread( + target=self.logging_obj.success_handler, + args=(processed_chunk, None, None, cache_hit), + ).start() # log processed_chunk + asyncio.create_task( + self.logging_obj.async_success_handler( + processed_chunk, cache_hit=cache_hit + ) + ) + + choice = processed_chunk.choices[0] + if isinstance(choice, StreamingChoices): + self.response_uptil_now += ( + choice.delta.get("content", "") or "" + ) + else: + self.response_uptil_now += "" + self.rules.post_call_rules( + input=self.response_uptil_now, model=self.model + ) + # RETURN RESULT + self.chunks.append(processed_chunk) + return processed_chunk + except (StopAsyncIteration, StopIteration): + if self.sent_last_chunk is True: + # log the final chunk with accurate streaming values + complete_streaming_response = litellm.stream_chunk_builder( + chunks=self.chunks, messages=self.messages + ) + response = self.model_response_creator() + if complete_streaming_response is not None: + setattr( + response, + "usage", + getattr(complete_streaming_response, "usage"), + ) + ## LOGGING + threading.Thread( + target=self.logging_obj.success_handler, + args=(response, None, None, cache_hit), + ).start() # log response + asyncio.create_task( + self.logging_obj.async_success_handler( + response, cache_hit=cache_hit + ) + ) + if self.sent_stream_usage is False and self.send_stream_usage is True: + self.sent_stream_usage = True + return response + raise StopAsyncIteration # Re-raise StopIteration + else: + self.sent_last_chunk = True + processed_chunk = self.finish_reason_handler() + ## LOGGING + threading.Thread( + target=self.logging_obj.success_handler, + args=(processed_chunk, None, None, cache_hit), + ).start() # log response + asyncio.create_task( + self.logging_obj.async_success_handler( + processed_chunk, cache_hit=cache_hit + ) + ) + return processed_chunk + except httpx.TimeoutException as e: # if httpx read timeout error occues + traceback_exception = traceback.format_exc() + ## ADD DEBUG INFORMATION - E.G. LITELLM REQUEST TIMEOUT + traceback_exception += "\nLiteLLM Default Request Timeout - {}".format( + litellm.request_timeout + ) + if self.logging_obj is not None: + ## LOGGING + threading.Thread( + target=self.logging_obj.failure_handler, + args=(e, traceback_exception), + ).start() # log response + # Handle any exceptions that might occur during streaming + asyncio.create_task( + self.logging_obj.async_failure_handler(e, traceback_exception) + ) + raise e + except Exception as e: + traceback_exception = traceback.format_exc() + if self.logging_obj is not None: + ## LOGGING + threading.Thread( + target=self.logging_obj.failure_handler, + args=(e, traceback_exception), + ).start() # log response + # Handle any exceptions that might occur during streaming + asyncio.create_task( + self.logging_obj.async_failure_handler(e, traceback_exception) # type: ignore + ) + ## Map to OpenAI Exception + raise exception_type( + model=self.model, + custom_llm_provider=self.custom_llm_provider, + original_exception=e, + completion_kwargs={}, + extra_kwargs={}, + ) + + +def calculate_total_usage(chunks: List[ModelResponse]) -> Usage: + """Assume most recent usage chunk has total usage uptil then.""" + prompt_tokens: int = 0 + completion_tokens: int = 0 + for chunk in chunks: + if "usage" in chunk: + if "prompt_tokens" in chunk["usage"]: + prompt_tokens = chunk["usage"].get("prompt_tokens", 0) or 0 + if "completion_tokens" in chunk["usage"]: + completion_tokens = chunk["usage"].get("completion_tokens", 0) or 0 + + returned_usage_chunk = Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + + return returned_usage_chunk + + +def generic_chunk_has_all_required_fields(chunk: dict) -> bool: + """ + Checks if the provided chunk dictionary contains all required fields for GenericStreamingChunk. + + :param chunk: The dictionary to check. + :return: True if all required fields are present, False otherwise. + """ + _all_fields = GChunk.__annotations__ + + decision = all(key in _all_fields for key in chunk) + return decision diff --git a/litellm/litellm_core_utils/streaming_utils.py b/litellm/litellm_core_utils/streaming_utils.py deleted file mode 100644 index c41b4f64c..000000000 --- a/litellm/litellm_core_utils/streaming_utils.py +++ /dev/null @@ -1,14 +0,0 @@ -from litellm.types.utils import GenericStreamingChunk as GChunk - - -def generic_chunk_has_all_required_fields(chunk: dict) -> bool: - """ - Checks if the provided chunk dictionary contains all required fields for GenericStreamingChunk. - - :param chunk: The dictionary to check. - :return: True if all required fields are present, False otherwise. - """ - _all_fields = GChunk.__annotations__ - - decision = all(key in _all_fields for key in chunk) - return decision diff --git a/litellm/llms/AzureOpenAI/azure.py b/litellm/llms/AzureOpenAI/azure.py index 39dea14e2..24303ef2f 100644 --- a/litellm/llms/AzureOpenAI/azure.py +++ b/litellm/llms/AzureOpenAI/azure.py @@ -12,7 +12,11 @@ from typing_extensions import overload import litellm from litellm.caching.caching import DualCache from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.types.utils import EmbeddingResponse from litellm.utils import ( CustomStreamWrapper, @@ -977,7 +981,10 @@ class AzureChatCompletion(BaseLLM): else: _params["timeout"] = httpx.Timeout(timeout=600.0, connect=5.0) - async_handler = AsyncHTTPHandler(**_params) # type: ignore + async_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.AZURE, + params=_params, + ) else: async_handler = client # type: ignore @@ -1521,7 +1528,8 @@ class AzureChatCompletion(BaseLLM): prompt: Optional[str] = None, ) -> dict: client_session = ( - litellm.aclient_session or httpx.AsyncClient() + litellm.aclient_session + or get_async_httpx_client(llm_provider=litellm.LlmProviders.AZURE).client ) # handle dall-e-2 calls if "gateway.ai.cloudflare.com" in api_base: diff --git a/litellm/llms/OpenAI/chat/gpt_transformation.py b/litellm/llms/OpenAI/chat/gpt_transformation.py index 14ebb4a53..c0c7e14dd 100644 --- a/litellm/llms/OpenAI/chat/gpt_transformation.py +++ b/litellm/llms/OpenAI/chat/gpt_transformation.py @@ -3,7 +3,7 @@ Support for gpt model family """ import types -from typing import Optional, Union +from typing import List, Optional, Union import litellm from litellm.types.llms.openai import AllMessageValues, ChatCompletionUserMessage @@ -163,3 +163,8 @@ class OpenAIGPTConfig: model=model, drop_params=drop_params, ) + + def _transform_messages( + self, messages: List[AllMessageValues] + ) -> List[AllMessageValues]: + return messages diff --git a/litellm/llms/OpenAI/chat/o1_handler.py b/litellm/llms/OpenAI/chat/o1_handler.py index 55dfe3715..5ff53a896 100644 --- a/litellm/llms/OpenAI/chat/o1_handler.py +++ b/litellm/llms/OpenAI/chat/o1_handler.py @@ -17,22 +17,6 @@ from litellm.utils import CustomStreamWrapper class OpenAIO1ChatCompletion(OpenAIChatCompletion): - async def mock_async_streaming( - self, - response: Any, - model: Optional[str], - logging_obj: Any, - ): - model_response = await response - completion_stream = MockResponseIterator(model_response=model_response) - streaming_response = CustomStreamWrapper( - completion_stream=completion_stream, - model=model, - custom_llm_provider="openai", - logging_obj=logging_obj, - ) - return streaming_response - def completion( self, model_response: ModelResponse, @@ -54,7 +38,7 @@ class OpenAIO1ChatCompletion(OpenAIChatCompletion): custom_llm_provider: Optional[str] = None, drop_params: Optional[bool] = None, ): - stream: Optional[bool] = optional_params.pop("stream", False) + # stream: Optional[bool] = optional_params.pop("stream", False) response = super().completion( model_response, timeout, @@ -76,20 +60,4 @@ class OpenAIO1ChatCompletion(OpenAIChatCompletion): drop_params, ) - if stream is True: - if asyncio.iscoroutine(response): - return self.mock_async_streaming( - response=response, model=model, logging_obj=logging_obj # type: ignore - ) - - completion_stream = MockResponseIterator(model_response=response) - streaming_response = CustomStreamWrapper( - completion_stream=completion_stream, - model=model, - custom_llm_provider="openai", - logging_obj=logging_obj, - ) - - return streaming_response - else: - return response + return response diff --git a/litellm/llms/OpenAI/chat/o1_transformation.py b/litellm/llms/OpenAI/chat/o1_transformation.py index d9def117f..2dd70afbb 100644 --- a/litellm/llms/OpenAI/chat/o1_transformation.py +++ b/litellm/llms/OpenAI/chat/o1_transformation.py @@ -108,7 +108,9 @@ class OpenAIO1Config(OpenAIGPTConfig): return True return False - def o1_prompt_factory(self, messages: List[AllMessageValues]): + def _transform_messages( + self, messages: List[AllMessageValues] + ) -> List[AllMessageValues]: """ Handles limitations of O-1 model family. - modalities: image => drop param (if user opts in to dropping param) diff --git a/litellm/llms/OpenAI/openai.py b/litellm/llms/OpenAI/openai.py index 008296fe7..057340b51 100644 --- a/litellm/llms/OpenAI/openai.py +++ b/litellm/llms/OpenAI/openai.py @@ -15,8 +15,10 @@ from pydantic import BaseModel from typing_extensions import overload, override import litellm +from litellm import LlmProviders from litellm._logging import verbose_logger from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj +from litellm.llms.custom_httpx.http_handler import _DEFAULT_TTL_FOR_HTTPX_CLIENTS from litellm.secret_managers.main import get_secret_str from litellm.types.utils import ProviderField from litellm.utils import ( @@ -24,6 +26,7 @@ from litellm.utils import ( CustomStreamWrapper, Message, ModelResponse, + ProviderConfigManager, TextCompletionResponse, Usage, convert_to_model_response_object, @@ -560,8 +563,9 @@ class OpenAIChatCompletion(BaseLLM): _cache_key = f"hashed_api_key={hashed_api_key},api_base={api_base},timeout={timeout},max_retries={max_retries},organization={organization},is_async={is_async}" - if _cache_key in litellm.in_memory_llm_clients_cache: - return litellm.in_memory_llm_clients_cache[_cache_key] + _cached_client = litellm.in_memory_llm_clients_cache.get_cache(_cache_key) + if _cached_client: + return _cached_client if is_async: _new_client: Union[OpenAI, AsyncOpenAI] = AsyncOpenAI( api_key=api_key, @@ -582,7 +586,11 @@ class OpenAIChatCompletion(BaseLLM): ) ## SAVE CACHE KEY - litellm.in_memory_llm_clients_cache[_cache_key] = _new_client + litellm.in_memory_llm_clients_cache.set_cache( + key=_cache_key, + value=_new_client, + ttl=_DEFAULT_TTL_FOR_HTTPX_CLIENTS, + ) return _new_client else: @@ -701,13 +709,11 @@ class OpenAIChatCompletion(BaseLLM): messages=messages, custom_llm_provider=custom_llm_provider, ) - if ( - litellm.openAIO1Config.is_model_o1_reasoning_model(model=model) - and messages is not None - ): - messages = litellm.openAIO1Config.o1_prompt_factory( - messages=messages, + if messages is not None and custom_llm_provider is not None: + provider_config = ProviderConfigManager.get_provider_config( + model=model, provider=LlmProviders(custom_llm_provider) ) + messages = provider_config._transform_messages(messages) for _ in range( 2 diff --git a/litellm/llms/anthropic/chat/handler.py b/litellm/llms/anthropic/chat/handler.py index da95ac075..be46051c6 100644 --- a/litellm/llms/anthropic/chat/handler.py +++ b/litellm/llms/anthropic/chat/handler.py @@ -44,8 +44,8 @@ from litellm.types.llms.openai import ( ChatCompletionToolCallFunctionChunk, ChatCompletionUsageBlock, ) -from litellm.types.utils import GenericStreamingChunk, PromptTokensDetailsWrapper -from litellm.utils import CustomStreamWrapper, ModelResponse, Usage +from litellm.types.utils import GenericStreamingChunk +from litellm.utils import CustomStreamWrapper, ModelResponse from ...base import BaseLLM from ..common_utils import AnthropicError, process_anthropic_headers @@ -58,6 +58,7 @@ def validate_environment( user_headers, model, messages: List[AllMessageValues], + is_vertex_request: bool, tools: Optional[List[AllAnthropicToolsValues]], anthropic_version: Optional[str] = None, ): @@ -71,12 +72,14 @@ def validate_environment( prompt_caching_set = AnthropicConfig().is_cache_control_set(messages=messages) computer_tool_used = AnthropicConfig().is_computer_tool_used(tools=tools) - + pdf_used = AnthropicConfig().is_pdf_used(messages=messages) headers = AnthropicConfig().get_anthropic_headers( anthropic_version=anthropic_version, computer_tool_used=computer_tool_used, prompt_caching_set=prompt_caching_set, + pdf_used=pdf_used, api_key=api_key, + is_vertex_request=is_vertex_request, ) if user_headers is not None and isinstance(user_headers, dict): @@ -93,6 +96,7 @@ async def make_call( messages: list, logging_obj, timeout: Optional[Union[float, httpx.Timeout]], + json_mode: bool, ) -> Tuple[Any, httpx.Headers]: if client is None: client = litellm.module_level_aclient @@ -118,7 +122,9 @@ async def make_call( raise AnthropicError(status_code=500, message=str(e)) completion_stream = ModelResponseIterator( - streaming_response=response.aiter_lines(), sync_stream=False + streaming_response=response.aiter_lines(), + sync_stream=False, + json_mode=json_mode, ) # LOGGING @@ -141,6 +147,7 @@ def make_sync_call( messages: list, logging_obj, timeout: Optional[Union[float, httpx.Timeout]], + json_mode: bool, ) -> Tuple[Any, httpx.Headers]: if client is None: client = litellm.module_level_client # re-use a module level client @@ -174,7 +181,7 @@ def make_sync_call( ) completion_stream = ModelResponseIterator( - streaming_response=response.iter_lines(), sync_stream=True + streaming_response=response.iter_lines(), sync_stream=True, json_mode=json_mode ) # LOGGING @@ -192,131 +199,6 @@ class AnthropicChatCompletion(BaseLLM): def __init__(self) -> None: super().__init__() - def _process_response( - self, - model: str, - response: Union[requests.Response, httpx.Response], - model_response: ModelResponse, - stream: bool, - logging_obj: litellm.litellm_core_utils.litellm_logging.Logging, # type: ignore - optional_params: dict, - api_key: str, - data: Union[dict, str], - messages: List, - print_verbose, - encoding, - json_mode: bool, - ) -> ModelResponse: - _hidden_params: Dict = {} - _hidden_params["additional_headers"] = process_anthropic_headers( - dict(response.headers) - ) - ## LOGGING - logging_obj.post_call( - input=messages, - api_key=api_key, - original_response=response.text, - additional_args={"complete_input_dict": data}, - ) - print_verbose(f"raw model_response: {response.text}") - ## RESPONSE OBJECT - try: - completion_response = response.json() - except Exception as e: - response_headers = getattr(response, "headers", None) - raise AnthropicError( - message="Unable to get json response - {}, Original Response: {}".format( - str(e), response.text - ), - status_code=response.status_code, - headers=response_headers, - ) - if "error" in completion_response: - response_headers = getattr(response, "headers", None) - raise AnthropicError( - message=str(completion_response["error"]), - status_code=response.status_code, - headers=response_headers, - ) - else: - text_content = "" - tool_calls: List[ChatCompletionToolCallChunk] = [] - for idx, content in enumerate(completion_response["content"]): - if content["type"] == "text": - text_content += content["text"] - ## TOOL CALLING - elif content["type"] == "tool_use": - tool_calls.append( - ChatCompletionToolCallChunk( - id=content["id"], - type="function", - function=ChatCompletionToolCallFunctionChunk( - name=content["name"], - arguments=json.dumps(content["input"]), - ), - index=idx, - ) - ) - - _message = litellm.Message( - tool_calls=tool_calls, - content=text_content or None, - ) - - ## HANDLE JSON MODE - anthropic returns single function call - if json_mode and len(tool_calls) == 1: - json_mode_content_str: Optional[str] = tool_calls[0]["function"].get( - "arguments" - ) - if json_mode_content_str is not None: - args = json.loads(json_mode_content_str) - values: Optional[dict] = args.get("values") - if values is not None: - _message = litellm.Message(content=json.dumps(values)) - completion_response["stop_reason"] = "stop" - model_response.choices[0].message = _message # type: ignore - model_response._hidden_params["original_response"] = completion_response[ - "content" - ] # allow user to access raw anthropic tool calling response - - model_response.choices[0].finish_reason = map_finish_reason( - completion_response["stop_reason"] - ) - - ## CALCULATING USAGE - prompt_tokens = completion_response["usage"]["input_tokens"] - completion_tokens = completion_response["usage"]["output_tokens"] - _usage = completion_response["usage"] - cache_creation_input_tokens: int = 0 - cache_read_input_tokens: int = 0 - - model_response.created = int(time.time()) - model_response.model = model - if "cache_creation_input_tokens" in _usage: - cache_creation_input_tokens = _usage["cache_creation_input_tokens"] - prompt_tokens += cache_creation_input_tokens - if "cache_read_input_tokens" in _usage: - cache_read_input_tokens = _usage["cache_read_input_tokens"] - prompt_tokens += cache_read_input_tokens - - prompt_tokens_details = PromptTokensDetailsWrapper( - cached_tokens=cache_read_input_tokens - ) - total_tokens = prompt_tokens + completion_tokens - usage = Usage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=total_tokens, - prompt_tokens_details=prompt_tokens_details, - cache_creation_input_tokens=cache_creation_input_tokens, - cache_read_input_tokens=cache_read_input_tokens, - ) - - setattr(model_response, "usage", usage) # type: ignore - - model_response._hidden_params = _hidden_params - return model_response - async def acompletion_stream_function( self, model: str, @@ -333,6 +215,7 @@ class AnthropicChatCompletion(BaseLLM): stream, _is_function_call, data: dict, + json_mode: bool, optional_params=None, litellm_params=None, logger_fn=None, @@ -349,6 +232,7 @@ class AnthropicChatCompletion(BaseLLM): messages=messages, logging_obj=logging_obj, timeout=timeout, + json_mode=json_mode, ) streamwrapper = CustomStreamWrapper( completion_stream=completion_stream, @@ -411,7 +295,7 @@ class AnthropicChatCompletion(BaseLLM): headers=error_headers, ) - return self._process_response( + return AnthropicConfig._process_response( model=model, response=response, model_response=model_response, @@ -439,30 +323,32 @@ class AnthropicChatCompletion(BaseLLM): logging_obj, optional_params: dict, timeout: Union[float, httpx.Timeout], + litellm_params: dict, acompletion=None, - litellm_params=None, logger_fn=None, headers={}, client=None, ): + optional_params = copy.deepcopy(optional_params) + stream = optional_params.pop("stream", None) + json_mode: bool = optional_params.pop("json_mode", False) + is_vertex_request: bool = optional_params.pop("is_vertex_request", False) + _is_function_call = False + messages = copy.deepcopy(messages) headers = validate_environment( api_key, headers, model, messages=messages, tools=optional_params.get("tools"), + is_vertex_request=is_vertex_request, ) - _is_function_call = False - messages = copy.deepcopy(messages) - optional_params = copy.deepcopy(optional_params) - stream = optional_params.pop("stream", None) - json_mode: bool = optional_params.pop("json_mode", False) - is_vertex_request: bool = optional_params.pop("is_vertex_request", False) data = AnthropicConfig()._transform_request( model=model, messages=messages, optional_params=optional_params, + litellm_params=litellm_params, headers=headers, _is_function_call=_is_function_call, is_vertex_request=is_vertex_request, @@ -499,6 +385,7 @@ class AnthropicChatCompletion(BaseLLM): optional_params=optional_params, stream=stream, _is_function_call=_is_function_call, + json_mode=json_mode, litellm_params=litellm_params, logger_fn=logger_fn, headers=headers, @@ -546,6 +433,7 @@ class AnthropicChatCompletion(BaseLLM): messages=messages, logging_obj=logging_obj, timeout=timeout, + json_mode=json_mode, ) return CustomStreamWrapper( completion_stream=completion_stream, @@ -583,7 +471,7 @@ class AnthropicChatCompletion(BaseLLM): headers=error_headers, ) - return self._process_response( + return AnthropicConfig._process_response( model=model, response=response, model_response=model_response, @@ -604,11 +492,14 @@ class AnthropicChatCompletion(BaseLLM): class ModelResponseIterator: - def __init__(self, streaming_response, sync_stream: bool): + def __init__( + self, streaming_response, sync_stream: bool, json_mode: Optional[bool] = False + ): self.streaming_response = streaming_response self.response_iterator = self.streaming_response self.content_blocks: List[ContentBlockDelta] = [] self.tool_index = -1 + self.json_mode = json_mode def check_empty_tool_call_args(self) -> bool: """ @@ -770,6 +661,8 @@ class ModelResponseIterator: status_code=500, # it looks like Anthropic API does not return a status code in the chunk error - default to 500 ) + text, tool_use = self._handle_json_mode_chunk(text=text, tool_use=tool_use) + returned_chunk = GenericStreamingChunk( text=text, tool_use=tool_use, @@ -784,6 +677,34 @@ class ModelResponseIterator: except json.JSONDecodeError: raise ValueError(f"Failed to decode JSON from chunk: {chunk}") + def _handle_json_mode_chunk( + self, text: str, tool_use: Optional[ChatCompletionToolCallChunk] + ) -> Tuple[str, Optional[ChatCompletionToolCallChunk]]: + """ + If JSON mode is enabled, convert the tool call to a message. + + Anthropic returns the JSON schema as part of the tool call + OpenAI returns the JSON schema as part of the content, this handles placing it in the content + + Args: + text: str + tool_use: Optional[ChatCompletionToolCallChunk] + Returns: + Tuple[str, Optional[ChatCompletionToolCallChunk]] + + text: The text to use in the content + tool_use: The ChatCompletionToolCallChunk to use in the chunk response + """ + if self.json_mode is True and tool_use is not None: + message = AnthropicConfig._convert_tool_response_to_message( + tool_calls=[tool_use] + ) + if message is not None: + text = message.content or "" + tool_use = None + + return text, tool_use + # Sync iterator def __iter__(self): return self @@ -858,3 +779,32 @@ class ModelResponseIterator: raise StopAsyncIteration except ValueError as e: raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}") + + def convert_str_chunk_to_generic_chunk(self, chunk: str) -> GenericStreamingChunk: + """ + Convert a string chunk to a GenericStreamingChunk + + Note: This is used for Anthropic pass through streaming logging + + We can move __anext__, and __next__ to use this function since it's common logic. + Did not migrate them to minmize changes made in 1 PR. + """ + str_line = chunk + if isinstance(chunk, bytes): # Handle binary data + str_line = chunk.decode("utf-8") # Convert bytes to string + index = str_line.find("data:") + if index != -1: + str_line = str_line[index:] + + if str_line.startswith("data:"): + data_json = json.loads(str_line[5:]) + return self.chunk_parser(chunk=data_json) + else: + return GenericStreamingChunk( + text="", + is_finished=False, + finish_reason="", + usage=None, + index=0, + tool_use=None, + ) diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py index ec3285473..feb5b8646 100644 --- a/litellm/llms/anthropic/chat/transformation.py +++ b/litellm/llms/anthropic/chat/transformation.py @@ -1,12 +1,20 @@ +import json +import time import types -from typing import List, Literal, Optional, Tuple, Union +from re import A +from typing import Dict, List, Literal, Optional, Tuple, Union + +import httpx +import requests import litellm +from litellm.litellm_core_utils.core_helpers import map_finish_reason from litellm.llms.prompt_templates.factory import anthropic_messages_pt from litellm.types.llms.anthropic import ( AllAnthropicToolsValues, AnthropicComputerTool, AnthropicHostedTools, + AnthropicInputSchema, AnthropicMessageRequestBase, AnthropicMessagesRequest, AnthropicMessagesTool, @@ -17,12 +25,23 @@ from litellm.types.llms.openai import ( AllMessageValues, ChatCompletionCachedContent, ChatCompletionSystemMessage, + ChatCompletionToolCallChunk, + ChatCompletionToolCallFunctionChunk, ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk, + ChatCompletionUsageBlock, +) +from litellm.types.utils import Message as LitellmMessage +from litellm.types.utils import PromptTokensDetailsWrapper +from litellm.utils import ( + CustomStreamWrapper, + ModelResponse, + Usage, + add_dummy_tool, + has_tool_call_blocks, ) -from litellm.utils import add_dummy_tool, has_tool_call_blocks -from ..common_utils import AnthropicError +from ..common_utils import AnthropicError, process_anthropic_headers class AnthropicConfig: @@ -90,6 +109,7 @@ class AnthropicConfig: "extra_headers", "parallel_tool_calls", "response_format", + "user", ] def get_cache_control_headers(self) -> dict: @@ -104,6 +124,8 @@ class AnthropicConfig: anthropic_version: Optional[str] = None, computer_tool_used: bool = False, prompt_caching_set: bool = False, + pdf_used: bool = False, + is_vertex_request: bool = False, ) -> dict: import json @@ -112,14 +134,21 @@ class AnthropicConfig: betas.append("prompt-caching-2024-07-31") if computer_tool_used: betas.append("computer-use-2024-10-22") + if pdf_used: + betas.append("pdfs-2024-09-25") headers = { "anthropic-version": anthropic_version or "2023-06-01", "x-api-key": api_key, "accept": "application/json", "content-type": "application/json", } - if len(betas) > 0: + + # Don't send any beta headers to Vertex, Vertex has failed requests when they are sent + if is_vertex_request is True: + pass + elif len(betas) > 0: headers["anthropic-beta"] = ",".join(betas) + return headers def _map_tool_choice( @@ -156,15 +185,17 @@ class AnthropicConfig: returned_tool: Optional[AllAnthropicToolsValues] = None if tool["type"] == "function" or tool["type"] == "custom": + _input_schema: dict = tool["function"].get( + "parameters", + { + "type": "object", + "properties": {}, + }, + ) + input_schema: AnthropicInputSchema = AnthropicInputSchema(**_input_schema) _tool = AnthropicMessagesTool( name=tool["function"]["name"], - input_schema=tool["function"].get( - "parameters", - { - "type": "object", - "properties": {}, - }, - ), + input_schema=input_schema, ) _description = tool["function"].get("description") @@ -240,6 +271,28 @@ class AnthropicConfig: anthropic_tools.append(new_tool) return anthropic_tools + def _map_stop_sequences( + self, stop: Optional[Union[str, List[str]]] + ) -> Optional[List[str]]: + new_stop: Optional[List[str]] = None + if isinstance(stop, str): + if ( + stop == "\n" + ) and litellm.drop_params is True: # anthropic doesn't allow whitespace characters as stop-sequences + return new_stop + new_stop = [stop] + elif isinstance(stop, list): + new_v = [] + for v in stop: + if ( + v == "\n" + ) and litellm.drop_params is True: # anthropic doesn't allow whitespace characters as stop-sequences + continue + new_v.append(v) + if len(new_v) > 0: + new_stop = new_v + return new_stop + def map_openai_params( self, non_default_params: dict, @@ -265,26 +318,10 @@ class AnthropicConfig: optional_params["tool_choice"] = _tool_choice if param == "stream" and value is True: optional_params["stream"] = value - if param == "stop": - if isinstance(value, str): - if ( - value == "\n" - ) and litellm.drop_params is True: # anthropic doesn't allow whitespace characters as stop-sequences - continue - value = [value] - elif isinstance(value, list): - new_v = [] - for v in value: - if ( - v == "\n" - ) and litellm.drop_params is True: # anthropic doesn't allow whitespace characters as stop-sequences - continue - new_v.append(v) - if len(new_v) > 0: - value = new_v - else: - continue - optional_params["stop_sequences"] = value + if param == "stop" and (isinstance(value, str) or isinstance(value, list)): + _value = self._map_stop_sequences(value) + if _value is not None: + optional_params["stop_sequences"] = _value if param == "temperature": optional_params["temperature"] = value if param == "top_p": @@ -301,21 +338,15 @@ class AnthropicConfig: - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective. """ - _tool_choice = None _tool_choice = {"name": "json_tool_call", "type": "tool"} - - _tool = AnthropicMessagesTool( - name="json_tool_call", - input_schema={ - "type": "object", - "properties": {"values": json_schema}, # type: ignore - }, + _tool = self._create_json_tool_call_for_response_format( + json_schema=json_schema, ) - optional_params["tools"] = [_tool] optional_params["tool_choice"] = _tool_choice optional_params["json_mode"] = True - + if param == "user": + optional_params["metadata"] = {"user_id": value} ## VALIDATE REQUEST """ Anthropic doesn't support tool calling without `tools=` param specified. @@ -338,6 +369,34 @@ class AnthropicConfig: return optional_params + def _create_json_tool_call_for_response_format( + self, + json_schema: Optional[dict] = None, + ) -> AnthropicMessagesTool: + """ + Handles creating a tool call for getting responses in JSON format. + + Args: + json_schema (Optional[dict]): The JSON schema the response should be in + + Returns: + AnthropicMessagesTool: The tool call to send to Anthropic API to get responses in JSON format + """ + _input_schema: AnthropicInputSchema = AnthropicInputSchema( + type="object", + ) + + if json_schema is None: + # Anthropic raises a 400 BadRequest error if properties is passed as None + # see usage with additionalProperties (Example 5) https://github.com/anthropics/anthropic-cookbook/blob/main/tool_use/extracting_structured_json.ipynb + _input_schema["additionalProperties"] = True + _input_schema["properties"] = {} + else: + _input_schema["properties"] = {"values": json_schema} + + _tool = AnthropicMessagesTool(name="json_tool_call", input_schema=_input_schema) + return _tool + def is_cache_control_set(self, messages: List[AllMessageValues]) -> bool: """ Return if {"cache_control": ..} in message content block @@ -365,6 +424,22 @@ class AnthropicConfig: return True return False + def is_pdf_used(self, messages: List[AllMessageValues]) -> bool: + """ + Set to true if media passed into messages. + + """ + for message in messages: + if ( + "content" in message + and message["content"] is not None + and isinstance(message["content"], list) + ): + for content in message["content"]: + if "type" in content: + return True + return False + def translate_system_message( self, messages: List[AllMessageValues] ) -> List[AnthropicSystemMessageContent]: @@ -423,6 +498,7 @@ class AnthropicConfig: model: str, messages: List[AllMessageValues], optional_params: dict, + litellm_params: dict, headers: dict, _is_function_call: bool, is_vertex_request: bool, @@ -460,6 +536,15 @@ class AnthropicConfig: if "tools" in optional_params: _is_function_call = True + ## Handle user_id in metadata + _litellm_metadata = litellm_params.get("metadata", None) + if ( + _litellm_metadata + and isinstance(_litellm_metadata, dict) + and "user_id" in _litellm_metadata + ): + optional_params["metadata"] = {"user_id": _litellm_metadata["user_id"]} + data = { "messages": anthropic_messages, **optional_params, @@ -467,3 +552,162 @@ class AnthropicConfig: if not is_vertex_request: data["model"] = model return data + + @staticmethod + def _process_response( + model: str, + response: Union[requests.Response, httpx.Response], + model_response: ModelResponse, + stream: bool, + logging_obj: litellm.litellm_core_utils.litellm_logging.Logging, # type: ignore + optional_params: dict, + api_key: str, + data: Union[dict, str], + messages: List, + print_verbose, + encoding, + json_mode: bool, + ) -> ModelResponse: + _hidden_params: Dict = {} + _hidden_params["additional_headers"] = process_anthropic_headers( + dict(response.headers) + ) + ## LOGGING + logging_obj.post_call( + input=messages, + api_key=api_key, + original_response=response.text, + additional_args={"complete_input_dict": data}, + ) + print_verbose(f"raw model_response: {response.text}") + ## RESPONSE OBJECT + try: + completion_response = response.json() + except Exception as e: + response_headers = getattr(response, "headers", None) + raise AnthropicError( + message="Unable to get json response - {}, Original Response: {}".format( + str(e), response.text + ), + status_code=response.status_code, + headers=response_headers, + ) + if "error" in completion_response: + response_headers = getattr(response, "headers", None) + raise AnthropicError( + message=str(completion_response["error"]), + status_code=response.status_code, + headers=response_headers, + ) + else: + text_content = "" + tool_calls: List[ChatCompletionToolCallChunk] = [] + for idx, content in enumerate(completion_response["content"]): + if content["type"] == "text": + text_content += content["text"] + ## TOOL CALLING + elif content["type"] == "tool_use": + tool_calls.append( + ChatCompletionToolCallChunk( + id=content["id"], + type="function", + function=ChatCompletionToolCallFunctionChunk( + name=content["name"], + arguments=json.dumps(content["input"]), + ), + index=idx, + ) + ) + + _message = litellm.Message( + tool_calls=tool_calls, + content=text_content or None, + ) + + ## HANDLE JSON MODE - anthropic returns single function call + if json_mode and len(tool_calls) == 1: + json_mode_content_str: Optional[str] = tool_calls[0]["function"].get( + "arguments" + ) + if json_mode_content_str is not None: + _converted_message = ( + AnthropicConfig._convert_tool_response_to_message( + tool_calls=tool_calls, + ) + ) + if _converted_message is not None: + completion_response["stop_reason"] = "stop" + _message = _converted_message + model_response.choices[0].message = _message # type: ignore + model_response._hidden_params["original_response"] = completion_response[ + "content" + ] # allow user to access raw anthropic tool calling response + + model_response.choices[0].finish_reason = map_finish_reason( + completion_response["stop_reason"] + ) + + ## CALCULATING USAGE + prompt_tokens = completion_response["usage"]["input_tokens"] + completion_tokens = completion_response["usage"]["output_tokens"] + _usage = completion_response["usage"] + cache_creation_input_tokens: int = 0 + cache_read_input_tokens: int = 0 + + model_response.created = int(time.time()) + model_response.model = model + if "cache_creation_input_tokens" in _usage: + cache_creation_input_tokens = _usage["cache_creation_input_tokens"] + prompt_tokens += cache_creation_input_tokens + if "cache_read_input_tokens" in _usage: + cache_read_input_tokens = _usage["cache_read_input_tokens"] + prompt_tokens += cache_read_input_tokens + + prompt_tokens_details = PromptTokensDetailsWrapper( + cached_tokens=cache_read_input_tokens + ) + total_tokens = prompt_tokens + completion_tokens + usage = Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + prompt_tokens_details=prompt_tokens_details, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, + ) + + setattr(model_response, "usage", usage) # type: ignore + + model_response._hidden_params = _hidden_params + return model_response + + @staticmethod + def _convert_tool_response_to_message( + tool_calls: List[ChatCompletionToolCallChunk], + ) -> Optional[LitellmMessage]: + """ + In JSON mode, Anthropic API returns JSON schema as a tool call, we need to convert it to a message to follow the OpenAI format + + """ + ## HANDLE JSON MODE - anthropic returns single function call + json_mode_content_str: Optional[str] = tool_calls[0]["function"].get( + "arguments" + ) + try: + if json_mode_content_str is not None: + args = json.loads(json_mode_content_str) + if ( + isinstance(args, dict) + and (values := args.get("values")) is not None + ): + _message = litellm.Message(content=json.dumps(values)) + return _message + else: + # a lot of the times the `values` key is not present in the tool response + # relevant issue: https://github.com/BerriAI/litellm/issues/6741 + _message = litellm.Message(content=json.dumps(args)) + return _message + except json.JSONDecodeError: + # json decode error does occur, return the original tool response str + return litellm.Message(content=json_mode_content_str) + return None diff --git a/litellm/llms/anthropic/completion.py b/litellm/llms/anthropic/completion.py index 89a50db6a..dc06401d6 100644 --- a/litellm/llms/anthropic/completion.py +++ b/litellm/llms/anthropic/completion.py @@ -13,7 +13,11 @@ import httpx import requests import litellm -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.utils import CustomStreamWrapper, ModelResponse, Usage from ..base import BaseLLM @@ -162,7 +166,10 @@ class AnthropicTextCompletion(BaseLLM): client=None, ): if client is None: - client = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0)) + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.ANTHROPIC, + params={"timeout": httpx.Timeout(timeout=600.0, connect=5.0)}, + ) response = await client.post(api_base, headers=headers, data=json.dumps(data)) @@ -198,7 +205,10 @@ class AnthropicTextCompletion(BaseLLM): client=None, ): if client is None: - client = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0)) + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.ANTHROPIC, + params={"timeout": httpx.Timeout(timeout=600.0, connect=5.0)}, + ) response = await client.post(api_base, headers=headers, data=json.dumps(data)) diff --git a/litellm/llms/azure_ai/chat/transformation.py b/litellm/llms/azure_ai/chat/transformation.py index 9767282fb..d8924fbb9 100644 --- a/litellm/llms/azure_ai/chat/transformation.py +++ b/litellm/llms/azure_ai/chat/transformation.py @@ -3,7 +3,10 @@ from typing import List, Optional, Tuple import litellm from litellm._logging import verbose_logger from litellm.llms.OpenAI.openai import OpenAIConfig -from litellm.llms.prompt_templates.common_utils import convert_content_list_to_str +from litellm.llms.prompt_templates.common_utils import ( + _audio_or_image_in_message_content, + convert_content_list_to_str, +) from litellm.secret_managers.main import get_secret_str from litellm.types.llms.openai import AllMessageValues from litellm.types.utils import ProviderField @@ -27,8 +30,21 @@ class AzureAIStudioConfig(OpenAIConfig): ), ] - def _transform_messages(self, messages: List[AllMessageValues]) -> List: + def _transform_messages( + self, + messages: List[AllMessageValues], + ) -> List: + """ + - Azure AI Studio doesn't support content as a list. This handles: + 1. Transforms list content to a string. + 2. If message contains an image or audio, send as is (user-intended) + """ for message in messages: + + # Do nothing if the message contains an image or audio + if _audio_or_image_in_message_content(message): + continue + texts = convert_content_list_to_str(message=message) if texts: message["content"] = texts diff --git a/litellm/llms/azure_ai/embed/handler.py b/litellm/llms/azure_ai/embed/handler.py index 638a77479..2946a84dd 100644 --- a/litellm/llms/azure_ai/embed/handler.py +++ b/litellm/llms/azure_ai/embed/handler.py @@ -74,7 +74,10 @@ class AzureAIEmbedding(OpenAIChatCompletion): client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None, ) -> EmbeddingResponse: if client is None or not isinstance(client, AsyncHTTPHandler): - client = AsyncHTTPHandler(timeout=timeout, concurrent_limit=1) + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.AZURE_AI, + params={"timeout": timeout}, + ) url = "{}/images/embeddings".format(api_base) diff --git a/litellm/llms/azure_ai/rerank/handler.py b/litellm/llms/azure_ai/rerank/handler.py index a67c893f2..60edfd296 100644 --- a/litellm/llms/azure_ai/rerank/handler.py +++ b/litellm/llms/azure_ai/rerank/handler.py @@ -4,6 +4,7 @@ import httpx from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.llms.cohere.rerank import CohereRerank +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.types.rerank import RerankResponse @@ -73,6 +74,7 @@ class AzureAIRerank(CohereRerank): return_documents: Optional[bool] = True, max_chunks_per_doc: Optional[int] = None, _is_async: Optional[bool] = False, + client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None, ) -> RerankResponse: if headers is None: diff --git a/litellm/llms/base_aws_llm.py b/litellm/llms/base_aws_llm.py index 70e3defc7..9f3a58a8b 100644 --- a/litellm/llms/base_aws_llm.py +++ b/litellm/llms/base_aws_llm.py @@ -1,16 +1,28 @@ import hashlib import json import os -from typing import Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple import httpx +from pydantic import BaseModel from litellm._logging import verbose_logger from litellm.caching.caching import DualCache, InMemoryCache -from litellm.secret_managers.main import get_secret +from litellm.secret_managers.main import get_secret, get_secret_str from .base import BaseLLM +if TYPE_CHECKING: + from botocore.credentials import Credentials +else: + Credentials = Any + + +class Boto3CredentialsInfo(BaseModel): + credentials: Credentials + aws_region_name: str + aws_bedrock_runtime_endpoint: Optional[str] + class AwsAuthError(Exception): def __init__(self, status_code, message): @@ -311,3 +323,74 @@ class BaseAWSLLM(BaseLLM): proxy_endpoint_url = endpoint_url return endpoint_url, proxy_endpoint_url + + def _get_boto_credentials_from_optional_params( + self, optional_params: dict + ) -> Boto3CredentialsInfo: + """ + Get boto3 credentials from optional params + + Args: + optional_params (dict): Optional parameters for the model call + + Returns: + Credentials: Boto3 credentials object + """ + try: + import boto3 + from botocore.auth import SigV4Auth + from botocore.awsrequest import AWSRequest + from botocore.credentials import Credentials + except ImportError: + raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.") + ## CREDENTIALS ## + # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them + aws_secret_access_key = optional_params.pop("aws_secret_access_key", None) + aws_access_key_id = optional_params.pop("aws_access_key_id", None) + aws_session_token = optional_params.pop("aws_session_token", None) + aws_region_name = optional_params.pop("aws_region_name", None) + aws_role_name = optional_params.pop("aws_role_name", None) + aws_session_name = optional_params.pop("aws_session_name", None) + aws_profile_name = optional_params.pop("aws_profile_name", None) + aws_web_identity_token = optional_params.pop("aws_web_identity_token", None) + aws_sts_endpoint = optional_params.pop("aws_sts_endpoint", None) + aws_bedrock_runtime_endpoint = optional_params.pop( + "aws_bedrock_runtime_endpoint", None + ) # https://bedrock-runtime.{region_name}.amazonaws.com + + ### SET REGION NAME ### + if aws_region_name is None: + # check env # + litellm_aws_region_name = get_secret_str("AWS_REGION_NAME", None) + + if litellm_aws_region_name is not None and isinstance( + litellm_aws_region_name, str + ): + aws_region_name = litellm_aws_region_name + + standard_aws_region_name = get_secret_str("AWS_REGION", None) + if standard_aws_region_name is not None and isinstance( + standard_aws_region_name, str + ): + aws_region_name = standard_aws_region_name + + if aws_region_name is None: + aws_region_name = "us-west-2" + + credentials: Credentials = self.get_credentials( + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + aws_session_token=aws_session_token, + aws_region_name=aws_region_name, + aws_session_name=aws_session_name, + aws_profile_name=aws_profile_name, + aws_role_name=aws_role_name, + aws_web_identity_token=aws_web_identity_token, + aws_sts_endpoint=aws_sts_endpoint, + ) + + return Boto3CredentialsInfo( + credentials=credentials, + aws_region_name=aws_region_name, + aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint, + ) diff --git a/litellm/llms/bedrock/chat/converse_transformation.py b/litellm/llms/bedrock/chat/converse_transformation.py index 6c08758dd..23ee97a47 100644 --- a/litellm/llms/bedrock/chat/converse_transformation.py +++ b/litellm/llms/bedrock/chat/converse_transformation.py @@ -458,7 +458,7 @@ class AmazonConverseConfig: """ Abbreviations of regions AWS Bedrock supports for cross region inference """ - return ["us", "eu"] + return ["us", "eu", "apac"] def _get_base_model(self, model: str) -> str: """ diff --git a/litellm/llms/bedrock/common_utils.py b/litellm/llms/bedrock/common_utils.py index 1ae74e535..332b1e2b3 100644 --- a/litellm/llms/bedrock/common_utils.py +++ b/litellm/llms/bedrock/common_utils.py @@ -484,73 +484,6 @@ class AmazonMistralConfig: } -class AmazonStabilityConfig: - """ - Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0 - - Supported Params for the Amazon / Stable Diffusion models: - - - `cfg_scale` (integer): Default `7`. Between [ 0 .. 35 ]. How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt) - - - `seed` (float): Default: `0`. Between [ 0 .. 4294967295 ]. Random noise seed (omit this option or use 0 for a random seed) - - - `steps` (array of strings): Default `30`. Between [ 10 .. 50 ]. Number of diffusion steps to run. - - - `width` (integer): Default: `512`. multiple of 64 >= 128. Width of the image to generate, in pixels, in an increment divible by 64. - Engine-specific dimension validation: - - - SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512. - - SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152 - - SDXL v1.0: same as SDXL v0.9 - - SD v1.6: must be between 320x320 and 1536x1536 - - - `height` (integer): Default: `512`. multiple of 64 >= 128. Height of the image to generate, in pixels, in an increment divible by 64. - Engine-specific dimension validation: - - - SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512. - - SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152 - - SDXL v1.0: same as SDXL v0.9 - - SD v1.6: must be between 320x320 and 1536x1536 - """ - - cfg_scale: Optional[int] = None - seed: Optional[float] = None - steps: Optional[List[str]] = None - width: Optional[int] = None - height: Optional[int] = None - - def __init__( - self, - cfg_scale: Optional[int] = None, - seed: Optional[float] = None, - steps: Optional[List[str]] = None, - width: Optional[int] = None, - height: Optional[int] = None, - ) -> None: - locals_ = locals() - for key, value in locals_.items(): - if key != "self" and value is not None: - setattr(self.__class__, key, value) - - @classmethod - def get_config(cls): - return { - k: v - for k, v in cls.__dict__.items() - if not k.startswith("__") - and not isinstance( - v, - ( - types.FunctionType, - types.BuiltinFunctionType, - classmethod, - staticmethod, - ), - ) - and v is not None - } - - def add_custom_header(headers): """Closure to capture the headers and add them.""" diff --git a/litellm/llms/bedrock/image/amazon_stability1_transformation.py b/litellm/llms/bedrock/image/amazon_stability1_transformation.py new file mode 100644 index 000000000..880881e97 --- /dev/null +++ b/litellm/llms/bedrock/image/amazon_stability1_transformation.py @@ -0,0 +1,104 @@ +import types +from typing import List, Optional + +from openai.types.image import Image + +from litellm.types.utils import ImageResponse + + +class AmazonStabilityConfig: + """ + Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0 + + Supported Params for the Amazon / Stable Diffusion models: + + - `cfg_scale` (integer): Default `7`. Between [ 0 .. 35 ]. How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt) + + - `seed` (float): Default: `0`. Between [ 0 .. 4294967295 ]. Random noise seed (omit this option or use 0 for a random seed) + + - `steps` (array of strings): Default `30`. Between [ 10 .. 50 ]. Number of diffusion steps to run. + + - `width` (integer): Default: `512`. multiple of 64 >= 128. Width of the image to generate, in pixels, in an increment divible by 64. + Engine-specific dimension validation: + + - SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512. + - SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152 + - SDXL v1.0: same as SDXL v0.9 + - SD v1.6: must be between 320x320 and 1536x1536 + + - `height` (integer): Default: `512`. multiple of 64 >= 128. Height of the image to generate, in pixels, in an increment divible by 64. + Engine-specific dimension validation: + + - SDXL Beta: must be between 128x128 and 512x896 (or 896x512); only one dimension can be greater than 512. + - SDXL v0.9: must be one of 1024x1024, 1152x896, 1216x832, 1344x768, 1536x640, 640x1536, 768x1344, 832x1216, or 896x1152 + - SDXL v1.0: same as SDXL v0.9 + - SD v1.6: must be between 320x320 and 1536x1536 + """ + + cfg_scale: Optional[int] = None + seed: Optional[float] = None + steps: Optional[List[str]] = None + width: Optional[int] = None + height: Optional[int] = None + + def __init__( + self, + cfg_scale: Optional[int] = None, + seed: Optional[float] = None, + steps: Optional[List[str]] = None, + width: Optional[int] = None, + height: Optional[int] = None, + ) -> None: + locals_ = locals() + for key, value in locals_.items(): + if key != "self" and value is not None: + setattr(self.__class__, key, value) + + @classmethod + def get_config(cls): + return { + k: v + for k, v in cls.__dict__.items() + if not k.startswith("__") + and not isinstance( + v, + ( + types.FunctionType, + types.BuiltinFunctionType, + classmethod, + staticmethod, + ), + ) + and v is not None + } + + @classmethod + def get_supported_openai_params(cls, model: Optional[str] = None) -> List: + return ["size"] + + @classmethod + def map_openai_params( + cls, + non_default_params: dict, + optional_params: dict, + ): + _size = non_default_params.get("size") + if _size is not None: + width, height = _size.split("x") + optional_params["width"] = int(width) + optional_params["height"] = int(height) + + return optional_params + + @classmethod + def transform_response_dict_to_openai_response( + cls, model_response: ImageResponse, response_dict: dict + ) -> ImageResponse: + image_list: List[Image] = [] + for artifact in response_dict["artifacts"]: + _image = Image(b64_json=artifact["base64"]) + image_list.append(_image) + + model_response.data = image_list + + return model_response diff --git a/litellm/llms/bedrock/image/amazon_stability3_transformation.py b/litellm/llms/bedrock/image/amazon_stability3_transformation.py new file mode 100644 index 000000000..2c90b3a12 --- /dev/null +++ b/litellm/llms/bedrock/image/amazon_stability3_transformation.py @@ -0,0 +1,100 @@ +import types +from typing import List, Optional + +from openai.types.image import Image + +from litellm.types.llms.bedrock import ( + AmazonStability3TextToImageRequest, + AmazonStability3TextToImageResponse, +) +from litellm.types.utils import ImageResponse + + +class AmazonStability3Config: + """ + Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=stability.stable-diffusion-xl-v0 + + Stability API Ref: https://platform.stability.ai/docs/api-reference#tag/Generate/paths/~1v2beta~1stable-image~1generate~1sd3/post + """ + + @classmethod + def get_config(cls): + return { + k: v + for k, v in cls.__dict__.items() + if not k.startswith("__") + and not isinstance( + v, + ( + types.FunctionType, + types.BuiltinFunctionType, + classmethod, + staticmethod, + ), + ) + and v is not None + } + + @classmethod + def get_supported_openai_params(cls, model: Optional[str] = None) -> List: + """ + No additional OpenAI params are mapped for stability 3 + """ + return [] + + @classmethod + def _is_stability_3_model(cls, model: Optional[str] = None) -> bool: + """ + Returns True if the model is a Stability 3 model + + Stability 3 models follow this pattern: + sd3-large + sd3-large-turbo + sd3-medium + sd3.5-large + sd3.5-large-turbo + + Stability ultra models + stable-image-ultra-v1 + """ + if model: + if "sd3" in model or "sd3.5" in model: + return True + if "stable-image-ultra-v1" in model: + return True + return False + + @classmethod + def transform_request_body( + cls, prompt: str, optional_params: dict + ) -> AmazonStability3TextToImageRequest: + """ + Transform the request body for the Stability 3 models + """ + data = AmazonStability3TextToImageRequest(prompt=prompt, **optional_params) + return data + + @classmethod + def map_openai_params(cls, non_default_params: dict, optional_params: dict) -> dict: + """ + Map the OpenAI params to the Bedrock params + + No OpenAI params are mapped for Stability 3, so directly return the optional_params + """ + return optional_params + + @classmethod + def transform_response_dict_to_openai_response( + cls, model_response: ImageResponse, response_dict: dict + ) -> ImageResponse: + """ + Transform the response dict to the OpenAI response + """ + + stability_3_response = AmazonStability3TextToImageResponse(**response_dict) + openai_images: List[Image] = [] + for _img in stability_3_response.get("images", []): + openai_images.append(Image(b64_json=_img)) + + model_response.data = openai_images + return model_response diff --git a/litellm/llms/bedrock/image/cost_calculator.py b/litellm/llms/bedrock/image/cost_calculator.py new file mode 100644 index 000000000..0a20b44cb --- /dev/null +++ b/litellm/llms/bedrock/image/cost_calculator.py @@ -0,0 +1,41 @@ +from typing import Optional + +import litellm +from litellm.types.utils import ImageResponse + + +def cost_calculator( + model: str, + image_response: ImageResponse, + size: Optional[str] = None, + optional_params: Optional[dict] = None, +) -> float: + """ + Bedrock image generation cost calculator + + Handles both Stability 1 and Stability 3 models + """ + if litellm.AmazonStability3Config()._is_stability_3_model(model=model): + pass + else: + # Stability 1 models + optional_params = optional_params or {} + + # see model_prices_and_context_window.json for details on how steps is used + # Reference pricing by steps for stability 1: https://aws.amazon.com/bedrock/pricing/ + _steps = optional_params.get("steps", 50) + steps = "max-steps" if _steps > 50 else "50-steps" + + # size is stored in model_prices_and_context_window.json as 1024-x-1024 + # current size has 1024x1024 + size = size or "1024-x-1024" + model = f"{size}/{steps}/{model}" + + _model_info = litellm.get_model_info( + model=model, + custom_llm_provider="bedrock", + ) + + output_cost_per_image: float = _model_info.get("output_cost_per_image") or 0.0 + num_images: int = len(image_response.data) + return output_cost_per_image * num_images diff --git a/litellm/llms/bedrock/image/image_handler.py b/litellm/llms/bedrock/image/image_handler.py new file mode 100644 index 000000000..31af2910f --- /dev/null +++ b/litellm/llms/bedrock/image/image_handler.py @@ -0,0 +1,304 @@ +import copy +import json +import os +from typing import TYPE_CHECKING, Any, List, Optional, Union + +import httpx +from openai.types.image import Image +from pydantic import BaseModel + +import litellm +from litellm._logging import verbose_logger +from litellm.litellm_core_utils.litellm_logging import Logging as LitellmLogging +from litellm.llms.custom_httpx.http_handler import ( + _get_httpx_client, + get_async_httpx_client, +) +from litellm.types.utils import ImageResponse + +from ...base_aws_llm import BaseAWSLLM +from ..common_utils import BedrockError + +if TYPE_CHECKING: + from botocore.awsrequest import AWSPreparedRequest +else: + AWSPreparedRequest = Any + + +class BedrockImagePreparedRequest(BaseModel): + """ + Internal/Helper class for preparing the request for bedrock image generation + """ + + endpoint_url: str + prepped: AWSPreparedRequest + body: bytes + data: dict + + +class BedrockImageGeneration(BaseAWSLLM): + """ + Bedrock Image Generation handler + """ + + def image_generation( + self, + model: str, + prompt: str, + model_response: ImageResponse, + optional_params: dict, + logging_obj: LitellmLogging, + timeout: Optional[Union[float, httpx.Timeout]], + aimg_generation: bool = False, + api_base: Optional[str] = None, + extra_headers: Optional[dict] = None, + ): + prepared_request = self._prepare_request( + model=model, + optional_params=optional_params, + api_base=api_base, + extra_headers=extra_headers, + logging_obj=logging_obj, + prompt=prompt, + ) + + if aimg_generation is True: + return self.async_image_generation( + prepared_request=prepared_request, + timeout=timeout, + model=model, + logging_obj=logging_obj, + prompt=prompt, + model_response=model_response, + ) + + client = _get_httpx_client() + try: + response = client.post(url=prepared_request.endpoint_url, headers=prepared_request.prepped.headers, data=prepared_request.body) # type: ignore + response.raise_for_status() + except httpx.HTTPStatusError as err: + error_code = err.response.status_code + raise BedrockError(status_code=error_code, message=err.response.text) + except httpx.TimeoutException: + raise BedrockError(status_code=408, message="Timeout error occurred.") + ### FORMAT RESPONSE TO OPENAI FORMAT ### + model_response = self._transform_response_dict_to_openai_response( + model_response=model_response, + model=model, + logging_obj=logging_obj, + prompt=prompt, + response=response, + data=prepared_request.data, + ) + return model_response + + async def async_image_generation( + self, + prepared_request: BedrockImagePreparedRequest, + timeout: Optional[Union[float, httpx.Timeout]], + model: str, + logging_obj: LitellmLogging, + prompt: str, + model_response: ImageResponse, + ) -> ImageResponse: + """ + Asynchronous handler for bedrock image generation + + Awaits the response from the bedrock image generation endpoint + """ + async_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.BEDROCK, + params={"timeout": timeout}, + ) + + try: + response = await async_client.post(url=prepared_request.endpoint_url, headers=prepared_request.prepped.headers, data=prepared_request.body) # type: ignore + response.raise_for_status() + except httpx.HTTPStatusError as err: + error_code = err.response.status_code + raise BedrockError(status_code=error_code, message=err.response.text) + except httpx.TimeoutException: + raise BedrockError(status_code=408, message="Timeout error occurred.") + + ### FORMAT RESPONSE TO OPENAI FORMAT ### + model_response = self._transform_response_dict_to_openai_response( + model=model, + logging_obj=logging_obj, + prompt=prompt, + response=response, + data=prepared_request.data, + model_response=model_response, + ) + return model_response + + def _prepare_request( + self, + model: str, + optional_params: dict, + api_base: Optional[str], + extra_headers: Optional[dict], + logging_obj: LitellmLogging, + prompt: str, + ) -> BedrockImagePreparedRequest: + """ + Prepare the request body, headers, and endpoint URL for the Bedrock Image Generation API + + Args: + model (str): The model to use for the image generation + optional_params (dict): The optional parameters for the image generation + api_base (Optional[str]): The base URL for the Bedrock API + extra_headers (Optional[dict]): The extra headers to include in the request + logging_obj (LitellmLogging): The logging object to use for logging + prompt (str): The prompt to use for the image generation + Returns: + BedrockImagePreparedRequest: The prepared request object + + The BedrockImagePreparedRequest contains: + endpoint_url (str): The endpoint URL for the Bedrock Image Generation API + prepped (httpx.Request): The prepared request object + body (bytes): The request body + """ + try: + import boto3 + from botocore.auth import SigV4Auth + from botocore.awsrequest import AWSRequest + from botocore.credentials import Credentials + except ImportError: + raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.") + boto3_credentials_info = self._get_boto_credentials_from_optional_params( + optional_params + ) + + ### SET RUNTIME ENDPOINT ### + modelId = model + _, proxy_endpoint_url = self.get_runtime_endpoint( + api_base=api_base, + aws_bedrock_runtime_endpoint=boto3_credentials_info.aws_bedrock_runtime_endpoint, + aws_region_name=boto3_credentials_info.aws_region_name, + ) + proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke" + sigv4 = SigV4Auth( + boto3_credentials_info.credentials, + "bedrock", + boto3_credentials_info.aws_region_name, + ) + + data = self._get_request_body( + model=model, prompt=prompt, optional_params=optional_params + ) + + # Make POST Request + body = json.dumps(data).encode("utf-8") + + headers = {"Content-Type": "application/json"} + if extra_headers is not None: + headers = {"Content-Type": "application/json", **extra_headers} + request = AWSRequest( + method="POST", url=proxy_endpoint_url, data=body, headers=headers + ) + sigv4.add_auth(request) + if ( + extra_headers is not None and "Authorization" in extra_headers + ): # prevent sigv4 from overwriting the auth header + request.headers["Authorization"] = extra_headers["Authorization"] + prepped = request.prepare() + + ## LOGGING + logging_obj.pre_call( + input=prompt, + api_key="", + additional_args={ + "complete_input_dict": data, + "api_base": proxy_endpoint_url, + "headers": prepped.headers, + }, + ) + return BedrockImagePreparedRequest( + endpoint_url=proxy_endpoint_url, + prepped=prepped, + body=body, + data=data, + ) + + def _get_request_body( + self, + model: str, + prompt: str, + optional_params: dict, + ) -> dict: + """ + Get the request body for the Bedrock Image Generation API + + Checks the model/provider and transforms the request body accordingly + + Returns: + dict: The request body to use for the Bedrock Image Generation API + """ + provider = model.split(".")[0] + inference_params = copy.deepcopy(optional_params) + inference_params.pop( + "user", None + ) # make sure user is not passed in for bedrock call + data = {} + if provider == "stability": + if litellm.AmazonStability3Config._is_stability_3_model(model): + request_body = litellm.AmazonStability3Config.transform_request_body( + prompt=prompt, optional_params=optional_params + ) + return dict(request_body) + else: + prompt = prompt.replace(os.linesep, " ") + ## LOAD CONFIG + config = litellm.AmazonStabilityConfig.get_config() + for k, v in config.items(): + if ( + k not in inference_params + ): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in + inference_params[k] = v + data = { + "text_prompts": [{"text": prompt, "weight": 1}], + **inference_params, + } + else: + raise BedrockError( + status_code=422, message=f"Unsupported model={model}, passed in" + ) + return data + + def _transform_response_dict_to_openai_response( + self, + model_response: ImageResponse, + model: str, + logging_obj: LitellmLogging, + prompt: str, + response: httpx.Response, + data: dict, + ) -> ImageResponse: + """ + Transforms the Image Generation response from Bedrock to OpenAI format + """ + + ## LOGGING + if logging_obj is not None: + logging_obj.post_call( + input=prompt, + api_key="", + original_response=response.text, + additional_args={"complete_input_dict": data}, + ) + verbose_logger.debug("raw model_response: %s", response.text) + response_dict = response.json() + if response_dict is None: + raise ValueError("Error in response object format, got None") + + config_class = ( + litellm.AmazonStability3Config + if litellm.AmazonStability3Config._is_stability_3_model(model=model) + else litellm.AmazonStabilityConfig + ) + config_class.transform_response_dict_to_openai_response( + model_response=model_response, + response_dict=response_dict, + ) + + return model_response diff --git a/litellm/llms/bedrock/image_generation.py b/litellm/llms/bedrock/image_generation.py deleted file mode 100644 index 65038d12e..000000000 --- a/litellm/llms/bedrock/image_generation.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Handles image gen calls to Bedrock's `/invoke` endpoint -""" - -import copy -import json -import os -from typing import Any, List - -from openai.types.image import Image - -import litellm -from litellm.types.utils import ImageResponse - -from .common_utils import BedrockError, init_bedrock_client - - -def image_generation( - model: str, - prompt: str, - model_response: ImageResponse, - optional_params: dict, - logging_obj: Any, - timeout=None, - aimg_generation=False, -): - """ - Bedrock Image Gen endpoint support - """ - ### BOTO3 INIT ### - # pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them - aws_secret_access_key = optional_params.pop("aws_secret_access_key", None) - aws_access_key_id = optional_params.pop("aws_access_key_id", None) - aws_region_name = optional_params.pop("aws_region_name", None) - aws_role_name = optional_params.pop("aws_role_name", None) - aws_session_name = optional_params.pop("aws_session_name", None) - aws_bedrock_runtime_endpoint = optional_params.pop( - "aws_bedrock_runtime_endpoint", None - ) - aws_web_identity_token = optional_params.pop("aws_web_identity_token", None) - - # use passed in BedrockRuntime.Client if provided, otherwise create a new one - client = init_bedrock_client( - aws_access_key_id=aws_access_key_id, - aws_secret_access_key=aws_secret_access_key, - aws_region_name=aws_region_name, - aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint, - aws_web_identity_token=aws_web_identity_token, - aws_role_name=aws_role_name, - aws_session_name=aws_session_name, - timeout=timeout, - ) - - ### FORMAT IMAGE GENERATION INPUT ### - modelId = model - provider = model.split(".")[0] - inference_params = copy.deepcopy(optional_params) - inference_params.pop( - "user", None - ) # make sure user is not passed in for bedrock call - data = {} - if provider == "stability": - prompt = prompt.replace(os.linesep, " ") - ## LOAD CONFIG - config = litellm.AmazonStabilityConfig.get_config() - for k, v in config.items(): - if ( - k not in inference_params - ): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in - inference_params[k] = v - data = {"text_prompts": [{"text": prompt, "weight": 1}], **inference_params} - else: - raise BedrockError( - status_code=422, message=f"Unsupported model={model}, passed in" - ) - - body = json.dumps(data).encode("utf-8") - ## LOGGING - request_str = f""" - response = client.invoke_model( - body={body}, # type: ignore - modelId={modelId}, - accept="application/json", - contentType="application/json", - )""" # type: ignore - logging_obj.pre_call( - input=prompt, - api_key="", # boto3 is used for init. - additional_args={ - "complete_input_dict": {"model": modelId, "texts": prompt}, - "request_str": request_str, - }, - ) - try: - response = client.invoke_model( - body=body, - modelId=modelId, - accept="application/json", - contentType="application/json", - ) - response_body = json.loads(response.get("body").read()) - ## LOGGING - logging_obj.post_call( - input=prompt, - api_key="", - additional_args={"complete_input_dict": data}, - original_response=json.dumps(response_body), - ) - except Exception as e: - raise BedrockError( - message=f"Embedding Error with model {model}: {e}", status_code=500 - ) - - ### FORMAT RESPONSE TO OPENAI FORMAT ### - if response_body is None: - raise Exception("Error in response object format") - - if model_response is None: - model_response = ImageResponse() - - image_list: List[Image] = [] - for artifact in response_body["artifacts"]: - _image = Image(b64_json=artifact["base64"]) - image_list.append(_image) - - model_response.data = image_list - return model_response diff --git a/litellm/llms/clarifai.py b/litellm/llms/clarifai.py index 2011c0bee..61d445423 100644 --- a/litellm/llms/clarifai.py +++ b/litellm/llms/clarifai.py @@ -9,7 +9,10 @@ import httpx import requests import litellm -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + get_async_httpx_client, +) from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage from .prompt_templates.factory import custom_prompt, prompt_factory @@ -185,7 +188,10 @@ async def async_completion( headers={}, ): - async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0)) + async_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.CLARIFAI, + params={"timeout": 600.0}, + ) response = await async_handler.post( url=model, headers=headers, data=json.dumps(data) ) diff --git a/litellm/llms/cohere/embed/handler.py b/litellm/llms/cohere/embed/handler.py index 95cbec225..afeba10b5 100644 --- a/litellm/llms/cohere/embed/handler.py +++ b/litellm/llms/cohere/embed/handler.py @@ -11,7 +11,11 @@ import requests # type: ignore import litellm from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.types.llms.bedrock import CohereEmbeddingRequest from litellm.utils import Choices, Message, ModelResponse, Usage @@ -70,8 +74,12 @@ async def async_embedding( }, ) ## COMPLETION CALL + if client is None: - client = AsyncHTTPHandler(concurrent_limit=1, timeout=timeout) + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.COHERE, + params={"timeout": timeout}, + ) try: response = await client.post(api_base, headers=headers, data=json.dumps(data)) @@ -144,6 +152,11 @@ def embedding( api_key=api_key, headers=headers, encoding=encoding, + client=( + client + if client is not None and isinstance(client, AsyncHTTPHandler) + else None + ), ) ## LOGGING diff --git a/litellm/llms/cohere/rerank.py b/litellm/llms/cohere/rerank.py index 022ffc6f9..8de2dfbb4 100644 --- a/litellm/llms/cohere/rerank.py +++ b/litellm/llms/cohere/rerank.py @@ -6,10 +6,14 @@ LiteLLM supports the re rank API format, no paramter transformation occurs from typing import Any, Dict, List, Optional, Union +import httpx + import litellm from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.llms.base import BaseLLM from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, _get_httpx_client, get_async_httpx_client, ) @@ -34,6 +38,23 @@ class CohereRerank(BaseLLM): # Merge other headers, overriding any default ones except Authorization return {**default_headers, **headers} + def ensure_rerank_endpoint(self, api_base: str) -> str: + """ + Ensures the `/v1/rerank` endpoint is appended to the given `api_base`. + If `/v1/rerank` is already present, the original URL is returned. + + :param api_base: The base API URL. + :return: A URL with `/v1/rerank` appended if missing. + """ + # Parse the base URL to ensure proper structure + url = httpx.URL(api_base) + + # Check if the URL already ends with `/v1/rerank` + if not url.path.endswith("/v1/rerank"): + url = url.copy_with(path=f"{url.path.rstrip('/')}/v1/rerank") + + return str(url) + def rerank( self, model: str, @@ -48,9 +69,10 @@ class CohereRerank(BaseLLM): return_documents: Optional[bool] = True, max_chunks_per_doc: Optional[int] = None, _is_async: Optional[bool] = False, # New parameter + client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None, ) -> RerankResponse: headers = self.validate_environment(api_key=api_key, headers=headers) - + api_base = self.ensure_rerank_endpoint(api_base) request_data = RerankRequest( model=model, query=query, @@ -76,9 +98,13 @@ class CohereRerank(BaseLLM): if _is_async: return self.async_rerank(request_data=request_data, api_key=api_key, api_base=api_base, headers=headers) # type: ignore # Call async method - client = _get_httpx_client() + if client is not None and isinstance(client, HTTPHandler): + client = client + else: + client = _get_httpx_client() + response = client.post( - api_base, + url=api_base, headers=headers, json=request_data_dict, ) @@ -100,10 +126,13 @@ class CohereRerank(BaseLLM): api_key: str, api_base: str, headers: dict, + client: Optional[AsyncHTTPHandler] = None, ) -> RerankResponse: request_data_dict = request_data.dict(exclude_none=True) - client = get_async_httpx_client(llm_provider=litellm.LlmProviders.COHERE) + client = client or get_async_httpx_client( + llm_provider=litellm.LlmProviders.COHERE + ) response = await client.post( api_base, diff --git a/litellm/llms/custom_httpx/http_handler.py b/litellm/llms/custom_httpx/http_handler.py index 9e5ed782e..f4d20f8fb 100644 --- a/litellm/llms/custom_httpx/http_handler.py +++ b/litellm/llms/custom_httpx/http_handler.py @@ -4,11 +4,11 @@ import traceback from typing import TYPE_CHECKING, Any, Callable, List, Mapping, Optional, Union import httpx -from httpx import USE_CLIENT_DEFAULT +from httpx import USE_CLIENT_DEFAULT, AsyncHTTPTransport, HTTPTransport import litellm - -from .types import httpxSpecialProvider +from litellm.caching import InMemoryCache +from litellm.types.llms.custom_http import * if TYPE_CHECKING: from litellm import LlmProviders @@ -26,6 +26,63 @@ headers = { # https://www.python-httpx.org/advanced/timeouts _DEFAULT_TIMEOUT = httpx.Timeout(timeout=5.0, connect=5.0) +_DEFAULT_TTL_FOR_HTTPX_CLIENTS = 3600 # 1 hour, re-use the same httpx client for 1 hour + +import re + + +def mask_sensitive_info(error_message): + # Find the start of the key parameter + if isinstance(error_message, str): + key_index = error_message.find("key=") + else: + return error_message + + # If key is found + if key_index != -1: + # Find the end of the key parameter (next & or end of string) + next_param = error_message.find("&", key_index) + + if next_param == -1: + # If no more parameters, mask until the end of the string + masked_message = error_message[: key_index + 4] + "[REDACTED_API_KEY]" + else: + # Replace the key with redacted value, keeping other parameters + masked_message = ( + error_message[: key_index + 4] + + "[REDACTED_API_KEY]" + + error_message[next_param:] + ) + + return masked_message + + return error_message + + +class MaskedHTTPStatusError(httpx.HTTPStatusError): + def __init__( + self, original_error, message: Optional[str] = None, text: Optional[str] = None + ): + # Create a new error with the masked URL + masked_url = mask_sensitive_info(str(original_error.request.url)) + # Create a new error that looks like the original, but with a masked URL + + super().__init__( + message=original_error.message, + request=httpx.Request( + method=original_error.request.method, + url=masked_url, + headers=original_error.request.headers, + content=original_error.request.content, + ), + response=httpx.Response( + status_code=original_error.response.status_code, + content=original_error.response.content, + headers=original_error.response.headers, + ), + ) + self.message = message + self.text = text class AsyncHTTPHandler: @@ -60,8 +117,10 @@ class AsyncHTTPHandler: if timeout is None: timeout = _DEFAULT_TIMEOUT # Create a client with a connection pool + transport = self._create_async_transport() return httpx.AsyncClient( + transport=transport, event_hooks=event_hooks, timeout=timeout, limits=httpx.Limits( @@ -152,13 +211,16 @@ class AsyncHTTPHandler: headers=headers, ) except httpx.HTTPStatusError as e: - setattr(e, "status_code", e.response.status_code) + if stream is True: setattr(e, "message", await e.response.aread()) setattr(e, "text", await e.response.aread()) else: - setattr(e, "message", e.response.text) - setattr(e, "text", e.response.text) + setattr(e, "message", mask_sensitive_info(e.response.text)) + setattr(e, "text", mask_sensitive_info(e.response.text)) + + setattr(e, "status_code", e.response.status_code) + raise e except Exception as e: raise e @@ -297,6 +359,18 @@ class AsyncHTTPHandler: except Exception: pass + def _create_async_transport(self) -> Optional[AsyncHTTPTransport]: + """ + Create an async transport with IPv4 only if litellm.force_ipv4 is True. + Otherwise, return None. + + Some users have seen httpx ConnectionError when using ipv6 - forcing ipv4 resolves the issue for them + """ + if litellm.force_ipv4: + return AsyncHTTPTransport(local_address="0.0.0.0") + else: + return None + class HTTPHandler: def __init__( @@ -316,8 +390,11 @@ class HTTPHandler: cert = os.getenv("SSL_CERTIFICATE", litellm.ssl_certificate) if client is None: + transport = self._create_sync_transport() + # Create a client with a connection pool self.client = httpx.Client( + transport=transport, timeout=timeout, limits=httpx.Limits( max_connections=concurrent_limit, @@ -381,11 +458,17 @@ class HTTPHandler: llm_provider="litellm-httpx-handler", ) except httpx.HTTPStatusError as e: - setattr(e, "status_code", e.response.status_code) + if stream is True: - setattr(e, "message", e.response.read()) + setattr(e, "message", mask_sensitive_info(e.response.read())) + setattr(e, "text", mask_sensitive_info(e.response.read())) else: - setattr(e, "message", e.response.text) + error_text = mask_sensitive_info(e.response.text) + setattr(e, "message", error_text) + setattr(e, "text", error_text) + + setattr(e, "status_code", e.response.status_code) + raise e except Exception as e: raise e @@ -427,6 +510,18 @@ class HTTPHandler: except Exception: pass + def _create_sync_transport(self) -> Optional[HTTPTransport]: + """ + Create an HTTP transport with IPv4 only if litellm.force_ipv4 is True. + Otherwise, return None. + + Some users have seen httpx ConnectionError when using ipv6 - forcing ipv4 resolves the issue for them + """ + if litellm.force_ipv4: + return HTTPTransport(local_address="0.0.0.0") + else: + return None + def get_async_httpx_client( llm_provider: Union[LlmProviders, httpxSpecialProvider], @@ -447,8 +542,9 @@ def get_async_httpx_client( pass _cache_key_name = "async_httpx_client" + _params_key_name + llm_provider - if _cache_key_name in litellm.in_memory_llm_clients_cache: - return litellm.in_memory_llm_clients_cache[_cache_key_name] + _cached_client = litellm.in_memory_llm_clients_cache.get_cache(_cache_key_name) + if _cached_client: + return _cached_client if params is not None: _new_client = AsyncHTTPHandler(**params) @@ -456,7 +552,11 @@ def get_async_httpx_client( _new_client = AsyncHTTPHandler( timeout=httpx.Timeout(timeout=600.0, connect=5.0) ) - litellm.in_memory_llm_clients_cache[_cache_key_name] = _new_client + litellm.in_memory_llm_clients_cache.set_cache( + key=_cache_key_name, + value=_new_client, + ttl=_DEFAULT_TTL_FOR_HTTPX_CLIENTS, + ) return _new_client @@ -476,13 +576,18 @@ def _get_httpx_client(params: Optional[dict] = None) -> HTTPHandler: pass _cache_key_name = "httpx_client" + _params_key_name - if _cache_key_name in litellm.in_memory_llm_clients_cache: - return litellm.in_memory_llm_clients_cache[_cache_key_name] + _cached_client = litellm.in_memory_llm_clients_cache.get_cache(_cache_key_name) + if _cached_client: + return _cached_client if params is not None: _new_client = HTTPHandler(**params) else: _new_client = HTTPHandler(timeout=httpx.Timeout(timeout=600.0, connect=5.0)) - litellm.in_memory_llm_clients_cache[_cache_key_name] = _new_client + litellm.in_memory_llm_clients_cache.set_cache( + key=_cache_key_name, + value=_new_client, + ttl=_DEFAULT_TTL_FOR_HTTPX_CLIENTS, + ) return _new_client diff --git a/litellm/llms/custom_httpx/types.py b/litellm/llms/custom_httpx/types.py deleted file mode 100644 index dc0958118..000000000 --- a/litellm/llms/custom_httpx/types.py +++ /dev/null @@ -1,10 +0,0 @@ -from enum import Enum - -import litellm - - -class httpxSpecialProvider(str, Enum): - LoggingCallback = "logging_callback" - GuardrailCallback = "guardrail_callback" - Caching = "caching" - Oauth2Check = "oauth2_check" diff --git a/litellm/llms/databricks/chat.py b/litellm/llms/databricks/chat.py index eb0cb341e..e752f4d98 100644 --- a/litellm/llms/databricks/chat.py +++ b/litellm/llms/databricks/chat.py @@ -393,7 +393,10 @@ class DatabricksChatCompletion(BaseLLM): if timeout is None: timeout = httpx.Timeout(timeout=600.0, connect=5.0) - self.async_handler = AsyncHTTPHandler(timeout=timeout) + self.async_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.DATABRICKS, + params={"timeout": timeout}, + ) try: response = await self.async_handler.post( @@ -470,6 +473,9 @@ class DatabricksChatCompletion(BaseLLM): optional_params[k] = v stream: bool = optional_params.get("stream", None) or False + optional_params.pop( + "max_retries", None + ) # [TODO] add max retry support at llm api call level optional_params["stream"] = stream data = { @@ -607,7 +613,10 @@ class DatabricksChatCompletion(BaseLLM): response = None try: if client is None or isinstance(client, AsyncHTTPHandler): - self.async_client = AsyncHTTPHandler(timeout=timeout) # type: ignore + self.async_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.DATABRICKS, + params={"timeout": timeout}, + ) else: self.async_client = client diff --git a/litellm/llms/databricks/streaming_utils.py b/litellm/llms/databricks/streaming_utils.py index a87ab39bb..502f4a091 100644 --- a/litellm/llms/databricks/streaming_utils.py +++ b/litellm/llms/databricks/streaming_utils.py @@ -1,5 +1,5 @@ import json -from typing import Optional +from typing import List, Optional import litellm from litellm import verbose_logger @@ -10,7 +10,7 @@ from litellm.types.llms.openai import ( ChatCompletionToolCallFunctionChunk, ChatCompletionUsageBlock, ) -from litellm.types.utils import GenericStreamingChunk +from litellm.types.utils import GenericStreamingChunk, ModelResponse, Usage class ModelResponseIterator: diff --git a/litellm/llms/deepseek/chat/transformation.py b/litellm/llms/deepseek/chat/transformation.py new file mode 100644 index 000000000..5785bdd50 --- /dev/null +++ b/litellm/llms/deepseek/chat/transformation.py @@ -0,0 +1,41 @@ +""" +Translates from OpenAI's `/v1/chat/completions` to DeepSeek's `/v1/chat/completions` +""" + +import types +from typing import List, Optional, Tuple, Union + +from pydantic import BaseModel + +import litellm +from litellm.secret_managers.main import get_secret_str +from litellm.types.llms.openai import AllMessageValues, ChatCompletionAssistantMessage + +from ....utils import _remove_additional_properties, _remove_strict_from_schema +from ...OpenAI.chat.gpt_transformation import OpenAIGPTConfig +from ...prompt_templates.common_utils import ( + handle_messages_with_content_list_to_str_conversion, +) + + +class DeepSeekChatConfig(OpenAIGPTConfig): + + def _transform_messages( + self, messages: List[AllMessageValues] + ) -> List[AllMessageValues]: + """ + DeepSeek does not support content in list format. + """ + messages = handle_messages_with_content_list_to_str_conversion(messages) + return super()._transform_messages(messages) + + def _get_openai_compatible_provider_info( + self, api_base: Optional[str], api_key: Optional[str] + ) -> Tuple[Optional[str], Optional[str]]: + api_base = ( + api_base + or get_secret_str("DEEPSEEK_API_BASE") + or "https://api.deepseek.com/beta" + ) # type: ignore + dynamic_api_key = api_key or get_secret_str("DEEPSEEK_API_KEY") + return api_base, dynamic_api_key diff --git a/litellm/llms/fine_tuning_apis/vertex_ai.py b/litellm/llms/fine_tuning_apis/vertex_ai.py index 11d052191..fd418103e 100644 --- a/litellm/llms/fine_tuning_apis/vertex_ai.py +++ b/litellm/llms/fine_tuning_apis/vertex_ai.py @@ -5,9 +5,14 @@ from typing import Any, Coroutine, Literal, Optional, Union import httpx from openai.types.fine_tuning.fine_tuning_job import FineTuningJob, Hyperparameters +import litellm from litellm._logging import verbose_logger from litellm.llms.base import BaseLLM -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( VertexLLM, ) @@ -26,8 +31,9 @@ class VertexFineTuningAPI(VertexLLM): def __init__(self) -> None: super().__init__() - self.async_handler = AsyncHTTPHandler( - timeout=httpx.Timeout(timeout=600.0, connect=5.0) + self.async_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.VERTEX_AI, + params={"timeout": 600.0}, ) def convert_response_created_at(self, response: ResponseTuningJob): diff --git a/litellm/llms/groq/chat/handler.py b/litellm/llms/groq/chat/handler.py index f4a16abc8..1fe87844c 100644 --- a/litellm/llms/groq/chat/handler.py +++ b/litellm/llms/groq/chat/handler.py @@ -6,55 +6,68 @@ from typing import Any, Callable, Optional, Union from httpx._config import Timeout +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.types.utils import CustomStreamingDecoder from litellm.utils import ModelResponse from ...groq.chat.transformation import GroqChatConfig -from ...OpenAI.openai import OpenAIChatCompletion +from ...openai_like.chat.handler import OpenAILikeChatHandler -class GroqChatCompletion(OpenAIChatCompletion): +class GroqChatCompletion(OpenAILikeChatHandler): def __init__(self, **kwargs): super().__init__(**kwargs) def completion( self, + *, + model: str, + messages: list, + api_base: str, + custom_llm_provider: str, + custom_prompt_dict: dict, model_response: ModelResponse, - timeout: Union[float, Timeout], + print_verbose: Callable, + encoding, + api_key: Optional[str], + logging_obj, optional_params: dict, - logging_obj: Any, - model: Optional[str] = None, - messages: Optional[list] = None, - print_verbose: Optional[Callable[..., Any]] = None, - api_key: Optional[str] = None, - api_base: Optional[str] = None, - acompletion: bool = False, + acompletion=None, litellm_params=None, logger_fn=None, headers: Optional[dict] = None, - custom_prompt_dict: dict = {}, - client=None, - organization: Optional[str] = None, - custom_llm_provider: Optional[str] = None, - drop_params: Optional[bool] = None, + timeout: Optional[Union[float, Timeout]] = None, + client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None, + custom_endpoint: Optional[bool] = None, + streaming_decoder: Optional[CustomStreamingDecoder] = None, + fake_stream: bool = False ): messages = GroqChatConfig()._transform_messages(messages) # type: ignore + + if optional_params.get("stream") is True: + fake_stream = GroqChatConfig()._should_fake_stream(optional_params) + else: + fake_stream = False + return super().completion( - model_response, - timeout, - optional_params, - logging_obj, - model, - messages, - print_verbose, - api_key, - api_base, - acompletion, - litellm_params, - logger_fn, - headers, - custom_prompt_dict, - client, - organization, - custom_llm_provider, - drop_params, + model=model, + messages=messages, + api_base=api_base, + custom_llm_provider=custom_llm_provider, + custom_prompt_dict=custom_prompt_dict, + model_response=model_response, + print_verbose=print_verbose, + encoding=encoding, + api_key=api_key, + logging_obj=logging_obj, + optional_params=optional_params, + acompletion=acompletion, + litellm_params=litellm_params, + logger_fn=logger_fn, + headers=headers, + timeout=timeout, + client=client, + custom_endpoint=custom_endpoint, + streaming_decoder=streaming_decoder, + fake_stream=fake_stream, ) diff --git a/litellm/llms/groq/chat/transformation.py b/litellm/llms/groq/chat/transformation.py index 4baba7657..dddc56a2c 100644 --- a/litellm/llms/groq/chat/transformation.py +++ b/litellm/llms/groq/chat/transformation.py @@ -2,6 +2,7 @@ Translate from OpenAI's `/v1/chat/completions` to Groq's `/v1/chat/completions` """ +import json import types from typing import List, Optional, Tuple, Union @@ -9,7 +10,12 @@ from pydantic import BaseModel import litellm from litellm.secret_managers.main import get_secret_str -from litellm.types.llms.openai import AllMessageValues, ChatCompletionAssistantMessage +from litellm.types.llms.openai import ( + AllMessageValues, + ChatCompletionAssistantMessage, + ChatCompletionToolParam, + ChatCompletionToolParamFunctionChunk, +) from ...OpenAI.chat.gpt_transformation import OpenAIGPTConfig @@ -99,3 +105,69 @@ class GroqChatConfig(OpenAIGPTConfig): ) # type: ignore dynamic_api_key = api_key or get_secret_str("GROQ_API_KEY") return api_base, dynamic_api_key + + def _should_fake_stream(self, optional_params: dict) -> bool: + """ + Groq doesn't support 'response_format' while streaming + """ + if optional_params.get("response_format") is not None: + return True + + return False + + def _create_json_tool_call_for_response_format( + self, + json_schema: dict, + ): + """ + Handles creating a tool call for getting responses in JSON format. + + Args: + json_schema (Optional[dict]): The JSON schema the response should be in + + Returns: + AnthropicMessagesTool: The tool call to send to Anthropic API to get responses in JSON format + """ + return ChatCompletionToolParam( + type="function", + function=ChatCompletionToolParamFunctionChunk( + name="json_tool_call", + parameters=json_schema, + ), + ) + + def map_openai_params( + self, + non_default_params: dict, + optional_params: dict, + model: str, + drop_params: bool = False, + ) -> dict: + _response_format = non_default_params.get("response_format") + if _response_format is not None and isinstance(_response_format, dict): + json_schema: Optional[dict] = None + if "response_schema" in _response_format: + json_schema = _response_format["response_schema"] + elif "json_schema" in _response_format: + json_schema = _response_format["json_schema"]["schema"] + """ + When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode + - You usually want to provide a single tool + - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool + - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective. + """ + if json_schema is not None: + _tool_choice = { + "type": "function", + "function": {"name": "json_tool_call"}, + } + _tool = self._create_json_tool_call_for_response_format( + json_schema=json_schema, + ) + optional_params["tools"] = [_tool] + optional_params["tool_choice"] = _tool_choice + optional_params["json_mode"] = True + non_default_params.pop("response_format", None) + return super().map_openai_params( + non_default_params, optional_params, model, drop_params + ) diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py index 907d72a60..8b45f1ae7 100644 --- a/litellm/llms/huggingface_restapi.py +++ b/litellm/llms/huggingface_restapi.py @@ -263,7 +263,11 @@ def get_hf_task_for_model(model: str) -> Tuple[hf_tasks, str]: return "text-generation-inference", model # default to tgi -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) def get_hf_task_embedding_for_model( @@ -301,7 +305,9 @@ async def async_get_hf_task_embedding_for_model( task_type, hf_tasks_embeddings ) ) - http_client = AsyncHTTPHandler(concurrent_limit=1) + http_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.HUGGINGFACE, + ) model_info = await http_client.get(url=api_base) @@ -1067,7 +1073,9 @@ class Huggingface(BaseLLM): ) ## COMPLETION CALL if client is None: - client = AsyncHTTPHandler(concurrent_limit=1) + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.HUGGINGFACE, + ) response = await client.post(api_base, headers=headers, data=json.dumps(data)) diff --git a/litellm/llms/jina_ai/embedding/transformation.py b/litellm/llms/jina_ai/embedding/transformation.py index 26ff58878..97b7b2cfa 100644 --- a/litellm/llms/jina_ai/embedding/transformation.py +++ b/litellm/llms/jina_ai/embedding/transformation.py @@ -76,4 +76,4 @@ class JinaAIEmbeddingConfig: or get_secret_str("JINA_AI_API_KEY") or get_secret_str("JINA_AI_TOKEN") ) - return LlmProviders.OPENAI_LIKE.value, api_base, dynamic_api_key + return LlmProviders.JINA_AI.value, api_base, dynamic_api_key diff --git a/litellm/llms/jina_ai/rerank/handler.py b/litellm/llms/jina_ai/rerank/handler.py new file mode 100644 index 000000000..a2cfdd49e --- /dev/null +++ b/litellm/llms/jina_ai/rerank/handler.py @@ -0,0 +1,96 @@ +""" +Re rank api + +LiteLLM supports the re rank API format, no paramter transformation occurs +""" + +import uuid +from typing import Any, Dict, List, Optional, Union + +import httpx +from pydantic import BaseModel + +import litellm +from litellm.llms.base import BaseLLM +from litellm.llms.custom_httpx.http_handler import ( + _get_httpx_client, + get_async_httpx_client, +) +from litellm.llms.jina_ai.rerank.transformation import JinaAIRerankConfig +from litellm.types.rerank import RerankRequest, RerankResponse + + +class JinaAIRerank(BaseLLM): + def rerank( + self, + model: str, + api_key: str, + query: str, + documents: List[Union[str, Dict[str, Any]]], + top_n: Optional[int] = None, + rank_fields: Optional[List[str]] = None, + return_documents: Optional[bool] = True, + max_chunks_per_doc: Optional[int] = None, + _is_async: Optional[bool] = False, + ) -> RerankResponse: + client = _get_httpx_client() + + request_data = RerankRequest( + model=model, + query=query, + top_n=top_n, + documents=documents, + rank_fields=rank_fields, + return_documents=return_documents, + ) + + # exclude None values from request_data + request_data_dict = request_data.dict(exclude_none=True) + + if _is_async: + return self.async_rerank(request_data_dict, api_key) # type: ignore # Call async method + + response = client.post( + "https://api.jina.ai/v1/rerank", + headers={ + "accept": "application/json", + "content-type": "application/json", + "authorization": f"Bearer {api_key}", + }, + json=request_data_dict, + ) + + if response.status_code != 200: + raise Exception(response.text) + + _json_response = response.json() + + return JinaAIRerankConfig()._transform_response(_json_response) + + async def async_rerank( # New async method + self, + request_data_dict: Dict[str, Any], + api_key: str, + ) -> RerankResponse: + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.JINA_AI + ) # Use async client + + response = await client.post( + "https://api.jina.ai/v1/rerank", + headers={ + "accept": "application/json", + "content-type": "application/json", + "authorization": f"Bearer {api_key}", + }, + json=request_data_dict, + ) + + if response.status_code != 200: + raise Exception(response.text) + + _json_response = response.json() + + return JinaAIRerankConfig()._transform_response(_json_response) + + pass diff --git a/litellm/llms/jina_ai/rerank/transformation.py b/litellm/llms/jina_ai/rerank/transformation.py new file mode 100644 index 000000000..82039a15b --- /dev/null +++ b/litellm/llms/jina_ai/rerank/transformation.py @@ -0,0 +1,36 @@ +""" +Transformation logic from Cohere's /v1/rerank format to Jina AI's `/v1/rerank` format. + +Why separate file? Make it easy to see how transformation works + +Docs - https://jina.ai/reranker +""" + +import uuid +from typing import List, Optional + +from litellm.types.rerank import ( + RerankBilledUnits, + RerankResponse, + RerankResponseMeta, + RerankTokens, +) + + +class JinaAIRerankConfig: + def _transform_response(self, response: dict) -> RerankResponse: + + _billed_units = RerankBilledUnits(**response.get("usage", {})) + _tokens = RerankTokens(**response.get("usage", {})) + rerank_meta = RerankResponseMeta(billed_units=_billed_units, tokens=_tokens) + + _results: Optional[List[dict]] = response.get("results") + + if _results is None: + raise ValueError(f"No results found in the response={response}") + + return RerankResponse( + id=response.get("id") or str(uuid.uuid4()), + results=_results, + meta=rerank_meta, + ) # Return response diff --git a/litellm/llms/lm_studio/embed/transformation.py b/litellm/llms/lm_studio/embed/transformation.py new file mode 100644 index 000000000..17b2173a7 --- /dev/null +++ b/litellm/llms/lm_studio/embed/transformation.py @@ -0,0 +1,54 @@ +""" +Transformation logic from OpenAI /v1/embeddings format to LM Studio's `/v1/embeddings` format. + +Why separate file? Make it easy to see how transformation works + +Docs - https://lmstudio.ai/docs/basics/server +""" + +import types +from typing import List, Optional, Tuple + +from litellm import LlmProviders +from litellm.secret_managers.main import get_secret_str +from litellm.types.utils import Embedding, EmbeddingResponse, Usage + + +class LmStudioEmbeddingConfig: + """ + Reference: https://lmstudio.ai/docs/basics/server + """ + + def __init__( + self, + ) -> None: + locals_ = locals() + for key, value in locals_.items(): + if key != "self" and value is not None: + setattr(self.__class__, key, value) + + @classmethod + def get_config(cls): + return { + k: v + for k, v in cls.__dict__.items() + if not k.startswith("__") + and not isinstance( + v, + ( + types.FunctionType, + types.BuiltinFunctionType, + classmethod, + staticmethod, + ), + ) + and v is not None + } + + def get_supported_openai_params(self) -> List[str]: + return [] + + def map_openai_params( + self, non_default_params: dict, optional_params: dict + ) -> dict: + return optional_params diff --git a/litellm/llms/mistral/mistral_chat_transformation.py b/litellm/llms/mistral/mistral_chat_transformation.py index 5d1a54c3a..aeb1a90fd 100644 --- a/litellm/llms/mistral/mistral_chat_transformation.py +++ b/litellm/llms/mistral/mistral_chat_transformation.py @@ -10,6 +10,7 @@ import types from typing import List, Literal, Optional, Tuple, Union from litellm.secret_managers.main import get_secret_str +from litellm.types.llms.openai import AllMessageValues class MistralConfig: @@ -148,3 +149,59 @@ class MistralConfig: or get_secret_str("MISTRAL_API_KEY") ) return api_base, dynamic_api_key + + @classmethod + def _transform_messages(cls, messages: List[AllMessageValues]): + """ + - handles scenario where content is list and not string + - content list is just text, and no images + - if image passed in, then just return as is (user-intended) + - if `name` is passed, then drop it for mistral API: https://github.com/BerriAI/litellm/issues/6696 + + Motivation: mistral api doesn't support content as a list + """ + new_messages = [] + for m in messages: + special_keys = ["role", "content", "tool_calls", "function_call"] + extra_args = {} + if isinstance(m, dict): + for k, v in m.items(): + if k not in special_keys: + extra_args[k] = v + texts = "" + _content = m.get("content") + if _content is not None and isinstance(_content, list): + for c in _content: + _text: Optional[str] = c.get("text") + if c["type"] == "image_url": + return messages + elif c["type"] == "text" and isinstance(_text, str): + texts += _text + elif _content is not None and isinstance(_content, str): + texts = _content + + new_m = {"role": m["role"], "content": texts, **extra_args} + + if m.get("tool_calls"): + new_m["tool_calls"] = m.get("tool_calls") + + new_m = cls._handle_name_in_message(new_m) + + new_messages.append(new_m) + return new_messages + + @classmethod + def _handle_name_in_message(cls, message: dict) -> dict: + """ + Mistral API only supports `name` in tool messages + + If role == tool, then we keep `name` + Otherwise, we drop `name` + """ + if message.get("name") is not None: + if message["role"] == "tool": + message["name"] = message.get("name") + else: + message.pop("name", None) + + return message diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index 845d0e2dd..e9dd2b53f 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -14,6 +14,7 @@ import requests # type: ignore import litellm from litellm import verbose_logger +from litellm.llms.custom_httpx.http_handler import get_async_httpx_client from litellm.secret_managers.main import get_secret_str from litellm.types.utils import ModelInfo, ProviderField, StreamingChoices @@ -164,6 +165,30 @@ class OllamaConfig: "response_format", ] + def map_openai_params( + self, optional_params: dict, non_default_params: dict + ) -> dict: + for param, value in non_default_params.items(): + if param == "max_tokens": + optional_params["num_predict"] = value + if param == "stream": + optional_params["stream"] = value + if param == "temperature": + optional_params["temperature"] = value + if param == "seed": + optional_params["seed"] = value + if param == "top_p": + optional_params["top_p"] = value + if param == "frequency_penalty": + optional_params["repeat_penalty"] = value + if param == "stop": + optional_params["stop"] = value + if param == "response_format" and isinstance(value, dict): + if value["type"] == "json_object": + optional_params["format"] = "json" + + return optional_params + def _supports_function_calling(self, ollama_model_info: dict) -> bool: """ Check if the 'template' field in the ollama_model_info contains a 'tools' or 'function' key. @@ -185,6 +210,8 @@ class OllamaConfig: "name": "mistral" }' """ + if model.startswith("ollama/") or model.startswith("ollama_chat/"): + model = model.split("/", 1)[1] api_base = get_secret_str("OLLAMA_API_BASE") or "http://localhost:11434" try: @@ -430,7 +457,10 @@ def ollama_completion_stream(url, data, logging_obj): async def ollama_async_streaming(url, data, model_response, encoding, logging_obj): try: - client = httpx.AsyncClient() + _async_http_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.OLLAMA + ) + client = _async_http_client.client async with client.stream( url=f"{url}", json=data, method="POST", timeout=litellm.request_timeout ) as response: diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index 536f766e0..ce0df139d 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -13,6 +13,7 @@ from pydantic import BaseModel import litellm from litellm import verbose_logger +from litellm.llms.custom_httpx.http_handler import get_async_httpx_client from litellm.types.llms.ollama import OllamaToolCall, OllamaToolCallFunction from litellm.types.llms.openai import ChatCompletionAssistantToolCall from litellm.types.utils import StreamingChoices @@ -445,7 +446,10 @@ async def ollama_async_streaming( url, api_key, data, model_response, encoding, logging_obj ): try: - client = httpx.AsyncClient() + _async_http_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.OLLAMA + ) + client = _async_http_client.client _request = { "url": f"{url}", "json": data, diff --git a/litellm/llms/openai_like/chat/handler.py b/litellm/llms/openai_like/chat/handler.py index 0dbc3a978..baa970304 100644 --- a/litellm/llms/openai_like/chat/handler.py +++ b/litellm/llms/openai_like/chat/handler.py @@ -17,7 +17,9 @@ import httpx # type: ignore import requests # type: ignore import litellm +from litellm import LlmProviders from litellm.litellm_core_utils.core_helpers import map_finish_reason +from litellm.llms.bedrock.chat.invoke_handler import MockResponseIterator from litellm.llms.custom_httpx.http_handler import ( AsyncHTTPHandler, HTTPHandler, @@ -25,9 +27,19 @@ from litellm.llms.custom_httpx.http_handler import ( ) from litellm.llms.databricks.streaming_utils import ModelResponseIterator from litellm.types.utils import CustomStreamingDecoder, ModelResponse -from litellm.utils import CustomStreamWrapper, EmbeddingResponse +from litellm.utils import ( + Choices, + CustomStreamWrapper, + EmbeddingResponse, + Message, + ProviderConfigManager, + TextCompletionResponse, + Usage, + convert_to_model_response_object, +) from ..common_utils import OpenAILikeBase, OpenAILikeError +from .transformation import OpenAILikeChatConfig async def make_call( @@ -39,16 +51,22 @@ async def make_call( messages: list, logging_obj, streaming_decoder: Optional[CustomStreamingDecoder] = None, + fake_stream: bool = False, ): if client is None: client = litellm.module_level_aclient - response = await client.post(api_base, headers=headers, data=data, stream=True) + response = await client.post( + api_base, headers=headers, data=data, stream=not fake_stream + ) if streaming_decoder is not None: completion_stream: Any = streaming_decoder.aiter_bytes( response.aiter_bytes(chunk_size=1024) ) + elif fake_stream: + model_response = ModelResponse(**response.json()) + completion_stream = MockResponseIterator(model_response=model_response) else: completion_stream = ModelResponseIterator( streaming_response=response.aiter_lines(), sync_stream=False @@ -73,11 +91,12 @@ def make_sync_call( messages: list, logging_obj, streaming_decoder: Optional[CustomStreamingDecoder] = None, + fake_stream: bool = False, ): if client is None: client = litellm.module_level_client # Create a new client if none provided - response = client.post(api_base, headers=headers, data=data, stream=True) + response = client.post(api_base, headers=headers, data=data, stream=not fake_stream) if response.status_code != 200: raise OpenAILikeError(status_code=response.status_code, message=response.read()) @@ -86,6 +105,9 @@ def make_sync_call( completion_stream = streaming_decoder.iter_bytes( response.iter_bytes(chunk_size=1024) ) + elif fake_stream: + model_response = ModelResponse(**response.json()) + completion_stream = MockResponseIterator(model_response=model_response) else: completion_stream = ModelResponseIterator( streaming_response=response.iter_lines(), sync_stream=True @@ -126,8 +148,8 @@ class OpenAILikeChatHandler(OpenAILikeBase): headers={}, client: Optional[AsyncHTTPHandler] = None, streaming_decoder: Optional[CustomStreamingDecoder] = None, + fake_stream: bool = False, ) -> CustomStreamWrapper: - data["stream"] = True completion_stream = await make_call( client=client, @@ -169,6 +191,7 @@ class OpenAILikeChatHandler(OpenAILikeBase): logger_fn=None, headers={}, timeout: Optional[Union[float, httpx.Timeout]] = None, + json_mode: bool = False, ) -> ModelResponse: if timeout is None: timeout = httpx.Timeout(timeout=600.0, connect=5.0) @@ -181,8 +204,6 @@ class OpenAILikeChatHandler(OpenAILikeBase): api_base, headers=headers, data=json.dumps(data), timeout=timeout ) response.raise_for_status() - - response_json = response.json() except httpx.HTTPStatusError as e: raise OpenAILikeError( status_code=e.response.status_code, @@ -193,22 +214,26 @@ class OpenAILikeChatHandler(OpenAILikeBase): except Exception as e: raise OpenAILikeError(status_code=500, message=str(e)) - logging_obj.post_call( - input=messages, - api_key="", - original_response=response_json, - additional_args={"complete_input_dict": data}, + return OpenAILikeChatConfig._transform_response( + model=model, + response=response, + model_response=model_response, + stream=stream, + logging_obj=logging_obj, + optional_params=optional_params, + api_key=api_key, + data=data, + messages=messages, + print_verbose=print_verbose, + encoding=encoding, + json_mode=json_mode, + custom_llm_provider=custom_llm_provider, + base_model=base_model, ) - response = ModelResponse(**response_json) - - response.model = custom_llm_provider + "/" + (response.model or "") - - if base_model is not None: - response._hidden_params["model"] = base_model - return response def completion( self, + *, model: str, messages: list, api_base: str, @@ -230,6 +255,7 @@ class OpenAILikeChatHandler(OpenAILikeBase): streaming_decoder: Optional[ CustomStreamingDecoder ] = None, # if openai-compatible api needs custom stream decoder - e.g. sagemaker + fake_stream: bool = False, ): custom_endpoint = custom_endpoint or optional_params.pop( "custom_endpoint", None @@ -243,13 +269,24 @@ class OpenAILikeChatHandler(OpenAILikeBase): headers=headers, ) - stream: bool = optional_params.get("stream", None) or False - optional_params["stream"] = stream + stream: bool = optional_params.pop("stream", None) or False + extra_body = optional_params.pop("extra_body", {}) + json_mode = optional_params.pop("json_mode", None) + optional_params.pop("max_retries", None) + if not fake_stream: + optional_params["stream"] = stream + + if messages is not None and custom_llm_provider is not None: + provider_config = ProviderConfigManager.get_provider_config( + model=model, provider=LlmProviders(custom_llm_provider) + ) + messages = provider_config._transform_messages(messages) data = { "model": model, "messages": messages, **optional_params, + **extra_body, } ## LOGGING @@ -288,6 +325,7 @@ class OpenAILikeChatHandler(OpenAILikeBase): client=client, custom_llm_provider=custom_llm_provider, streaming_decoder=streaming_decoder, + fake_stream=fake_stream, ) else: return self.acompletion_function( @@ -327,6 +365,7 @@ class OpenAILikeChatHandler(OpenAILikeBase): messages=messages, logging_obj=logging_obj, streaming_decoder=streaming_decoder, + fake_stream=fake_stream, ) # completion_stream.__iter__() return CustomStreamWrapper( @@ -344,7 +383,6 @@ class OpenAILikeChatHandler(OpenAILikeBase): ) response.raise_for_status() - response_json = response.json() except httpx.HTTPStatusError as e: raise OpenAILikeError( status_code=e.response.status_code, @@ -356,17 +394,19 @@ class OpenAILikeChatHandler(OpenAILikeBase): ) except Exception as e: raise OpenAILikeError(status_code=500, message=str(e)) - logging_obj.post_call( - input=messages, - api_key="", - original_response=response_json, - additional_args={"complete_input_dict": data}, + return OpenAILikeChatConfig._transform_response( + model=model, + response=response, + model_response=model_response, + stream=stream, + logging_obj=logging_obj, + optional_params=optional_params, + api_key=api_key, + data=data, + messages=messages, + print_verbose=print_verbose, + encoding=encoding, + json_mode=json_mode, + custom_llm_provider=custom_llm_provider, + base_model=base_model, ) - response = ModelResponse(**response_json) - - response.model = custom_llm_provider + "/" + (response.model or "") - - if base_model is not None: - response._hidden_params["model"] = base_model - - return response diff --git a/litellm/llms/openai_like/chat/transformation.py b/litellm/llms/openai_like/chat/transformation.py new file mode 100644 index 000000000..c355cf330 --- /dev/null +++ b/litellm/llms/openai_like/chat/transformation.py @@ -0,0 +1,98 @@ +""" +OpenAI-like chat completion transformation +""" + +import types +from typing import List, Optional, Tuple, Union + +import httpx +from pydantic import BaseModel + +import litellm +from litellm.secret_managers.main import get_secret_str +from litellm.types.llms.openai import AllMessageValues, ChatCompletionAssistantMessage +from litellm.types.utils import ModelResponse + +from ....utils import _remove_additional_properties, _remove_strict_from_schema +from ...OpenAI.chat.gpt_transformation import OpenAIGPTConfig + + +class OpenAILikeChatConfig(OpenAIGPTConfig): + def _get_openai_compatible_provider_info( + self, api_base: Optional[str], api_key: Optional[str] + ) -> Tuple[Optional[str], Optional[str]]: + api_base = api_base or get_secret_str("OPENAI_LIKE_API_BASE") # type: ignore + dynamic_api_key = ( + api_key or get_secret_str("OPENAI_LIKE_API_KEY") or "" + ) # vllm does not require an api key + return api_base, dynamic_api_key + + @staticmethod + def _convert_tool_response_to_message( + message: ChatCompletionAssistantMessage, json_mode: bool + ) -> ChatCompletionAssistantMessage: + """ + if json_mode is true, convert the returned tool call response to a content with json str + + e.g. input: + + {"role": "assistant", "tool_calls": [{"id": "call_5ms4", "type": "function", "function": {"name": "json_tool_call", "arguments": "{\"key\": \"question\", \"value\": \"What is the capital of France?\"}"}}]} + + output: + + {"role": "assistant", "content": "{\"key\": \"question\", \"value\": \"What is the capital of France?\"}"} + """ + if not json_mode: + return message + + _tool_calls = message.get("tool_calls") + + if _tool_calls is None or len(_tool_calls) != 1: + return message + + message["content"] = _tool_calls[0]["function"].get("arguments") or "" + message["tool_calls"] = None + + return message + + @staticmethod + def _transform_response( + model: str, + response: httpx.Response, + model_response: ModelResponse, + stream: bool, + logging_obj: litellm.litellm_core_utils.litellm_logging.Logging, # type: ignore + optional_params: dict, + api_key: Optional[str], + data: Union[dict, str], + messages: List, + print_verbose, + encoding, + json_mode: bool, + custom_llm_provider: str, + base_model: Optional[str], + ) -> ModelResponse: + response_json = response.json() + logging_obj.post_call( + input=messages, + api_key="", + original_response=response_json, + additional_args={"complete_input_dict": data}, + ) + + if json_mode: + for choice in response_json["choices"]: + message = OpenAILikeChatConfig._convert_tool_response_to_message( + choice.get("message"), json_mode + ) + choice["message"] = message + + returned_response = ModelResponse(**response_json) + + returned_response.model = ( + custom_llm_provider + "/" + (returned_response.model or "") + ) + + if base_model is not None: + returned_response._hidden_params["model"] = base_model + return returned_response diff --git a/litellm/llms/openai_like/embedding/handler.py b/litellm/llms/openai_like/embedding/handler.py index ce0860724..e786b5db8 100644 --- a/litellm/llms/openai_like/embedding/handler.py +++ b/litellm/llms/openai_like/embedding/handler.py @@ -45,7 +45,10 @@ class OpenAILikeEmbeddingHandler(OpenAILikeBase): response = None try: if client is None or isinstance(client, AsyncHTTPHandler): - self.async_client = AsyncHTTPHandler(timeout=timeout) # type: ignore + self.async_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.OPENAI, + params={"timeout": timeout}, + ) else: self.async_client = client @@ -62,7 +65,7 @@ class OpenAILikeEmbeddingHandler(OpenAILikeBase): except httpx.HTTPStatusError as e: raise OpenAILikeError( status_code=e.response.status_code, - message=response.text if response else str(e), + message=e.response.text if e.response else str(e), ) except httpx.TimeoutException: raise OpenAILikeError( diff --git a/litellm/llms/predibase.py b/litellm/llms/predibase.py index 96796f9dc..e80964551 100644 --- a/litellm/llms/predibase.py +++ b/litellm/llms/predibase.py @@ -19,7 +19,10 @@ import litellm.litellm_core_utils import litellm.litellm_core_utils.litellm_logging from litellm import verbose_logger from litellm.litellm_core_utils.core_helpers import map_finish_reason -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + get_async_httpx_client, +) from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage from .base import BaseLLM @@ -549,7 +552,10 @@ class PredibaseChatCompletion(BaseLLM): headers={}, ) -> ModelResponse: - async_handler = AsyncHTTPHandler(timeout=httpx.Timeout(timeout=timeout)) + async_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.PREDIBASE, + params={"timeout": timeout}, + ) try: response = await async_handler.post( api_base, headers=headers, data=json.dumps(data) diff --git a/litellm/llms/prompt_templates/common_utils.py b/litellm/llms/prompt_templates/common_utils.py index 6b4971269..24cb7b451 100644 --- a/litellm/llms/prompt_templates/common_utils.py +++ b/litellm/llms/prompt_templates/common_utils.py @@ -24,11 +24,23 @@ DEFAULT_ASSISTANT_CONTINUE_MESSAGE = ChatCompletionAssistantMessage( ) +def handle_messages_with_content_list_to_str_conversion( + messages: List[AllMessageValues], +) -> List[AllMessageValues]: + """ + Handles messages with content list conversion + """ + for message in messages: + texts = convert_content_list_to_str(message=message) + if texts: + message["content"] = texts + return messages + + def convert_content_list_to_str(message: AllMessageValues) -> str: """ - handles scenario where content is list and not string - content list is just text, and no images - - if image passed in, then just return as is (user-intended) Motivation: mistral api + azure ai don't support content as a list """ @@ -46,6 +58,19 @@ def convert_content_list_to_str(message: AllMessageValues) -> str: return texts +def _audio_or_image_in_message_content(message: AllMessageValues) -> bool: + """ + Checks if message content contains an image or audio + """ + message_content = message.get("content") + if message_content: + if message_content is not None and isinstance(message_content, list): + for c in message_content: + if c.get("type") == "image_url" or c.get("type") == "input_audio": + return True + return False + + def convert_openai_message_to_only_content_messages( messages: List[AllMessageValues], ) -> List[Dict[str, str]]: diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index aee304760..bfd35ca47 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -33,6 +33,7 @@ from litellm.types.llms.openai import ( ChatCompletionAssistantToolCall, ChatCompletionFunctionMessage, ChatCompletionImageObject, + ChatCompletionImageUrlObject, ChatCompletionTextObject, ChatCompletionToolCallFunctionChunk, ChatCompletionToolMessage, @@ -259,43 +260,6 @@ def mistral_instruct_pt(messages): return prompt -def mistral_api_pt(messages): - """ - - handles scenario where content is list and not string - - content list is just text, and no images - - if image passed in, then just return as is (user-intended) - - Motivation: mistral api doesn't support content as a list - """ - new_messages = [] - for m in messages: - special_keys = ["role", "content", "tool_calls", "function_call"] - extra_args = {} - if isinstance(m, dict): - for k, v in m.items(): - if k not in special_keys: - extra_args[k] = v - texts = "" - if m.get("content", None) is not None and isinstance(m["content"], list): - for c in m["content"]: - if c["type"] == "image_url": - return messages - elif c["type"] == "text" and isinstance(c["text"], str): - texts += c["text"] - elif m.get("content", None) is not None and isinstance(m["content"], str): - texts = m["content"] - - new_m = {"role": m["role"], "content": texts, **extra_args} - - if new_m["role"] == "tool" and m.get("name"): - new_m["name"] = m["name"] - if m.get("tool_calls"): - new_m["tool_calls"] = m["tool_calls"] - - new_messages.append(new_m) - return new_messages - - # Falcon prompt template - from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py#L110 def falcon_instruct_pt(messages): prompt = "" @@ -718,6 +682,27 @@ def construct_tool_use_system_prompt( return tool_use_system_prompt +def convert_generic_image_chunk_to_openai_image_obj( + image_chunk: GenericImageParsingChunk, +) -> str: + """ + Convert a generic image chunk to an OpenAI image object. + + Input: + GenericImageParsingChunk( + type="base64", + media_type="image/jpeg", + data="...", + ) + + Return: + "data:image/jpeg;base64,{base64_image}" + """ + return "data:{};{},{}".format( + image_chunk["media_type"], image_chunk["type"], image_chunk["data"] + ) + + def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsingChunk: """ Input: @@ -743,6 +728,7 @@ def convert_to_anthropic_image_obj(openai_image_url: str) -> GenericImageParsing data=base64_data, ) except Exception as e: + traceback.print_exc() if "Error: Unable to fetch image from URL" in str(e): raise e raise Exception( @@ -980,17 +966,10 @@ def _gemini_tool_call_invoke_helper( name = function_call_params.get("name", "") or "" arguments = function_call_params.get("arguments", "") arguments_dict = json.loads(arguments) - function_call: Optional[litellm.types.llms.vertex_ai.FunctionCall] = None - for k, v in arguments_dict.items(): - inferred_protocol_value = infer_protocol_value(value=v) - _field = litellm.types.llms.vertex_ai.Field( - key=k, value={inferred_protocol_value: v} - ) - _fields = litellm.types.llms.vertex_ai.FunctionCallArgs(fields=_field) - function_call = litellm.types.llms.vertex_ai.FunctionCall( - name=name, - args=_fields, - ) + function_call = litellm.types.llms.vertex_ai.FunctionCall( + name=name, + args=arguments_dict, + ) return function_call @@ -1015,54 +994,26 @@ def convert_to_gemini_tool_call_invoke( }, """ """ - Gemini tool call invokes: - https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/function-calling#submit-api-output - content { - role: "model" - parts [ + Gemini tool call invokes: + { + "role": "model", + "parts": [ { - function_call { - name: "get_current_weather" - args { - fields { - key: "unit" - value { - string_value: "fahrenheit" - } - } - fields { - key: "predicted_temperature" - value { - number_value: 45 - } - } - fields { - key: "location" - value { - string_value: "Boston, MA" - } - } - } - }, - { - function_call { - name: "get_current_weather" - args { - fields { - key: "location" - value { - string_value: "San Francisco" - } - } - } + "functionCall": { + "name": "get_current_weather", + "args": { + "unit": "fahrenheit", + "predicted_temperature": 45, + "location": "Boston, MA", } + } } - ] + ] } """ """ - - json.load the arguments - - iterate through arguments -> create a FunctionCallArgs for each field + - json.load the arguments """ try: _parts_list: List[litellm.types.llms.vertex_ai.PartType] = [] @@ -1165,16 +1116,8 @@ def convert_to_gemini_tool_call_result( # We can't determine from openai message format whether it's a successful or # error call result so default to the successful result template - inferred_content_value = infer_protocol_value(value=content_str) - - _field = litellm.types.llms.vertex_ai.Field( - key="content", value={inferred_content_value: content_str} - ) - - _function_call_args = litellm.types.llms.vertex_ai.FunctionCallArgs(fields=_field) - _function_response = litellm.types.llms.vertex_ai.FunctionResponse( - name=name, response=_function_call_args # type: ignore + name=name, response={"content": content_str} # type: ignore ) _part = litellm.types.llms.vertex_ai.PartType(function_response=_function_response) @@ -1216,15 +1159,44 @@ def convert_to_anthropic_tool_result( ] } """ - content_str: str = "" + anthropic_content: Union[ + str, + List[Union[AnthropicMessagesToolResultContent, AnthropicMessagesImageParam]], + ] = "" if isinstance(message["content"], str): - content_str = message["content"] + anthropic_content = message["content"] elif isinstance(message["content"], List): content_list = message["content"] + anthropic_content_list: List[ + Union[AnthropicMessagesToolResultContent, AnthropicMessagesImageParam] + ] = [] for content in content_list: if content["type"] == "text": - content_str += content["text"] + anthropic_content_list.append( + AnthropicMessagesToolResultContent( + type="text", + text=content["text"], + ) + ) + elif content["type"] == "image_url": + if isinstance(content["image_url"], str): + image_chunk = convert_to_anthropic_image_obj(content["image_url"]) + else: + image_chunk = convert_to_anthropic_image_obj( + content["image_url"]["url"] + ) + anthropic_content_list.append( + AnthropicMessagesImageParam( + type="image", + source=AnthropicContentParamSource( + type="base64", + media_type=image_chunk["media_type"], + data=image_chunk["data"], + ), + ) + ) + anthropic_content = anthropic_content_list anthropic_tool_result: Optional[AnthropicMessagesToolResultParam] = None ## PROMPT CACHING CHECK ## cache_control = message.get("cache_control", None) @@ -1235,14 +1207,14 @@ def convert_to_anthropic_tool_result( # We can't determine from openai message format whether it's a successful or # error call result so default to the successful result template anthropic_tool_result = AnthropicMessagesToolResultParam( - type="tool_result", tool_use_id=tool_call_id, content=content_str + type="tool_result", tool_use_id=tool_call_id, content=anthropic_content ) if message["role"] == "function": function_message: ChatCompletionFunctionMessage = message tool_call_id = function_message.get("tool_call_id") or str(uuid.uuid4()) anthropic_tool_result = AnthropicMessagesToolResultParam( - type="tool_result", tool_use_id=tool_call_id, content=content_str + type="tool_result", tool_use_id=tool_call_id, content=anthropic_content ) if anthropic_tool_result is None: @@ -1330,7 +1302,10 @@ def convert_to_anthropic_tool_invoke( def add_cache_control_to_content( anthropic_content_element: Union[ - dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam + dict, + AnthropicMessagesImageParam, + AnthropicMessagesTextParam, + AnthropicMessagesDocumentParam, ], orignal_content_element: Union[dict, AllMessageValues], ): @@ -1343,6 +1318,32 @@ def add_cache_control_to_content( return anthropic_content_element +def _anthropic_content_element_factory( + image_chunk: GenericImageParsingChunk, +) -> Union[AnthropicMessagesImageParam, AnthropicMessagesDocumentParam]: + if image_chunk["media_type"] == "application/pdf": + _anthropic_content_element: Union[ + AnthropicMessagesDocumentParam, AnthropicMessagesImageParam + ] = AnthropicMessagesDocumentParam( + type="document", + source=AnthropicContentParamSource( + type="base64", + media_type=image_chunk["media_type"], + data=image_chunk["data"], + ), + ) + else: + _anthropic_content_element = AnthropicMessagesImageParam( + type="image", + source=AnthropicContentParamSource( + type="base64", + media_type=image_chunk["media_type"], + data=image_chunk["data"], + ), + ) + return _anthropic_content_element + + def anthropic_messages_pt( # noqa: PLR0915 messages: List[AllMessageValues], model: str, @@ -1400,15 +1401,9 @@ def anthropic_messages_pt( # noqa: PLR0915 openai_image_url=m["image_url"]["url"] ) - _anthropic_content_element = AnthropicMessagesImageParam( - type="image", - source=AnthropicImageParamSource( - type="base64", - media_type=image_chunk["media_type"], - data=image_chunk["data"], - ), + _anthropic_content_element = ( + _anthropic_content_element_factory(image_chunk) ) - _content_element = add_cache_control_to_content( anthropic_content_element=_anthropic_content_element, orignal_content_element=dict(m), @@ -2830,7 +2825,7 @@ def prompt_factory( else: return gemini_text_image_pt(messages=messages) elif custom_llm_provider == "mistral": - return mistral_api_pt(messages=messages) + return litellm.MistralConfig._transform_messages(messages=messages) elif custom_llm_provider == "bedrock": if "amazon.titan-text" in model: return amazon_titan_pt(messages=messages) diff --git a/litellm/llms/replicate.py b/litellm/llms/replicate.py index 094110234..2e9bbb333 100644 --- a/litellm/llms/replicate.py +++ b/litellm/llms/replicate.py @@ -9,7 +9,10 @@ import httpx # type: ignore import requests # type: ignore import litellm -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + get_async_httpx_client, +) from litellm.utils import CustomStreamWrapper, ModelResponse, Usage from .prompt_templates.factory import custom_prompt, prompt_factory @@ -325,7 +328,7 @@ def handle_prediction_response_streaming(prediction_url, api_token, print_verbos async def async_handle_prediction_response_streaming( prediction_url, api_token, print_verbose ): - http_handler = AsyncHTTPHandler(concurrent_limit=1) + http_handler = get_async_httpx_client(llm_provider=litellm.LlmProviders.REPLICATE) previous_output = "" output_string = "" @@ -560,7 +563,9 @@ async def async_completion( logging_obj, print_verbose, ) -> Union[ModelResponse, CustomStreamWrapper]: - http_handler = AsyncHTTPHandler(concurrent_limit=1) + http_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.REPLICATE, + ) prediction_url = await async_start_prediction( version_id, input_data, diff --git a/litellm/llms/text_completion_codestral.py b/litellm/llms/text_completion_codestral.py index 21582d26c..d3c1ae3cb 100644 --- a/litellm/llms/text_completion_codestral.py +++ b/litellm/llms/text_completion_codestral.py @@ -18,7 +18,10 @@ import litellm from litellm import verbose_logger from litellm.litellm_core_utils.core_helpers import map_finish_reason from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + get_async_httpx_client, +) from litellm.types.llms.databricks import GenericStreamingChunk from litellm.utils import ( Choices, @@ -479,8 +482,9 @@ class CodestralTextCompletion(BaseLLM): headers={}, ) -> TextCompletionResponse: - async_handler = AsyncHTTPHandler( - timeout=httpx.Timeout(timeout=timeout), concurrent_limit=1 + async_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.TEXT_COMPLETION_CODESTRAL, + params={"timeout": timeout}, ) try: diff --git a/litellm/llms/together_ai/chat.py b/litellm/llms/together_ai/chat.py index 398bc489c..cb12d6147 100644 --- a/litellm/llms/together_ai/chat.py +++ b/litellm/llms/together_ai/chat.py @@ -6,8 +6,8 @@ Calls done in OpenAI/openai.py as TogetherAI is openai-compatible. Docs: https://docs.together.ai/reference/completions-1 """ -from ..OpenAI.openai import OpenAIConfig +from ..OpenAI.chat.gpt_transformation import OpenAIGPTConfig -class TogetherAIConfig(OpenAIConfig): +class TogetherAIConfig(OpenAIGPTConfig): pass diff --git a/litellm/llms/together_ai/rerank.py b/litellm/llms/together_ai/rerank/handler.py similarity index 84% rename from litellm/llms/together_ai/rerank.py rename to litellm/llms/together_ai/rerank/handler.py index 1be73af2d..3e6d5d667 100644 --- a/litellm/llms/together_ai/rerank.py +++ b/litellm/llms/together_ai/rerank/handler.py @@ -15,7 +15,14 @@ from litellm.llms.custom_httpx.http_handler import ( _get_httpx_client, get_async_httpx_client, ) -from litellm.types.rerank import RerankRequest, RerankResponse +from litellm.llms.together_ai.rerank.transformation import TogetherAIRerankConfig +from litellm.types.rerank import ( + RerankBilledUnits, + RerankRequest, + RerankResponse, + RerankResponseMeta, + RerankTokens, +) class TogetherAIRerank(BaseLLM): @@ -65,13 +72,7 @@ class TogetherAIRerank(BaseLLM): _json_response = response.json() - response = RerankResponse( - id=_json_response.get("id"), - results=_json_response.get("results"), - meta=_json_response.get("meta") or {}, - ) - - return response + return TogetherAIRerankConfig()._transform_response(_json_response) async def async_rerank( # New async method self, @@ -97,10 +98,4 @@ class TogetherAIRerank(BaseLLM): _json_response = response.json() - return RerankResponse( - id=_json_response.get("id"), - results=_json_response.get("results"), - meta=_json_response.get("meta") or {}, - ) # Return response - - pass + return TogetherAIRerankConfig()._transform_response(_json_response) diff --git a/litellm/llms/together_ai/rerank/transformation.py b/litellm/llms/together_ai/rerank/transformation.py new file mode 100644 index 000000000..b2024b5cd --- /dev/null +++ b/litellm/llms/together_ai/rerank/transformation.py @@ -0,0 +1,34 @@ +""" +Transformation logic from Cohere's /v1/rerank format to Together AI's `/v1/rerank` format. + +Why separate file? Make it easy to see how transformation works +""" + +import uuid +from typing import List, Optional + +from litellm.types.rerank import ( + RerankBilledUnits, + RerankResponse, + RerankResponseMeta, + RerankTokens, +) + + +class TogetherAIRerankConfig: + def _transform_response(self, response: dict) -> RerankResponse: + + _billed_units = RerankBilledUnits(**response.get("usage", {})) + _tokens = RerankTokens(**response.get("usage", {})) + rerank_meta = RerankResponseMeta(billed_units=_billed_units, tokens=_tokens) + + _results: Optional[List[dict]] = response.get("results") + + if _results is None: + raise ValueError(f"No results found in the response={response}") + + return RerankResponse( + id=response.get("id") or str(uuid.uuid4()), + results=_results, + meta=rerank_meta, + ) # Return response diff --git a/litellm/llms/triton.py b/litellm/llms/triton.py index be4179ccc..efd0d0a2d 100644 --- a/litellm/llms/triton.py +++ b/litellm/llms/triton.py @@ -8,7 +8,11 @@ import httpx # type: ignore import requests # type: ignore import litellm -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.utils import ( Choices, CustomStreamWrapper, @@ -50,8 +54,8 @@ class TritonChatCompletion(BaseLLM): logging_obj: Any, api_key: Optional[str] = None, ) -> EmbeddingResponse: - async_handler = AsyncHTTPHandler( - timeout=httpx.Timeout(timeout=600.0, connect=5.0) + async_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.TRITON, params={"timeout": 600.0} ) response = await async_handler.post(url=api_base, data=json.dumps(data)) @@ -261,7 +265,9 @@ class TritonChatCompletion(BaseLLM): model_response, type_of_model, ) -> ModelResponse: - handler = AsyncHTTPHandler() + handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.TRITON, params={"timeout": 600.0} + ) if stream: return self._ahandle_stream( # type: ignore handler, api_base, data_for_triton, model, logging_obj diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/common_utils.py b/litellm/llms/vertex_ai_and_google_ai_studio/common_utils.py index 0f95b222c..74bab0b26 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/common_utils.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/common_utils.py @@ -89,6 +89,9 @@ def _get_vertex_url( elif mode == "embedding": endpoint = "predict" url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:{endpoint}" + if model.isdigit(): + # https://us-central1-aiplatform.googleapis.com/v1/projects/$PROJECT_ID/locations/us-central1/endpoints/$ENDPOINT_ID:predict + url = f"https://{vertex_location}-aiplatform.googleapis.com/{vertex_api_version}/projects/{vertex_project}/locations/{vertex_location}/endpoints/{model}:{endpoint}" if not url or not endpoint: raise ValueError(f"Unable to get vertex url/endpoint for mode: {mode}") diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py b/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py index e60a17052..b9be8a3bd 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py @@ -6,7 +6,11 @@ import httpx import litellm from litellm.caching.caching import Cache, LiteLLMCacheType from litellm.litellm_core_utils.litellm_logging import Logging -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.llms.OpenAI.openai import AllMessageValues from litellm.types.llms.vertex_ai import ( CachedContentListAllResponseBody, @@ -331,6 +335,13 @@ class ContextCachingEndpoints(VertexBase): if cached_content is not None: return messages, cached_content + cached_messages, non_cached_messages = separate_cached_messages( + messages=messages + ) + + if len(cached_messages) == 0: + return messages, None + ## AUTHORIZATION ## token, url = self._get_token_and_url_context_caching( gemini_api_key=api_key, @@ -347,22 +358,12 @@ class ContextCachingEndpoints(VertexBase): headers.update(extra_headers) if client is None or not isinstance(client, AsyncHTTPHandler): - _params = {} - if timeout is not None: - if isinstance(timeout, float) or isinstance(timeout, int): - timeout = httpx.Timeout(timeout) - _params["timeout"] = timeout - client = AsyncHTTPHandler(**_params) # type: ignore + client = get_async_httpx_client( + params={"timeout": timeout}, llm_provider=litellm.LlmProviders.VERTEX_AI + ) else: client = client - cached_messages, non_cached_messages = separate_cached_messages( - messages=messages - ) - - if len(cached_messages) == 0: - return messages, None - ## CHECK IF CACHED ALREADY generated_cache_key = local_cache_obj.get_cache_key(messages=cached_messages) google_cache_name = await self.async_check_cache( diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/transformation.py b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/transformation.py index f828d93c8..c9fe6e3f4 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/transformation.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/transformation.py @@ -107,6 +107,10 @@ def _get_image_mime_type_from_url(url: str) -> Optional[str]: return "image/png" elif url.endswith(".webp"): return "image/webp" + elif url.endswith(".mp4"): + return "video/mp4" + elif url.endswith(".pdf"): + return "application/pdf" return None @@ -294,7 +298,12 @@ def _transform_request_body( optional_params = {k: v for k, v in optional_params.items() if k not in remove_keys} try: - content = _gemini_convert_messages_with_history(messages=messages) + if custom_llm_provider == "gemini": + content = litellm.GoogleAIStudioGeminiConfig._transform_messages( + messages=messages + ) + else: + content = litellm.VertexGeminiConfig._transform_messages(messages=messages) tools: Optional[Tools] = optional_params.pop("tools", None) tool_choice: Optional[ToolConfig] = optional_params.pop("tool_choice", None) safety_settings: Optional[List[SafetSettingsConfig]] = optional_params.pop( diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py index 39c63dbb3..4287ed1bc 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py @@ -35,7 +35,12 @@ from litellm.llms.custom_httpx.http_handler import ( HTTPHandler, get_async_httpx_client, ) +from litellm.llms.prompt_templates.factory import ( + convert_generic_image_chunk_to_openai_image_obj, + convert_to_anthropic_image_obj, +) from litellm.types.llms.openai import ( + AllMessageValues, ChatCompletionResponseMessage, ChatCompletionToolCallChunk, ChatCompletionToolCallFunctionChunk, @@ -78,6 +83,8 @@ from ..common_utils import ( ) from ..vertex_llm_base import VertexBase from .transformation import ( + _gemini_convert_messages_with_history, + _process_gemini_image, async_transform_request_body, set_headers, sync_transform_request_body, @@ -912,6 +919,10 @@ class VertexGeminiConfig: return model_response + @staticmethod + def _transform_messages(messages: List[AllMessageValues]) -> List[ContentType]: + return _gemini_convert_messages_with_history(messages=messages) + class GoogleAIStudioGeminiConfig( VertexGeminiConfig @@ -1015,6 +1026,32 @@ class GoogleAIStudioGeminiConfig( model, non_default_params, optional_params, drop_params ) + @staticmethod + def _transform_messages(messages: List[AllMessageValues]) -> List[ContentType]: + """ + Google AI Studio Gemini does not support image urls in messages. + """ + for message in messages: + _message_content = message.get("content") + if _message_content is not None and isinstance(_message_content, list): + _parts: List[PartType] = [] + for element in _message_content: + if element.get("type") == "image_url": + img_element = element + _image_url: Optional[str] = None + if isinstance(img_element.get("image_url"), dict): + _image_url = img_element["image_url"].get("url") # type: ignore + else: + _image_url = img_element.get("image_url") # type: ignore + if _image_url and "https://" in _image_url: + image_obj = convert_to_anthropic_image_obj(_image_url) + img_element["image_url"] = ( # type: ignore + convert_generic_image_chunk_to_openai_image_obj( + image_obj + ) + ) + return _gemini_convert_messages_with_history(messages=messages) + async def make_call( client: Optional[AsyncHTTPHandler], @@ -1026,7 +1063,9 @@ async def make_call( logging_obj, ): if client is None: - client = AsyncHTTPHandler() # Create a new client if none provided + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.VERTEX_AI, + ) try: response = await client.post(api_base, headers=headers, data=data, stream=True) diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/gemini_embeddings/batch_embed_content_handler.py b/litellm/llms/vertex_ai_and_google_ai_studio/gemini_embeddings/batch_embed_content_handler.py index 314e129c2..8e2d1f39a 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini_embeddings/batch_embed_content_handler.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini_embeddings/batch_embed_content_handler.py @@ -7,8 +7,13 @@ from typing import Any, List, Literal, Optional, Union import httpx +import litellm from litellm import EmbeddingResponse -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.types.llms.openai import EmbeddingInput from litellm.types.llms.vertex_ai import ( VertexAIBatchEmbeddingsRequestBody, @@ -150,7 +155,10 @@ class GoogleBatchEmbeddings(VertexLLM): else: _params["timeout"] = httpx.Timeout(timeout=600.0, connect=5.0) - async_handler: AsyncHTTPHandler = AsyncHTTPHandler(**_params) # type: ignore + async_handler: AsyncHTTPHandler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.VERTEX_AI, + params={"timeout": timeout}, + ) else: async_handler = client # type: ignore diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/image_generation/cost_calculator.py b/litellm/llms/vertex_ai_and_google_ai_studio/image_generation/cost_calculator.py new file mode 100644 index 000000000..2d7fa37f7 --- /dev/null +++ b/litellm/llms/vertex_ai_and_google_ai_studio/image_generation/cost_calculator.py @@ -0,0 +1,25 @@ +""" +Vertex AI Image Generation Cost Calculator +""" + +from typing import Optional + +import litellm +from litellm.types.utils import ImageResponse + + +def cost_calculator( + model: str, + image_response: ImageResponse, +) -> float: + """ + Vertex AI Image Generation Cost Calculator + """ + _model_info = litellm.get_model_info( + model=model, + custom_llm_provider="vertex_ai", + ) + + output_cost_per_image: float = _model_info.get("output_cost_per_image") or 0.0 + num_images: int = len(image_response.data) + return output_cost_per_image * num_images diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/image_generation/image_generation_handler.py b/litellm/llms/vertex_ai_and_google_ai_studio/image_generation/image_generation_handler.py index 1531464c8..6cb5771e6 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/image_generation/image_generation_handler.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/image_generation/image_generation_handler.py @@ -5,7 +5,11 @@ import httpx from openai.types.image import Image import litellm -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( VertexLLM, ) @@ -156,7 +160,10 @@ class VertexImageGeneration(VertexLLM): else: _params["timeout"] = httpx.Timeout(timeout=600.0, connect=5.0) - self.async_handler = AsyncHTTPHandler(**_params) # type: ignore + self.async_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.VERTEX_AI, + params={"timeout": timeout}, + ) else: self.async_handler = client # type: ignore diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/multimodal_embeddings/embedding_handler.py b/litellm/llms/vertex_ai_and_google_ai_studio/multimodal_embeddings/embedding_handler.py index d8af891b0..27b77fdd9 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/multimodal_embeddings/embedding_handler.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/multimodal_embeddings/embedding_handler.py @@ -5,7 +5,11 @@ import httpx import litellm from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( VertexAIError, VertexLLM, @@ -172,7 +176,10 @@ class VertexMultimodalEmbedding(VertexLLM): if isinstance(timeout, float) or isinstance(timeout, int): timeout = httpx.Timeout(timeout) _params["timeout"] = timeout - client = AsyncHTTPHandler(**_params) # type: ignore + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.VERTEX_AI, + params={"timeout": timeout}, + ) else: client = client # type: ignore diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_non_gemini.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_non_gemini.py index 80295ec40..829bf6528 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_non_gemini.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_non_gemini.py @@ -14,6 +14,7 @@ from pydantic import BaseModel import litellm from litellm._logging import verbose_logger from litellm.litellm_core_utils.core_helpers import map_finish_reason +from litellm.llms.custom_httpx.http_handler import _DEFAULT_TTL_FOR_HTTPX_CLIENTS from litellm.llms.prompt_templates.factory import ( convert_to_anthropic_image_obj, convert_to_gemini_tool_call_invoke, @@ -93,11 +94,15 @@ def _get_client_cache_key( def _get_client_from_cache(client_cache_key: str): - return litellm.in_memory_llm_clients_cache.get(client_cache_key, None) + return litellm.in_memory_llm_clients_cache.get_cache(client_cache_key) def _set_client_in_cache(client_cache_key: str, vertex_llm_model: Any): - litellm.in_memory_llm_clients_cache[client_cache_key] = vertex_llm_model + litellm.in_memory_llm_clients_cache.set_cache( + key=client_cache_key, + value=vertex_llm_model, + ttl=_DEFAULT_TTL_FOR_HTTPX_CLIENTS, + ) def completion( # noqa: PLR0915 diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py index e8443e6f6..f335f53d9 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py @@ -236,4 +236,6 @@ class VertexAIPartnerModels(VertexBase): ) except Exception as e: + if hasattr(e, "status_code"): + raise e raise VertexAIError(status_code=500, message=str(e)) diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/embedding_handler.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/embedding_handler.py index 0cde5c3b5..26741ff4f 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/embedding_handler.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/embedding_handler.py @@ -96,7 +96,7 @@ class VertexEmbedding(VertexBase): headers = self.set_headers(auth_header=auth_header, extra_headers=extra_headers) vertex_request: VertexEmbeddingRequest = ( litellm.vertexAITextEmbeddingConfig.transform_openai_request_to_vertex_embedding_request( - input=input, optional_params=optional_params + input=input, optional_params=optional_params, model=model ) ) @@ -188,7 +188,7 @@ class VertexEmbedding(VertexBase): headers = self.set_headers(auth_header=auth_header, extra_headers=extra_headers) vertex_request: VertexEmbeddingRequest = ( litellm.vertexAITextEmbeddingConfig.transform_openai_request_to_vertex_embedding_request( - input=input, optional_params=optional_params + input=input, optional_params=optional_params, model=model ) ) diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/transformation.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/transformation.py index 1ca405392..6f4b25cef 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/transformation.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/transformation.py @@ -101,11 +101,16 @@ class VertexAITextEmbeddingConfig(BaseModel): return optional_params def transform_openai_request_to_vertex_embedding_request( - self, input: Union[list, str], optional_params: dict + self, input: Union[list, str], optional_params: dict, model: str ) -> VertexEmbeddingRequest: """ Transforms an openai request to a vertex embedding request. """ + if model.isdigit(): + return self._transform_openai_request_to_fine_tuned_embedding_request( + input, optional_params, model + ) + vertex_request: VertexEmbeddingRequest = VertexEmbeddingRequest() vertex_text_embedding_input_list: List[TextEmbeddingInput] = [] task_type: Optional[TaskType] = optional_params.get("task_type") @@ -125,6 +130,47 @@ class VertexAITextEmbeddingConfig(BaseModel): return vertex_request + def _transform_openai_request_to_fine_tuned_embedding_request( + self, input: Union[list, str], optional_params: dict, model: str + ) -> VertexEmbeddingRequest: + """ + Transforms an openai request to a vertex fine-tuned embedding request. + + Vertex Doc: https://console.cloud.google.com/vertex-ai/model-garden?hl=en&project=adroit-crow-413218&pageState=(%22galleryStateKey%22:(%22f%22:(%22g%22:%5B%5D,%22o%22:%5B%5D),%22s%22:%22%22)) + Sample Request: + + ```json + { + "instances" : [ + { + "inputs": "How would the Future of AI in 10 Years look?", + "parameters": { + "max_new_tokens": 128, + "temperature": 1.0, + "top_p": 0.9, + "top_k": 10 + } + } + ] + } + ``` + """ + vertex_request: VertexEmbeddingRequest = VertexEmbeddingRequest() + vertex_text_embedding_input_list: List[TextEmbeddingFineTunedInput] = [] + if isinstance(input, str): + input = [input] # Convert single string to list for uniform processing + + for text in input: + embedding_input = TextEmbeddingFineTunedInput(inputs=text) + vertex_text_embedding_input_list.append(embedding_input) + + vertex_request["instances"] = vertex_text_embedding_input_list + vertex_request["parameters"] = TextEmbeddingFineTunedParameters( + **optional_params + ) + + return vertex_request + def create_embedding_input( self, content: str, @@ -157,6 +203,11 @@ class VertexAITextEmbeddingConfig(BaseModel): """ Transforms a vertex embedding response to an openai response. """ + if model.isdigit(): + return self._transform_vertex_response_to_openai_for_fine_tuned_models( + response, model, model_response + ) + _predictions = response["predictions"] embedding_response = [] @@ -181,3 +232,35 @@ class VertexAITextEmbeddingConfig(BaseModel): ) setattr(model_response, "usage", usage) return model_response + + def _transform_vertex_response_to_openai_for_fine_tuned_models( + self, response: dict, model: str, model_response: litellm.EmbeddingResponse + ) -> litellm.EmbeddingResponse: + """ + Transforms a vertex fine-tuned model embedding response to an openai response format. + """ + _predictions = response["predictions"] + + embedding_response = [] + # For fine-tuned models, we don't get token counts in the response + input_tokens = 0 + + for idx, embedding_values in enumerate(_predictions): + embedding_response.append( + { + "object": "embedding", + "index": idx, + "embedding": embedding_values[ + 0 + ], # The embedding values are nested one level deeper + } + ) + + model_response.object = "list" + model_response.data = embedding_response + model_response.model = model + usage = Usage( + prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens + ) + setattr(model_response, "usage", usage) + return model_response diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/types.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/types.py index 311809c82..433305516 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/types.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_embeddings/types.py @@ -23,14 +23,27 @@ class TextEmbeddingInput(TypedDict, total=False): title: Optional[str] +# Fine-tuned models require a different input format +# Ref: https://console.cloud.google.com/vertex-ai/model-garden?hl=en&project=adroit-crow-413218&pageState=(%22galleryStateKey%22:(%22f%22:(%22g%22:%5B%5D,%22o%22:%5B%5D),%22s%22:%22%22)) +class TextEmbeddingFineTunedInput(TypedDict, total=False): + inputs: str + + +class TextEmbeddingFineTunedParameters(TypedDict, total=False): + max_new_tokens: Optional[int] + temperature: Optional[float] + top_p: Optional[float] + top_k: Optional[int] + + class EmbeddingParameters(TypedDict, total=False): auto_truncate: Optional[bool] output_dimensionality: Optional[int] class VertexEmbeddingRequest(TypedDict, total=False): - instances: List[TextEmbeddingInput] - parameters: Optional[EmbeddingParameters] + instances: Union[List[TextEmbeddingInput], List[TextEmbeddingFineTunedInput]] + parameters: Optional[Union[EmbeddingParameters, TextEmbeddingFineTunedParameters]] # Example usage: diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_model_garden/main.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_model_garden/main.py new file mode 100644 index 000000000..4285c4dcb --- /dev/null +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_model_garden/main.py @@ -0,0 +1,156 @@ +""" +API Handler for calling Vertex AI Model Garden Models + +Most Vertex Model Garden Models are OpenAI compatible - so this handler calls `openai_like_chat_completions` + +Usage: + +response = litellm.completion( + model="vertex_ai/openai/5464397967697903616", + messages=[{"role": "user", "content": "Hello, how are you?"}], +) + +Sent to this route when `model` is in the format `vertex_ai/openai/{MODEL_ID}` + + +Vertex Documentation for using the OpenAI /chat/completions endpoint: https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama3_deployment.ipynb +""" + +import types +from enum import Enum +from typing import Callable, Literal, Optional, Union + +import httpx # type: ignore + +import litellm +from litellm.utils import ModelResponse + +from ..common_utils import VertexAIError +from ..vertex_llm_base import VertexBase + + +def create_vertex_url( + vertex_location: str, + vertex_project: str, + stream: Optional[bool], + model: str, + api_base: Optional[str] = None, +) -> str: + """Return the base url for the vertex garden models""" + # f"https://{self.endpoint.location}-aiplatform.googleapis.com/v1beta1/projects/{PROJECT_ID}/locations/{self.endpoint.location}" + return f"https://{vertex_location}-aiplatform.googleapis.com/v1beta1/projects/{vertex_project}/locations/{vertex_location}/endpoints/{model}" + + +class VertexAIModelGardenModels(VertexBase): + def __init__(self) -> None: + pass + + def completion( + self, + model: str, + messages: list, + model_response: ModelResponse, + print_verbose: Callable, + encoding, + logging_obj, + api_base: Optional[str], + optional_params: dict, + custom_prompt_dict: dict, + headers: Optional[dict], + timeout: Union[float, httpx.Timeout], + litellm_params: dict, + vertex_project=None, + vertex_location=None, + vertex_credentials=None, + logger_fn=None, + acompletion: bool = False, + client=None, + ): + """ + Handles calling Vertex AI Model Garden Models in OpenAI compatible format + + Sent to this route when `model` is in the format `vertex_ai/openai/{MODEL_ID}` + """ + try: + import vertexai + from google.cloud import aiplatform + + from litellm.llms.anthropic.chat import AnthropicChatCompletion + from litellm.llms.databricks.chat import DatabricksChatCompletion + from litellm.llms.OpenAI.openai import OpenAIChatCompletion + from litellm.llms.text_completion_codestral import CodestralTextCompletion + from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( + VertexLLM, + ) + except Exception: + + raise VertexAIError( + status_code=400, + message="""vertexai import failed please run `pip install -U "google-cloud-aiplatform>=1.38"`""", + ) + + if not ( + hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models") + ): + raise VertexAIError( + status_code=400, + message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""", + ) + try: + model = model.replace("openai/", "") + vertex_httpx_logic = VertexLLM() + + access_token, project_id = vertex_httpx_logic._ensure_access_token( + credentials=vertex_credentials, + project_id=vertex_project, + custom_llm_provider="vertex_ai", + ) + + openai_like_chat_completions = DatabricksChatCompletion() + + ## CONSTRUCT API BASE + stream: bool = optional_params.get("stream", False) or False + optional_params["stream"] = stream + default_api_base = create_vertex_url( + vertex_location=vertex_location or "us-central1", + vertex_project=vertex_project or project_id, + stream=stream, + model=model, + ) + + if len(default_api_base.split(":")) > 1: + endpoint = default_api_base.split(":")[-1] + else: + endpoint = "" + + _, api_base = self._check_custom_proxy( + api_base=api_base, + custom_llm_provider="vertex_ai", + gemini_api_key=None, + endpoint=endpoint, + stream=stream, + auth_header=None, + url=default_api_base, + ) + model = "" + return openai_like_chat_completions.completion( + model=model, + messages=messages, + api_base=api_base, + api_key=access_token, + custom_prompt_dict=custom_prompt_dict, + model_response=model_response, + print_verbose=print_verbose, + logging_obj=logging_obj, + optional_params=optional_params, + acompletion=acompletion, + litellm_params=litellm_params, + logger_fn=logger_fn, + client=client, + timeout=timeout, + encoding=encoding, + custom_llm_provider="vertex_ai", + ) + + except Exception as e: + raise VertexAIError(status_code=500, message=str(e)) diff --git a/litellm/llms/watsonx/chat/handler.py b/litellm/llms/watsonx/chat/handler.py index b016bb0a7..932946d3c 100644 --- a/litellm/llms/watsonx/chat/handler.py +++ b/litellm/llms/watsonx/chat/handler.py @@ -57,6 +57,7 @@ class WatsonXChatHandler(OpenAILikeChatHandler): def completion( self, + *, model: str, messages: list, api_base: str, @@ -75,9 +76,8 @@ class WatsonXChatHandler(OpenAILikeChatHandler): timeout: Optional[Union[float, httpx.Timeout]] = None, client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None, custom_endpoint: Optional[bool] = None, - streaming_decoder: Optional[ - CustomStreamingDecoder - ] = None, # if openai-compatible api needs custom stream decoder - e.g. sagemaker + streaming_decoder: Optional[CustomStreamingDecoder] = None, + fake_stream: bool = False, ): api_params = _get_api_params(optional_params, print_verbose=print_verbose) diff --git a/litellm/llms/watsonx/completion/handler.py b/litellm/llms/watsonx/completion/handler.py index fda25ba0f..9618f6342 100644 --- a/litellm/llms/watsonx/completion/handler.py +++ b/litellm/llms/watsonx/completion/handler.py @@ -24,7 +24,10 @@ import httpx # type: ignore import requests # type: ignore import litellm -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + get_async_httpx_client, +) from litellm.secret_managers.main import get_secret_str from litellm.types.llms.watsonx import WatsonXAIEndpoint from litellm.utils import EmbeddingResponse, ModelResponse, Usage, map_finish_reason @@ -710,10 +713,13 @@ class RequestManager: if stream: request_params["stream"] = stream try: - self.async_handler = AsyncHTTPHandler( - timeout=httpx.Timeout( - timeout=request_params.pop("timeout", 600.0), connect=5.0 - ), + self.async_handler = get_async_httpx_client( + llm_provider=litellm.LlmProviders.WATSONX, + params={ + "timeout": httpx.Timeout( + timeout=request_params.pop("timeout", 600.0), connect=5.0 + ), + }, ) if "json" in request_params: request_params["data"] = json.dumps(request_params.pop("json", {})) diff --git a/litellm/main.py b/litellm/main.py index 8334f35d7..5095ce518 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -108,9 +108,9 @@ from .llms.azure_text import AzureTextCompletion from .llms.AzureOpenAI.audio_transcriptions import AzureAudioTranscription from .llms.AzureOpenAI.azure import AzureChatCompletion, _check_dynamic_azure_params from .llms.AzureOpenAI.chat.o1_handler import AzureOpenAIO1ChatCompletion -from .llms.bedrock import image_generation as bedrock_image_generation # type: ignore from .llms.bedrock.chat import BedrockConverseLLM, BedrockLLM from .llms.bedrock.embed.embedding import BedrockEmbedding +from .llms.bedrock.image.image_handler import BedrockImageGeneration from .llms.cohere import chat as cohere_chat from .llms.cohere import completion as cohere_completion # type: ignore from .llms.cohere.embed import handler as cohere_embed @@ -158,6 +158,9 @@ from .llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.main import ( from .llms.vertex_ai_and_google_ai_studio.vertex_embeddings.embedding_handler import ( VertexEmbedding, ) +from .llms.vertex_ai_and_google_ai_studio.vertex_model_garden.main import ( + VertexAIModelGardenModels, +) from .llms.watsonx.chat.handler import WatsonXChatHandler from .llms.watsonx.completion.handler import IBMWatsonXAI from .types.llms.openai import ( @@ -214,12 +217,14 @@ triton_chat_completions = TritonChatCompletion() bedrock_chat_completion = BedrockLLM() bedrock_converse_chat_completion = BedrockConverseLLM() bedrock_embedding = BedrockEmbedding() +bedrock_image_generation = BedrockImageGeneration() vertex_chat_completion = VertexLLM() vertex_embedding = VertexEmbedding() vertex_multimodal_embedding = VertexMultimodalEmbedding() vertex_image_generation = VertexImageGeneration() google_batch_embeddings = GoogleBatchEmbeddings() vertex_partner_models_chat_completion = VertexAIPartnerModels() +vertex_model_garden_chat_completion = VertexAIModelGardenModels() vertex_text_to_speech = VertexTextToSpeechAPI() watsonxai = IBMWatsonXAI() sagemaker_llm = SagemakerLLM() @@ -549,7 +554,6 @@ def mock_completion( Raises: Exception: If an error occurs during the generation of the mock completion response. - Note: - This function is intended for testing or debugging purposes to generate mock completion responses. - If 'stream' is True, it returns a response that mimics the behavior of a streaming completion. @@ -1065,6 +1069,7 @@ def completion( # type: ignore # noqa: PLR0915 azure_ad_token_provider=kwargs.get("azure_ad_token_provider"), user_continue_message=kwargs.get("user_continue_message"), base_model=base_model, + litellm_trace_id=kwargs.get("litellm_trace_id"), ) logging.update_environment_variables( model=model, @@ -1490,8 +1495,8 @@ def completion( # type: ignore # noqa: PLR0915 timeout=timeout, # type: ignore custom_prompt_dict=custom_prompt_dict, client=client, # pass AsyncOpenAI, OpenAI client - organization=organization, custom_llm_provider=custom_llm_provider, + encoding=encoding, ) elif ( model in litellm.open_ai_chat_completion_models @@ -2353,6 +2358,28 @@ def completion( # type: ignore # noqa: PLR0915 api_base=api_base, extra_headers=extra_headers, ) + elif "openai" in model: + # Vertex Model Garden - OpenAI compatible models + model_response = vertex_model_garden_chat_completion.completion( + model=model, + messages=messages, + model_response=model_response, + print_verbose=print_verbose, + optional_params=new_params, + litellm_params=litellm_params, # type: ignore + logger_fn=logger_fn, + encoding=encoding, + api_base=api_base, + vertex_location=vertex_ai_location, + vertex_project=vertex_ai_project, + vertex_credentials=vertex_credentials, + logging_obj=logging, + acompletion=acompletion, + headers=headers, + custom_prompt_dict=custom_prompt_dict, + timeout=timeout, + client=client, + ) else: model_response = vertex_ai_non_gemini.completion( model=model, @@ -3155,6 +3182,7 @@ async def aembedding(*args, **kwargs) -> EmbeddingResponse: or custom_llm_provider == "azure_ai" or custom_llm_provider == "together_ai" or custom_llm_provider == "openai_like" + or custom_llm_provider == "jina_ai" ): # currently implemented aiohttp calls for just azure and openai, soon all. # Await normally init_response = await loop.run_in_executor(None, func_with_context) @@ -3412,6 +3440,10 @@ def embedding( # noqa: PLR0915 or litellm.openai_key or get_secret_str("OPENAI_API_KEY") ) + + if extra_headers is not None: + optional_params["extra_headers"] = extra_headers + api_type = "openai" api_version = None @@ -3454,7 +3486,7 @@ def embedding( # noqa: PLR0915 client=client, aembedding=aembedding, ) - elif custom_llm_provider == "openai_like": + elif custom_llm_provider == "openai_like" or custom_llm_provider == "jina_ai": api_base = ( api_base or litellm.api_base or get_secret_str("OPENAI_LIKE_API_BASE") ) @@ -4447,6 +4479,7 @@ def image_generation( # noqa: PLR0915 k: v for k, v in kwargs.items() if k not in default_params } # model-specific params - pass them straight to the model/provider optional_params = get_optional_params_image_gen( + model=model, n=n, quality=quality, response_format=response_format, @@ -4539,7 +4572,7 @@ def image_generation( # noqa: PLR0915 elif custom_llm_provider == "bedrock": if model is None: raise Exception("Model needs to be set for bedrock") - model_response = bedrock_image_generation.image_generation( + model_response = bedrock_image_generation.image_generation( # type: ignore model=model, prompt=prompt, timeout=timeout, @@ -4700,6 +4733,7 @@ def transcription( response_format: Optional[ Literal["json", "text", "srt", "verbose_json", "vtt"] ] = None, + timestamp_granularities: Optional[List[Literal["word", "segment"]]] = None, temperature: Optional[int] = None, # openai defaults this to 0 ## LITELLM PARAMS ## user: Optional[str] = None, @@ -4749,6 +4783,7 @@ def transcription( language=language, prompt=prompt, response_format=response_format, + timestamp_granularities=timestamp_granularities, temperature=temperature, custom_llm_provider=custom_llm_provider, drop_params=drop_params, diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index a37a431dc..ac22871bc 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -26,16 +26,17 @@ "supports_prompt_caching": true }, "gpt-4o": { - "max_tokens": 4096, + "max_tokens": 16384, "max_input_tokens": 128000, - "max_output_tokens": 4096, - "input_cost_per_token": 0.000005, - "output_cost_per_token": 0.000015, + "max_output_tokens": 16384, + "input_cost_per_token": 0.0000025, + "output_cost_per_token": 0.000010, "cache_read_input_token_cost": 0.00000125, "litellm_provider": "openai", "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true, "supports_prompt_caching": true }, @@ -80,6 +81,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true, "supports_prompt_caching": true }, @@ -94,6 +96,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true, "supports_prompt_caching": true }, @@ -108,7 +111,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true, + "supports_vision": false, "supports_prompt_caching": true }, "o1-mini-2024-09-12": { @@ -122,7 +125,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true, + "supports_vision": false, "supports_prompt_caching": true }, "o1-preview": { @@ -136,7 +139,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true, + "supports_vision": false, "supports_prompt_caching": true }, "o1-preview-2024-09-12": { @@ -150,7 +153,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true, + "supports_vision": false, "supports_prompt_caching": true }, "chatgpt-4o-latest": { @@ -190,6 +193,22 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, + "supports_vision": true, + "supports_prompt_caching": true + }, + "gpt-4o-2024-11-20": { + "max_tokens": 16384, + "max_input_tokens": 128000, + "max_output_tokens": 16384, + "input_cost_per_token": 0.0000025, + "output_cost_per_token": 0.000010, + "cache_read_input_token_cost": 0.00000125, + "litellm_provider": "openai", + "mode": "chat", + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true, "supports_prompt_caching": true }, @@ -461,6 +480,20 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, + "supports_vision": true + }, + "ft:gpt-4o-2024-11-20": { + "max_tokens": 16384, + "max_input_tokens": 128000, + "max_output_tokens": 16384, + "input_cost_per_token": 0.00000375, + "output_cost_per_token": 0.000015, + "litellm_provider": "openai", + "mode": "chat", + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true }, "ft:gpt-4o-mini-2024-07-18": { @@ -473,6 +506,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true }, "ft:davinci-002": { @@ -652,7 +686,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true, + "supports_vision": false, "supports_prompt_caching": true }, "azure/o1-mini-2024-09-12": { @@ -666,7 +700,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true, + "supports_vision": false, "supports_prompt_caching": true }, "azure/o1-preview": { @@ -680,7 +714,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true, + "supports_vision": false, "supports_prompt_caching": true }, "azure/o1-preview-2024-09-12": { @@ -694,7 +728,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true, + "supports_vision": false, "supports_prompt_caching": true }, "azure/gpt-4o": { @@ -721,6 +755,20 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, + "supports_vision": true + }, + "azure/gpt-4o-2024-11-20": { + "max_tokens": 16384, + "max_input_tokens": 128000, + "max_output_tokens": 16384, + "input_cost_per_token": 0.00000275, + "output_cost_per_token": 0.000011, + "litellm_provider": "azure", + "mode": "chat", + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true }, "azure/gpt-4o-2024-05-13": { @@ -746,6 +794,20 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, + "supports_vision": true + }, + "azure/global-standard/gpt-4o-2024-11-20": { + "max_tokens": 16384, + "max_input_tokens": 128000, + "max_output_tokens": 16384, + "input_cost_per_token": 0.0000025, + "output_cost_per_token": 0.000010, + "litellm_provider": "azure", + "mode": "chat", + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true }, "azure/global-standard/gpt-4o-mini": { @@ -758,6 +820,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true }, "azure/gpt-4o-mini": { @@ -771,6 +834,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true, "supports_prompt_caching": true }, @@ -785,6 +849,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, + "supports_response_schema": true, "supports_vision": true, "supports_prompt_caching": true }, @@ -1109,6 +1174,52 @@ "supports_function_calling": true, "mode": "chat" }, + "azure_ai/mistral-large-2407": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.000002, + "output_cost_per_token": 0.000006, + "litellm_provider": "azure_ai", + "supports_function_calling": true, + "mode": "chat", + "source": "https://azuremarketplace.microsoft.com/en/marketplace/apps/000-000.mistral-ai-large-2407-offer?tab=Overview" + }, + "azure_ai/ministral-3b": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000004, + "output_cost_per_token": 0.00000004, + "litellm_provider": "azure_ai", + "supports_function_calling": true, + "mode": "chat", + "source": "https://azuremarketplace.microsoft.com/en/marketplace/apps/000-000.ministral-3b-2410-offer?tab=Overview" + }, + "azure_ai/Llama-3.2-11B-Vision-Instruct": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 2048, + "input_cost_per_token": 0.00000037, + "output_cost_per_token": 0.00000037, + "litellm_provider": "azure_ai", + "supports_function_calling": true, + "supports_vision": true, + "mode": "chat", + "source": "https://azuremarketplace.microsoft.com/en/marketplace/apps/metagenai.meta-llama-3-2-11b-vision-instruct-offer?tab=Overview" + }, + "azure_ai/Llama-3.2-90B-Vision-Instruct": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 2048, + "input_cost_per_token": 0.00000204, + "output_cost_per_token": 0.00000204, + "litellm_provider": "azure_ai", + "supports_function_calling": true, + "supports_vision": true, + "mode": "chat", + "source": "https://azuremarketplace.microsoft.com/en/marketplace/apps/metagenai.meta-llama-3-2-90b-vision-instruct-offer?tab=Overview" + }, "azure_ai/Meta-Llama-3-70B-Instruct": { "max_tokens": 8192, "max_input_tokens": 8192, @@ -1148,6 +1259,105 @@ "mode": "chat", "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice" }, + "azure_ai/Phi-3.5-mini-instruct": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000013, + "output_cost_per_token": 0.00000052, + "litellm_provider": "azure_ai", + "mode": "chat", + "supports_vision": false, + "source": "https://azure.microsoft.com/en-us/pricing/details/phi-3/" + }, + "azure_ai/Phi-3.5-vision-instruct": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000013, + "output_cost_per_token": 0.00000052, + "litellm_provider": "azure_ai", + "mode": "chat", + "supports_vision": true, + "source": "https://azure.microsoft.com/en-us/pricing/details/phi-3/" + }, + "azure_ai/Phi-3.5-MoE-instruct": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000016, + "output_cost_per_token": 0.00000064, + "litellm_provider": "azure_ai", + "mode": "chat", + "supports_vision": false, + "source": "https://azure.microsoft.com/en-us/pricing/details/phi-3/" + }, + "azure_ai/Phi-3-mini-4k-instruct": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000013, + "output_cost_per_token": 0.00000052, + "litellm_provider": "azure_ai", + "mode": "chat", + "supports_vision": false, + "source": "https://azure.microsoft.com/en-us/pricing/details/phi-3/" + }, + "azure_ai/Phi-3-mini-128k-instruct": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000013, + "output_cost_per_token": 0.00000052, + "litellm_provider": "azure_ai", + "mode": "chat", + "supports_vision": false, + "source": "https://azure.microsoft.com/en-us/pricing/details/phi-3/" + }, + "azure_ai/Phi-3-small-8k-instruct": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000015, + "output_cost_per_token": 0.0000006, + "litellm_provider": "azure_ai", + "mode": "chat", + "supports_vision": false, + "source": "https://azure.microsoft.com/en-us/pricing/details/phi-3/" + }, + "azure_ai/Phi-3-small-128k-instruct": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000015, + "output_cost_per_token": 0.0000006, + "litellm_provider": "azure_ai", + "mode": "chat", + "supports_vision": false, + "source": "https://azure.microsoft.com/en-us/pricing/details/phi-3/" + }, + "azure_ai/Phi-3-medium-4k-instruct": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000017, + "output_cost_per_token": 0.00000068, + "litellm_provider": "azure_ai", + "mode": "chat", + "supports_vision": false, + "source": "https://azure.microsoft.com/en-us/pricing/details/phi-3/" + }, + "azure_ai/Phi-3-medium-128k-instruct": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000017, + "output_cost_per_token": 0.00000068, + "litellm_provider": "azure_ai", + "mode": "chat", + "supports_vision": false, + "source": "https://azure.microsoft.com/en-us/pricing/details/phi-3/" + }, "azure_ai/cohere-rerank-v3-multilingual": { "max_tokens": 4096, "max_input_tokens": 4096, @@ -1535,7 +1745,8 @@ "output_cost_per_token": 0.00000080, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "groq/llama3-8b-8192": { "max_tokens": 8192, @@ -1545,7 +1756,74 @@ "output_cost_per_token": 0.00000008, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true + }, + "groq/llama-3.2-1b-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.00000004, + "output_cost_per_token": 0.00000004, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true, + "supports_response_schema": true + }, + "groq/llama-3.2-3b-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.00000006, + "output_cost_per_token": 0.00000006, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true, + "supports_response_schema": true + }, + "groq/llama-3.2-11b-text-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.00000018, + "output_cost_per_token": 0.00000018, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true, + "supports_response_schema": true + }, + "groq/llama-3.2-11b-vision-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.00000018, + "output_cost_per_token": 0.00000018, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true, + "supports_response_schema": true + }, + "groq/llama-3.2-90b-text-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.0000009, + "output_cost_per_token": 0.0000009, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true, + "supports_response_schema": true + }, + "groq/llama-3.2-90b-vision-preview": { + "max_tokens": 8192, + "max_input_tokens": 8192, + "max_output_tokens": 8192, + "input_cost_per_token": 0.0000009, + "output_cost_per_token": 0.0000009, + "litellm_provider": "groq", + "mode": "chat", + "supports_function_calling": true, + "supports_response_schema": true }, "groq/llama3-70b-8192": { "max_tokens": 8192, @@ -1555,7 +1833,8 @@ "output_cost_per_token": 0.00000079, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "groq/llama-3.1-8b-instant": { "max_tokens": 8192, @@ -1565,7 +1844,8 @@ "output_cost_per_token": 0.00000008, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "groq/llama-3.1-70b-versatile": { "max_tokens": 8192, @@ -1575,7 +1855,8 @@ "output_cost_per_token": 0.00000079, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "groq/llama-3.1-405b-reasoning": { "max_tokens": 8192, @@ -1585,7 +1866,8 @@ "output_cost_per_token": 0.00000079, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "groq/mixtral-8x7b-32768": { "max_tokens": 32768, @@ -1595,7 +1877,8 @@ "output_cost_per_token": 0.00000024, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "groq/gemma-7b-it": { "max_tokens": 8192, @@ -1605,7 +1888,8 @@ "output_cost_per_token": 0.00000007, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "groq/gemma2-9b-it": { "max_tokens": 8192, @@ -1615,7 +1899,8 @@ "output_cost_per_token": 0.00000020, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "groq/llama3-groq-70b-8192-tool-use-preview": { "max_tokens": 8192, @@ -1625,7 +1910,8 @@ "output_cost_per_token": 0.00000089, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "groq/llama3-groq-8b-8192-tool-use-preview": { "max_tokens": 8192, @@ -1635,7 +1921,8 @@ "output_cost_per_token": 0.00000019, "litellm_provider": "groq", "mode": "chat", - "supports_function_calling": true + "supports_function_calling": true, + "supports_response_schema": true }, "cerebras/llama3.1-8b": { "max_tokens": 128000, @@ -1728,20 +2015,24 @@ "supports_vision": true, "tool_use_system_prompt_tokens": 264, "supports_assistant_prefill": true, - "supports_prompt_caching": true + "supports_prompt_caching": true, + "supports_response_schema": true }, "claude-3-5-haiku-20241022": { - "max_tokens": 4096, + "max_tokens": 8192, "max_input_tokens": 200000, - "max_output_tokens": 4096, + "max_output_tokens": 8192, "input_cost_per_token": 0.000001, "output_cost_per_token": 0.000005, + "cache_creation_input_token_cost": 0.00000125, + "cache_read_input_token_cost": 0.0000001, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, "tool_use_system_prompt_tokens": 264, "supports_assistant_prefill": true, - "supports_prompt_caching": true + "supports_prompt_caching": true, + "supports_response_schema": true }, "claude-3-opus-20240229": { "max_tokens": 4096, @@ -1757,7 +2048,8 @@ "supports_vision": true, "tool_use_system_prompt_tokens": 395, "supports_assistant_prefill": true, - "supports_prompt_caching": true + "supports_prompt_caching": true, + "supports_response_schema": true }, "claude-3-sonnet-20240229": { "max_tokens": 4096, @@ -1771,7 +2063,8 @@ "supports_vision": true, "tool_use_system_prompt_tokens": 159, "supports_assistant_prefill": true, - "supports_prompt_caching": true + "supports_prompt_caching": true, + "supports_response_schema": true }, "claude-3-5-sonnet-20240620": { "max_tokens": 8192, @@ -1787,7 +2080,8 @@ "supports_vision": true, "tool_use_system_prompt_tokens": 159, "supports_assistant_prefill": true, - "supports_prompt_caching": true + "supports_prompt_caching": true, + "supports_response_schema": true }, "claude-3-5-sonnet-20241022": { "max_tokens": 8192, @@ -1803,7 +2097,9 @@ "supports_vision": true, "tool_use_system_prompt_tokens": 159, "supports_assistant_prefill": true, - "supports_prompt_caching": true + "supports_pdf_input": true, + "supports_prompt_caching": true, + "supports_response_schema": true }, "text-bison": { "max_tokens": 2048, @@ -2208,16 +2504,16 @@ "input_cost_per_image": 0.00032875, "input_cost_per_audio_per_second": 0.00003125, "input_cost_per_video_per_second": 0.00032875, - "input_cost_per_token": 0.000000078125, - "input_cost_per_character": 0.0000003125, + "input_cost_per_token": 0.00000125, + "input_cost_per_character": 0.0000003125, "input_cost_per_image_above_128k_tokens": 0.0006575, "input_cost_per_video_per_second_above_128k_tokens": 0.0006575, "input_cost_per_audio_per_second_above_128k_tokens": 0.0000625, - "input_cost_per_token_above_128k_tokens": 0.00000015625, - "input_cost_per_character_above_128k_tokens": 0.000000625, - "output_cost_per_token": 0.0000003125, + "input_cost_per_token_above_128k_tokens": 0.0000025, + "input_cost_per_character_above_128k_tokens": 0.000000625, + "output_cost_per_token": 0.000005, "output_cost_per_character": 0.00000125, - "output_cost_per_token_above_128k_tokens": 0.000000625, + "output_cost_per_token_above_128k_tokens": 0.00001, "output_cost_per_character_above_128k_tokens": 0.0000025, "litellm_provider": "vertex_ai-language-models", "mode": "chat", @@ -2234,16 +2530,16 @@ "input_cost_per_image": 0.00032875, "input_cost_per_audio_per_second": 0.00003125, "input_cost_per_video_per_second": 0.00032875, - "input_cost_per_token": 0.000000078125, - "input_cost_per_character": 0.0000003125, + "input_cost_per_token": 0.00000125, + "input_cost_per_character": 0.0000003125, "input_cost_per_image_above_128k_tokens": 0.0006575, "input_cost_per_video_per_second_above_128k_tokens": 0.0006575, "input_cost_per_audio_per_second_above_128k_tokens": 0.0000625, - "input_cost_per_token_above_128k_tokens": 0.00000015625, - "input_cost_per_character_above_128k_tokens": 0.000000625, - "output_cost_per_token": 0.0000003125, + "input_cost_per_token_above_128k_tokens": 0.0000025, + "input_cost_per_character_above_128k_tokens": 0.000000625, + "output_cost_per_token": 0.000005, "output_cost_per_character": 0.00000125, - "output_cost_per_token_above_128k_tokens": 0.000000625, + "output_cost_per_token_above_128k_tokens": 0.00001, "output_cost_per_character_above_128k_tokens": 0.0000025, "litellm_provider": "vertex_ai-language-models", "mode": "chat", @@ -2260,16 +2556,16 @@ "input_cost_per_image": 0.00032875, "input_cost_per_audio_per_second": 0.00003125, "input_cost_per_video_per_second": 0.00032875, - "input_cost_per_token": 0.000000078125, - "input_cost_per_character": 0.0000003125, + "input_cost_per_token": 0.00000125, + "input_cost_per_character": 0.0000003125, "input_cost_per_image_above_128k_tokens": 0.0006575, "input_cost_per_video_per_second_above_128k_tokens": 0.0006575, "input_cost_per_audio_per_second_above_128k_tokens": 0.0000625, - "input_cost_per_token_above_128k_tokens": 0.00000015625, - "input_cost_per_character_above_128k_tokens": 0.000000625, - "output_cost_per_token": 0.0000003125, + "input_cost_per_token_above_128k_tokens": 0.0000025, + "input_cost_per_character_above_128k_tokens": 0.000000625, + "output_cost_per_token": 0.000005, "output_cost_per_character": 0.00000125, - "output_cost_per_token_above_128k_tokens": 0.000000625, + "output_cost_per_token_above_128k_tokens": 0.00001, "output_cost_per_character_above_128k_tokens": 0.0000025, "litellm_provider": "vertex_ai-language-models", "mode": "chat", @@ -2369,17 +2665,17 @@ "input_cost_per_image": 0.00002, "input_cost_per_video_per_second": 0.00002, "input_cost_per_audio_per_second": 0.000002, - "input_cost_per_token": 0.000000004688, + "input_cost_per_token": 0.000000075, "input_cost_per_character": 0.00000001875, "input_cost_per_token_above_128k_tokens": 0.000001, "input_cost_per_character_above_128k_tokens": 0.00000025, "input_cost_per_image_above_128k_tokens": 0.00004, "input_cost_per_video_per_second_above_128k_tokens": 0.00004, "input_cost_per_audio_per_second_above_128k_tokens": 0.000004, - "output_cost_per_token": 0.0000000046875, - "output_cost_per_character": 0.00000001875, - "output_cost_per_token_above_128k_tokens": 0.000000009375, - "output_cost_per_character_above_128k_tokens": 0.0000000375, + "output_cost_per_token": 0.0000003, + "output_cost_per_character": 0.000000075, + "output_cost_per_token_above_128k_tokens": 0.0000006, + "output_cost_per_character_above_128k_tokens": 0.00000015, "litellm_provider": "vertex_ai-language-models", "mode": "chat", "supports_system_messages": true, @@ -2433,17 +2729,17 @@ "input_cost_per_image": 0.00002, "input_cost_per_video_per_second": 0.00002, "input_cost_per_audio_per_second": 0.000002, - "input_cost_per_token": 0.000000004688, + "input_cost_per_token": 0.000000075, "input_cost_per_character": 0.00000001875, "input_cost_per_token_above_128k_tokens": 0.000001, "input_cost_per_character_above_128k_tokens": 0.00000025, "input_cost_per_image_above_128k_tokens": 0.00004, "input_cost_per_video_per_second_above_128k_tokens": 0.00004, "input_cost_per_audio_per_second_above_128k_tokens": 0.000004, - "output_cost_per_token": 0.0000000046875, - "output_cost_per_character": 0.00000001875, - "output_cost_per_token_above_128k_tokens": 0.000000009375, - "output_cost_per_character_above_128k_tokens": 0.0000000375, + "output_cost_per_token": 0.0000003, + "output_cost_per_character": 0.000000075, + "output_cost_per_token_above_128k_tokens": 0.0000006, + "output_cost_per_character_above_128k_tokens": 0.00000015, "litellm_provider": "vertex_ai-language-models", "mode": "chat", "supports_system_messages": true, @@ -2465,17 +2761,17 @@ "input_cost_per_image": 0.00002, "input_cost_per_video_per_second": 0.00002, "input_cost_per_audio_per_second": 0.000002, - "input_cost_per_token": 0.000000004688, + "input_cost_per_token": 0.000000075, "input_cost_per_character": 0.00000001875, "input_cost_per_token_above_128k_tokens": 0.000001, "input_cost_per_character_above_128k_tokens": 0.00000025, "input_cost_per_image_above_128k_tokens": 0.00004, "input_cost_per_video_per_second_above_128k_tokens": 0.00004, "input_cost_per_audio_per_second_above_128k_tokens": 0.000004, - "output_cost_per_token": 0.0000000046875, - "output_cost_per_character": 0.00000001875, - "output_cost_per_token_above_128k_tokens": 0.000000009375, - "output_cost_per_character_above_128k_tokens": 0.0000000375, + "output_cost_per_token": 0.0000003, + "output_cost_per_character": 0.000000075, + "output_cost_per_token_above_128k_tokens": 0.0000006, + "output_cost_per_character_above_128k_tokens": 0.00000015, "litellm_provider": "vertex_ai-language-models", "mode": "chat", "supports_system_messages": true, @@ -2497,7 +2793,7 @@ "input_cost_per_image": 0.00002, "input_cost_per_video_per_second": 0.00002, "input_cost_per_audio_per_second": 0.000002, - "input_cost_per_token": 0.000000004688, + "input_cost_per_token": 0.000000075, "input_cost_per_character": 0.00000001875, "input_cost_per_token_above_128k_tokens": 0.000001, "input_cost_per_character_above_128k_tokens": 0.00000025, @@ -2608,6 +2904,18 @@ "mode": "chat", "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" }, + "vertex_ai/claude-3-sonnet": { + "max_tokens": 4096, + "max_input_tokens": 200000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000015, + "litellm_provider": "vertex_ai-anthropic_models", + "mode": "chat", + "supports_function_calling": true, + "supports_vision": true, + "supports_assistant_prefill": true + }, "vertex_ai/claude-3-sonnet@20240229": { "max_tokens": 4096, "max_input_tokens": 200000, @@ -2620,6 +2928,18 @@ "supports_vision": true, "supports_assistant_prefill": true }, + "vertex_ai/claude-3-5-sonnet": { + "max_tokens": 8192, + "max_input_tokens": 200000, + "max_output_tokens": 8192, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000015, + "litellm_provider": "vertex_ai-anthropic_models", + "mode": "chat", + "supports_function_calling": true, + "supports_vision": true, + "supports_assistant_prefill": true + }, "vertex_ai/claude-3-5-sonnet@20240620": { "max_tokens": 8192, "max_input_tokens": 200000, @@ -2632,6 +2952,18 @@ "supports_vision": true, "supports_assistant_prefill": true }, + "vertex_ai/claude-3-5-sonnet-v2": { + "max_tokens": 8192, + "max_input_tokens": 200000, + "max_output_tokens": 8192, + "input_cost_per_token": 0.000003, + "output_cost_per_token": 0.000015, + "litellm_provider": "vertex_ai-anthropic_models", + "mode": "chat", + "supports_function_calling": true, + "supports_vision": true, + "supports_assistant_prefill": true + }, "vertex_ai/claude-3-5-sonnet-v2@20241022": { "max_tokens": 8192, "max_input_tokens": 200000, @@ -2644,6 +2976,18 @@ "supports_vision": true, "supports_assistant_prefill": true }, + "vertex_ai/claude-3-haiku": { + "max_tokens": 4096, + "max_input_tokens": 200000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000025, + "output_cost_per_token": 0.00000125, + "litellm_provider": "vertex_ai-anthropic_models", + "mode": "chat", + "supports_function_calling": true, + "supports_vision": true, + "supports_assistant_prefill": true + }, "vertex_ai/claude-3-haiku@20240307": { "max_tokens": 4096, "max_input_tokens": 200000, @@ -2656,10 +3000,10 @@ "supports_vision": true, "supports_assistant_prefill": true }, - "vertex_ai/claude-3-5-haiku@20241022": { - "max_tokens": 4096, + "vertex_ai/claude-3-5-haiku": { + "max_tokens": 8192, "max_input_tokens": 200000, - "max_output_tokens": 4096, + "max_output_tokens": 8192, "input_cost_per_token": 0.000001, "output_cost_per_token": 0.000005, "litellm_provider": "vertex_ai-anthropic_models", @@ -2667,6 +3011,29 @@ "supports_function_calling": true, "supports_assistant_prefill": true }, + "vertex_ai/claude-3-5-haiku@20241022": { + "max_tokens": 8192, + "max_input_tokens": 200000, + "max_output_tokens": 8192, + "input_cost_per_token": 0.000001, + "output_cost_per_token": 0.000005, + "litellm_provider": "vertex_ai-anthropic_models", + "mode": "chat", + "supports_function_calling": true, + "supports_assistant_prefill": true + }, + "vertex_ai/claude-3-opus": { + "max_tokens": 4096, + "max_input_tokens": 200000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.000015, + "output_cost_per_token": 0.000075, + "litellm_provider": "vertex_ai-anthropic_models", + "mode": "chat", + "supports_function_calling": true, + "supports_vision": true, + "supports_assistant_prefill": true + }, "vertex_ai/claude-3-opus@20240229": { "max_tokens": 4096, "max_input_tokens": 200000, @@ -2710,14 +3077,15 @@ "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models" }, "vertex_ai/meta/llama-3.2-90b-vision-instruct-maas": { - "max_tokens": 8192, + "max_tokens": 128000, "max_input_tokens": 128000, - "max_output_tokens": 8192, + "max_output_tokens": 2048, "input_cost_per_token": 0.0, "output_cost_per_token": 0.0, "litellm_provider": "vertex_ai-llama_models", "mode": "chat", "supports_system_messages": true, + "supports_vision": true, "source": "https://console.cloud.google.com/vertex-ai/publishers/meta/model-garden/llama-3.2-90b-vision-instruct-maas" }, "vertex_ai/mistral-large@latest": { @@ -2826,19 +3194,19 @@ "supports_function_calling": true }, "vertex_ai/imagegeneration@006": { - "cost_per_image": 0.020, + "output_cost_per_image": 0.020, "litellm_provider": "vertex_ai-image-models", "mode": "image_generation", "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing" }, "vertex_ai/imagen-3.0-generate-001": { - "cost_per_image": 0.04, + "output_cost_per_image": 0.04, "litellm_provider": "vertex_ai-image-models", "mode": "image_generation", "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing" }, "vertex_ai/imagen-3.0-fast-generate-001": { - "cost_per_image": 0.02, + "output_cost_per_image": 0.02, "litellm_provider": "vertex_ai-image-models", "mode": "image_generation", "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing" @@ -3015,6 +3383,8 @@ "supports_vision": true, "supports_response_schema": true, "supports_prompt_caching": true, + "tpm": 4000000, + "rpm": 2000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-flash-001": { @@ -3038,6 +3408,8 @@ "supports_vision": true, "supports_response_schema": true, "supports_prompt_caching": true, + "tpm": 4000000, + "rpm": 2000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-flash": { @@ -3060,6 +3432,8 @@ "supports_function_calling": true, "supports_vision": true, "supports_response_schema": true, + "tpm": 4000000, + "rpm": 2000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-flash-latest": { @@ -3082,6 +3456,32 @@ "supports_function_calling": true, "supports_vision": true, "supports_response_schema": true, + "tpm": 4000000, + "rpm": 2000, + "source": "https://ai.google.dev/pricing" + }, + "gemini/gemini-1.5-flash-8b": { + "max_tokens": 8192, + "max_input_tokens": 1048576, + "max_output_tokens": 8192, + "max_images_per_prompt": 3000, + "max_videos_per_prompt": 10, + "max_video_length": 1, + "max_audio_length_hours": 8.4, + "max_audio_per_prompt": 1, + "max_pdf_size_mb": 30, + "input_cost_per_token": 0, + "input_cost_per_token_above_128k_tokens": 0, + "output_cost_per_token": 0, + "output_cost_per_token_above_128k_tokens": 0, + "litellm_provider": "gemini", + "mode": "chat", + "supports_system_messages": true, + "supports_function_calling": true, + "supports_vision": true, + "supports_response_schema": true, + "tpm": 4000000, + "rpm": 4000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-flash-8b-exp-0924": { @@ -3104,8 +3504,37 @@ "supports_function_calling": true, "supports_vision": true, "supports_response_schema": true, + "tpm": 4000000, + "rpm": 4000, "source": "https://ai.google.dev/pricing" }, + "gemini/gemini-exp-1114": { + "max_tokens": 8192, + "max_input_tokens": 1048576, + "max_output_tokens": 8192, + "max_images_per_prompt": 3000, + "max_videos_per_prompt": 10, + "max_video_length": 1, + "max_audio_length_hours": 8.4, + "max_audio_per_prompt": 1, + "max_pdf_size_mb": 30, + "input_cost_per_token": 0, + "input_cost_per_token_above_128k_tokens": 0, + "output_cost_per_token": 0, + "output_cost_per_token_above_128k_tokens": 0, + "litellm_provider": "gemini", + "mode": "chat", + "supports_system_messages": true, + "supports_function_calling": true, + "supports_vision": true, + "supports_response_schema": true, + "tpm": 4000000, + "rpm": 1000, + "source": "https://ai.google.dev/pricing", + "metadata": { + "notes": "Rate limits not documented for gemini-exp-1114. Assuming same as gemini-1.5-pro." + } + }, "gemini/gemini-1.5-flash-exp-0827": { "max_tokens": 8192, "max_input_tokens": 1048576, @@ -3126,6 +3555,8 @@ "supports_function_calling": true, "supports_vision": true, "supports_response_schema": true, + "tpm": 4000000, + "rpm": 2000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-flash-8b-exp-0827": { @@ -3147,6 +3578,9 @@ "supports_system_messages": true, "supports_function_calling": true, "supports_vision": true, + "supports_response_schema": true, + "tpm": 4000000, + "rpm": 4000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-pro": { @@ -3160,7 +3594,10 @@ "litellm_provider": "gemini", "mode": "chat", "supports_function_calling": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "rpd": 30000, + "tpm": 120000, + "rpm": 360, + "source": "https://ai.google.dev/gemini-api/docs/models/gemini" }, "gemini/gemini-1.5-pro": { "max_tokens": 8192, @@ -3177,6 +3614,8 @@ "supports_vision": true, "supports_tool_choice": true, "supports_response_schema": true, + "tpm": 4000000, + "rpm": 1000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-pro-002": { @@ -3195,6 +3634,8 @@ "supports_tool_choice": true, "supports_response_schema": true, "supports_prompt_caching": true, + "tpm": 4000000, + "rpm": 1000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-pro-001": { @@ -3213,6 +3654,8 @@ "supports_tool_choice": true, "supports_response_schema": true, "supports_prompt_caching": true, + "tpm": 4000000, + "rpm": 1000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-pro-exp-0801": { @@ -3230,6 +3673,8 @@ "supports_vision": true, "supports_tool_choice": true, "supports_response_schema": true, + "tpm": 4000000, + "rpm": 1000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-pro-exp-0827": { @@ -3247,6 +3692,8 @@ "supports_vision": true, "supports_tool_choice": true, "supports_response_schema": true, + "tpm": 4000000, + "rpm": 1000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-pro-latest": { @@ -3264,6 +3711,8 @@ "supports_vision": true, "supports_tool_choice": true, "supports_response_schema": true, + "tpm": 4000000, + "rpm": 1000, "source": "https://ai.google.dev/pricing" }, "gemini/gemini-pro-vision": { @@ -3278,6 +3727,9 @@ "mode": "chat", "supports_function_calling": true, "supports_vision": true, + "rpd": 30000, + "tpm": 120000, + "rpm": 360, "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" }, "gemini/gemini-gemma-2-27b-it": { @@ -3660,9 +4112,9 @@ "tool_use_system_prompt_tokens": 264 }, "openrouter/anthropic/claude-3-5-haiku-20241022": { - "max_tokens": 4096, + "max_tokens": 8192, "max_input_tokens": 200000, - "max_output_tokens": 4096, + "max_output_tokens": 8192, "input_cost_per_token": 0.000001, "output_cost_per_token": 0.000005, "litellm_provider": "openrouter", @@ -3670,22 +4122,6 @@ "supports_function_calling": true, "tool_use_system_prompt_tokens": 264 }, - "anthropic/claude-3-5-sonnet-20241022": { - "max_tokens": 8192, - "max_input_tokens": 200000, - "max_output_tokens": 8192, - "input_cost_per_token": 0.000003, - "output_cost_per_token": 0.000015, - "cache_creation_input_token_cost": 0.00000375, - "cache_read_input_token_cost": 0.0000003, - "litellm_provider": "anthropic", - "mode": "chat", - "supports_function_calling": true, - "supports_vision": true, - "tool_use_system_prompt_tokens": 159, - "supports_assistant_prefill": true, - "supports_prompt_caching": true - }, "openrouter/anthropic/claude-3.5-sonnet": { "max_tokens": 8192, "max_input_tokens": 200000, @@ -3790,7 +4226,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true + "supports_vision": false }, "openrouter/openai/o1-mini-2024-09-12": { "max_tokens": 65536, @@ -3802,7 +4238,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true + "supports_vision": false }, "openrouter/openai/o1-preview": { "max_tokens": 32768, @@ -3814,7 +4250,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true + "supports_vision": false }, "openrouter/openai/o1-preview-2024-09-12": { "max_tokens": 32768, @@ -3826,7 +4262,7 @@ "mode": "chat", "supports_function_calling": true, "supports_parallel_function_calling": true, - "supports_vision": true + "supports_vision": false }, "openrouter/openai/gpt-4o": { "max_tokens": 4096, @@ -4002,6 +4438,15 @@ "litellm_provider": "openrouter", "mode": "chat" }, + "openrouter/qwen/qwen-2.5-coder-32b-instruct": { + "max_tokens": 33792, + "max_input_tokens": 33792, + "max_output_tokens": 33792, + "input_cost_per_token": 0.00000018, + "output_cost_per_token": 0.00000018, + "litellm_provider": "openrouter", + "mode": "chat" + }, "j2-ultra": { "max_tokens": 8192, "max_input_tokens": 8192, @@ -4373,9 +4818,9 @@ "supports_vision": true }, "anthropic.claude-3-5-sonnet-20241022-v2:0": { - "max_tokens": 4096, + "max_tokens": 8192, "max_input_tokens": 200000, - "max_output_tokens": 4096, + "max_output_tokens": 8192, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000015, "litellm_provider": "bedrock", @@ -4403,6 +4848,7 @@ "output_cost_per_token": 0.000005, "litellm_provider": "bedrock", "mode": "chat", + "supports_assistant_prefill": true, "supports_function_calling": true }, "anthropic.claude-3-opus-20240229-v1:0": { @@ -4439,9 +4885,9 @@ "supports_vision": true }, "us.anthropic.claude-3-5-sonnet-20241022-v2:0": { - "max_tokens": 4096, + "max_tokens": 8192, "max_input_tokens": 200000, - "max_output_tokens": 4096, + "max_output_tokens": 8192, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000015, "litellm_provider": "bedrock", @@ -4469,6 +4915,7 @@ "output_cost_per_token": 0.000005, "litellm_provider": "bedrock", "mode": "chat", + "supports_assistant_prefill": true, "supports_function_calling": true }, "us.anthropic.claude-3-opus-20240229-v1:0": { @@ -4505,9 +4952,9 @@ "supports_vision": true }, "eu.anthropic.claude-3-5-sonnet-20241022-v2:0": { - "max_tokens": 4096, + "max_tokens": 8192, "max_input_tokens": 200000, - "max_output_tokens": 4096, + "max_output_tokens": 8192, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000015, "litellm_provider": "bedrock", @@ -5277,6 +5724,17 @@ "supports_function_calling": true, "supports_tool_choice": false }, + "us.meta.llama3-1-8b-instruct-v1:0": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 2048, + "input_cost_per_token": 0.00000022, + "output_cost_per_token": 0.00000022, + "litellm_provider": "bedrock", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false + }, "meta.llama3-1-70b-instruct-v1:0": { "max_tokens": 128000, "max_input_tokens": 128000, @@ -5288,6 +5746,17 @@ "supports_function_calling": true, "supports_tool_choice": false }, + "us.meta.llama3-1-70b-instruct-v1:0": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 2048, + "input_cost_per_token": 0.00000099, + "output_cost_per_token": 0.00000099, + "litellm_provider": "bedrock", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false + }, "meta.llama3-1-405b-instruct-v1:0": { "max_tokens": 128000, "max_input_tokens": 128000, @@ -5299,6 +5768,17 @@ "supports_function_calling": true, "supports_tool_choice": false }, + "us.meta.llama3-1-405b-instruct-v1:0": { + "max_tokens": 128000, + "max_input_tokens": 128000, + "max_output_tokens": 4096, + "input_cost_per_token": 0.00000532, + "output_cost_per_token": 0.000016, + "litellm_provider": "bedrock", + "mode": "chat", + "supports_function_calling": true, + "supports_tool_choice": false + }, "meta.llama3-2-1b-instruct-v1:0": { "max_tokens": 128000, "max_input_tokens": 128000, @@ -5451,6 +5931,20 @@ "litellm_provider": "bedrock", "mode": "image_generation" }, + "stability.sd3-large-v1:0": { + "max_tokens": 77, + "max_input_tokens": 77, + "output_cost_per_image": 0.08, + "litellm_provider": "bedrock", + "mode": "image_generation" + }, + "stability.stable-image-ultra-v1:0": { + "max_tokens": 77, + "max_input_tokens": 77, + "output_cost_per_image": 0.14, + "litellm_provider": "bedrock", + "mode": "image_generation" + }, "sagemaker/meta-textgeneration-llama-2-7b": { "max_tokens": 4096, "max_input_tokens": 4096, @@ -6249,6 +6743,17 @@ "supports_function_calling": true, "source": "https://fireworks.ai/pricing" }, + "fireworks_ai/accounts/fireworks/models/qwen2p5-coder-32b-instruct": { + "max_tokens": 4096, + "max_input_tokens": 4096, + "max_output_tokens": 4096, + "input_cost_per_token": 0.0000009, + "output_cost_per_token": 0.0000009, + "litellm_provider": "fireworks_ai", + "mode": "chat", + "supports_function_calling": true, + "source": "https://fireworks.ai/pricing" + }, "fireworks_ai/accounts/fireworks/models/yi-large": { "max_tokens": 32768, "max_input_tokens": 32768, diff --git a/litellm/proxy/_experimental/out/_next/static/chunks/131-3d2257b0ff5aadb2.js b/litellm/proxy/_experimental/out/_next/static/chunks/131-3d2257b0ff5aadb2.js deleted file mode 100644 index 51181e75a..000000000 --- a/litellm/proxy/_experimental/out/_next/static/chunks/131-3d2257b0ff5aadb2.js +++ /dev/null @@ -1,8 +0,0 @@ -"use strict";(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[131],{84174:function(e,t,n){n.d(t,{Z:function(){return s}});var a=n(14749),r=n(2265),i={icon:{tag:"svg",attrs:{viewBox:"64 64 896 896",focusable:"false"},children:[{tag:"path",attrs:{d:"M832 64H296c-4.4 0-8 3.6-8 8v56c0 4.4 3.6 8 8 8h496v688c0 4.4 3.6 8 8 8h56c4.4 0 8-3.6 8-8V96c0-17.7-14.3-32-32-32zM704 192H192c-17.7 0-32 14.3-32 32v530.7c0 8.5 3.4 16.6 9.4 22.6l173.3 173.3c2.2 2.2 4.7 4 7.4 5.5v1.9h4.2c3.5 1.3 7.2 2 11 2H704c17.7 0 32-14.3 32-32V224c0-17.7-14.3-32-32-32zM350 856.2L263.9 770H350v86.2zM664 888H414V746c0-22.1-17.9-40-40-40H232V264h432v624z"}}]},name:"copy",theme:"outlined"},o=n(60688),s=r.forwardRef(function(e,t){return r.createElement(o.Z,(0,a.Z)({},e,{ref:t,icon:i}))})},50459:function(e,t,n){n.d(t,{Z:function(){return s}});var a=n(14749),r=n(2265),i={icon:{tag:"svg",attrs:{viewBox:"64 64 896 896",focusable:"false"},children:[{tag:"path",attrs:{d:"M765.7 486.8L314.9 134.7A7.97 7.97 0 00302 141v77.3c0 4.9 2.3 9.6 6.1 12.6l360 281.1-360 281.1c-3.9 3-6.1 7.7-6.1 12.6V883c0 6.7 7.7 10.4 12.9 6.3l450.8-352.1a31.96 31.96 0 000-50.4z"}}]},name:"right",theme:"outlined"},o=n(60688),s=r.forwardRef(function(e,t){return r.createElement(o.Z,(0,a.Z)({},e,{ref:t,icon:i}))})},92836:function(e,t,n){n.d(t,{Z:function(){return p}});var a=n(69703),r=n(80991),i=n(2898),o=n(99250),s=n(65492),l=n(2265),c=n(41608),d=n(50027);n(18174),n(21871),n(41213);let u=(0,s.fn)("Tab"),p=l.forwardRef((e,t)=>{let{icon:n,className:p,children:g}=e,m=(0,a._T)(e,["icon","className","children"]),b=(0,l.useContext)(c.O),f=(0,l.useContext)(d.Z);return l.createElement(r.O,Object.assign({ref:t,className:(0,o.q)(u("root"),"flex whitespace-nowrap truncate max-w-xs outline-none focus:ring-0 text-tremor-default transition duration-100",f?(0,s.bM)(f,i.K.text).selectTextColor:"solid"===b?"ui-selected:text-tremor-content-emphasis dark:ui-selected:text-dark-tremor-content-emphasis":"ui-selected:text-tremor-brand dark:ui-selected:text-dark-tremor-brand",function(e,t){switch(e){case"line":return(0,o.q)("ui-selected:border-b-2 hover:border-b-2 border-transparent transition duration-100 -mb-px px-2 py-2","hover:border-tremor-content hover:text-tremor-content-emphasis text-tremor-content","dark:hover:border-dark-tremor-content-emphasis dark:hover:text-dark-tremor-content-emphasis dark:text-dark-tremor-content",t?(0,s.bM)(t,i.K.border).selectBorderColor:"ui-selected:border-tremor-brand dark:ui-selected:border-dark-tremor-brand");case"solid":return(0,o.q)("border-transparent border rounded-tremor-small px-2.5 py-1","ui-selected:border-tremor-border ui-selected:bg-tremor-background ui-selected:shadow-tremor-input hover:text-tremor-content-emphasis ui-selected:text-tremor-brand","dark:ui-selected:border-dark-tremor-border dark:ui-selected:bg-dark-tremor-background dark:ui-selected:shadow-dark-tremor-input dark:hover:text-dark-tremor-content-emphasis dark:ui-selected:text-dark-tremor-brand",t?(0,s.bM)(t,i.K.text).selectTextColor:"text-tremor-content dark:text-dark-tremor-content")}}(b,f),p)},m),n?l.createElement(n,{className:(0,o.q)(u("icon"),"flex-none h-5 w-5",g?"mr-2":"")}):null,g?l.createElement("span",null,g):null)});p.displayName="Tab"},26734:function(e,t,n){n.d(t,{Z:function(){return c}});var a=n(69703),r=n(80991),i=n(99250),o=n(65492),s=n(2265);let l=(0,o.fn)("TabGroup"),c=s.forwardRef((e,t)=>{let{defaultIndex:n,index:o,onIndexChange:c,children:d,className:u}=e,p=(0,a._T)(e,["defaultIndex","index","onIndexChange","children","className"]);return s.createElement(r.O.Group,Object.assign({as:"div",ref:t,defaultIndex:n,selectedIndex:o,onChange:c,className:(0,i.q)(l("root"),"w-full",u)},p),d)});c.displayName="TabGroup"},41608:function(e,t,n){n.d(t,{O:function(){return c},Z:function(){return u}});var a=n(69703),r=n(2265),i=n(50027);n(18174),n(21871),n(41213);var o=n(80991),s=n(99250);let l=(0,n(65492).fn)("TabList"),c=(0,r.createContext)("line"),d={line:(0,s.q)("flex border-b space-x-4","border-tremor-border","dark:border-dark-tremor-border"),solid:(0,s.q)("inline-flex p-0.5 rounded-tremor-default space-x-1.5","bg-tremor-background-subtle","dark:bg-dark-tremor-background-subtle")},u=r.forwardRef((e,t)=>{let{color:n,variant:u="line",children:p,className:g}=e,m=(0,a._T)(e,["color","variant","children","className"]);return r.createElement(o.O.List,Object.assign({ref:t,className:(0,s.q)(l("root"),"justify-start overflow-x-clip",d[u],g)},m),r.createElement(c.Provider,{value:u},r.createElement(i.Z.Provider,{value:n},p)))});u.displayName="TabList"},32126:function(e,t,n){n.d(t,{Z:function(){return d}});var a=n(69703);n(50027);var r=n(18174);n(21871);var i=n(41213),o=n(99250),s=n(65492),l=n(2265);let c=(0,s.fn)("TabPanel"),d=l.forwardRef((e,t)=>{let{children:n,className:s}=e,d=(0,a._T)(e,["children","className"]),{selectedValue:u}=(0,l.useContext)(i.Z),p=u===(0,l.useContext)(r.Z);return l.createElement("div",Object.assign({ref:t,className:(0,o.q)(c("root"),"w-full mt-2",p?"":"hidden",s),"aria-selected":p?"true":"false"},d),n)});d.displayName="TabPanel"},23682:function(e,t,n){n.d(t,{Z:function(){return u}});var a=n(69703),r=n(80991);n(50027);var i=n(18174);n(21871);var o=n(41213),s=n(99250),l=n(65492),c=n(2265);let d=(0,l.fn)("TabPanels"),u=c.forwardRef((e,t)=>{let{children:n,className:l}=e,u=(0,a._T)(e,["children","className"]);return c.createElement(r.O.Panels,Object.assign({as:"div",ref:t,className:(0,s.q)(d("root"),"w-full",l)},u),e=>{let{selectedIndex:t}=e;return c.createElement(o.Z.Provider,{value:{selectedValue:t}},c.Children.map(n,(e,t)=>c.createElement(i.Z.Provider,{value:t},e)))})});u.displayName="TabPanels"},50027:function(e,t,n){n.d(t,{Z:function(){return i}});var a=n(2265),r=n(54942);n(99250);let i=(0,a.createContext)(r.fr.Blue)},18174:function(e,t,n){n.d(t,{Z:function(){return a}});let a=(0,n(2265).createContext)(0)},21871:function(e,t,n){n.d(t,{Z:function(){return a}});let a=(0,n(2265).createContext)(void 0)},41213:function(e,t,n){n.d(t,{Z:function(){return a}});let a=(0,n(2265).createContext)({selectedValue:void 0,handleValueChange:void 0})},21467:function(e,t,n){n.d(t,{i:function(){return s}});var a=n(2265),r=n(44329),i=n(54165),o=n(57499);function s(e){return t=>a.createElement(i.ZP,{theme:{token:{motion:!1,zIndexPopupBase:0}}},a.createElement(e,Object.assign({},t)))}t.Z=(e,t,n,i)=>s(s=>{let{prefixCls:l,style:c}=s,d=a.useRef(null),[u,p]=a.useState(0),[g,m]=a.useState(0),[b,f]=(0,r.Z)(!1,{value:s.open}),{getPrefixCls:E}=a.useContext(o.E_),h=E(t||"select",l);a.useEffect(()=>{if(f(!0),"undefined"!=typeof ResizeObserver){let e=new ResizeObserver(e=>{let t=e[0].target;p(t.offsetHeight+8),m(t.offsetWidth)}),t=setInterval(()=>{var a;let r=n?".".concat(n(h)):".".concat(h,"-dropdown"),i=null===(a=d.current)||void 0===a?void 0:a.querySelector(r);i&&(clearInterval(t),e.observe(i))},10);return()=>{clearInterval(t),e.disconnect()}}},[]);let S=Object.assign(Object.assign({},s),{style:Object.assign(Object.assign({},c),{margin:0}),open:b,visible:b,getPopupContainer:()=>d.current});return i&&(S=i(S)),a.createElement("div",{ref:d,style:{paddingBottom:u,position:"relative",minWidth:g}},a.createElement(e,Object.assign({},S)))})},99129:function(e,t,n){let a;n.d(t,{Z:function(){return eY}});var r=n(63787),i=n(2265),o=n(37274),s=n(57499),l=n(54165),c=n(99537),d=n(77136),u=n(20653),p=n(40388),g=n(16480),m=n.n(g),b=n(51761),f=n(47387),E=n(70595),h=n(24750),S=n(89211),y=n(13565),T=n(51350),A=e=>{let{type:t,children:n,prefixCls:a,buttonProps:r,close:o,autoFocus:s,emitEvent:l,isSilent:c,quitOnNullishReturnValue:d,actionFn:u}=e,p=i.useRef(!1),g=i.useRef(null),[m,b]=(0,S.Z)(!1),f=function(){null==o||o.apply(void 0,arguments)};i.useEffect(()=>{let e=null;return s&&(e=setTimeout(()=>{var e;null===(e=g.current)||void 0===e||e.focus()})),()=>{e&&clearTimeout(e)}},[]);let E=e=>{e&&e.then&&(b(!0),e.then(function(){b(!1,!0),f.apply(void 0,arguments),p.current=!1},e=>{if(b(!1,!0),p.current=!1,null==c||!c())return Promise.reject(e)}))};return i.createElement(y.ZP,Object.assign({},(0,T.nx)(t),{onClick:e=>{let t;if(!p.current){if(p.current=!0,!u){f();return}if(l){var n;if(t=u(e),d&&!((n=t)&&n.then)){p.current=!1,f(e);return}}else if(u.length)t=u(o),p.current=!1;else if(!(t=u())){f();return}E(t)}},loading:m,prefixCls:a},r,{ref:g}),n)};let R=i.createContext({}),{Provider:I}=R;var N=()=>{let{autoFocusButton:e,cancelButtonProps:t,cancelTextLocale:n,isSilent:a,mergedOkCancel:r,rootPrefixCls:o,close:s,onCancel:l,onConfirm:c}=(0,i.useContext)(R);return r?i.createElement(A,{isSilent:a,actionFn:l,close:function(){null==s||s.apply(void 0,arguments),null==c||c(!1)},autoFocus:"cancel"===e,buttonProps:t,prefixCls:"".concat(o,"-btn")},n):null},_=()=>{let{autoFocusButton:e,close:t,isSilent:n,okButtonProps:a,rootPrefixCls:r,okTextLocale:o,okType:s,onConfirm:l,onOk:c}=(0,i.useContext)(R);return i.createElement(A,{isSilent:n,type:s||"primary",actionFn:c,close:function(){null==t||t.apply(void 0,arguments),null==l||l(!0)},autoFocus:"ok"===e,buttonProps:a,prefixCls:"".concat(r,"-btn")},o)},v=n(81303),w=n(14749),k=n(80406),C=n(88804),O=i.createContext({}),x=n(5239),L=n(31506),D=n(91010),P=n(4295),M=n(72480);function F(e,t,n){var a=t;return!a&&n&&(a="".concat(e,"-").concat(n)),a}function U(e,t){var n=e["page".concat(t?"Y":"X","Offset")],a="scroll".concat(t?"Top":"Left");if("number"!=typeof n){var r=e.document;"number"!=typeof(n=r.documentElement[a])&&(n=r.body[a])}return n}var B=n(49367),G=n(74084),$=i.memo(function(e){return e.children},function(e,t){return!t.shouldUpdate}),H={width:0,height:0,overflow:"hidden",outline:"none"},z=i.forwardRef(function(e,t){var n,a,r,o=e.prefixCls,s=e.className,l=e.style,c=e.title,d=e.ariaId,u=e.footer,p=e.closable,g=e.closeIcon,b=e.onClose,f=e.children,E=e.bodyStyle,h=e.bodyProps,S=e.modalRender,y=e.onMouseDown,T=e.onMouseUp,A=e.holderRef,R=e.visible,I=e.forceRender,N=e.width,_=e.height,v=e.classNames,k=e.styles,C=i.useContext(O).panel,L=(0,G.x1)(A,C),D=(0,i.useRef)(),P=(0,i.useRef)();i.useImperativeHandle(t,function(){return{focus:function(){var e;null===(e=D.current)||void 0===e||e.focus()},changeActive:function(e){var t=document.activeElement;e&&t===P.current?D.current.focus():e||t!==D.current||P.current.focus()}}});var M={};void 0!==N&&(M.width=N),void 0!==_&&(M.height=_),u&&(n=i.createElement("div",{className:m()("".concat(o,"-footer"),null==v?void 0:v.footer),style:(0,x.Z)({},null==k?void 0:k.footer)},u)),c&&(a=i.createElement("div",{className:m()("".concat(o,"-header"),null==v?void 0:v.header),style:(0,x.Z)({},null==k?void 0:k.header)},i.createElement("div",{className:"".concat(o,"-title"),id:d},c))),p&&(r=i.createElement("button",{type:"button",onClick:b,"aria-label":"Close",className:"".concat(o,"-close")},g||i.createElement("span",{className:"".concat(o,"-close-x")})));var F=i.createElement("div",{className:m()("".concat(o,"-content"),null==v?void 0:v.content),style:null==k?void 0:k.content},r,a,i.createElement("div",(0,w.Z)({className:m()("".concat(o,"-body"),null==v?void 0:v.body),style:(0,x.Z)((0,x.Z)({},E),null==k?void 0:k.body)},h),f),n);return i.createElement("div",{key:"dialog-element",role:"dialog","aria-labelledby":c?d:null,"aria-modal":"true",ref:L,style:(0,x.Z)((0,x.Z)({},l),M),className:m()(o,s),onMouseDown:y,onMouseUp:T},i.createElement("div",{tabIndex:0,ref:D,style:H,"aria-hidden":"true"}),i.createElement($,{shouldUpdate:R||I},S?S(F):F),i.createElement("div",{tabIndex:0,ref:P,style:H,"aria-hidden":"true"}))}),j=i.forwardRef(function(e,t){var n=e.prefixCls,a=e.title,r=e.style,o=e.className,s=e.visible,l=e.forceRender,c=e.destroyOnClose,d=e.motionName,u=e.ariaId,p=e.onVisibleChanged,g=e.mousePosition,b=(0,i.useRef)(),f=i.useState(),E=(0,k.Z)(f,2),h=E[0],S=E[1],y={};function T(){var e,t,n,a,r,i=(n={left:(t=(e=b.current).getBoundingClientRect()).left,top:t.top},r=(a=e.ownerDocument).defaultView||a.parentWindow,n.left+=U(r),n.top+=U(r,!0),n);S(g?"".concat(g.x-i.left,"px ").concat(g.y-i.top,"px"):"")}return h&&(y.transformOrigin=h),i.createElement(B.ZP,{visible:s,onVisibleChanged:p,onAppearPrepare:T,onEnterPrepare:T,forceRender:l,motionName:d,removeOnLeave:c,ref:b},function(s,l){var c=s.className,d=s.style;return i.createElement(z,(0,w.Z)({},e,{ref:t,title:a,ariaId:u,prefixCls:n,holderRef:l,style:(0,x.Z)((0,x.Z)((0,x.Z)({},d),r),y),className:m()(o,c)}))})});function V(e){var t=e.prefixCls,n=e.style,a=e.visible,r=e.maskProps,o=e.motionName,s=e.className;return i.createElement(B.ZP,{key:"mask",visible:a,motionName:o,leavedClassName:"".concat(t,"-mask-hidden")},function(e,a){var o=e.className,l=e.style;return i.createElement("div",(0,w.Z)({ref:a,style:(0,x.Z)((0,x.Z)({},l),n),className:m()("".concat(t,"-mask"),o,s)},r))})}function W(e){var t=e.prefixCls,n=void 0===t?"rc-dialog":t,a=e.zIndex,r=e.visible,o=void 0!==r&&r,s=e.keyboard,l=void 0===s||s,c=e.focusTriggerAfterClose,d=void 0===c||c,u=e.wrapStyle,p=e.wrapClassName,g=e.wrapProps,b=e.onClose,f=e.afterOpenChange,E=e.afterClose,h=e.transitionName,S=e.animation,y=e.closable,T=e.mask,A=void 0===T||T,R=e.maskTransitionName,I=e.maskAnimation,N=e.maskClosable,_=e.maskStyle,v=e.maskProps,C=e.rootClassName,O=e.classNames,U=e.styles,B=(0,i.useRef)(),G=(0,i.useRef)(),$=(0,i.useRef)(),H=i.useState(o),z=(0,k.Z)(H,2),W=z[0],q=z[1],Y=(0,D.Z)();function K(e){null==b||b(e)}var Z=(0,i.useRef)(!1),X=(0,i.useRef)(),Q=null;return(void 0===N||N)&&(Q=function(e){Z.current?Z.current=!1:G.current===e.target&&K(e)}),(0,i.useEffect)(function(){o&&(q(!0),(0,L.Z)(G.current,document.activeElement)||(B.current=document.activeElement))},[o]),(0,i.useEffect)(function(){return function(){clearTimeout(X.current)}},[]),i.createElement("div",(0,w.Z)({className:m()("".concat(n,"-root"),C)},(0,M.Z)(e,{data:!0})),i.createElement(V,{prefixCls:n,visible:A&&o,motionName:F(n,R,I),style:(0,x.Z)((0,x.Z)({zIndex:a},_),null==U?void 0:U.mask),maskProps:v,className:null==O?void 0:O.mask}),i.createElement("div",(0,w.Z)({tabIndex:-1,onKeyDown:function(e){if(l&&e.keyCode===P.Z.ESC){e.stopPropagation(),K(e);return}o&&e.keyCode===P.Z.TAB&&$.current.changeActive(!e.shiftKey)},className:m()("".concat(n,"-wrap"),p,null==O?void 0:O.wrapper),ref:G,onClick:Q,style:(0,x.Z)((0,x.Z)((0,x.Z)({zIndex:a},u),null==U?void 0:U.wrapper),{},{display:W?null:"none"})},g),i.createElement(j,(0,w.Z)({},e,{onMouseDown:function(){clearTimeout(X.current),Z.current=!0},onMouseUp:function(){X.current=setTimeout(function(){Z.current=!1})},ref:$,closable:void 0===y||y,ariaId:Y,prefixCls:n,visible:o&&W,onClose:K,onVisibleChanged:function(e){if(e)!function(){if(!(0,L.Z)(G.current,document.activeElement)){var e;null===(e=$.current)||void 0===e||e.focus()}}();else{if(q(!1),A&&B.current&&d){try{B.current.focus({preventScroll:!0})}catch(e){}B.current=null}W&&(null==E||E())}null==f||f(e)},motionName:F(n,h,S)}))))}j.displayName="Content",n(53850);var q=function(e){var t=e.visible,n=e.getContainer,a=e.forceRender,r=e.destroyOnClose,o=void 0!==r&&r,s=e.afterClose,l=e.panelRef,c=i.useState(t),d=(0,k.Z)(c,2),u=d[0],p=d[1],g=i.useMemo(function(){return{panel:l}},[l]);return(i.useEffect(function(){t&&p(!0)},[t]),a||!o||u)?i.createElement(O.Provider,{value:g},i.createElement(C.Z,{open:t||a||u,autoDestroy:!1,getContainer:n,autoLock:t||u},i.createElement(W,(0,w.Z)({},e,{destroyOnClose:o,afterClose:function(){null==s||s(),p(!1)}})))):null};q.displayName="Dialog";var Y=function(e,t,n){let a=arguments.length>3&&void 0!==arguments[3]?arguments[3]:i.createElement(v.Z,null),r=arguments.length>4&&void 0!==arguments[4]&&arguments[4];if("boolean"==typeof e?!e:void 0===t?!r:!1===t||null===t)return[!1,null];let o="boolean"==typeof t||null==t?a:t;return[!0,n?n(o):o]},K=n(22127),Z=n(86718),X=n(47137),Q=n(92801),J=n(48563);function ee(){}let et=i.createContext({add:ee,remove:ee});var en=n(17094),ea=()=>{let{cancelButtonProps:e,cancelTextLocale:t,onCancel:n}=(0,i.useContext)(R);return i.createElement(y.ZP,Object.assign({onClick:n},e),t)},er=()=>{let{confirmLoading:e,okButtonProps:t,okType:n,okTextLocale:a,onOk:r}=(0,i.useContext)(R);return i.createElement(y.ZP,Object.assign({},(0,T.nx)(n),{loading:e,onClick:r},t),a)},ei=n(4678);function eo(e,t){return i.createElement("span",{className:"".concat(e,"-close-x")},t||i.createElement(v.Z,{className:"".concat(e,"-close-icon")}))}let es=e=>{let t;let{okText:n,okType:a="primary",cancelText:o,confirmLoading:s,onOk:l,onCancel:c,okButtonProps:d,cancelButtonProps:u,footer:p}=e,[g]=(0,E.Z)("Modal",(0,ei.A)()),m={confirmLoading:s,okButtonProps:d,cancelButtonProps:u,okTextLocale:n||(null==g?void 0:g.okText),cancelTextLocale:o||(null==g?void 0:g.cancelText),okType:a,onOk:l,onCancel:c},b=i.useMemo(()=>m,(0,r.Z)(Object.values(m)));return"function"==typeof p||void 0===p?(t=i.createElement(i.Fragment,null,i.createElement(ea,null),i.createElement(er,null)),"function"==typeof p&&(t=p(t,{OkBtn:er,CancelBtn:ea})),t=i.createElement(I,{value:b},t)):t=p,i.createElement(en.n,{disabled:!1},t)};var el=n(11303),ec=n(13703),ed=n(58854),eu=n(80316),ep=n(76585),eg=n(8985);function em(e){return{position:e,inset:0}}let eb=e=>{let{componentCls:t,antCls:n}=e;return[{["".concat(t,"-root")]:{["".concat(t).concat(n,"-zoom-enter, ").concat(t).concat(n,"-zoom-appear")]:{transform:"none",opacity:0,animationDuration:e.motionDurationSlow,userSelect:"none"},["".concat(t).concat(n,"-zoom-leave ").concat(t,"-content")]:{pointerEvents:"none"},["".concat(t,"-mask")]:Object.assign(Object.assign({},em("fixed")),{zIndex:e.zIndexPopupBase,height:"100%",backgroundColor:e.colorBgMask,pointerEvents:"none",["".concat(t,"-hidden")]:{display:"none"}}),["".concat(t,"-wrap")]:Object.assign(Object.assign({},em("fixed")),{zIndex:e.zIndexPopupBase,overflow:"auto",outline:0,WebkitOverflowScrolling:"touch",["&:has(".concat(t).concat(n,"-zoom-enter), &:has(").concat(t).concat(n,"-zoom-appear)")]:{pointerEvents:"none"}})}},{["".concat(t,"-root")]:(0,ec.J$)(e)}]},ef=e=>{let{componentCls:t}=e;return[{["".concat(t,"-root")]:{["".concat(t,"-wrap-rtl")]:{direction:"rtl"},["".concat(t,"-centered")]:{textAlign:"center","&::before":{display:"inline-block",width:0,height:"100%",verticalAlign:"middle",content:'""'},[t]:{top:0,display:"inline-block",paddingBottom:0,textAlign:"start",verticalAlign:"middle"}},["@media (max-width: ".concat(e.screenSMMax,"px)")]:{[t]:{maxWidth:"calc(100vw - 16px)",margin:"".concat((0,eg.bf)(e.marginXS)," auto")},["".concat(t,"-centered")]:{[t]:{flex:1}}}}},{[t]:Object.assign(Object.assign({},(0,el.Wf)(e)),{pointerEvents:"none",position:"relative",top:100,width:"auto",maxWidth:"calc(100vw - ".concat((0,eg.bf)(e.calc(e.margin).mul(2).equal()),")"),margin:"0 auto",paddingBottom:e.paddingLG,["".concat(t,"-title")]:{margin:0,color:e.titleColor,fontWeight:e.fontWeightStrong,fontSize:e.titleFontSize,lineHeight:e.titleLineHeight,wordWrap:"break-word"},["".concat(t,"-content")]:{position:"relative",backgroundColor:e.contentBg,backgroundClip:"padding-box",border:0,borderRadius:e.borderRadiusLG,boxShadow:e.boxShadow,pointerEvents:"auto",padding:e.contentPadding},["".concat(t,"-close")]:Object.assign({position:"absolute",top:e.calc(e.modalHeaderHeight).sub(e.modalCloseBtnSize).div(2).equal(),insetInlineEnd:e.calc(e.modalHeaderHeight).sub(e.modalCloseBtnSize).div(2).equal(),zIndex:e.calc(e.zIndexPopupBase).add(10).equal(),padding:0,color:e.modalCloseIconColor,fontWeight:e.fontWeightStrong,lineHeight:1,textDecoration:"none",background:"transparent",borderRadius:e.borderRadiusSM,width:e.modalCloseBtnSize,height:e.modalCloseBtnSize,border:0,outline:0,cursor:"pointer",transition:"color ".concat(e.motionDurationMid,", background-color ").concat(e.motionDurationMid),"&-x":{display:"flex",fontSize:e.fontSizeLG,fontStyle:"normal",lineHeight:"".concat((0,eg.bf)(e.modalCloseBtnSize)),justifyContent:"center",textTransform:"none",textRendering:"auto"},"&:hover":{color:e.modalIconHoverColor,backgroundColor:e.closeBtnHoverBg,textDecoration:"none"},"&:active":{backgroundColor:e.closeBtnActiveBg}},(0,el.Qy)(e)),["".concat(t,"-header")]:{color:e.colorText,background:e.headerBg,borderRadius:"".concat((0,eg.bf)(e.borderRadiusLG)," ").concat((0,eg.bf)(e.borderRadiusLG)," 0 0"),marginBottom:e.headerMarginBottom,padding:e.headerPadding,borderBottom:e.headerBorderBottom},["".concat(t,"-body")]:{fontSize:e.fontSize,lineHeight:e.lineHeight,wordWrap:"break-word",padding:e.bodyPadding},["".concat(t,"-footer")]:{textAlign:"end",background:e.footerBg,marginTop:e.footerMarginTop,padding:e.footerPadding,borderTop:e.footerBorderTop,borderRadius:e.footerBorderRadius,["> ".concat(e.antCls,"-btn + ").concat(e.antCls,"-btn")]:{marginInlineStart:e.marginXS}},["".concat(t,"-open")]:{overflow:"hidden"}})},{["".concat(t,"-pure-panel")]:{top:"auto",padding:0,display:"flex",flexDirection:"column",["".concat(t,"-content,\n ").concat(t,"-body,\n ").concat(t,"-confirm-body-wrapper")]:{display:"flex",flexDirection:"column",flex:"auto"},["".concat(t,"-confirm-body")]:{marginBottom:"auto"}}}]},eE=e=>{let{componentCls:t}=e;return{["".concat(t,"-root")]:{["".concat(t,"-wrap-rtl")]:{direction:"rtl",["".concat(t,"-confirm-body")]:{direction:"rtl"}}}}},eh=e=>{let t=e.padding,n=e.fontSizeHeading5,a=e.lineHeightHeading5;return(0,eu.TS)(e,{modalHeaderHeight:e.calc(e.calc(a).mul(n).equal()).add(e.calc(t).mul(2).equal()).equal(),modalFooterBorderColorSplit:e.colorSplit,modalFooterBorderStyle:e.lineType,modalFooterBorderWidth:e.lineWidth,modalIconHoverColor:e.colorIconHover,modalCloseIconColor:e.colorIcon,modalCloseBtnSize:e.fontHeight,modalConfirmIconSize:e.fontHeight,modalTitleHeight:e.calc(e.titleFontSize).mul(e.titleLineHeight).equal()})},eS=e=>({footerBg:"transparent",headerBg:e.colorBgElevated,titleLineHeight:e.lineHeightHeading5,titleFontSize:e.fontSizeHeading5,contentBg:e.colorBgElevated,titleColor:e.colorTextHeading,closeBtnHoverBg:e.wireframe?"transparent":e.colorFillContent,closeBtnActiveBg:e.wireframe?"transparent":e.colorFillContentHover,contentPadding:e.wireframe?0:"".concat((0,eg.bf)(e.paddingMD)," ").concat((0,eg.bf)(e.paddingContentHorizontalLG)),headerPadding:e.wireframe?"".concat((0,eg.bf)(e.padding)," ").concat((0,eg.bf)(e.paddingLG)):0,headerBorderBottom:e.wireframe?"".concat((0,eg.bf)(e.lineWidth)," ").concat(e.lineType," ").concat(e.colorSplit):"none",headerMarginBottom:e.wireframe?0:e.marginXS,bodyPadding:e.wireframe?e.paddingLG:0,footerPadding:e.wireframe?"".concat((0,eg.bf)(e.paddingXS)," ").concat((0,eg.bf)(e.padding)):0,footerBorderTop:e.wireframe?"".concat((0,eg.bf)(e.lineWidth)," ").concat(e.lineType," ").concat(e.colorSplit):"none",footerBorderRadius:e.wireframe?"0 0 ".concat((0,eg.bf)(e.borderRadiusLG)," ").concat((0,eg.bf)(e.borderRadiusLG)):0,footerMarginTop:e.wireframe?0:e.marginSM,confirmBodyPadding:e.wireframe?"".concat((0,eg.bf)(2*e.padding)," ").concat((0,eg.bf)(2*e.padding)," ").concat((0,eg.bf)(e.paddingLG)):0,confirmIconMarginInlineEnd:e.wireframe?e.margin:e.marginSM,confirmBtnsMarginTop:e.wireframe?e.marginLG:e.marginSM});var ey=(0,ep.I$)("Modal",e=>{let t=eh(e);return[ef(t),eE(t),eb(t),(0,ed._y)(t,"zoom")]},eS,{unitless:{titleLineHeight:!0}}),eT=n(92935),eA=function(e,t){var n={};for(var a in e)Object.prototype.hasOwnProperty.call(e,a)&&0>t.indexOf(a)&&(n[a]=e[a]);if(null!=e&&"function"==typeof Object.getOwnPropertySymbols)for(var r=0,a=Object.getOwnPropertySymbols(e);rt.indexOf(a[r])&&Object.prototype.propertyIsEnumerable.call(e,a[r])&&(n[a[r]]=e[a[r]]);return n};(0,K.Z)()&&window.document.documentElement&&document.documentElement.addEventListener("click",e=>{a={x:e.pageX,y:e.pageY},setTimeout(()=>{a=null},100)},!0);var eR=e=>{var t;let{getPopupContainer:n,getPrefixCls:r,direction:o,modal:l}=i.useContext(s.E_),c=t=>{let{onCancel:n}=e;null==n||n(t)},{prefixCls:d,className:u,rootClassName:p,open:g,wrapClassName:E,centered:h,getContainer:S,closeIcon:y,closable:T,focusTriggerAfterClose:A=!0,style:R,visible:I,width:N=520,footer:_,classNames:w,styles:k}=e,C=eA(e,["prefixCls","className","rootClassName","open","wrapClassName","centered","getContainer","closeIcon","closable","focusTriggerAfterClose","style","visible","width","footer","classNames","styles"]),O=r("modal",d),x=r(),L=(0,eT.Z)(O),[D,P,M]=ey(O,L),F=m()(E,{["".concat(O,"-centered")]:!!h,["".concat(O,"-wrap-rtl")]:"rtl"===o}),U=null!==_&&i.createElement(es,Object.assign({},e,{onOk:t=>{let{onOk:n}=e;null==n||n(t)},onCancel:c})),[B,G]=Y(T,y,e=>eo(O,e),i.createElement(v.Z,{className:"".concat(O,"-close-icon")}),!0),$=function(e){let t=i.useContext(et),n=i.useRef();return(0,J.zX)(a=>{if(a){let r=e?a.querySelector(e):a;t.add(r),n.current=r}else t.remove(n.current)})}(".".concat(O,"-content")),[H,z]=(0,b.Cn)("Modal",C.zIndex);return D(i.createElement(Q.BR,null,i.createElement(X.Ux,{status:!0,override:!0},i.createElement(Z.Z.Provider,{value:z},i.createElement(q,Object.assign({width:N},C,{zIndex:H,getContainer:void 0===S?n:S,prefixCls:O,rootClassName:m()(P,p,M,L),footer:U,visible:null!=g?g:I,mousePosition:null!==(t=C.mousePosition)&&void 0!==t?t:a,onClose:c,closable:B,closeIcon:G,focusTriggerAfterClose:A,transitionName:(0,f.m)(x,"zoom",e.transitionName),maskTransitionName:(0,f.m)(x,"fade",e.maskTransitionName),className:m()(P,u,null==l?void 0:l.className),style:Object.assign(Object.assign({},null==l?void 0:l.style),R),classNames:Object.assign(Object.assign({wrapper:F},null==l?void 0:l.classNames),w),styles:Object.assign(Object.assign({},null==l?void 0:l.styles),k),panelRef:$}))))))};let eI=e=>{let{componentCls:t,titleFontSize:n,titleLineHeight:a,modalConfirmIconSize:r,fontSize:i,lineHeight:o,modalTitleHeight:s,fontHeight:l,confirmBodyPadding:c}=e,d="".concat(t,"-confirm");return{[d]:{"&-rtl":{direction:"rtl"},["".concat(e.antCls,"-modal-header")]:{display:"none"},["".concat(d,"-body-wrapper")]:Object.assign({},(0,el.dF)()),["&".concat(t," ").concat(t,"-body")]:{padding:c},["".concat(d,"-body")]:{display:"flex",flexWrap:"nowrap",alignItems:"start",["> ".concat(e.iconCls)]:{flex:"none",fontSize:r,marginInlineEnd:e.confirmIconMarginInlineEnd,marginTop:e.calc(e.calc(l).sub(r).equal()).div(2).equal()},["&-has-title > ".concat(e.iconCls)]:{marginTop:e.calc(e.calc(s).sub(r).equal()).div(2).equal()}},["".concat(d,"-paragraph")]:{display:"flex",flexDirection:"column",flex:"auto",rowGap:e.marginXS,maxWidth:"calc(100% - ".concat((0,eg.bf)(e.calc(e.modalConfirmIconSize).add(e.marginSM).equal()),")")},["".concat(d,"-title")]:{color:e.colorTextHeading,fontWeight:e.fontWeightStrong,fontSize:n,lineHeight:a},["".concat(d,"-content")]:{color:e.colorText,fontSize:i,lineHeight:o},["".concat(d,"-btns")]:{textAlign:"end",marginTop:e.confirmBtnsMarginTop,["".concat(e.antCls,"-btn + ").concat(e.antCls,"-btn")]:{marginBottom:0,marginInlineStart:e.marginXS}}},["".concat(d,"-error ").concat(d,"-body > ").concat(e.iconCls)]:{color:e.colorError},["".concat(d,"-warning ").concat(d,"-body > ").concat(e.iconCls,",\n ").concat(d,"-confirm ").concat(d,"-body > ").concat(e.iconCls)]:{color:e.colorWarning},["".concat(d,"-info ").concat(d,"-body > ").concat(e.iconCls)]:{color:e.colorInfo},["".concat(d,"-success ").concat(d,"-body > ").concat(e.iconCls)]:{color:e.colorSuccess}}};var eN=(0,ep.bk)(["Modal","confirm"],e=>[eI(eh(e))],eS,{order:-1e3}),e_=function(e,t){var n={};for(var a in e)Object.prototype.hasOwnProperty.call(e,a)&&0>t.indexOf(a)&&(n[a]=e[a]);if(null!=e&&"function"==typeof Object.getOwnPropertySymbols)for(var r=0,a=Object.getOwnPropertySymbols(e);rt.indexOf(a[r])&&Object.prototype.propertyIsEnumerable.call(e,a[r])&&(n[a[r]]=e[a[r]]);return n};function ev(e){let{prefixCls:t,icon:n,okText:a,cancelText:o,confirmPrefixCls:s,type:l,okCancel:g,footer:b,locale:f}=e,h=e_(e,["prefixCls","icon","okText","cancelText","confirmPrefixCls","type","okCancel","footer","locale"]),S=n;if(!n&&null!==n)switch(l){case"info":S=i.createElement(p.Z,null);break;case"success":S=i.createElement(c.Z,null);break;case"error":S=i.createElement(d.Z,null);break;default:S=i.createElement(u.Z,null)}let y=null!=g?g:"confirm"===l,T=null!==e.autoFocusButton&&(e.autoFocusButton||"ok"),[A]=(0,E.Z)("Modal"),R=f||A,v=a||(y?null==R?void 0:R.okText:null==R?void 0:R.justOkText),w=Object.assign({autoFocusButton:T,cancelTextLocale:o||(null==R?void 0:R.cancelText),okTextLocale:v,mergedOkCancel:y},h),k=i.useMemo(()=>w,(0,r.Z)(Object.values(w))),C=i.createElement(i.Fragment,null,i.createElement(N,null),i.createElement(_,null)),O=void 0!==e.title&&null!==e.title,x="".concat(s,"-body");return i.createElement("div",{className:"".concat(s,"-body-wrapper")},i.createElement("div",{className:m()(x,{["".concat(x,"-has-title")]:O})},S,i.createElement("div",{className:"".concat(s,"-paragraph")},O&&i.createElement("span",{className:"".concat(s,"-title")},e.title),i.createElement("div",{className:"".concat(s,"-content")},e.content))),void 0===b||"function"==typeof b?i.createElement(I,{value:k},i.createElement("div",{className:"".concat(s,"-btns")},"function"==typeof b?b(C,{OkBtn:_,CancelBtn:N}):C)):b,i.createElement(eN,{prefixCls:t}))}let ew=e=>{let{close:t,zIndex:n,afterClose:a,open:r,keyboard:o,centered:s,getContainer:l,maskStyle:c,direction:d,prefixCls:u,wrapClassName:p,rootPrefixCls:g,bodyStyle:E,closable:S=!1,closeIcon:y,modalRender:T,focusTriggerAfterClose:A,onConfirm:R,styles:I}=e,N="".concat(u,"-confirm"),_=e.width||416,v=e.style||{},w=void 0===e.mask||e.mask,k=void 0!==e.maskClosable&&e.maskClosable,C=m()(N,"".concat(N,"-").concat(e.type),{["".concat(N,"-rtl")]:"rtl"===d},e.className),[,O]=(0,h.ZP)(),x=i.useMemo(()=>void 0!==n?n:O.zIndexPopupBase+b.u6,[n,O]);return i.createElement(eR,{prefixCls:u,className:C,wrapClassName:m()({["".concat(N,"-centered")]:!!e.centered},p),onCancel:()=>{null==t||t({triggerCancel:!0}),null==R||R(!1)},open:r,title:"",footer:null,transitionName:(0,f.m)(g||"","zoom",e.transitionName),maskTransitionName:(0,f.m)(g||"","fade",e.maskTransitionName),mask:w,maskClosable:k,style:v,styles:Object.assign({body:E,mask:c},I),width:_,zIndex:x,afterClose:a,keyboard:o,centered:s,getContainer:l,closable:S,closeIcon:y,modalRender:T,focusTriggerAfterClose:A},i.createElement(ev,Object.assign({},e,{confirmPrefixCls:N})))};var ek=e=>{let{rootPrefixCls:t,iconPrefixCls:n,direction:a,theme:r}=e;return i.createElement(l.ZP,{prefixCls:t,iconPrefixCls:n,direction:a,theme:r},i.createElement(ew,Object.assign({},e)))},eC=[];let eO="",ex=e=>{var t,n;let{prefixCls:a,getContainer:r,direction:o}=e,l=(0,ei.A)(),c=(0,i.useContext)(s.E_),d=eO||c.getPrefixCls(),u=a||"".concat(d,"-modal"),p=r;return!1===p&&(p=void 0),i.createElement(ek,Object.assign({},e,{rootPrefixCls:d,prefixCls:u,iconPrefixCls:c.iconPrefixCls,theme:c.theme,direction:null!=o?o:c.direction,locale:null!==(n=null===(t=c.locale)||void 0===t?void 0:t.Modal)&&void 0!==n?n:l,getContainer:p}))};function eL(e){let t;let n=(0,l.w6)(),a=document.createDocumentFragment(),s=Object.assign(Object.assign({},e),{close:u,open:!0});function c(){for(var t=arguments.length,n=Array(t),i=0;ie&&e.triggerCancel);e.onCancel&&s&&e.onCancel.apply(e,[()=>{}].concat((0,r.Z)(n.slice(1))));for(let e=0;e{let t=n.getPrefixCls(void 0,eO),r=n.getIconPrefixCls(),s=n.getTheme(),c=i.createElement(ex,Object.assign({},e));(0,o.s)(i.createElement(l.ZP,{prefixCls:t,iconPrefixCls:r,theme:s},n.holderRender?n.holderRender(c):c),a)})}function u(){for(var t=arguments.length,n=Array(t),a=0;a{"function"==typeof e.afterClose&&e.afterClose(),c.apply(this,n)}})).visible&&delete s.visible,d(s)}return d(s),eC.push(u),{destroy:u,update:function(e){d(s="function"==typeof e?e(s):Object.assign(Object.assign({},s),e))}}}function eD(e){return Object.assign(Object.assign({},e),{type:"warning"})}function eP(e){return Object.assign(Object.assign({},e),{type:"info"})}function eM(e){return Object.assign(Object.assign({},e),{type:"success"})}function eF(e){return Object.assign(Object.assign({},e),{type:"error"})}function eU(e){return Object.assign(Object.assign({},e),{type:"confirm"})}var eB=n(21467),eG=function(e,t){var n={};for(var a in e)Object.prototype.hasOwnProperty.call(e,a)&&0>t.indexOf(a)&&(n[a]=e[a]);if(null!=e&&"function"==typeof Object.getOwnPropertySymbols)for(var r=0,a=Object.getOwnPropertySymbols(e);rt.indexOf(a[r])&&Object.prototype.propertyIsEnumerable.call(e,a[r])&&(n[a[r]]=e[a[r]]);return n},e$=(0,eB.i)(e=>{let{prefixCls:t,className:n,closeIcon:a,closable:r,type:o,title:l,children:c,footer:d}=e,u=eG(e,["prefixCls","className","closeIcon","closable","type","title","children","footer"]),{getPrefixCls:p}=i.useContext(s.E_),g=p(),b=t||p("modal"),f=(0,eT.Z)(g),[E,h,S]=ey(b,f),y="".concat(b,"-confirm"),T={};return T=o?{closable:null!=r&&r,title:"",footer:"",children:i.createElement(ev,Object.assign({},e,{prefixCls:b,confirmPrefixCls:y,rootPrefixCls:g,content:c}))}:{closable:null==r||r,title:l,footer:null!==d&&i.createElement(es,Object.assign({},e)),children:c},E(i.createElement(z,Object.assign({prefixCls:b,className:m()(h,"".concat(b,"-pure-panel"),o&&y,o&&"".concat(y,"-").concat(o),n,S,f)},u,{closeIcon:eo(b,a),closable:r},T)))}),eH=n(79474),ez=function(e,t){var n={};for(var a in e)Object.prototype.hasOwnProperty.call(e,a)&&0>t.indexOf(a)&&(n[a]=e[a]);if(null!=e&&"function"==typeof Object.getOwnPropertySymbols)for(var r=0,a=Object.getOwnPropertySymbols(e);rt.indexOf(a[r])&&Object.prototype.propertyIsEnumerable.call(e,a[r])&&(n[a[r]]=e[a[r]]);return n},ej=i.forwardRef((e,t)=>{var n,{afterClose:a,config:o}=e,l=ez(e,["afterClose","config"]);let[c,d]=i.useState(!0),[u,p]=i.useState(o),{direction:g,getPrefixCls:m}=i.useContext(s.E_),b=m("modal"),f=m(),h=function(){d(!1);for(var e=arguments.length,t=Array(e),n=0;ne&&e.triggerCancel);u.onCancel&&a&&u.onCancel.apply(u,[()=>{}].concat((0,r.Z)(t.slice(1))))};i.useImperativeHandle(t,()=>({destroy:h,update:e=>{p(t=>Object.assign(Object.assign({},t),e))}}));let S=null!==(n=u.okCancel)&&void 0!==n?n:"confirm"===u.type,[y]=(0,E.Z)("Modal",eH.Z.Modal);return i.createElement(ek,Object.assign({prefixCls:b,rootPrefixCls:f},u,{close:h,open:c,afterClose:()=>{var e;a(),null===(e=u.afterClose)||void 0===e||e.call(u)},okText:u.okText||(S?null==y?void 0:y.okText:null==y?void 0:y.justOkText),direction:u.direction||g,cancelText:u.cancelText||(null==y?void 0:y.cancelText)},l))});let eV=0,eW=i.memo(i.forwardRef((e,t)=>{let[n,a]=function(){let[e,t]=i.useState([]);return[e,i.useCallback(e=>(t(t=>[].concat((0,r.Z)(t),[e])),()=>{t(t=>t.filter(t=>t!==e))}),[])]}();return i.useImperativeHandle(t,()=>({patchElement:a}),[]),i.createElement(i.Fragment,null,n)}));function eq(e){return eL(eD(e))}eR.useModal=function(){let e=i.useRef(null),[t,n]=i.useState([]);i.useEffect(()=>{t.length&&((0,r.Z)(t).forEach(e=>{e()}),n([]))},[t]);let a=i.useCallback(t=>function(a){var o;let s,l;eV+=1;let c=i.createRef(),d=new Promise(e=>{s=e}),u=!1,p=i.createElement(ej,{key:"modal-".concat(eV),config:t(a),ref:c,afterClose:()=>{null==l||l()},isSilent:()=>u,onConfirm:e=>{s(e)}});return(l=null===(o=e.current)||void 0===o?void 0:o.patchElement(p))&&eC.push(l),{destroy:()=>{function e(){var e;null===(e=c.current)||void 0===e||e.destroy()}c.current?e():n(t=>[].concat((0,r.Z)(t),[e]))},update:e=>{function t(){var t;null===(t=c.current)||void 0===t||t.update(e)}c.current?t():n(e=>[].concat((0,r.Z)(e),[t]))},then:e=>(u=!0,d.then(e))}},[]);return[i.useMemo(()=>({info:a(eP),success:a(eM),error:a(eF),warning:a(eD),confirm:a(eU)}),[]),i.createElement(eW,{key:"modal-holder",ref:e})]},eR.info=function(e){return eL(eP(e))},eR.success=function(e){return eL(eM(e))},eR.error=function(e){return eL(eF(e))},eR.warning=eq,eR.warn=eq,eR.confirm=function(e){return eL(eU(e))},eR.destroyAll=function(){for(;eC.length;){let e=eC.pop();e&&e()}},eR.config=function(e){let{rootPrefixCls:t}=e;eO=t},eR._InternalPanelDoNotUseOrYouWillBeFired=e$;var eY=eR},13703:function(e,t,n){n.d(t,{J$:function(){return s}});var a=n(8985),r=n(59353);let i=new a.E4("antFadeIn",{"0%":{opacity:0},"100%":{opacity:1}}),o=new a.E4("antFadeOut",{"0%":{opacity:1},"100%":{opacity:0}}),s=function(e){let t=arguments.length>1&&void 0!==arguments[1]&&arguments[1],{antCls:n}=e,a="".concat(n,"-fade"),s=t?"&":"";return[(0,r.R)(a,i,o,e.motionDurationMid,t),{["\n ".concat(s).concat(a,"-enter,\n ").concat(s).concat(a,"-appear\n ")]:{opacity:0,animationTimingFunction:"linear"},["".concat(s).concat(a,"-leave")]:{animationTimingFunction:"linear"}}]}},44056:function(e){e.exports=function(e,n){for(var a,r,i,o=e||"",s=n||"div",l={},c=0;c4&&m.slice(0,4)===o&&s.test(t)&&("-"===t.charAt(4)?b=o+(n=t.slice(5).replace(l,u)).charAt(0).toUpperCase()+n.slice(1):(g=(p=t).slice(4),t=l.test(g)?p:("-"!==(g=g.replace(c,d)).charAt(0)&&(g="-"+g),o+g)),f=r),new f(b,t))};var s=/^data[-\w.:]+$/i,l=/-[a-z]/g,c=/[A-Z]/g;function d(e){return"-"+e.toLowerCase()}function u(e){return e.charAt(1).toUpperCase()}},31872:function(e,t,n){var a=n(96130),r=n(64730),i=n(61861),o=n(46982),s=n(83671),l=n(53618);e.exports=a([i,r,o,s,l])},83671:function(e,t,n){var a=n(7667),r=n(13585),i=a.booleanish,o=a.number,s=a.spaceSeparated;e.exports=r({transform:function(e,t){return"role"===t?t:"aria-"+t.slice(4).toLowerCase()},properties:{ariaActiveDescendant:null,ariaAtomic:i,ariaAutoComplete:null,ariaBusy:i,ariaChecked:i,ariaColCount:o,ariaColIndex:o,ariaColSpan:o,ariaControls:s,ariaCurrent:null,ariaDescribedBy:s,ariaDetails:null,ariaDisabled:i,ariaDropEffect:s,ariaErrorMessage:null,ariaExpanded:i,ariaFlowTo:s,ariaGrabbed:i,ariaHasPopup:null,ariaHidden:i,ariaInvalid:null,ariaKeyShortcuts:null,ariaLabel:null,ariaLabelledBy:s,ariaLevel:o,ariaLive:null,ariaModal:i,ariaMultiLine:i,ariaMultiSelectable:i,ariaOrientation:null,ariaOwns:s,ariaPlaceholder:null,ariaPosInSet:o,ariaPressed:i,ariaReadOnly:i,ariaRelevant:null,ariaRequired:i,ariaRoleDescription:s,ariaRowCount:o,ariaRowIndex:o,ariaRowSpan:o,ariaSelected:i,ariaSetSize:o,ariaSort:null,ariaValueMax:o,ariaValueMin:o,ariaValueNow:o,ariaValueText:null,role:null}})},53618:function(e,t,n){var a=n(7667),r=n(13585),i=n(46640),o=a.boolean,s=a.overloadedBoolean,l=a.booleanish,c=a.number,d=a.spaceSeparated,u=a.commaSeparated;e.exports=r({space:"html",attributes:{acceptcharset:"accept-charset",classname:"class",htmlfor:"for",httpequiv:"http-equiv"},transform:i,mustUseProperty:["checked","multiple","muted","selected"],properties:{abbr:null,accept:u,acceptCharset:d,accessKey:d,action:null,allow:null,allowFullScreen:o,allowPaymentRequest:o,allowUserMedia:o,alt:null,as:null,async:o,autoCapitalize:null,autoComplete:d,autoFocus:o,autoPlay:o,capture:o,charSet:null,checked:o,cite:null,className:d,cols:c,colSpan:null,content:null,contentEditable:l,controls:o,controlsList:d,coords:c|u,crossOrigin:null,data:null,dateTime:null,decoding:null,default:o,defer:o,dir:null,dirName:null,disabled:o,download:s,draggable:l,encType:null,enterKeyHint:null,form:null,formAction:null,formEncType:null,formMethod:null,formNoValidate:o,formTarget:null,headers:d,height:c,hidden:o,high:c,href:null,hrefLang:null,htmlFor:d,httpEquiv:d,id:null,imageSizes:null,imageSrcSet:u,inputMode:null,integrity:null,is:null,isMap:o,itemId:null,itemProp:d,itemRef:d,itemScope:o,itemType:d,kind:null,label:null,lang:null,language:null,list:null,loading:null,loop:o,low:c,manifest:null,max:null,maxLength:c,media:null,method:null,min:null,minLength:c,multiple:o,muted:o,name:null,nonce:null,noModule:o,noValidate:o,onAbort:null,onAfterPrint:null,onAuxClick:null,onBeforePrint:null,onBeforeUnload:null,onBlur:null,onCancel:null,onCanPlay:null,onCanPlayThrough:null,onChange:null,onClick:null,onClose:null,onContextMenu:null,onCopy:null,onCueChange:null,onCut:null,onDblClick:null,onDrag:null,onDragEnd:null,onDragEnter:null,onDragExit:null,onDragLeave:null,onDragOver:null,onDragStart:null,onDrop:null,onDurationChange:null,onEmptied:null,onEnded:null,onError:null,onFocus:null,onFormData:null,onHashChange:null,onInput:null,onInvalid:null,onKeyDown:null,onKeyPress:null,onKeyUp:null,onLanguageChange:null,onLoad:null,onLoadedData:null,onLoadedMetadata:null,onLoadEnd:null,onLoadStart:null,onMessage:null,onMessageError:null,onMouseDown:null,onMouseEnter:null,onMouseLeave:null,onMouseMove:null,onMouseOut:null,onMouseOver:null,onMouseUp:null,onOffline:null,onOnline:null,onPageHide:null,onPageShow:null,onPaste:null,onPause:null,onPlay:null,onPlaying:null,onPopState:null,onProgress:null,onRateChange:null,onRejectionHandled:null,onReset:null,onResize:null,onScroll:null,onSecurityPolicyViolation:null,onSeeked:null,onSeeking:null,onSelect:null,onSlotChange:null,onStalled:null,onStorage:null,onSubmit:null,onSuspend:null,onTimeUpdate:null,onToggle:null,onUnhandledRejection:null,onUnload:null,onVolumeChange:null,onWaiting:null,onWheel:null,open:o,optimum:c,pattern:null,ping:d,placeholder:null,playsInline:o,poster:null,preload:null,readOnly:o,referrerPolicy:null,rel:d,required:o,reversed:o,rows:c,rowSpan:c,sandbox:d,scope:null,scoped:o,seamless:o,selected:o,shape:null,size:c,sizes:null,slot:null,span:c,spellCheck:l,src:null,srcDoc:null,srcLang:null,srcSet:u,start:c,step:null,style:null,tabIndex:c,target:null,title:null,translate:null,type:null,typeMustMatch:o,useMap:null,value:l,width:c,wrap:null,align:null,aLink:null,archive:d,axis:null,background:null,bgColor:null,border:c,borderColor:null,bottomMargin:c,cellPadding:null,cellSpacing:null,char:null,charOff:null,classId:null,clear:null,code:null,codeBase:null,codeType:null,color:null,compact:o,declare:o,event:null,face:null,frame:null,frameBorder:null,hSpace:c,leftMargin:c,link:null,longDesc:null,lowSrc:null,marginHeight:c,marginWidth:c,noResize:o,noHref:o,noShade:o,noWrap:o,object:null,profile:null,prompt:null,rev:null,rightMargin:c,rules:null,scheme:null,scrolling:l,standby:null,summary:null,text:null,topMargin:c,valueType:null,version:null,vAlign:null,vLink:null,vSpace:c,allowTransparency:null,autoCorrect:null,autoSave:null,disablePictureInPicture:o,disableRemotePlayback:o,prefix:null,property:null,results:c,security:null,unselectable:null}})},46640:function(e,t,n){var a=n(25852);e.exports=function(e,t){return a(e,t.toLowerCase())}},25852:function(e){e.exports=function(e,t){return t in e?e[t]:t}},13585:function(e,t,n){var a=n(39900),r=n(94949),i=n(7478);e.exports=function(e){var t,n,o=e.space,s=e.mustUseProperty||[],l=e.attributes||{},c=e.properties,d=e.transform,u={},p={};for(t in c)n=new i(t,d(l,t),c[t],o),-1!==s.indexOf(t)&&(n.mustUseProperty=!0),u[t]=n,p[a(t)]=t,p[a(n.attribute)]=t;return new r(u,p,o)}},7478:function(e,t,n){var a=n(74108),r=n(7667);e.exports=s,s.prototype=new a,s.prototype.defined=!0;var i=["boolean","booleanish","overloadedBoolean","number","commaSeparated","spaceSeparated","commaOrSpaceSeparated"],o=i.length;function s(e,t,n,s){var l,c,d,u=-1;for(s&&(this.space=s),a.call(this,e,t);++u