diff --git a/.circleci/config.yml b/.circleci/config.yml index 0a12aa73b8..feb425a38e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,6 +3,18 @@ orbs: codecov: codecov/codecov@4.0.1 node: circleci/node@5.1.0 # Add this line to declare the node orb +commands: + setup_google_dns: + steps: + - run: + name: "Configure Google DNS" + command: | + # Backup original resolv.conf + sudo cp /etc/resolv.conf /etc/resolv.conf.backup + # Add both local and Google DNS servers + echo "nameserver 127.0.0.11" | sudo tee /etc/resolv.conf + echo "nameserver 8.8.8.8" | sudo tee -a /etc/resolv.conf + echo "nameserver 8.8.4.4" | sudo tee -a /etc/resolv.conf jobs: local_testing: @@ -15,7 +27,7 @@ jobs: steps: - checkout - + - setup_google_dns - run: name: Show git commit hash command: | @@ -49,7 +61,7 @@ jobs: pip install opentelemetry-api==1.25.0 pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0 - pip install openai==1.66.1 + pip install openai==1.68.2 pip install prisma==0.11.0 pip install "detect_secrets==1.5.0" pip install "httpx==0.24.1" @@ -66,7 +78,7 @@ jobs: pip install python-multipart pip install google-cloud-aiplatform pip install prometheus-client==0.20.0 - pip install "pydantic==2.7.1" + pip install "pydantic==2.10.2" pip install "diskcache==5.6.1" pip install "Pillow==10.3.0" pip install "jsonschema==4.22.0" @@ -134,7 +146,7 @@ jobs: steps: - checkout - + - setup_google_dns - run: name: Show git commit hash command: | @@ -168,7 +180,7 @@ jobs: pip install opentelemetry-api==1.25.0 pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0 - pip install openai==1.66.1 + pip install openai==1.68.2 pip install prisma==0.11.0 pip install "detect_secrets==1.5.0" pip install "httpx==0.24.1" @@ -185,7 +197,7 @@ jobs: pip install python-multipart pip install google-cloud-aiplatform pip install prometheus-client==0.20.0 - pip install "pydantic==2.7.1" + pip install "pydantic==2.10.2" pip install "diskcache==5.6.1" pip install "Pillow==10.3.0" pip install "jsonschema==4.22.0" @@ -234,7 +246,13 @@ jobs: steps: - checkout - + - setup_google_dns + - run: + name: DNS lookup for Redis host + command: | + sudo apt-get update + sudo apt-get install -y dnsutils + dig redis-19899.c239.us-east-1-2.ec2.redns.redis-cloud.com +short - run: name: Show git commit hash command: | @@ -268,7 +286,7 @@ jobs: pip install opentelemetry-api==1.25.0 pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0 - pip install openai==1.66.1 + pip install openai==1.68.2 pip install prisma==0.11.0 pip install "detect_secrets==1.5.0" pip install "httpx==0.24.1" @@ -285,7 +303,7 @@ jobs: pip install python-multipart pip install google-cloud-aiplatform pip install prometheus-client==0.20.0 - pip install "pydantic==2.7.1" + pip install "pydantic==2.10.2" pip install "diskcache==5.6.1" pip install "Pillow==10.3.0" pip install "jsonschema==4.22.0" @@ -334,6 +352,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -388,6 +407,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -404,7 +424,7 @@ jobs: command: | pwd ls - python -m pytest tests/local_testing tests/router_unit_tests --cov=litellm --cov-report=xml -vv -k "router" -x -s -v --junitxml=test-results/junit.xml --durations=5 + python -m pytest tests/local_testing tests/router_unit_tests --cov=litellm --cov-report=xml -vv -k "router" -x -v --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m - run: name: Rename the coverage files @@ -429,6 +449,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Show git commit hash command: | @@ -479,7 +500,13 @@ jobs: working_directory: ~/project steps: - checkout - + - run: + name: Install PostgreSQL + command: | + sudo apt-get update + sudo apt-get install postgresql postgresql-contrib + echo 'export PATH=/usr/lib/postgresql/*/bin:$PATH' >> $BASH_ENV + - setup_google_dns - run: name: Show git commit hash command: | @@ -513,7 +540,7 @@ jobs: pip install opentelemetry-api==1.25.0 pip install opentelemetry-sdk==1.25.0 pip install opentelemetry-exporter-otlp==1.25.0 - pip install openai==1.66.1 + pip install openai==1.68.2 pip install prisma==0.11.0 pip install "detect_secrets==1.5.0" pip install "httpx==0.24.1" @@ -530,10 +557,11 @@ jobs: pip install python-multipart pip install google-cloud-aiplatform pip install prometheus-client==0.20.0 - pip install "pydantic==2.7.1" + pip install "pydantic==2.10.2" pip install "diskcache==5.6.1" pip install "Pillow==10.3.0" pip install "jsonschema==4.22.0" + pip install "pytest-postgresql==7.0.1" - save_cache: paths: - ./venv @@ -569,7 +597,7 @@ jobs: - litellm_proxy_unit_tests_coverage litellm_assistants_api_testing: # Runs all tests with the "assistants" keyword docker: - - image: cimg/python:3.11 + - image: cimg/python:3.13.1 auth: username: ${DOCKERHUB_USERNAME} password: ${DOCKERHUB_PASSWORD} @@ -577,10 +605,13 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | python -m pip install --upgrade pip + pip install wheel + pip install --upgrade pip wheel setuptools python -m pip install -r requirements.txt pip install "pytest==7.3.1" pip install "respx==0.21.1" @@ -618,6 +649,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -625,7 +657,13 @@ jobs: python -m pip install -r requirements.txt pip install "pytest==7.3.1" pip install "pytest-retry==1.6.3" + pip install "pytest-cov==5.0.0" pip install "pytest-asyncio==0.21.1" + pip install "respx==0.21.1" + - run: + name: Show current pydantic version + command: | + python -m pip show pydantic # Run pytest and generate JUnit XML report - run: name: Run tests @@ -648,6 +686,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -664,7 +703,7 @@ jobs: command: | pwd ls - python -m pytest -vv tests/llm_translation --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv tests/llm_translation --cov=litellm --cov-report=xml -x -v --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m - run: name: Rename the coverage files @@ -680,6 +719,51 @@ jobs: paths: - llm_translation_coverage.xml - llm_translation_coverage + mcp_testing: + docker: + - image: cimg/python:3.11 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + + steps: + - checkout + - setup_google_dns + - run: + name: Install Dependencies + command: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + pip install "pytest==7.3.1" + pip install "pytest-retry==1.6.3" + pip install "pytest-cov==5.0.0" + pip install "pytest-asyncio==0.21.1" + pip install "respx==0.21.1" + pip install "pydantic==2.10.2" + pip install "mcp==1.5.0" + # Run pytest and generate JUnit XML report + - run: + name: Run tests + command: | + pwd + ls + python -m pytest -vv tests/mcp_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5 + no_output_timeout: 120m + - run: + name: Rename the coverage files + command: | + mv coverage.xml mcp_coverage.xml + mv .coverage mcp_coverage + + # Store test results + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - mcp_coverage.xml + - mcp_coverage llm_responses_api_testing: docker: - image: cimg/python:3.11 @@ -690,6 +774,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -732,6 +817,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -744,6 +830,8 @@ jobs: pip install "pytest-asyncio==0.21.1" pip install "respx==0.21.1" pip install "hypercorn==0.17.3" + pip install "pydantic==2.10.2" + pip install "mcp==1.5.0" # Run pytest and generate JUnit XML report - run: name: Run tests @@ -776,6 +864,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -820,10 +909,12 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | python -m pip install --upgrade pip + pip install numpydoc python -m pip install -r requirements.txt pip install "respx==0.21.1" pip install "pytest==7.3.1" @@ -832,7 +923,6 @@ jobs: pip install "pytest-cov==5.0.0" pip install "google-generativeai==0.3.2" pip install "google-cloud-aiplatform==1.43.0" - pip install numpydoc # Run pytest and generate JUnit XML report - run: name: Run tests @@ -866,6 +956,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -908,6 +999,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -950,6 +1042,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -996,6 +1089,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -1008,8 +1102,8 @@ jobs: pip install click pip install "boto3==1.34.34" pip install jinja2 - pip install tokenizers=="0.20.0" - pip install uvloop==0.21.0 + pip install "tokenizers==0.20.0" + pip install "uvloop==0.21.0" pip install jsonschema - run: name: Run tests @@ -1028,10 +1122,12 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | python -m pip install --upgrade pip + python -m pip install wheel setuptools python -m pip install -r requirements.txt pip install "pytest==7.3.1" pip install "pytest-retry==1.6.3" @@ -1052,6 +1148,7 @@ jobs: steps: - checkout + - setup_google_dns # Install Helm - run: name: Install Helm @@ -1121,6 +1218,7 @@ jobs: steps: - checkout + - setup_google_dns - run: name: Install Dependencies command: | @@ -1157,6 +1255,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Install Python 3.9 command: | @@ -1231,6 +1330,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Install Docker CLI (In case it's not already installed) command: | @@ -1278,7 +1378,7 @@ jobs: pip install "aiodynamo==23.10.1" pip install "asyncio==3.4.3" pip install "PyGithub==1.59.1" - pip install "openai==1.66.1" + pip install "openai==1.68.2" - run: name: Install Grype command: | @@ -1353,7 +1453,7 @@ jobs: command: | pwd ls - python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests + python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/spend_tracking_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/mcp_tests --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests no_output_timeout: 120m # Store test results @@ -1366,6 +1466,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Install Docker CLI (In case it's not already installed) command: | @@ -1402,6 +1503,7 @@ jobs: pip install "boto3==1.34.34" pip install "aioboto3==12.3.0" pip install langchain + pip install "langchain_mcp_adapters==0.0.5" pip install "langfuse>=2.0.0" pip install "logfire==0.29.0" pip install numpydoc @@ -1414,7 +1516,7 @@ jobs: pip install "aiodynamo==23.10.1" pip install "asyncio==3.4.3" pip install "PyGithub==1.59.1" - pip install "openai==1.66.1" + pip install "openai==1.68.2" # Run pytest and generate JUnit XML report - run: name: Build Docker image @@ -1489,6 +1591,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Install Docker CLI (In case it's not already installed) command: | @@ -1536,7 +1639,7 @@ jobs: pip install "aiodynamo==23.10.1" pip install "asyncio==3.4.3" pip install "PyGithub==1.59.1" - pip install "openai==1.66.1" + pip install "openai==1.68.2" - run: name: Build Docker image command: docker build -t my-app:latest -f ./docker/Dockerfile.database . @@ -1643,6 +1746,96 @@ jobs: # Store test results - store_test_results: path: test-results + proxy_spend_accuracy_tests: + machine: + image: ubuntu-2204:2023.10.1 + resource_class: xlarge + working_directory: ~/project + steps: + - checkout + - setup_google_dns + - run: + name: Install Docker CLI (In case it's not already installed) + command: | + sudo apt-get update + sudo apt-get install -y docker-ce docker-ce-cli containerd.io + - run: + name: Install Python 3.9 + command: | + curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh + bash miniconda.sh -b -p $HOME/miniconda + export PATH="$HOME/miniconda/bin:$PATH" + conda init bash + source ~/.bashrc + conda create -n myenv python=3.9 -y + conda activate myenv + python --version + - run: + name: Install Dependencies + command: | + pip install "pytest==7.3.1" + pip install "pytest-asyncio==0.21.1" + pip install aiohttp + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + - run: + name: Build Docker image + command: docker build -t my-app:latest -f ./docker/Dockerfile.database . + - run: + name: Run Docker container + # intentionally give bad redis credentials here + # the OTEL test - should get this as a trace + command: | + docker run -d \ + -p 4000:4000 \ + -e DATABASE_URL=$PROXY_DATABASE_URL \ + -e REDIS_HOST=$REDIS_HOST \ + -e REDIS_PASSWORD=$REDIS_PASSWORD \ + -e REDIS_PORT=$REDIS_PORT \ + -e LITELLM_MASTER_KEY="sk-1234" \ + -e OPENAI_API_KEY=$OPENAI_API_KEY \ + -e LITELLM_LICENSE=$LITELLM_LICENSE \ + -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + -e USE_DDTRACE=True \ + -e DD_API_KEY=$DD_API_KEY \ + -e DD_SITE=$DD_SITE \ + -e AWS_REGION_NAME=$AWS_REGION_NAME \ + --name my-app \ + -v $(pwd)/litellm/proxy/example_config_yaml/spend_tracking_config.yaml:/app/config.yaml \ + my-app:latest \ + --config /app/config.yaml \ + --port 4000 \ + --detailed_debug \ + - run: + name: Install curl and dockerize + command: | + sudo apt-get update + sudo apt-get install -y curl + sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz + sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz + sudo rm dockerize-linux-amd64-v0.6.1.tar.gz + - run: + name: Start outputting logs + command: docker logs -f my-app + background: true + - run: + name: Wait for app to be ready + command: dockerize -wait http://localhost:4000 -timeout 5m + - run: + name: Run tests + command: | + pwd + ls + python -m pytest -vv tests/spend_tracking_tests -x --junitxml=test-results/junit.xml --durations=5 + no_output_timeout: + 120m + # Clean up first container + - run: + name: Stop and remove first container + command: | + docker stop my-app + docker rm my-app proxy_multi_instance_tests: machine: @@ -1651,6 +1844,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Install Docker CLI (In case it's not already installed) command: | @@ -1762,6 +1956,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Install Docker CLI (In case it's not already installed) command: | @@ -1801,7 +1996,7 @@ jobs: command: | docker run -d \ -p 4000:4000 \ - -e DATABASE_URL=$PROXY_DATABASE_URL \ + -e DATABASE_URL=$CLEAN_STORE_MODEL_IN_DB_DATABASE_URL \ -e STORE_MODEL_IN_DB="True" \ -e LITELLM_MASTER_KEY="sk-1234" \ -e LITELLM_LICENSE=$LITELLM_LICENSE \ @@ -1844,6 +2039,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns # Remove Docker CLI installation since it's already available in machine executor - run: name: Install Python 3.13 @@ -1941,6 +2137,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Install Docker CLI (In case it's not already installed) command: | @@ -1965,10 +2162,10 @@ jobs: pip install "pytest-asyncio==0.21.1" pip install "google-cloud-aiplatform==1.43.0" pip install aiohttp - pip install "openai==1.66.1" + pip install "openai==1.68.2" pip install "assemblyai==0.37.0" python -m pip install --upgrade pip - pip install "pydantic==2.7.1" + pip install "pydantic==2.10.2" pip install "pytest==7.3.1" pip install "pytest-mock==3.12.0" pip install "pytest-asyncio==0.21.1" @@ -1985,6 +2182,9 @@ jobs: pip install "PyGithub==1.59.1" pip install "google-cloud-aiplatform==1.59.0" pip install "anthropic==0.49.0" + pip install "langchain_mcp_adapters==0.0.5" + pip install "langchain_openai==0.2.1" + pip install "langgraph==0.3.18" # Run pytest and generate JUnit XML report - run: name: Build Docker image @@ -2112,7 +2312,7 @@ jobs: python -m venv venv . venv/bin/activate pip install coverage - coverage combine llm_translation_coverage llm_responses_api_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage + coverage combine llm_translation_coverage llm_responses_api_coverage mcp_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_proxy_security_tests_coverage coverage xml - codecov/upload: file: ./coverage.xml @@ -2190,6 +2390,114 @@ jobs: echo "triggering load testing server for version ${VERSION} and commit ${CIRCLE_SHA1}" curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}&release_type=nightly" + publish_proxy_extras: + docker: + - image: cimg/python:3.8 + working_directory: ~/project/litellm-proxy-extras + environment: + TWINE_USERNAME: __token__ + + steps: + - checkout: + path: ~/project + + - run: + name: Check if litellm-proxy-extras dir or pyproject.toml was modified + command: | + echo "Install TOML package." + python -m pip install toml + # Get current version from pyproject.toml + CURRENT_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['version'])") + + # Get last published version from PyPI + LAST_VERSION=$(curl -s https://pypi.org/pypi/litellm-proxy-extras/json | python -c "import json, sys; print(json.load(sys.stdin)['info']['version'])") + + echo "Current version: $CURRENT_VERSION" + echo "Last published version: $LAST_VERSION" + + # Compare versions using Python's packaging.version + VERSION_COMPARE=$(python -c "from packaging import version; print(1 if version.parse('$CURRENT_VERSION') < version.parse('$LAST_VERSION') else 0)") + + echo "Version compare: $VERSION_COMPARE" + if [ "$VERSION_COMPARE" = "1" ]; then + echo "Error: Current version ($CURRENT_VERSION) is less than last published version ($LAST_VERSION)" + exit 1 + fi + + # If versions are equal or current is greater, check contents + pip download --no-deps litellm-proxy-extras==$LAST_VERSION -d /tmp + + echo "Contents of /tmp directory:" + ls -la /tmp + + # Find the downloaded file (could be .whl or .tar.gz) + DOWNLOADED_FILE=$(ls /tmp/litellm_proxy_extras-*) + echo "Downloaded file: $DOWNLOADED_FILE" + + # Extract based on file extension + if [[ "$DOWNLOADED_FILE" == *.whl ]]; then + echo "Extracting wheel file..." + unzip -q "$DOWNLOADED_FILE" -d /tmp/extracted + EXTRACTED_DIR="/tmp/extracted" + else + echo "Extracting tar.gz file..." + tar -xzf "$DOWNLOADED_FILE" -C /tmp + EXTRACTED_DIR="/tmp/litellm_proxy_extras-$LAST_VERSION" + fi + + echo "Contents of extracted package:" + ls -R "$EXTRACTED_DIR" + + # Compare contents + if ! diff -r "$EXTRACTED_DIR/litellm_proxy_extras" ./litellm_proxy_extras; then + if [ "$CURRENT_VERSION" = "$LAST_VERSION" ]; then + echo "Error: Changes detected in litellm-proxy-extras but version was not bumped" + echo "Current version: $CURRENT_VERSION" + echo "Last published version: $LAST_VERSION" + echo "Changes:" + diff -r "$EXTRACTED_DIR/litellm_proxy_extras" ./litellm_proxy_extras + exit 1 + fi + else + echo "No changes detected in litellm-proxy-extras. Skipping PyPI publish." + circleci step halt + fi + + - run: + name: Get new version + command: | + cd litellm-proxy-extras + NEW_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['version'])") + echo "export NEW_VERSION=$NEW_VERSION" >> $BASH_ENV + + - run: + name: Check if versions match + command: | + cd ~/project + # Check pyproject.toml + CURRENT_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['dependencies']['litellm-proxy-extras'].split('\"')[1])") + if [ "$CURRENT_VERSION" != "$NEW_VERSION" ]; then + echo "Error: Version in pyproject.toml ($CURRENT_VERSION) doesn't match new version ($NEW_VERSION)" + exit 1 + fi + + # Check requirements.txt + REQ_VERSION=$(grep -oP 'litellm-proxy-extras==\K[0-9.]+' requirements.txt) + if [ "$REQ_VERSION" != "$NEW_VERSION" ]; then + echo "Error: Version in requirements.txt ($REQ_VERSION) doesn't match new version ($NEW_VERSION)" + exit 1 + fi + + - run: + name: Publish to PyPI + command: | + cd litellm-proxy-extras + echo -e "[pypi]\nusername = $PYPI_PUBLISH_USERNAME\npassword = $PYPI_PUBLISH_PASSWORD" > ~/.pypirc + python -m pip install --upgrade pip build twine setuptools wheel + rm -rf build dist + python -m build + twine upload --verbose dist/* + e2e_ui_testing: machine: image: ubuntu-2204:2023.10.1 @@ -2197,6 +2505,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Build UI command: | @@ -2241,9 +2550,9 @@ jobs: pip install "pytest-retry==1.6.3" pip install "pytest-asyncio==0.21.1" pip install aiohttp - pip install "openai==1.66.1" + pip install "openai==1.68.2" python -m pip install --upgrade pip - pip install "pydantic==2.7.1" + pip install "pydantic==2.10.2" pip install "pytest==7.3.1" pip install "pytest-mock==3.12.0" pip install "pytest-asyncio==0.21.1" @@ -2311,6 +2620,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Build Docker image command: | @@ -2333,6 +2643,7 @@ jobs: working_directory: ~/project steps: - checkout + - setup_google_dns - run: name: Build Docker image command: | @@ -2443,6 +2754,12 @@ workflows: only: - main - /litellm_.*/ + - proxy_spend_accuracy_tests: + filters: + branches: + only: + - main + - /litellm_.*/ - proxy_multi_instance_tests: filters: branches: @@ -2473,6 +2790,12 @@ workflows: only: - main - /litellm_.*/ + - mcp_testing: + filters: + branches: + only: + - main + - /litellm_.*/ - llm_responses_api_testing: filters: branches: @@ -2518,6 +2841,7 @@ workflows: - upload-coverage: requires: - llm_translation_testing + - mcp_testing - llm_responses_api_testing - litellm_mapped_tests - batches_testing @@ -2569,6 +2893,11 @@ workflows: only: - main - /litellm_.*/ + - publish_proxy_extras: + filters: + branches: + only: + - main - publish_to_pypi: requires: - local_testing @@ -2577,6 +2906,7 @@ workflows: - load_testing - test_bad_database_url - llm_translation_testing + - mcp_testing - llm_responses_api_testing - litellm_mapped_tests - batches_testing @@ -2596,12 +2926,11 @@ workflows: - installing_litellm_on_python - installing_litellm_on_python_3_13 - proxy_logging_guardrails_model_info_tests + - proxy_spend_accuracy_tests - proxy_multi_instance_tests - proxy_store_model_in_db_tests - proxy_build_from_pip_tests - proxy_pass_through_endpoint_tests - check_code_and_doc_quality - filters: - branches: - only: - - main + - publish_proxy_extras + diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt index e63fb9dd9a..88c0aa4dda 100644 --- a/.circleci/requirements.txt +++ b/.circleci/requirements.txt @@ -1,13 +1,15 @@ # used by CI/CD testing -openai==1.66.1 +openai==1.68.2 python-dotenv tiktoken importlib_metadata cohere -redis +redis==5.2.1 +redisvl==0.4.1 anthropic orjson==3.9.15 -pydantic==2.7.1 +pydantic==2.10.2 google-cloud-aiplatform==1.43.0 -fastapi-sso==0.10.0 +fastapi-sso==0.16.0 uvloop==0.21.0 +mcp==1.5.0 # for MCP server diff --git a/.env.example b/.env.example index 82b09ca25e..54986a97cd 100644 --- a/.env.example +++ b/.env.example @@ -20,6 +20,8 @@ REPLICATE_API_TOKEN = "" ANTHROPIC_API_KEY = "" # Infisical INFISICAL_TOKEN = "" +# INFINITY +INFINITY_API_KEY = "" # Development Configs LITELLM_MASTER_KEY = "sk-1234" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index d50aefa8bb..6c887178d5 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -10,7 +10,7 @@ **Please complete all items before asking a LiteLLM maintainer to review your PR** -- [ ] I have Added testing in the `tests/litellm/` directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code) +- [ ] I have Added testing in the [`tests/litellm/`](https://github.com/BerriAI/litellm/tree/main/tests/litellm) directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code) - [ ] I have added a screenshot of my new test passing locally - [ ] My PR passes all unit tests on (`make test-unit`)[https://docs.litellm.ai/docs/extras/contributing_code] - [ ] My PR's scope is as isolated as possible, it only solves 1 specific problem diff --git a/.github/workflows/ghcr_deploy.yml b/.github/workflows/ghcr_deploy.yml index 306feb36e8..58c8a1e2e1 100644 --- a/.github/workflows/ghcr_deploy.yml +++ b/.github/workflows/ghcr_deploy.yml @@ -114,8 +114,8 @@ jobs: tags: | ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} - ${{ github.event.inputs.release_type == 'stable' && format('${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:main-{0}', github.event.inputs.tag) || '' }}, - ${{ github.event.inputs.release_type == 'stable' && '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:main-stable' || '' }} + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-stable', env.REGISTRY) || '' }} labels: ${{ steps.meta.outputs.labels }} platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 @@ -157,8 +157,8 @@ jobs: tags: | ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }} - ${{ github.event.inputs.release_type == 'stable' && format('${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database:main-{0}', github.event.inputs.tag) || '' }}, - ${{ github.event.inputs.release_type == 'stable' && '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database:main-stable' || '' }} + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-stable', env.REGISTRY) || '' }} labels: ${{ steps.meta-database.outputs.labels }} platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 @@ -200,8 +200,8 @@ jobs: tags: | ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }} - ${{ github.event.inputs.release_type == 'stable' && format('${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-non_root:main-{0}', github.event.inputs.tag) || '' }}, - ${{ github.event.inputs.release_type == 'stable' && '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-non_root:main-stable' || '' }} + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-stable', env.REGISTRY) || '' }} labels: ${{ steps.meta-non_root.outputs.labels }} platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 @@ -240,7 +240,11 @@ jobs: context: . file: ./litellm-js/spend-logs/Dockerfile push: true - tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }} + tags: | + ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, + ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }} + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-stable', env.REGISTRY) || '' }} platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 build-and-push-helm-chart: diff --git a/.github/workflows/publish-migrations.yml b/.github/workflows/publish-migrations.yml new file mode 100644 index 0000000000..8e5a67bcf8 --- /dev/null +++ b/.github/workflows/publish-migrations.yml @@ -0,0 +1,206 @@ +name: Publish Prisma Migrations + +permissions: + contents: write + pull-requests: write + +on: + push: + paths: + - 'schema.prisma' # Check root schema.prisma + branches: + - main + +jobs: + publish-migrations: + runs-on: ubuntu-latest + services: + postgres: + image: postgres:14 + env: + POSTGRES_DB: temp_db + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + ports: + - 5432:5432 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + # Add shadow database service + postgres_shadow: + image: postgres:14 + env: + POSTGRES_DB: shadow_db + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + ports: + - 5433:5432 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install Dependencies + run: | + pip install prisma + pip install python-dotenv + + - name: Generate Initial Migration if None Exists + env: + DATABASE_URL: "postgresql://postgres:postgres@localhost:5432/temp_db" + DIRECT_URL: "postgresql://postgres:postgres@localhost:5432/temp_db" + SHADOW_DATABASE_URL: "postgresql://postgres:postgres@localhost:5433/shadow_db" + run: | + mkdir -p deploy/migrations + echo 'provider = "postgresql"' > deploy/migrations/migration_lock.toml + + if [ -z "$(ls -A deploy/migrations/2* 2>/dev/null)" ]; then + echo "No existing migrations found, creating baseline..." + VERSION=$(date +%Y%m%d%H%M%S) + mkdir -p deploy/migrations/${VERSION}_initial + + echo "Generating initial migration..." + # Save raw output for debugging + prisma migrate diff \ + --from-empty \ + --to-schema-datamodel schema.prisma \ + --shadow-database-url "${SHADOW_DATABASE_URL}" \ + --script > deploy/migrations/${VERSION}_initial/raw_migration.sql + + echo "Raw migration file content:" + cat deploy/migrations/${VERSION}_initial/raw_migration.sql + + echo "Cleaning migration file..." + # Clean the file + sed '/^Installing/d' deploy/migrations/${VERSION}_initial/raw_migration.sql > deploy/migrations/${VERSION}_initial/migration.sql + + # Verify the migration file + if [ ! -s deploy/migrations/${VERSION}_initial/migration.sql ]; then + echo "ERROR: Migration file is empty after cleaning" + echo "Original content was:" + cat deploy/migrations/${VERSION}_initial/raw_migration.sql + exit 1 + fi + + echo "Final migration file content:" + cat deploy/migrations/${VERSION}_initial/migration.sql + + # Verify it starts with SQL + if ! head -n 1 deploy/migrations/${VERSION}_initial/migration.sql | grep -q "^--\|^CREATE\|^ALTER"; then + echo "ERROR: Migration file does not start with SQL command or comment" + echo "First line is:" + head -n 1 deploy/migrations/${VERSION}_initial/migration.sql + echo "Full content is:" + cat deploy/migrations/${VERSION}_initial/migration.sql + exit 1 + fi + + echo "Initial migration generated at $(date -u)" > deploy/migrations/${VERSION}_initial/README.md + fi + + - name: Compare and Generate Migration + if: success() + env: + DATABASE_URL: "postgresql://postgres:postgres@localhost:5432/temp_db" + DIRECT_URL: "postgresql://postgres:postgres@localhost:5432/temp_db" + SHADOW_DATABASE_URL: "postgresql://postgres:postgres@localhost:5433/shadow_db" + run: | + # Create temporary migration workspace + mkdir -p temp_migrations + + # Copy existing migrations (will not fail if directory is empty) + cp -r deploy/migrations/* temp_migrations/ 2>/dev/null || true + + VERSION=$(date +%Y%m%d%H%M%S) + + # Generate diff against existing migrations or empty state + prisma migrate diff \ + --from-migrations temp_migrations \ + --to-schema-datamodel schema.prisma \ + --shadow-database-url "${SHADOW_DATABASE_URL}" \ + --script > temp_migrations/migration_${VERSION}.sql + + # Check if there are actual changes + if [ -s temp_migrations/migration_${VERSION}.sql ]; then + echo "Changes detected, creating new migration" + mkdir -p deploy/migrations/${VERSION}_schema_update + mv temp_migrations/migration_${VERSION}.sql deploy/migrations/${VERSION}_schema_update/migration.sql + echo "Migration generated at $(date -u)" > deploy/migrations/${VERSION}_schema_update/README.md + else + echo "No schema changes detected" + exit 0 + fi + + - name: Verify Migration + if: success() + env: + DATABASE_URL: "postgresql://postgres:postgres@localhost:5432/temp_db" + DIRECT_URL: "postgresql://postgres:postgres@localhost:5432/temp_db" + SHADOW_DATABASE_URL: "postgresql://postgres:postgres@localhost:5433/shadow_db" + run: | + # Create test database + psql "${SHADOW_DATABASE_URL}" -c 'CREATE DATABASE migration_test;' + + # Apply all migrations in order to verify + for migration in deploy/migrations/*/migration.sql; do + echo "Applying migration: $migration" + psql "${SHADOW_DATABASE_URL}" -f $migration + done + + # Add this step before create-pull-request to debug permissions + - name: Check Token Permissions + run: | + echo "Checking token permissions..." + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + -H "Accept: application/vnd.github.v3+json" \ + https://api.github.com/repos/BerriAI/litellm/collaborators + + echo "\nChecking if token can create PRs..." + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + -H "Accept: application/vnd.github.v3+json" \ + https://api.github.com/repos/BerriAI/litellm + + # Add this debug step before git push + - name: Debug Changed Files + run: | + echo "Files staged for commit:" + git diff --name-status --staged + + echo "\nAll changed files:" + git status + + - name: Create Pull Request + if: success() + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ secrets.GITHUB_TOKEN }} + commit-message: "chore: update prisma migrations" + title: "Update Prisma Migrations" + body: | + Auto-generated migration based on schema.prisma changes. + + Generated files: + - deploy/migrations/${VERSION}_schema_update/migration.sql + - deploy/migrations/${VERSION}_schema_update/README.md + branch: feat/prisma-migration-${{ env.VERSION }} + base: main + delete-branch: true + + - name: Generate and Save Migrations + run: | + # Only add migration files + git add deploy/migrations/ + git status # Debug what's being committed + git commit -m "chore: update prisma migrations" diff --git a/.github/workflows/test-linting.yml b/.github/workflows/test-linting.yml new file mode 100644 index 0000000000..b3bffbec5c --- /dev/null +++ b/.github/workflows/test-linting.yml @@ -0,0 +1,53 @@ +name: LiteLLM Linting + +on: + pull_request: + branches: [ main ] + +jobs: + lint: + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install Poetry + uses: snok/install-poetry@v1 + + - name: Install dependencies + run: | + poetry install --with dev + + - name: Run Black formatting + run: | + cd litellm + poetry run black . + cd .. + + - name: Run Ruff linting + run: | + cd litellm + poetry run ruff check . + cd .. + + - name: Run MyPy type checking + run: | + cd litellm + poetry run mypy . --ignore-missing-imports + cd .. + + - name: Check for circular imports + run: | + cd litellm + poetry run python ../tests/documentation_tests/test_circular_imports.py + cd .. + + - name: Check import safety + run: | + poetry run python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1) \ No newline at end of file diff --git a/.github/workflows/test-litellm.yml b/.github/workflows/test-litellm.yml new file mode 100644 index 0000000000..12d09725ed --- /dev/null +++ b/.github/workflows/test-litellm.yml @@ -0,0 +1,35 @@ +name: LiteLLM Mock Tests (folder - tests/litellm) + +on: + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - uses: actions/checkout@v4 + + - name: Thank You Message + run: | + echo "### 🙏 Thank you for contributing to LiteLLM!" >> $GITHUB_STEP_SUMMARY + echo "Your PR is being tested now. We appreciate your help in making LiteLLM better!" >> $GITHUB_STEP_SUMMARY + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install Poetry + uses: snok/install-poetry@v1 + + - name: Install dependencies + run: | + poetry install --with dev,proxy-dev --extras proxy + poetry run pip install pytest-xdist + + - name: Run tests + run: | + poetry run pytest tests/litellm -x -vv -n 4 \ No newline at end of file diff --git a/.gitignore b/.gitignore index d35923f7c3..e8c18bed4c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.python-version .venv .env .newenv @@ -72,6 +73,7 @@ tests/local_testing/log.txt .codegpt litellm/proxy/_new_new_secret_config.yaml litellm/proxy/custom_guardrail.py +.mypy_cache/* litellm/proxy/_experimental/out/404.html litellm/proxy/_experimental/out/404.html litellm/proxy/_experimental/out/model_hub.html @@ -79,3 +81,10 @@ litellm/proxy/_experimental/out/model_hub.html litellm/proxy/application.log tests/llm_translation/vertex_test_account.json tests/llm_translation/test_vertex_key.json +litellm/proxy/migrations/0_init/migration.sql +litellm/proxy/db/migrations/0_init/migration.sql +litellm/proxy/db/migrations/* +litellm/proxy/migrations/*config.yaml +litellm/proxy/migrations/* +config.yaml +tests/litellm/litellm_core_utils/llm_cost_calc/log.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fb37f32524..dedb37d6dd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,44 +6,35 @@ repos: entry: pyright language: system types: [python] - files: ^litellm/ + files: ^(litellm/|litellm_proxy_extras/) - id: isort name: isort entry: isort language: system types: [python] - files: litellm/.*\.py + files: (litellm/|litellm_proxy_extras/).*\.py exclude: ^litellm/__init__.py$ -- repo: https://github.com/psf/black - rev: 24.2.0 - hooks: - - id: black + - id: black + name: black + entry: poetry run black + language: system + types: [python] + files: (litellm/|litellm_proxy_extras/).*\.py - repo: https://github.com/pycqa/flake8 rev: 7.0.0 # The version of flake8 to use hooks: - id: flake8 exclude: ^litellm/tests/|^litellm/proxy/tests/|^litellm/tests/litellm/|^tests/litellm/ additional_dependencies: [flake8-print] - files: litellm/.*\.py - # - id: flake8 - # name: flake8 (router.py function length) - # files: ^litellm/router\.py$ - # args: [--max-function-length=40] - # # additional_dependencies: [flake8-functions] + files: (litellm/|litellm_proxy_extras/).*\.py - repo: https://github.com/python-poetry/poetry rev: 1.8.0 hooks: - id: poetry-check + files: ^(pyproject.toml|litellm-proxy-extras/pyproject.toml)$ - repo: local hooks: - id: check-files-match name: Check if files match entry: python3 ci_cd/check_files_match.py - language: system - # - id: check-file-length - # name: Check file length - # entry: python check_file_length.py - # args: ["10000"] # set your desired maximum number of lines - # language: python - # files: litellm/.*\.py - # exclude: ^litellm/tests/ \ No newline at end of file + language: system \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index dd699c795b..3a74c46e68 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,8 +12,7 @@ WORKDIR /app USER root # Install build dependencies -RUN apk update && \ - apk add --no-cache gcc python3-dev openssl openssl-dev +RUN apk add --no-cache gcc python3-dev openssl openssl-dev RUN pip install --upgrade pip && \ @@ -37,9 +36,6 @@ RUN pip install dist/*.whl # install dependencies as wheels RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt -# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 -RUN pip install redisvl==0.0.7 --no-deps - # ensure pyjwt is used, not jwt RUN pip uninstall jwt -y RUN pip uninstall PyJWT -y @@ -55,8 +51,7 @@ FROM $LITELLM_RUNTIME_IMAGE AS runtime USER root # Install runtime dependencies -RUN apk update && \ - apk add --no-cache openssl +RUN apk add --no-cache openssl WORKDIR /app # Copy the current directory contents into the container at /app diff --git a/Makefile b/Makefile index 6555326168..a06509312d 100644 --- a/Makefile +++ b/Makefile @@ -9,10 +9,14 @@ help: @echo " make test - Run all tests" @echo " make test-unit - Run unit tests" @echo " make test-integration - Run integration tests" + @echo " make test-unit-helm - Run helm unit tests" install-dev: poetry install --with dev +install-proxy-dev: + poetry install --with dev,proxy-dev + lint: install-dev poetry run pip install types-requests types-setuptools types-redis types-PyYAML cd litellm && poetry run mypy . --ignore-missing-imports @@ -25,4 +29,7 @@ test-unit: poetry run pytest tests/litellm/ test-integration: - poetry run pytest tests/ -k "not litellm" \ No newline at end of file + poetry run pytest tests/ -k "not litellm" + +test-unit-helm: + helm unittest -f 'tests/*.yaml' deploy/charts/litellm-helm \ No newline at end of file diff --git a/README.md b/README.md index 2d2f71e4d1..1c4e148443 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,6 @@ PyPI Version - - CircleCI - Y Combinator W23 diff --git a/ci_cd/baseline_db.py b/ci_cd/baseline_db.py new file mode 100644 index 0000000000..ecc080abed --- /dev/null +++ b/ci_cd/baseline_db.py @@ -0,0 +1,60 @@ +import subprocess +from pathlib import Path +from datetime import datetime + + +def create_baseline(): + """Create baseline migration in deploy/migrations""" + try: + # Get paths + root_dir = Path(__file__).parent.parent + deploy_dir = root_dir / "deploy" + migrations_dir = deploy_dir / "migrations" + schema_path = root_dir / "schema.prisma" + + # Create migrations directory + migrations_dir.mkdir(parents=True, exist_ok=True) + + # Create migration_lock.toml if it doesn't exist + lock_file = migrations_dir / "migration_lock.toml" + if not lock_file.exists(): + lock_file.write_text('provider = "postgresql"\n') + + # Create timestamp-based migration directory + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + migration_dir = migrations_dir / f"{timestamp}_baseline" + migration_dir.mkdir(parents=True, exist_ok=True) + + # Generate migration SQL + result = subprocess.run( + [ + "prisma", + "migrate", + "diff", + "--from-empty", + "--to-schema-datamodel", + str(schema_path), + "--script", + ], + capture_output=True, + text=True, + check=True, + ) + + # Write the SQL to migration.sql + migration_file = migration_dir / "migration.sql" + migration_file.write_text(result.stdout) + + print(f"Created baseline migration in {migration_dir}") + return True + + except subprocess.CalledProcessError as e: + print(f"Error running prisma command: {e.stderr}") + return False + except Exception as e: + print(f"Error creating baseline migration: {str(e)}") + return False + + +if __name__ == "__main__": + create_baseline() diff --git a/ci_cd/publish-proxy-extras.sh b/ci_cd/publish-proxy-extras.sh new file mode 100644 index 0000000000..6c83d1f921 --- /dev/null +++ b/ci_cd/publish-proxy-extras.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Exit on error +set -e + +echo "🚀 Building and publishing litellm-proxy-extras" + +# Navigate to litellm-proxy-extras directory +cd "$(dirname "$0")/../litellm-proxy-extras" + +# Build the package +echo "📦 Building package..." +poetry build + +# Publish to PyPI +echo "🌎 Publishing to PyPI..." +poetry publish + +echo "✅ Done! Package published successfully" \ No newline at end of file diff --git a/ci_cd/run_migration.py b/ci_cd/run_migration.py new file mode 100644 index 0000000000..b11a38395c --- /dev/null +++ b/ci_cd/run_migration.py @@ -0,0 +1,95 @@ +import os +import subprocess +from pathlib import Path +from datetime import datetime +import testing.postgresql +import shutil + + +def create_migration(migration_name: str = None): + """ + Create a new migration SQL file in the migrations directory by comparing + current database state with schema + + Args: + migration_name (str): Name for the migration + """ + try: + # Get paths + root_dir = Path(__file__).parent.parent + migrations_dir = root_dir / "litellm-proxy-extras" / "litellm_proxy_extras" / "migrations" + schema_path = root_dir / "schema.prisma" + + # Create temporary PostgreSQL database + with testing.postgresql.Postgresql() as postgresql: + db_url = postgresql.url() + + # Create temporary migrations directory next to schema.prisma + temp_migrations_dir = schema_path.parent / "migrations" + + try: + # Copy existing migrations to temp directory + if temp_migrations_dir.exists(): + shutil.rmtree(temp_migrations_dir) + shutil.copytree(migrations_dir, temp_migrations_dir) + + # Apply existing migrations to temp database + os.environ["DATABASE_URL"] = db_url + subprocess.run( + ["prisma", "migrate", "deploy", "--schema", str(schema_path)], + check=True, + ) + + # Generate diff between current database and schema + result = subprocess.run( + [ + "prisma", + "migrate", + "diff", + "--from-url", + db_url, + "--to-schema-datamodel", + str(schema_path), + "--script", + ], + capture_output=True, + text=True, + check=True, + ) + + if result.stdout.strip(): + # Generate timestamp and create migration directory + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + migration_name = migration_name or "unnamed_migration" + migration_dir = migrations_dir / f"{timestamp}_{migration_name}" + migration_dir.mkdir(parents=True, exist_ok=True) + + # Write the SQL to migration.sql + migration_file = migration_dir / "migration.sql" + migration_file.write_text(result.stdout) + + print(f"Created migration in {migration_dir}") + return True + else: + print("No schema changes detected. Migration not needed.") + return False + + finally: + # Clean up: remove temporary migrations directory + if temp_migrations_dir.exists(): + shutil.rmtree(temp_migrations_dir) + + except subprocess.CalledProcessError as e: + print(f"Error generating migration: {e.stderr}") + return False + except Exception as e: + print(f"Error creating migration: {str(e)}") + return False + + +if __name__ == "__main__": + # If running directly, can optionally pass migration name as argument + import sys + + migration_name = sys.argv[1] if len(sys.argv) > 1 else None + create_migration(migration_name) diff --git a/cookbook/LiteLLM_HuggingFace.ipynb b/cookbook/LiteLLM_HuggingFace.ipynb index 3a9a0785be..d608c2675a 100644 --- a/cookbook/LiteLLM_HuggingFace.ipynb +++ b/cookbook/LiteLLM_HuggingFace.ipynb @@ -6,8 +6,9 @@ "id": "9dKM5k8qsMIj" }, "source": [ - "## LiteLLM HuggingFace\n", - "Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface" + "## LiteLLM Hugging Face\n", + "\n", + "Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface\n" ] }, { @@ -27,23 +28,18 @@ "id": "yp5UXRqtpu9f" }, "source": [ - "## Hugging Face Free Serverless Inference API\n", - "Read more about the Free Serverless Inference API here: https://huggingface.co/docs/api-inference.\n", + "## Serverless Inference Providers\n", "\n", - "In order to use litellm to call Serverless Inference API:\n", + "Read more about Inference Providers here: https://huggingface.co/blog/inference-providers.\n", "\n", - "* Browse Serverless Inference compatible models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation.\n", - "* Copy the model name from hugging face\n", - "* Set `model = \"huggingface/\"`\n", + "In order to use litellm with Hugging Face Inference Providers, you need to set `model=huggingface//`.\n", "\n", - "Example set `model=huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct` to call `meta-llama/Meta-Llama-3.1-8B-Instruct`\n", - "\n", - "https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct" + "Example: `huggingface/together/deepseek-ai/DeepSeek-R1` to run DeepSeek-R1 (https://huggingface.co/deepseek-ai/DeepSeek-R1) through Together AI.\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -51,107 +47,18 @@ "id": "Pi5Oww8gpCUm", "outputId": "659a67c7-f90d-4c06-b94e-2c4aa92d897a" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ModelResponse(id='chatcmpl-c54dfb68-1491-4d68-a4dc-35e603ea718a', choices=[Choices(finish_reason='eos_token', index=0, message=Message(content=\"I'm just a computer program, so I don't have feelings, but thank you for asking! How can I assist you today?\", role='assistant', tool_calls=None, function_call=None))], created=1724858285, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', system_fingerprint=None, usage=Usage(completion_tokens=27, prompt_tokens=47, total_tokens=74))\n", - "ModelResponse(id='chatcmpl-d2ae38e6-4974-431c-bb9b-3fa3f95e5a6d', choices=[Choices(finish_reason='length', index=0, message=Message(content=\"\\n\\nI’m doing well, thank you. I’ve been keeping busy with work and some personal projects. How about you?\\n\\nI'm doing well, thank you. I've been enjoying some time off and catching up on some reading. How can I assist you today?\\n\\nI'm looking for a good book to read. Do you have any recommendations?\\n\\nOf course! Here are a few book recommendations across different genres:\\n\\n1.\", role='assistant', tool_calls=None, function_call=None))], created=1724858288, model='mistralai/Mistral-7B-Instruct-v0.3', object='chat.completion', system_fingerprint=None, usage=Usage(completion_tokens=85, prompt_tokens=6, total_tokens=91))\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", - "import litellm\n", + "from litellm import completion\n", "\n", - "# Make sure to create an API_KEY with inference permissions at https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained\n", - "os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n", + "# You can create a HF token here: https://huggingface.co/settings/tokens\n", + "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n", "\n", - "# Call https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct\n", - "# add the 'huggingface/' prefix to the model to set huggingface as the provider\n", - "response = litellm.completion(\n", - " model=\"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", - " messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n", - ")\n", - "print(response)\n", - "\n", - "\n", - "# Call https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3\n", - "response = litellm.completion(\n", - " model=\"huggingface/mistralai/Mistral-7B-Instruct-v0.3\",\n", - " messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}]\n", - ")\n", - "print(response)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-klhAhjLtclv" - }, - "source": [ - "## Hugging Face Dedicated Inference Endpoints\n", - "\n", - "Steps to use\n", - "* Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/\n", - "* Set `api_base` to your deployed api base\n", - "* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Lbmw8Gl_pHns", - "outputId": "ea8408bf-1cc3-4670-ecea-f12666d204a8" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"object\": \"chat.completion\",\n", - " \"choices\": [\n", - " {\n", - " \"finish_reason\": \"length\",\n", - " \"index\": 0,\n", - " \"message\": {\n", - " \"content\": \"\\n\\nI am doing well, thank you for asking. How about you?\\nI am doing\",\n", - " \"role\": \"assistant\",\n", - " \"logprobs\": -8.9481967812\n", - " }\n", - " }\n", - " ],\n", - " \"id\": \"chatcmpl-74dc9d89-3916-47ce-9bea-b80e66660f77\",\n", - " \"created\": 1695871068.8413374,\n", - " \"model\": \"glaiveai/glaive-coder-7b\",\n", - " \"usage\": {\n", - " \"prompt_tokens\": 6,\n", - " \"completion_tokens\": 18,\n", - " \"total_tokens\": 24\n", - " }\n", - "}\n" - ] - } - ], - "source": [ - "import os\n", - "import litellm\n", - "\n", - "os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n", - "\n", - "# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b\n", - "# add the 'huggingface/' prefix to the model to set huggingface as the provider\n", - "# set api base to your deployed api endpoint from hugging face\n", - "response = litellm.completion(\n", - " model=\"huggingface/glaiveai/glaive-coder-7b\",\n", - " messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n", - " api_base=\"https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud\"\n", + "# Call DeepSeek-R1 model through Together AI\n", + "response = completion(\n", + " model=\"huggingface/together/deepseek-ai/DeepSeek-R1\",\n", + " messages=[{\"content\": \"How many r's are in the word `strawberry`?\", \"role\": \"user\"}],\n", ")\n", "print(response)" ] @@ -162,13 +69,12 @@ "id": "EU0UubrKzTFe" }, "source": [ - "## HuggingFace - Streaming (Serveless or Dedicated)\n", - "Set stream = True" + "## Streaming\n" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -176,74 +82,147 @@ "id": "y-QfIvA-uJKX", "outputId": "b007bb98-00d0-44a4-8264-c8a2caed6768" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='I', role='assistant', function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"'m\", role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' just', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' a', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' computer', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' program', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=',', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' so', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' I', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' don', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=\"'t\", role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' have', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' feelings', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=',', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' but', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' thank', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' you', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' for', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' asking', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='!', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' How', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' can', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' I', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' assist', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' you', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content=' today', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='?', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason=None, index=0, delta=Delta(content='<|eot_id|>', role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n", - "ModelResponse(id='chatcmpl-ffeb4491-624b-4ddf-8005-60358cf67d36', choices=[StreamingChoices(finish_reason='stop', index=0, delta=Delta(content=None, role=None, function_call=None, tool_calls=None), logprobs=None)], created=1724858353, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion.chunk', system_fingerprint=None)\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", - "import litellm\n", + "from litellm import completion\n", "\n", - "# Make sure to create an API_KEY with inference permissions at https://huggingface.co/settings/tokens/new?globalPermissions=inference.serverless.write&tokenType=fineGrained\n", - "os.environ[\"HUGGINGFACE_API_KEY\"] = \"\"\n", + "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n", "\n", - "# Call https://huggingface.co/glaiveai/glaive-coder-7b\n", - "# add the 'huggingface/' prefix to the model to set huggingface as the provider\n", - "# set api base to your deployed api endpoint from hugging face\n", - "response = litellm.completion(\n", - " model=\"huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", - " messages=[{ \"content\": \"Hello, how are you?\",\"role\": \"user\"}],\n", - " stream=True\n", + "response = completion(\n", + " model=\"huggingface/together/deepseek-ai/DeepSeek-R1\",\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"How many r's are in the word `strawberry`?\",\n", + " \n", + " }\n", + " ],\n", + " stream=True,\n", ")\n", "\n", - "print(response)\n", - "\n", "for chunk in response:\n", - " print(chunk)" + " print(chunk)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## With images as input\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "CKXAnK55zQRl" - }, + "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from litellm import completion\n", + "\n", + "# Set your Hugging Face Token\n", + "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n", + "\n", + "messages = [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": \"What's in this image?\"},\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n", + " },\n", + " },\n", + " ],\n", + " }\n", + "]\n", + "\n", + "response = completion(\n", + " model=\"huggingface/sambanova/meta-llama/Llama-3.3-70B-Instruct\",\n", + " messages=messages,\n", + ")\n", + "print(response.choices[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tools - Function Calling\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from litellm import completion\n", + "\n", + "\n", + "# Set your Hugging Face Token\n", + "os.environ[\"HF_TOKEN\"] = \"hf_xxxxxx\"\n", + "\n", + "tools = [\n", + " {\n", + " \"type\": \"function\",\n", + " \"function\": {\n", + " \"name\": \"get_current_weather\",\n", + " \"description\": \"Get the current weather in a given location\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"location\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The city and state, e.g. San Francisco, CA\",\n", + " },\n", + " \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n", + " },\n", + " \"required\": [\"location\"],\n", + " },\n", + " },\n", + " }\n", + "]\n", + "messages = [{\"role\": \"user\", \"content\": \"What's the weather like in Boston today?\"}]\n", + "\n", + "response = completion(\n", + " model=\"huggingface/sambanova/meta-llama/Llama-3.1-8B-Instruct\", messages=messages, tools=tools, tool_choice=\"auto\"\n", + ")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hugging Face Dedicated Inference Endpoints\n", + "\n", + "Steps to use\n", + "\n", + "- Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/\n", + "- Set `api_base` to your deployed api base\n", + "- set the model to `huggingface/tgi` so that litellm knows it's a huggingface Deployed Inference Endpoint.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import litellm\n", + "\n", + "\n", + "response = litellm.completion(\n", + " model=\"huggingface/tgi\",\n", + " messages=[{\"content\": \"Hello, how are you?\", \"role\": \"user\"}],\n", + " api_base=\"https://my-endpoint.endpoints.huggingface.cloud/v1/\",\n", + ")\n", + "print(response)" + ] } ], "metadata": { @@ -251,7 +230,8 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", + "language": "python", "name": "python3" }, "language_info": { @@ -264,7 +244,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.2" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/cookbook/litellm-ollama-docker-image/requirements.txt b/cookbook/litellm-ollama-docker-image/requirements.txt index 6d24983e38..7990d251cc 100644 --- a/cookbook/litellm-ollama-docker-image/requirements.txt +++ b/cookbook/litellm-ollama-docker-image/requirements.txt @@ -1 +1 @@ -litellm==1.55.3 \ No newline at end of file +litellm==1.61.15 \ No newline at end of file diff --git a/cookbook/misc/dev_release.txt b/cookbook/misc/dev_release.txt index 717a6da546..bd40f89e6f 100644 --- a/cookbook/misc/dev_release.txt +++ b/cookbook/misc/dev_release.txt @@ -1,2 +1,11 @@ python3 -m build -twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ - \ No newline at end of file +twine upload --verbose dist/litellm-1.18.13.dev4.tar.gz -u __token__ - + + +Note: You might need to make a MANIFEST.ini file on root for build process incase it fails + +Place this in MANIFEST.ini +recursive-exclude venv * +recursive-exclude myenv * +recursive-exclude py313_env * +recursive-exclude **/.venv * diff --git a/deploy/charts/litellm-helm/Chart.yaml b/deploy/charts/litellm-helm/Chart.yaml index f1f2fd8d64..5de591fd73 100644 --- a/deploy/charts/litellm-helm/Chart.yaml +++ b/deploy/charts/litellm-helm/Chart.yaml @@ -18,7 +18,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.4.1 +version: 0.4.3 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/deploy/charts/litellm-helm/README.md b/deploy/charts/litellm-helm/README.md index 8b2196f577..a0ba5781df 100644 --- a/deploy/charts/litellm-helm/README.md +++ b/deploy/charts/litellm-helm/README.md @@ -22,6 +22,8 @@ If `db.useStackgresOperator` is used (not yet implemented): | Name | Description | Value | | ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | | `replicaCount` | The number of LiteLLM Proxy pods to be deployed | `1` | +| `masterkeySecretName` | The name of the Kubernetes Secret that contains the Master API Key for LiteLLM. If not specified, use the generated secret name. | N/A | +| `masterkeySecretKey` | The key within the Kubernetes Secret that contains the Master API Key for LiteLLM. If not specified, use `masterkey` as the key. | N/A | | `masterkey` | The Master API Key for LiteLLM. If not specified, a random key is generated. | N/A | | `environmentSecrets` | An optional array of Secret object names. The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` | | `environmentConfigMaps` | An optional array of ConfigMap object names. The keys and values in these configmaps will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` | diff --git a/deploy/charts/litellm-helm/templates/deployment.yaml b/deploy/charts/litellm-helm/templates/deployment.yaml index 697148abf8..5b9488c19b 100644 --- a/deploy/charts/litellm-helm/templates/deployment.yaml +++ b/deploy/charts/litellm-helm/templates/deployment.yaml @@ -78,8 +78,8 @@ spec: - name: PROXY_MASTER_KEY valueFrom: secretKeyRef: - name: {{ include "litellm.fullname" . }}-masterkey - key: masterkey + name: {{ .Values.masterkeySecretName | default (printf "%s-masterkey" (include "litellm.fullname" .)) }} + key: {{ .Values.masterkeySecretKey | default "masterkey" }} {{- if .Values.redis.enabled }} - name: REDIS_HOST value: {{ include "litellm.redis.serviceName" . }} @@ -97,6 +97,9 @@ spec: value: {{ $val | quote }} {{- end }} {{- end }} + {{- with .Values.extraEnvVars }} + {{- toYaml . | nindent 12 }} + {{- end }} envFrom: {{- range .Values.environmentSecrets }} - secretRef: diff --git a/deploy/charts/litellm-helm/templates/migrations-job.yaml b/deploy/charts/litellm-helm/templates/migrations-job.yaml index e994c45548..1c4b6817fa 100644 --- a/deploy/charts/litellm-helm/templates/migrations-job.yaml +++ b/deploy/charts/litellm-helm/templates/migrations-job.yaml @@ -65,6 +65,6 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} - ttlSecondsAfterFinished: {{ .Values.migrationJob.ttlSecondsAfterFinished }} + ttlSecondsAfterFinished: {{ .Values.migrationJob.ttlSecondsAfterFinished }} backoffLimit: {{ .Values.migrationJob.backoffLimit }} {{- end }} diff --git a/deploy/charts/litellm-helm/templates/secret-masterkey.yaml b/deploy/charts/litellm-helm/templates/secret-masterkey.yaml index 57b854cc0f..5632957dc0 100644 --- a/deploy/charts/litellm-helm/templates/secret-masterkey.yaml +++ b/deploy/charts/litellm-helm/templates/secret-masterkey.yaml @@ -1,3 +1,4 @@ +{{- if not .Values.masterkeySecretName }} {{ $masterkey := (.Values.masterkey | default (randAlphaNum 17)) }} apiVersion: v1 kind: Secret @@ -5,4 +6,5 @@ metadata: name: {{ include "litellm.fullname" . }}-masterkey data: masterkey: {{ $masterkey | b64enc }} -type: Opaque \ No newline at end of file +type: Opaque +{{- end }} diff --git a/deploy/charts/litellm-helm/templates/service.yaml b/deploy/charts/litellm-helm/templates/service.yaml index 40e7f27f16..d8d81e78c8 100644 --- a/deploy/charts/litellm-helm/templates/service.yaml +++ b/deploy/charts/litellm-helm/templates/service.yaml @@ -2,6 +2,10 @@ apiVersion: v1 kind: Service metadata: name: {{ include "litellm.fullname" . }} + {{- with .Values.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} labels: {{- include "litellm.labels" . | nindent 4 }} spec: diff --git a/deploy/charts/litellm-helm/tests/deployment_tests.yaml b/deploy/charts/litellm-helm/tests/deployment_tests.yaml index e7ce44b052..b71f91377f 100644 --- a/deploy/charts/litellm-helm/tests/deployment_tests.yaml +++ b/deploy/charts/litellm-helm/tests/deployment_tests.yaml @@ -52,3 +52,66 @@ tests: - equal: path: spec.template.spec.affinity.nodeAffinity.requiredDuringSchedulingIgnoredDuringExecution.nodeSelectorTerms[0].matchExpressions[0].values[0] value: antarctica-east1 + - it: should work without masterkeySecretName or masterkeySecretKey + template: deployment.yaml + set: + masterkeySecretName: "" + masterkeySecretKey: "" + asserts: + - contains: + path: spec.template.spec.containers[0].env + content: + name: PROXY_MASTER_KEY + valueFrom: + secretKeyRef: + name: RELEASE-NAME-litellm-masterkey + key: masterkey + - it: should work with masterkeySecretName and masterkeySecretKey + template: deployment.yaml + set: + masterkeySecretName: my-secret + masterkeySecretKey: my-key + asserts: + - contains: + path: spec.template.spec.containers[0].env + content: + name: PROXY_MASTER_KEY + valueFrom: + secretKeyRef: + name: my-secret + key: my-key + - it: should work with extraEnvVars + template: deployment.yaml + set: + extraEnvVars: + - name: EXTRA_ENV_VAR + valueFrom: + fieldRef: + fieldPath: metadata.labels['env'] + asserts: + - contains: + path: spec.template.spec.containers[0].env + content: + name: EXTRA_ENV_VAR + valueFrom: + fieldRef: + fieldPath: metadata.labels['env'] + - it: should work with both extraEnvVars and envVars + template: deployment.yaml + set: + envVars: + ENV_VAR: ENV_VAR_VALUE + extraEnvVars: + - name: EXTRA_ENV_VAR + value: EXTRA_ENV_VAR_VALUE + asserts: + - contains: + path: spec.template.spec.containers[0].env + content: + name: ENV_VAR + value: ENV_VAR_VALUE + - contains: + path: spec.template.spec.containers[0].env + content: + name: EXTRA_ENV_VAR + value: EXTRA_ENV_VAR_VALUE diff --git a/deploy/charts/litellm-helm/tests/masterkey-secret_tests.yaml b/deploy/charts/litellm-helm/tests/masterkey-secret_tests.yaml new file mode 100644 index 0000000000..eb1d3c3967 --- /dev/null +++ b/deploy/charts/litellm-helm/tests/masterkey-secret_tests.yaml @@ -0,0 +1,18 @@ +suite: test masterkey secret +templates: + - secret-masterkey.yaml +tests: + - it: should create a secret if masterkeySecretName is not set + template: secret-masterkey.yaml + set: + masterkeySecretName: "" + asserts: + - isKind: + of: Secret + - it: should not create a secret if masterkeySecretName is set + template: secret-masterkey.yaml + set: + masterkeySecretName: my-secret + asserts: + - hasDocuments: + count: 0 diff --git a/deploy/charts/litellm-helm/values.yaml b/deploy/charts/litellm-helm/values.yaml index 9f21fc40ad..0440e28eed 100644 --- a/deploy/charts/litellm-helm/values.yaml +++ b/deploy/charts/litellm-helm/values.yaml @@ -75,6 +75,12 @@ ingress: # masterkey: changeit +# if set, use this secret for the master key; otherwise, autogenerate a new one +masterkeySecretName: "" + +# if set, use this secret key for the master key; otherwise, use the default key +masterkeySecretKey: "" + # The elements within proxy_config are rendered as config.yaml for the proxy # Examples: https://github.com/BerriAI/litellm/tree/main/litellm/proxy/example_config_yaml # Reference: https://docs.litellm.ai/docs/proxy/configs @@ -189,9 +195,15 @@ migrationJob: annotations: {} ttlSecondsAfterFinished: 120 -# Additional environment variables to be added to the deployment +# Additional environment variables to be added to the deployment as a map of key-value pairs envVars: { # USE_DDTRACE: "true" } +# Additional environment variables to be added to the deployment as a list of k8s env vars +extraEnvVars: { + # - name: EXTRA_ENV_VAR + # value: EXTRA_ENV_VAR_VALUE +} + diff --git a/docker-compose.yml b/docker-compose.yml index d16ec6ed20..66f5bcaa7f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -66,5 +66,3 @@ volumes: postgres_data: name: litellm_postgres_data # Named volume for Postgres data persistence - -# ...rest of your docker-compose config if any diff --git a/docker/Dockerfile.alpine b/docker/Dockerfile.alpine index cc0c434013..f036081549 100644 --- a/docker/Dockerfile.alpine +++ b/docker/Dockerfile.alpine @@ -35,7 +35,7 @@ RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt FROM $LITELLM_RUNTIME_IMAGE AS runtime # Update dependencies and clean up -RUN apk update && apk upgrade && rm -rf /var/cache/apk/* +RUN apk upgrade --no-cache WORKDIR /app diff --git a/docker/Dockerfile.database b/docker/Dockerfile.database index 02eb286180..da0326fd2c 100644 --- a/docker/Dockerfile.database +++ b/docker/Dockerfile.database @@ -12,8 +12,7 @@ WORKDIR /app USER root # Install build dependencies -RUN apk update && \ - apk add --no-cache gcc python3-dev openssl openssl-dev +RUN apk add --no-cache gcc python3-dev openssl openssl-dev RUN pip install --upgrade pip && \ @@ -44,8 +43,7 @@ FROM $LITELLM_RUNTIME_IMAGE AS runtime USER root # Install runtime dependencies -RUN apk update && \ - apk add --no-cache openssl +RUN apk add --no-cache openssl WORKDIR /app # Copy the current directory contents into the container at /app @@ -59,9 +57,6 @@ COPY --from=builder /wheels/ /wheels/ # Install the built wheel using pip; again using a wildcard if it's the only file RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels -# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 -RUN pip install redisvl==0.0.7 --no-deps - # ensure pyjwt is used, not jwt RUN pip uninstall jwt -y RUN pip uninstall PyJWT -y diff --git a/docker/Dockerfile.non_root b/docker/Dockerfile.non_root index 3a4cdb59d5..079778cafb 100644 --- a/docker/Dockerfile.non_root +++ b/docker/Dockerfile.non_root @@ -14,7 +14,7 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] # Install build dependencies RUN apt-get clean && apt-get update && \ - apt-get install -y gcc python3-dev && \ + apt-get install -y gcc g++ python3-dev && \ rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir --upgrade pip && \ @@ -56,10 +56,8 @@ COPY --from=builder /wheels/ /wheels/ # Install the built wheel using pip; again using a wildcard if it's the only file RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels -# install semantic-cache [Experimental]- we need this here and not in requirements.txt because redisvl pins to pydantic 1.0 # ensure pyjwt is used, not jwt -RUN pip install redisvl==0.0.7 --no-deps --no-cache-dir && \ - pip uninstall jwt -y && \ +RUN pip uninstall jwt -y && \ pip uninstall PyJWT -y && \ pip install PyJWT==2.9.0 --no-cache-dir diff --git a/docs/my-website/docs/anthropic_unified.md b/docs/my-website/docs/anthropic_unified.md index cf6ba798d5..92cae9c0aa 100644 --- a/docs/my-website/docs/anthropic_unified.md +++ b/docs/my-website/docs/anthropic_unified.md @@ -3,9 +3,10 @@ import TabItem from '@theme/TabItem'; # /v1/messages [BETA] -LiteLLM provides a BETA endpoint in the spec of Anthropic's `/v1/messages` endpoint. +Use LiteLLM to call all your LLM APIs in the Anthropic `v1/messages` format. -This currently just supports the Anthropic API. + +## Overview | Feature | Supported | Notes | |-------|-------|-------| @@ -21,9 +22,61 @@ Planned improvement: - Bedrock Anthropic support ## Usage +--- + +### LiteLLM Python SDK + +#### Non-streaming example +```python showLineNumbers title="Example using LiteLLM Python SDK" +import litellm +response = await litellm.anthropic.messages.acreate( + messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}], + api_key=api_key, + model="anthropic/claude-3-haiku-20240307", + max_tokens=100, +) +``` + +Example response: +```json +{ + "content": [ + { + "text": "Hi! this is a very short joke", + "type": "text" + } + ], + "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF", + "model": "claude-3-7-sonnet-20250219", + "role": "assistant", + "stop_reason": "end_turn", + "stop_sequence": null, + "type": "message", + "usage": { + "input_tokens": 2095, + "output_tokens": 503, + "cache_creation_input_tokens": 2095, + "cache_read_input_tokens": 0 + } +} +``` + +#### Streaming example +```python showLineNumbers title="Example using LiteLLM Python SDK" +import litellm +response = await litellm.anthropic.messages.acreate( + messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}], + api_key=api_key, + model="anthropic/claude-3-haiku-20240307", + max_tokens=100, + stream=True, +) +async for chunk in response: + print(chunk) +``` + +### LiteLLM Proxy Server - - 1. Setup config.yaml @@ -42,7 +95,28 @@ litellm --config /path/to/config.yaml 3. Test it! -```bash + + + +```python showLineNumbers title="Example using LiteLLM Proxy Server" +import anthropic + +# point anthropic sdk to litellm proxy +client = anthropic.Anthropic( + base_url="http://0.0.0.0:4000", + api_key="sk-1234", +) + +response = client.messages.create( + messages=[{"role": "user", "content": "Hello, can you tell me a short joke?"}], + model="anthropic-claude", + max_tokens=100, +) +``` + + + +```bash showLineNumbers title="Example using LiteLLM Proxy Server" curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \ -H 'content-type: application/json' \ -H 'x-api-key: $LITELLM_API_KEY' \ @@ -52,41 +126,176 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/messages' \ "messages": [ { "role": "user", - "content": [ - { - "type": "text", - "text": "List 5 important events in the XIX century" - } - ] + "content": "Hello, can you tell me a short joke?" } ], - "max_tokens": 4096 + "max_tokens": 100 }' ``` + - + -```python -from litellm.llms.anthropic.experimental_pass_through.messages.handler import anthropic_messages -import asyncio -import os -# set env -os.environ["ANTHROPIC_API_KEY"] = "my-api-key" +## Request Format +--- -messages = [{"role": "user", "content": "Hello, can you tell me a short joke?"}] +Request body will be in the Anthropic messages API format. **litellm follows the Anthropic messages specification for this endpoint.** -# Call the handler -async def call(): - response = await anthropic_messages( - messages=messages, - api_key=api_key, - model="claude-3-haiku-20240307", - max_tokens=100, - ) +#### Example request body -asyncio.run(call()) +```json +{ + "model": "claude-3-7-sonnet-20250219", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "Hello, world" + } + ] +} ``` - - \ No newline at end of file +#### Required Fields +- **model** (string): + The model identifier (e.g., `"claude-3-7-sonnet-20250219"`). +- **max_tokens** (integer): + The maximum number of tokens to generate before stopping. + _Note: The model may stop before reaching this limit; value must be greater than 1._ +- **messages** (array of objects): + An ordered list of conversational turns. + Each message object must include: + - **role** (enum: `"user"` or `"assistant"`): + Specifies the speaker of the message. + - **content** (string or array of content blocks): + The text or content blocks (e.g., an array containing objects with a `type` such as `"text"`) that form the message. + _Example equivalence:_ + ```json + {"role": "user", "content": "Hello, Claude"} + ``` + is equivalent to: + ```json + {"role": "user", "content": [{"type": "text", "text": "Hello, Claude"}]} + ``` + +#### Optional Fields +- **metadata** (object): + Contains additional metadata about the request (e.g., `user_id` as an opaque identifier). +- **stop_sequences** (array of strings): + Custom sequences that, when encountered in the generated text, cause the model to stop. +- **stream** (boolean): + Indicates whether to stream the response using server-sent events. +- **system** (string or array): + A system prompt providing context or specific instructions to the model. +- **temperature** (number): + Controls randomness in the model’s responses. Valid range: `0 < temperature < 1`. +- **thinking** (object): + Configuration for enabling extended thinking. If enabled, it includes: + - **budget_tokens** (integer): + Minimum of 1024 tokens (and less than `max_tokens`). + - **type** (enum): + E.g., `"enabled"`. +- **tool_choice** (object): + Instructs how the model should utilize any provided tools. +- **tools** (array of objects): + Definitions for tools available to the model. Each tool includes: + - **name** (string): + The tool’s name. + - **description** (string): + A detailed description of the tool. + - **input_schema** (object): + A JSON schema describing the expected input format for the tool. +- **top_k** (integer): + Limits sampling to the top K options. +- **top_p** (number): + Enables nucleus sampling with a cumulative probability cutoff. Valid range: `0 < top_p < 1`. + + +## Response Format +--- + +Responses will be in the Anthropic messages API format. + +#### Example Response + +```json +{ + "content": [ + { + "text": "Hi! My name is Claude.", + "type": "text" + } + ], + "id": "msg_013Zva2CMHLNnXjNJJKqJ2EF", + "model": "claude-3-7-sonnet-20250219", + "role": "assistant", + "stop_reason": "end_turn", + "stop_sequence": null, + "type": "message", + "usage": { + "input_tokens": 2095, + "output_tokens": 503, + "cache_creation_input_tokens": 2095, + "cache_read_input_tokens": 0 + } +} +``` + +#### Response fields + +- **content** (array of objects): + Contains the generated content blocks from the model. Each block includes: + - **type** (string): + Indicates the type of content (e.g., `"text"`, `"tool_use"`, `"thinking"`, or `"redacted_thinking"`). + - **text** (string): + The generated text from the model. + _Note: Maximum length is 5,000,000 characters._ + - **citations** (array of objects or `null`): + Optional field providing citation details. Each citation includes: + - **cited_text** (string): + The excerpt being cited. + - **document_index** (integer): + An index referencing the cited document. + - **document_title** (string or `null`): + The title of the cited document. + - **start_char_index** (integer): + The starting character index for the citation. + - **end_char_index** (integer): + The ending character index for the citation. + - **type** (string): + Typically `"char_location"`. + +- **id** (string): + A unique identifier for the response message. + _Note: The format and length of IDs may change over time._ + +- **model** (string): + Specifies the model that generated the response. + +- **role** (string): + Indicates the role of the generated message. For responses, this is always `"assistant"`. + +- **stop_reason** (string): + Explains why the model stopped generating text. Possible values include: + - `"end_turn"`: The model reached a natural stopping point. + - `"max_tokens"`: The generation stopped because the maximum token limit was reached. + - `"stop_sequence"`: A custom stop sequence was encountered. + - `"tool_use"`: The model invoked one or more tools. + +- **stop_sequence** (string or `null`): + Contains the specific stop sequence that caused the generation to halt, if applicable; otherwise, it is `null`. + +- **type** (string): + Denotes the type of response object, which is always `"message"`. + +- **usage** (object): + Provides details on token usage for billing and rate limiting. This includes: + - **input_tokens** (integer): + Total number of input tokens processed. + - **output_tokens** (integer): + Total number of output tokens generated. + - **cache_creation_input_tokens** (integer or `null`): + Number of tokens used to create a cache entry. + - **cache_read_input_tokens** (integer or `null`): + Number of tokens read from the cache. diff --git a/docs/my-website/docs/caching/all_caches.md b/docs/my-website/docs/caching/all_caches.md index dc1951cc77..a14170beef 100644 --- a/docs/my-website/docs/caching/all_caches.md +++ b/docs/my-website/docs/caching/all_caches.md @@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem'; # Caching - In-Memory, Redis, s3, Redis Semantic Cache, Disk -[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm.caching.caching.py) +[**See Code**](https://github.com/BerriAI/litellm/blob/main/litellm/caching/caching.py) :::info @@ -26,7 +26,7 @@ Install redis pip install redis ``` -For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ +For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/ ```python import litellm @@ -37,11 +37,11 @@ litellm.cache = Cache(type="redis", host=, port=, password= -Install redis +Install redisvl client ```shell -pip install redisvl==0.0.7 +pip install redisvl==0.4.1 ``` -For the hosted version you can setup your own Redis DB here: https://app.redislabs.com/ +For the hosted version you can setup your own Redis DB here: https://redis.io/try-free/ ```python import litellm @@ -114,6 +114,7 @@ litellm.cache = Cache( port=os.environ["REDIS_PORT"], password=os.environ["REDIS_PASSWORD"], similarity_threshold=0.8, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity + ttl=120, redis_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here ) response1 = completion( @@ -471,11 +472,13 @@ def __init__( password: Optional[str] = None, namespace: Optional[str] = None, default_in_redis_ttl: Optional[float] = None, - similarity_threshold: Optional[float] = None, - redis_semantic_cache_use_async=False, - redis_semantic_cache_embedding_model="text-embedding-ada-002", redis_flush_size=None, + # redis semantic cache params + similarity_threshold: Optional[float] = None, + redis_semantic_cache_embedding_model: str = "text-embedding-ada-002", + redis_semantic_cache_index_name: Optional[str] = None, + # s3 Bucket, boto3 configuration s3_bucket_name: Optional[str] = None, s3_region_name: Optional[str] = None, diff --git a/docs/my-website/docs/completion/document_understanding.md b/docs/my-website/docs/completion/document_understanding.md index 6719169aef..f58b836c63 100644 --- a/docs/my-website/docs/completion/document_understanding.md +++ b/docs/my-website/docs/completion/document_understanding.md @@ -27,16 +27,18 @@ os.environ["AWS_REGION_NAME"] = "" # pdf url -image_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" +file_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" # model model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0" -image_content = [ +file_content = [ {"type": "text", "text": "What's this file about?"}, { - "type": "image_url", - "image_url": image_url, # OR {"url": image_url} + "type": "file", + "file": { + "file_id": file_url, + } }, ] @@ -46,7 +48,7 @@ if not supports_pdf_input(model, None): response = completion( model=model, - messages=[{"role": "user", "content": image_content}], + messages=[{"role": "user", "content": file_content}], ) assert response is not None ``` @@ -80,11 +82,15 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ -d '{ "model": "bedrock-model", "messages": [ - {"role": "user", "content": {"type": "text", "text": "What's this file about?"}}, - { - "type": "image_url", - "image_url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", - } + {"role": "user", "content": [ + {"type": "text", "text": "What's this file about?"}, + { + "type": "file", + "file": { + "file_id": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", + } + } + ]}, ] }' ``` @@ -116,11 +122,13 @@ base64_url = f"data:application/pdf;base64,{encoded_file}" # model model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0" -image_content = [ +file_content = [ {"type": "text", "text": "What's this file about?"}, { - "type": "image_url", - "image_url": base64_url, # OR {"url": base64_url} + "type": "file", + "file": { + "file_data": base64_url, + } }, ] @@ -130,11 +138,53 @@ if not supports_pdf_input(model, None): response = completion( model=model, - messages=[{"role": "user", "content": image_content}], + messages=[{"role": "user", "content": file_content}], ) assert response is not None ``` + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: bedrock-model + litellm_params: + model: bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0 + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: os.environ/AWS_REGION_NAME +``` + +2. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "bedrock-model", + "messages": [ + {"role": "user", "content": [ + {"type": "text", "text": "What's this file about?"}, + { + "type": "file", + "file": { + "file_data": "data:application/pdf;base64...", + } + } + ]}, + ] +}' +``` + ## Checking if a model supports pdf input diff --git a/docs/my-website/docs/completion/drop_params.md b/docs/my-website/docs/completion/drop_params.md index e79a88e14b..590d9a4595 100644 --- a/docs/my-website/docs/completion/drop_params.md +++ b/docs/my-website/docs/completion/drop_params.md @@ -107,4 +107,76 @@ response = litellm.completion( -**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model. \ No newline at end of file +**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model. + +## Specify allowed openai params in a request + +Tell litellm to allow specific openai params in a request. Use this if you get a `litellm.UnsupportedParamsError` and want to allow a param. LiteLLM will pass the param as is to the model. + + + + + + +In this example we pass `allowed_openai_params=["tools"]` to allow the `tools` param. + +```python showLineNumbers title="Pass allowed_openai_params to LiteLLM Python SDK" +await litellm.acompletion( + model="azure/o_series/", + api_key="xxxxx", + api_base=api_base, + messages=[{"role": "user", "content": "Hello! return a json object"}], + tools=[{"type": "function", "function": {"name": "get_current_time", "description": "Get the current time in a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city name, e.g. San Francisco"}}, "required": ["location"]}}}] + allowed_openai_params=["tools"], +) +``` + + + +When using litellm proxy you can pass `allowed_openai_params` in two ways: + +1. Dynamically pass `allowed_openai_params` in a request +2. Set `allowed_openai_params` on the config.yaml file for a specific model + +#### Dynamically pass allowed_openai_params in a request +In this example we pass `allowed_openai_params=["tools"]` to allow the `tools` param for a request sent to the model set on the proxy. + +```python showLineNumbers title="Dynamically pass allowed_openai_params in a request" +import openai +from openai import AsyncAzureOpenAI + +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } + ], + extra_body={ + "allowed_openai_params": ["tools"] + } +) +``` + +#### Set allowed_openai_params on config.yaml + +You can also set `allowed_openai_params` on the config.yaml file for a specific model. This means that all requests to this deployment are allowed to pass in the `tools` param. + +```yaml showLineNumbers title="Set allowed_openai_params on config.yaml" +model_list: + - model_name: azure-o1-preview + litellm_params: + model: azure/o_series/ + api_key: xxxxx + api_base: https://openai-prod-test.openai.azure.com/openai/deployments/o1/chat/completions?api-version=2025-01-01-preview + allowed_openai_params: ["tools"] +``` + + \ No newline at end of file diff --git a/docs/my-website/docs/completion/prompt_caching.md b/docs/my-website/docs/completion/prompt_caching.md index 6fbf89bd6d..9447a11d52 100644 --- a/docs/my-website/docs/completion/prompt_caching.md +++ b/docs/my-website/docs/completion/prompt_caching.md @@ -4,7 +4,7 @@ import TabItem from '@theme/TabItem'; # Prompt Caching Supported Providers: -- OpenAI (`deepseek/`) +- OpenAI (`openai/`) - Anthropic API (`anthropic/`) - Bedrock (`bedrock/`, `bedrock/invoke/`, `bedrock/converse`) ([All models bedrock supports prompt caching on](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html)) - Deepseek API (`deepseek/`) @@ -505,4 +505,4 @@ curl -L -X GET 'http://0.0.0.0:4000/v1/model/info' \ -This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) \ No newline at end of file +This checks our maintained [model info/cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) diff --git a/docs/my-website/docs/completion/web_search.md b/docs/my-website/docs/completion/web_search.md new file mode 100644 index 0000000000..7a67dc265e --- /dev/null +++ b/docs/my-website/docs/completion/web_search.md @@ -0,0 +1,308 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Using Web Search + +Use web search with litellm + +| Feature | Details | +|---------|---------| +| Supported Endpoints | - `/chat/completions`
- `/responses` | +| Supported Providers | `openai` | +| LiteLLM Cost Tracking | ✅ Supported | +| LiteLLM Version | `v1.63.15-nightly` or higher | + + +## `/chat/completions` (litellm.completion) + +### Quick Start + + + + +```python showLineNumbers +from litellm import completion + +response = completion( + model="openai/gpt-4o-search-preview", + messages=[ + { + "role": "user", + "content": "What was a positive news story from today?", + } + ], +) +``` + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: gpt-4o-search-preview + litellm_params: + model: openai/gpt-4o-search-preview + api_key: os.environ/OPENAI_API_KEY +``` + +2. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```python showLineNumbers +from openai import OpenAI + +# Point to your proxy server +client = OpenAI( + api_key="sk-1234", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="gpt-4o-search-preview", + messages=[ + { + "role": "user", + "content": "What was a positive news story from today?" + } + ] +) +``` + + + +### Search context size + + + + +```python showLineNumbers +from litellm import completion + +# Customize search context size +response = completion( + model="openai/gpt-4o-search-preview", + messages=[ + { + "role": "user", + "content": "What was a positive news story from today?", + } + ], + web_search_options={ + "search_context_size": "low" # Options: "low", "medium" (default), "high" + } +) +``` + + + +```python showLineNumbers +from openai import OpenAI + +# Point to your proxy server +client = OpenAI( + api_key="sk-1234", + base_url="http://0.0.0.0:4000" +) + +# Customize search context size +response = client.chat.completions.create( + model="gpt-4o-search-preview", + messages=[ + { + "role": "user", + "content": "What was a positive news story from today?" + } + ], + web_search_options={ + "search_context_size": "low" # Options: "low", "medium" (default), "high" + } +) +``` + + + +## `/responses` (litellm.responses) + +### Quick Start + + + + +```python showLineNumbers +from litellm import responses + +response = responses( + model="openai/gpt-4o", + input=[ + { + "role": "user", + "content": "What was a positive news story from today?" + } + ], + tools=[{ + "type": "web_search_preview" # enables web search with default medium context size + }] +) +``` + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: gpt-4o + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_API_KEY +``` + +2. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```python showLineNumbers +from openai import OpenAI + +# Point to your proxy server +client = OpenAI( + api_key="sk-1234", + base_url="http://0.0.0.0:4000" +) + +response = client.responses.create( + model="gpt-4o", + tools=[{ + "type": "web_search_preview" + }], + input="What was a positive news story from today?", +) + +print(response.output_text) +``` + + + +### Search context size + + + + +```python showLineNumbers +from litellm import responses + +# Customize search context size +response = responses( + model="openai/gpt-4o", + input=[ + { + "role": "user", + "content": "What was a positive news story from today?" + } + ], + tools=[{ + "type": "web_search_preview", + "search_context_size": "low" # Options: "low", "medium" (default), "high" + }] +) +``` + + + +```python showLineNumbers +from openai import OpenAI + +# Point to your proxy server +client = OpenAI( + api_key="sk-1234", + base_url="http://0.0.0.0:4000" +) + +# Customize search context size +response = client.responses.create( + model="gpt-4o", + tools=[{ + "type": "web_search_preview", + "search_context_size": "low" # Options: "low", "medium" (default), "high" + }], + input="What was a positive news story from today?", +) + +print(response.output_text) +``` + + + + + + + + +## Checking if a model supports web search + + + + +Use `litellm.supports_web_search(model="openai/gpt-4o-search-preview")` -> returns `True` if model can perform web searches + +```python showLineNumbers +assert litellm.supports_web_search(model="openai/gpt-4o-search-preview") == True +``` + + + + +1. Define OpenAI models in config.yaml + +```yaml +model_list: + - model_name: gpt-4o-search-preview + litellm_params: + model: openai/gpt-4o-search-preview + api_key: os.environ/OPENAI_API_KEY + model_info: + supports_web_search: True +``` + +2. Run proxy server + +```bash +litellm --config config.yaml +``` + +3. Call `/model_group/info` to check if a model supports web search + +```shell +curl -X 'GET' \ + 'http://localhost:4000/model_group/info' \ + -H 'accept: application/json' \ + -H 'x-api-key: sk-1234' +``` + +Expected Response + +```json showLineNumbers +{ + "data": [ + { + "model_group": "gpt-4o-search-preview", + "providers": ["openai"], + "max_tokens": 128000, + "supports_web_search": true, # 👈 supports_web_search is true + } + ] +} +``` + + + diff --git a/docs/my-website/docs/enterprise.md b/docs/my-website/docs/enterprise.md index 0306a5b452..706ca33714 100644 --- a/docs/my-website/docs/enterprise.md +++ b/docs/my-website/docs/enterprise.md @@ -1,3 +1,5 @@ +import Image from '@theme/IdealImage'; + # Enterprise For companies that need SSO, user management and professional support for LiteLLM Proxy @@ -7,6 +9,8 @@ Get free 7-day trial key [here](https://www.litellm.ai/#trial) Includes all enterprise features. + + [**Procurement available via AWS / Azure Marketplace**](./data_security.md#legalcompliance-faqs) @@ -34,9 +38,9 @@ You can use our cloud product where we setup a dedicated instance for you. Professional Support can assist with LLM/Provider integrations, deployment, upgrade management, and LLM Provider troubleshooting. We can’t solve your own infrastructure-related issues but we will guide you to fix them. -- 1 hour for Sev0 issues -- 6 hours for Sev1 -- 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday) +- 1 hour for Sev0 issues - 100% production traffic is failing +- 6 hours for Sev1 - <100% production traffic is failing +- 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday) - setup issues e.g. Redis working on our end, but not on your infrastructure. - 72h SLA for patching vulnerabilities in the software. **We can offer custom SLAs** based on your needs and the severity of the issue diff --git a/docs/my-website/docs/files_endpoints.md b/docs/my-website/docs/files_endpoints.md index 7e20982ff4..31a02d41a3 100644 --- a/docs/my-website/docs/files_endpoints.md +++ b/docs/my-website/docs/files_endpoints.md @@ -2,10 +2,12 @@ import TabItem from '@theme/TabItem'; import Tabs from '@theme/Tabs'; -# /files +# Provider Files Endpoints Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API. +Use this to call the provider's `/files` endpoints directly, in the OpenAI format. + ## Quick Start - Upload a File @@ -14,48 +16,105 @@ Files are used to upload documents that can be used with features like Assistant - Delete File - Get File Content + + -```bash -$ export OPENAI_API_KEY="sk-..." +1. Setup config.yaml -$ litellm - -# RUNNING on http://0.0.0.0:4000 +``` +# for /files endpoints +files_settings: + - custom_llm_provider: azure + api_base: https://exampleopenaiendpoint-production.up.railway.app + api_key: fake-key + api_version: "2023-03-15-preview" + - custom_llm_provider: openai + api_key: os.environ/OPENAI_API_KEY ``` -**Upload a File** +2. Start LiteLLM PROXY Server + ```bash -curl http://localhost:4000/v1/files \ - -H "Authorization: Bearer sk-1234" \ - -F purpose="fine-tune" \ - -F file="@mydata.jsonl" +litellm --config /path/to/config.yaml + +## RUNNING on http://0.0.0.0:4000 ``` -**List Files** -```bash -curl http://localhost:4000/v1/files \ - -H "Authorization: Bearer sk-1234" +3. Use OpenAI's /files endpoints + +Upload a File + +```python +from openai import OpenAI + +client = OpenAI( + api_key="sk-...", + base_url="http://0.0.0.0:4000/v1" +) + +client.files.create( + file=wav_data, + purpose="user_data", + extra_body={"custom_llm_provider": "openai"} +) ``` -**Retrieve File Information** -```bash -curl http://localhost:4000/v1/files/file-abc123 \ - -H "Authorization: Bearer sk-1234" +List Files + +```python +from openai import OpenAI + +client = OpenAI( + api_key="sk-...", + base_url="http://0.0.0.0:4000/v1" +) + +files = client.files.list(extra_body={"custom_llm_provider": "openai"}) +print("files=", files) ``` -**Delete File** -```bash -curl http://localhost:4000/v1/files/file-abc123 \ - -X DELETE \ - -H "Authorization: Bearer sk-1234" +Retrieve File Information + +```python +from openai import OpenAI + +client = OpenAI( + api_key="sk-...", + base_url="http://0.0.0.0:4000/v1" +) + +file = client.files.retrieve(file_id="file-abc123", extra_body={"custom_llm_provider": "openai"}) +print("file=", file) ``` -**Get File Content** -```bash -curl http://localhost:4000/v1/files/file-abc123/content \ - -H "Authorization: Bearer sk-1234" +Delete File + +```python +from openai import OpenAI + +client = OpenAI( + api_key="sk-...", + base_url="http://0.0.0.0:4000/v1" +) + +response = client.files.delete(file_id="file-abc123", extra_body={"custom_llm_provider": "openai"}) +print("delete response=", response) +``` + +Get File Content + +```python +from openai import OpenAI + +client = OpenAI( + api_key="sk-...", + base_url="http://0.0.0.0:4000/v1" +) + +content = client.files.content(file_id="file-abc123", extra_body={"custom_llm_provider": "openai"}) +print("content=", content) ``` @@ -120,7 +179,7 @@ print("file content=", content) ### [OpenAI](#quick-start) -## [Azure OpenAI](./providers/azure#azure-batches-api) +### [Azure OpenAI](./providers/azure#azure-batches-api) ### [Vertex AI](./providers/vertex#batch-apis) diff --git a/docs/my-website/docs/guides/security_settings.md b/docs/my-website/docs/guides/security_settings.md new file mode 100644 index 0000000000..4dfeda2d70 --- /dev/null +++ b/docs/my-website/docs/guides/security_settings.md @@ -0,0 +1,66 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# SSL Security Settings + +If you're in an environment using an older TTS bundle, with an older encryption, follow this guide. + + +LiteLLM uses HTTPX for network requests, unless otherwise specified. + +1. Disable SSL verification + + + + + +```python +import litellm +litellm.ssl_verify = False +``` + + + +```yaml +litellm_settings: + ssl_verify: false +``` + + + + +```bash +export SSL_VERIFY="False" +``` + + + +2. Lower security settings + + + + +```python +import litellm +litellm.ssl_security_level = 1 +litellm.ssl_certificate = "/path/to/certificate.pem" +``` + + + +```yaml +litellm_settings: + ssl_security_level: 1 + ssl_certificate: "/path/to/certificate.pem" +``` + + + +```bash +export SSL_SECURITY_LEVEL="1" +export SSL_CERTIFICATE="/path/to/certificate.pem" +``` + + + + diff --git a/docs/my-website/docs/index.md b/docs/my-website/docs/index.md index dd3be587b5..9e4d76b89c 100644 --- a/docs/my-website/docs/index.md +++ b/docs/my-website/docs/index.md @@ -111,8 +111,8 @@ from litellm import completion import os # auth: run 'gcloud auth application-default' -os.environ["VERTEX_PROJECT"] = "hardy-device-386718" -os.environ["VERTEX_LOCATION"] = "us-central1" +os.environ["VERTEXAI_PROJECT"] = "hardy-device-386718" +os.environ["VERTEXAI_LOCATION"] = "us-central1" response = completion( model="vertex_ai/gemini-1.5-pro", diff --git a/docs/my-website/docs/mcp.md b/docs/my-website/docs/mcp.md new file mode 100644 index 0000000000..0947c494c7 --- /dev/null +++ b/docs/my-website/docs/mcp.md @@ -0,0 +1,427 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import Image from '@theme/IdealImage'; + +# /mcp [BETA] - Model Context Protocol + +## Expose MCP tools on LiteLLM Proxy Server + +This allows you to define tools that can be called by any MCP compatible client. Define your `mcp_servers` with LiteLLM and all your clients can list and call available tools. + + +

+ LiteLLM MCP Architecture: Use MCP tools with all LiteLLM supported models +

+ +#### How it works + +LiteLLM exposes the following MCP endpoints: + +- `/mcp/tools/list` - List all available tools +- `/mcp/tools/call` - Call a specific tool with the provided arguments + +When MCP clients connect to LiteLLM they can follow this workflow: + +1. Connect to the LiteLLM MCP server +2. List all available tools on LiteLLM +3. Client makes LLM API request with tool call(s) +4. LLM API returns which tools to call and with what arguments +5. MCP client makes MCP tool calls to LiteLLM +6. LiteLLM makes the tool calls to the appropriate MCP server +7. LiteLLM returns the tool call results to the MCP client + +#### Usage + +#### 1. Define your tools on under `mcp_servers` in your config.yaml file. + +LiteLLM allows you to define your tools on the `mcp_servers` section in your config.yaml file. All tools listed here will be available to MCP clients (when they connect to LiteLLM and call `list_tools`). + +```yaml title="config.yaml" showLineNumbers +model_list: + - model_name: gpt-4o + litellm_params: + model: openai/gpt-4o + api_key: sk-xxxxxxx + +mcp_servers: + { + "zapier_mcp": { + "url": "https://actions.zapier.com/mcp/sk-akxxxxx/sse" + }, + "fetch": { + "url": "http://localhost:8000/sse" + } + } +``` + + +#### 2. Start LiteLLM Gateway + + + + +```shell title="Docker Run" showLineNumbers +docker run -d \ + -p 4000:4000 \ + -e OPENAI_API_KEY=$OPENAI_API_KEY \ + --name my-app \ + -v $(pwd)/my_config.yaml:/app/config.yaml \ + my-app:latest \ + --config /app/config.yaml \ + --port 4000 \ + --detailed_debug \ +``` + + + + + +```shell title="litellm pip" showLineNumbers +litellm --config config.yaml --detailed_debug +``` + + + + + +#### 3. Make an LLM API request + +In this example we will do the following: + +1. Use MCP client to list MCP tools on LiteLLM Proxy +2. Use `transform_mcp_tool_to_openai_tool` to convert MCP tools to OpenAI tools +3. Provide the MCP tools to `gpt-4o` +4. Handle tool call from `gpt-4o` +5. Convert OpenAI tool call to MCP tool call +6. Execute tool call on MCP server + +```python title="MCP Client List Tools" showLineNumbers +import asyncio +from openai import AsyncOpenAI +from openai.types.chat import ChatCompletionUserMessageParam +from mcp import ClientSession +from mcp.client.sse import sse_client +from litellm.experimental_mcp_client.tools import ( + transform_mcp_tool_to_openai_tool, + transform_openai_tool_call_request_to_mcp_tool_call_request, +) + + +async def main(): + # Initialize clients + + # point OpenAI client to LiteLLM Proxy + client = AsyncOpenAI(api_key="sk-1234", base_url="http://localhost:4000") + + # Point MCP client to LiteLLM Proxy + async with sse_client("http://localhost:4000/mcp/") as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + + # 1. List MCP tools on LiteLLM Proxy + mcp_tools = await session.list_tools() + print("List of MCP tools for MCP server:", mcp_tools.tools) + + # Create message + messages = [ + ChatCompletionUserMessageParam( + content="Send an email about LiteLLM supporting MCP", role="user" + ) + ] + + # 2. Use `transform_mcp_tool_to_openai_tool` to convert MCP tools to OpenAI tools + # Since OpenAI only supports tools in the OpenAI format, we need to convert the MCP tools to the OpenAI format. + openai_tools = [ + transform_mcp_tool_to_openai_tool(tool) for tool in mcp_tools.tools + ] + + # 3. Provide the MCP tools to `gpt-4o` + response = await client.chat.completions.create( + model="gpt-4o", + messages=messages, + tools=openai_tools, + tool_choice="auto", + ) + + # 4. Handle tool call from `gpt-4o` + if response.choices[0].message.tool_calls: + tool_call = response.choices[0].message.tool_calls[0] + if tool_call: + + # 5. Convert OpenAI tool call to MCP tool call + # Since MCP servers expect tools in the MCP format, we need to convert the OpenAI tool call to the MCP format. + # This is done using litellm.experimental_mcp_client.tools.transform_openai_tool_call_request_to_mcp_tool_call_request + mcp_call = ( + transform_openai_tool_call_request_to_mcp_tool_call_request( + openai_tool=tool_call.model_dump() + ) + ) + + # 6. Execute tool call on MCP server + result = await session.call_tool( + name=mcp_call.name, arguments=mcp_call.arguments + ) + + print("Result:", result) + + +# Run it +asyncio.run(main()) +``` + +## LiteLLM Python SDK MCP Bridge + +LiteLLM Python SDK acts as a MCP bridge to utilize MCP tools with all LiteLLM supported models. LiteLLM offers the following features for using MCP + +- **List** Available MCP Tools: OpenAI clients can view all available MCP tools + - `litellm.experimental_mcp_client.load_mcp_tools` to list all available MCP tools +- **Call** MCP Tools: OpenAI clients can call MCP tools + - `litellm.experimental_mcp_client.call_openai_tool` to call an OpenAI tool on an MCP server + + +### 1. List Available MCP Tools + +In this example we'll use `litellm.experimental_mcp_client.load_mcp_tools` to list all available MCP tools on any MCP server. This method can be used in two ways: + +- `format="mcp"` - (default) Return MCP tools + - Returns: `mcp.types.Tool` +- `format="openai"` - Return MCP tools converted to OpenAI API compatible tools. Allows using with OpenAI endpoints. + - Returns: `openai.types.chat.ChatCompletionToolParam` + + + + +```python title="MCP Client List Tools" showLineNumbers +# Create server parameters for stdio connection +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client +import os +import litellm +from litellm import experimental_mcp_client + + +server_params = StdioServerParameters( + command="python3", + # Make sure to update to the full absolute path to your mcp_server.py file + args=["./mcp_server.py"], +) + +async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + # Initialize the connection + await session.initialize() + + # Get tools + tools = await experimental_mcp_client.load_mcp_tools(session=session, format="openai") + print("MCP TOOLS: ", tools) + + messages = [{"role": "user", "content": "what's (3 + 5)"}] + llm_response = await litellm.acompletion( + model="gpt-4o", + api_key=os.getenv("OPENAI_API_KEY"), + messages=messages, + tools=tools, + ) + print("LLM RESPONSE: ", json.dumps(llm_response, indent=4, default=str)) +``` + + + + + +In this example we'll walk through how you can use the OpenAI SDK pointed to the LiteLLM proxy to call MCP tools. The key difference here is we use the OpenAI SDK to make the LLM API request + +```python title="MCP Client List Tools" showLineNumbers +# Create server parameters for stdio connection +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client +import os +from openai import OpenAI +from litellm import experimental_mcp_client + +server_params = StdioServerParameters( + command="python3", + # Make sure to update to the full absolute path to your mcp_server.py file + args=["./mcp_server.py"], +) + +async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + # Initialize the connection + await session.initialize() + + # Get tools using litellm mcp client + tools = await experimental_mcp_client.load_mcp_tools(session=session, format="openai") + print("MCP TOOLS: ", tools) + + # Use OpenAI SDK pointed to LiteLLM proxy + client = OpenAI( + api_key="your-api-key", # Your LiteLLM proxy API key + base_url="http://localhost:4000" # Your LiteLLM proxy URL + ) + + messages = [{"role": "user", "content": "what's (3 + 5)"}] + llm_response = client.chat.completions.create( + model="gpt-4", + messages=messages, + tools=tools + ) + print("LLM RESPONSE: ", llm_response) +``` + + + + +### 2. List and Call MCP Tools + +In this example we'll use +- `litellm.experimental_mcp_client.load_mcp_tools` to list all available MCP tools on any MCP server +- `litellm.experimental_mcp_client.call_openai_tool` to call an OpenAI tool on an MCP server + +The first llm response returns a list of OpenAI tools. We take the first tool call from the LLM response and pass it to `litellm.experimental_mcp_client.call_openai_tool` to call the tool on the MCP server. + +#### How `litellm.experimental_mcp_client.call_openai_tool` works + +- Accepts an OpenAI Tool Call from the LLM response +- Converts the OpenAI Tool Call to an MCP Tool +- Calls the MCP Tool on the MCP server +- Returns the result of the MCP Tool call + + + + +```python title="MCP Client List and Call Tools" showLineNumbers +# Create server parameters for stdio connection +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client +import os +import litellm +from litellm import experimental_mcp_client + + +server_params = StdioServerParameters( + command="python3", + # Make sure to update to the full absolute path to your mcp_server.py file + args=["./mcp_server.py"], +) + +async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + # Initialize the connection + await session.initialize() + + # Get tools + tools = await experimental_mcp_client.load_mcp_tools(session=session, format="openai") + print("MCP TOOLS: ", tools) + + messages = [{"role": "user", "content": "what's (3 + 5)"}] + llm_response = await litellm.acompletion( + model="gpt-4o", + api_key=os.getenv("OPENAI_API_KEY"), + messages=messages, + tools=tools, + ) + print("LLM RESPONSE: ", json.dumps(llm_response, indent=4, default=str)) + + openai_tool = llm_response["choices"][0]["message"]["tool_calls"][0] + # Call the tool using MCP client + call_result = await experimental_mcp_client.call_openai_tool( + session=session, + openai_tool=openai_tool, + ) + print("MCP TOOL CALL RESULT: ", call_result) + + # send the tool result to the LLM + messages.append(llm_response["choices"][0]["message"]) + messages.append( + { + "role": "tool", + "content": str(call_result.content[0].text), + "tool_call_id": openai_tool["id"], + } + ) + print("final messages with tool result: ", messages) + llm_response = await litellm.acompletion( + model="gpt-4o", + api_key=os.getenv("OPENAI_API_KEY"), + messages=messages, + tools=tools, + ) + print( + "FINAL LLM RESPONSE: ", json.dumps(llm_response, indent=4, default=str) + ) +``` + + + + +In this example we'll walk through how you can use the OpenAI SDK pointed to the LiteLLM proxy to call MCP tools. The key difference here is we use the OpenAI SDK to make the LLM API request + +```python title="MCP Client with OpenAI SDK" showLineNumbers +# Create server parameters for stdio connection +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client +import os +from openai import OpenAI +from litellm import experimental_mcp_client + +server_params = StdioServerParameters( + command="python3", + # Make sure to update to the full absolute path to your mcp_server.py file + args=["./mcp_server.py"], +) + +async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + # Initialize the connection + await session.initialize() + + # Get tools using litellm mcp client + tools = await experimental_mcp_client.load_mcp_tools(session=session, format="openai") + print("MCP TOOLS: ", tools) + + # Use OpenAI SDK pointed to LiteLLM proxy + client = OpenAI( + api_key="your-api-key", # Your LiteLLM proxy API key + base_url="http://localhost:8000" # Your LiteLLM proxy URL + ) + + messages = [{"role": "user", "content": "what's (3 + 5)"}] + llm_response = client.chat.completions.create( + model="gpt-4", + messages=messages, + tools=tools + ) + print("LLM RESPONSE: ", llm_response) + + # Get the first tool call + tool_call = llm_response.choices[0].message.tool_calls[0] + + # Call the tool using MCP client + call_result = await experimental_mcp_client.call_openai_tool( + session=session, + openai_tool=tool_call.model_dump(), + ) + print("MCP TOOL CALL RESULT: ", call_result) + + # Send the tool result back to the LLM + messages.append(llm_response.choices[0].message.model_dump()) + messages.append({ + "role": "tool", + "content": str(call_result.content[0].text), + "tool_call_id": tool_call.id, + }) + + final_response = client.chat.completions.create( + model="gpt-4", + messages=messages, + tools=tools + ) + print("FINAL RESPONSE: ", final_response) +``` + + + \ No newline at end of file diff --git a/docs/my-website/docs/observability/agentops_integration.md b/docs/my-website/docs/observability/agentops_integration.md new file mode 100644 index 0000000000..e0599fab70 --- /dev/null +++ b/docs/my-website/docs/observability/agentops_integration.md @@ -0,0 +1,83 @@ +# 🖇️ AgentOps - LLM Observability Platform + +:::tip + +This is community maintained. Please make an issue if you run into a bug: +https://github.com/BerriAI/litellm + +::: + +[AgentOps](https://docs.agentops.ai) is an observability platform that enables tracing and monitoring of LLM calls, providing detailed insights into your AI operations. + +## Using AgentOps with LiteLLM + +LiteLLM provides `success_callbacks` and `failure_callbacks`, allowing you to easily integrate AgentOps for comprehensive tracing and monitoring of your LLM operations. + +### Integration + +Use just a few lines of code to instantly trace your responses **across all providers** with AgentOps: +Get your AgentOps API Keys from https://app.agentops.ai/ +```python +import litellm + +# Configure LiteLLM to use AgentOps +litellm.success_callback = ["agentops"] + +# Make your LLM calls as usual +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hello, how are you?"}], +) +``` + +Complete Code: + +```python +import os +from litellm import completion + +# Set env variables +os.environ["OPENAI_API_KEY"] = "your-openai-key" +os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key" + +# Configure LiteLLM to use AgentOps +litellm.success_callback = ["agentops"] + +# OpenAI call +response = completion( + model="gpt-4", + messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}], +) + +print(response) +``` + +### Configuration Options + +The AgentOps integration can be configured through environment variables: + +- `AGENTOPS_API_KEY` (str, optional): Your AgentOps API key +- `AGENTOPS_ENVIRONMENT` (str, optional): Deployment environment (defaults to "production") +- `AGENTOPS_SERVICE_NAME` (str, optional): Service name for tracing (defaults to "agentops") + +### Advanced Usage + +You can configure additional settings through environment variables: + +```python +import os + +# Configure AgentOps settings +os.environ["AGENTOPS_API_KEY"] = "your-agentops-api-key" +os.environ["AGENTOPS_ENVIRONMENT"] = "staging" +os.environ["AGENTOPS_SERVICE_NAME"] = "my-service" + +# Enable AgentOps tracing +litellm.success_callback = ["agentops"] +``` + +### Support + +For issues or questions, please refer to: +- [AgentOps Documentation](https://docs.agentops.ai) +- [LiteLLM Documentation](https://docs.litellm.ai) \ No newline at end of file diff --git a/docs/my-website/docs/observability/arize_integration.md b/docs/my-website/docs/observability/arize_integration.md index 1cd36a1111..a654a1b4de 100644 --- a/docs/my-website/docs/observability/arize_integration.md +++ b/docs/my-website/docs/observability/arize_integration.md @@ -1,4 +1,7 @@ + import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; # Arize AI @@ -11,6 +14,8 @@ https://github.com/BerriAI/litellm ::: + + ## Pre-Requisites @@ -24,7 +29,9 @@ You can also use the instrumentor option instead of the callback, which you can ```python litellm.callbacks = ["arize"] ``` + ```python + import litellm import os @@ -48,7 +55,7 @@ response = litellm.completion( ### Using with LiteLLM Proxy - +1. Setup config.yaml ```yaml model_list: - model_name: gpt-4 @@ -60,13 +67,134 @@ model_list: litellm_settings: callbacks: ["arize"] +general_settings: + master_key: "sk-1234" # can also be set as an environment variable + environment_variables: ARIZE_SPACE_KEY: "d0*****" ARIZE_API_KEY: "141a****" ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint - ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT + ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT or Neither (defaults to https://otlp.arize.com/v1 on grpc) ``` +2. Start the proxy + +```bash +litellm --config config.yaml +``` + +3. Test it! + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}]}' +``` + +## Pass Arize Space/Key per-request + +Supported parameters: +- `arize_api_key` +- `arize_space_key` + + + + +```python +import litellm +import os + +# LLM API Keys +os.environ['OPENAI_API_KEY']="" + +# set arize as a callback, litellm will send the data to arize +litellm.callbacks = ["arize"] + +# openai call +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": "Hi 👋 - i'm openai"} + ], + arize_api_key=os.getenv("ARIZE_SPACE_2_API_KEY"), + arize_space_key=os.getenv("ARIZE_SPACE_2_KEY"), +) +``` + + + + +1. Setup config.yaml +```yaml +model_list: + - model_name: gpt-4 + litellm_params: + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ + +litellm_settings: + callbacks: ["arize"] + +general_settings: + master_key: "sk-1234" # can also be set as an environment variable +``` + +2. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + + + + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gpt-4", + "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}], + "arize_api_key": "ARIZE_SPACE_2_API_KEY", + "arize_space_key": "ARIZE_SPACE_2_KEY" +}' +``` + + + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create( + model="gpt-3.5-turbo", + messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } + ], + extra_body={ + "arize_api_key": "ARIZE_SPACE_2_API_KEY", + "arize_space_key": "ARIZE_SPACE_2_KEY" + } +) + +print(response) +``` + + + + + ## Support & Talk to Founders - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) diff --git a/docs/my-website/docs/pass_through/cohere.md b/docs/my-website/docs/pass_through/cohere.md index 87eabd462c..227ff5777a 100644 --- a/docs/my-website/docs/pass_through/cohere.md +++ b/docs/my-website/docs/pass_through/cohere.md @@ -4,7 +4,7 @@ Pass-through endpoints for Cohere - call provider-specific endpoint, in native f | Feature | Supported | Notes | |-------|-------|-------| -| Cost Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) | +| Cost Tracking | ✅ | Supported for `/v1/chat`, and `/v2/chat` | | Logging | ✅ | works across all integrations | | End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) | | Streaming | ✅ | | diff --git a/docs/my-website/docs/pass_through/mistral.md b/docs/my-website/docs/pass_through/mistral.md new file mode 100644 index 0000000000..ee7ca800c4 --- /dev/null +++ b/docs/my-website/docs/pass_through/mistral.md @@ -0,0 +1,217 @@ +# Mistral + +Pass-through endpoints for Mistral - call provider-specific endpoint, in native format (no translation). + +| Feature | Supported | Notes | +|-------|-------|-------| +| Cost Tracking | ❌ | Not supported | +| Logging | ✅ | works across all integrations | +| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) | +| Streaming | ✅ | | + +Just replace `https://api.mistral.ai/v1` with `LITELLM_PROXY_BASE_URL/mistral` 🚀 + +#### **Example Usage** + +```bash +curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "mistral-ocr-latest", + "document": { + "type": "image_url", + "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png" + } + +}' +``` + +Supports **ALL** Mistral Endpoints (including streaming). + +## Quick Start + +Let's call the Mistral [`/chat/completions` endpoint](https://docs.mistral.ai/api/#tag/chat/operation/chat_completion_v1_chat_completions_post) + +1. Add MISTRAL_API_KEY to your environment + +```bash +export MISTRAL_API_KEY="sk-1234" +``` + +2. Start LiteLLM Proxy + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +3. Test it! + +Let's call the Mistral `/ocr` endpoint + +```bash +curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "mistral-ocr-latest", + "document": { + "type": "image_url", + "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png" + } + +}' +``` + + +## Examples + +Anything after `http://0.0.0.0:4000/mistral` is treated as a provider-specific route, and handled accordingly. + +Key Changes: + +| **Original Endpoint** | **Replace With** | +|------------------------------------------------------|-----------------------------------| +| `https://api.mistral.ai/v1` | `http://0.0.0.0:4000/mistral` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") | +| `bearer $MISTRAL_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) | + + +### **Example 1: OCR endpoint** + +#### LiteLLM Proxy Call + +```bash +curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/ocr' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer $LITELLM_API_KEY' \ +-d '{ + "model": "mistral-ocr-latest", + "document": { + "type": "image_url", + "image_url": "https://raw.githubusercontent.com/mistralai/cookbook/refs/heads/main/mistral/ocr/receipt.png" + } +}' +``` + + +#### Direct Mistral API Call + +```bash +curl https://api.mistral.ai/v1/ocr \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${MISTRAL_API_KEY}" \ + -d '{ + "model": "mistral-ocr-latest", + "document": { + "type": "document_url", + "document_url": "https://arxiv.org/pdf/2201.04234" + }, + "include_image_base64": true + }' +``` + +### **Example 2: Chat API** + +#### LiteLLM Proxy Call + +```bash +curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \ +-d '{ + "messages": [ + { + "role": "user", + "content": "I am going to Paris, what should I see?" + } + ], + "max_tokens": 2048, + "temperature": 0.8, + "top_p": 0.1, + "model": "mistral-large-latest", +}' +``` + +#### Direct Mistral API Call + +```bash +curl -L -X POST 'https://api.mistral.ai/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-d '{ + "messages": [ + { + "role": "user", + "content": "I am going to Paris, what should I see?" + } + ], + "max_tokens": 2048, + "temperature": 0.8, + "top_p": 0.1, + "model": "mistral-large-latest", +}' +``` + + +## Advanced - Use with Virtual Keys + +Pre-requisites +- [Setup proxy with DB](../proxy/virtual_keys.md#setup) + +Use this, to avoid giving developers the raw Mistral API key, but still letting them use Mistral endpoints. + +### Usage + +1. Setup environment + +```bash +export DATABASE_URL="" +export LITELLM_MASTER_KEY="" +export MISTRAL_API_BASE="" +``` + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +2. Generate virtual key + +```bash +curl -X POST 'http://0.0.0.0:4000/key/generate' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{}' +``` + +Expected Response + +```bash +{ + ... + "key": "sk-1234ewknldferwedojwojw" +} +``` + +3. Test it! + + +```bash +curl -L -X POST 'http://0.0.0.0:4000/mistral/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \ + --data '{ + "messages": [ + { + "role": "user", + "content": "I am going to Paris, what should I see?" + } + ], + "max_tokens": 2048, + "temperature": 0.8, + "top_p": 0.1, + "model": "qwen2.5-7b-instruct", +}' +``` \ No newline at end of file diff --git a/docs/my-website/docs/pass_through/vertex_ai.md b/docs/my-website/docs/pass_through/vertex_ai.md index ce366af541..f40dfa70eb 100644 --- a/docs/my-website/docs/pass_through/vertex_ai.md +++ b/docs/my-website/docs/pass_through/vertex_ai.md @@ -13,8 +13,102 @@ Pass-through endpoints for Vertex AI - call provider-specific endpoint, in nativ | End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) | | Streaming | ✅ | | +## Supported Endpoints + +LiteLLM supports 2 vertex ai passthrough routes: + +1. `/vertex_ai` → routes to `https://{vertex_location}-aiplatform.googleapis.com/` +2. `/vertex_ai/discovery` → routes to [`https://discoveryengine.googleapis.com`](https://discoveryengine.googleapis.com/) + +## How to use + Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE_URL/vertex_ai` +LiteLLM supports 3 flows for calling Vertex AI endpoints via pass-through: + +1. **Specific Credentials**: Admin sets passthrough credentials for a specific project/region. + +2. **Default Credentials**: Admin sets default credentials. + +3. **Client-Side Credentials**: User can send client-side credentials through to Vertex AI (default behavior - if no default or mapped credentials are found, the request is passed through directly). + + +## Example Usage + + + + +```yaml +model_list: + - model_name: gemini-1.0-pro + litellm_params: + model: vertex_ai/gemini-1.0-pro + vertex_project: adroit-crow-413218 + vertex_region: us-central1 + vertex_credentials: /path/to/credentials.json + use_in_pass_through: true # 👈 KEY CHANGE +``` + + + + + + + +```yaml +default_vertex_config: + vertex_project: adroit-crow-413218 + vertex_region: us-central1 + vertex_credentials: /path/to/credentials.json +``` + + + +```bash +export DEFAULT_VERTEXAI_PROJECT="adroit-crow-413218" +export DEFAULT_VERTEXAI_LOCATION="us-central1" +export DEFAULT_GOOGLE_APPLICATION_CREDENTIALS="/path/to/credentials.json" +``` + + + + + + +Try Gemini 2.0 Flash (curl) + +``` +MODEL_ID="gemini-2.0-flash-001" +PROJECT_ID="YOUR_PROJECT_ID" +``` + +```bash +curl \ + -X POST \ + -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \ + -H "Content-Type: application/json" \ + "${LITELLM_PROXY_BASE_URL}/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/${MODEL_ID}:streamGenerateContent" -d \ + $'{ + "contents": { + "role": "user", + "parts": [ + { + "fileData": { + "mimeType": "image/png", + "fileUri": "gs://generativeai-downloads/images/scones.jpg" + } + }, + { + "text": "Describe this picture." + } + ] + } + }' +``` + + + + #### **Example Usage** @@ -22,7 +116,7 @@ Just replace `https://REGION-aiplatform.googleapis.com` with `LITELLM_PROXY_BASE ```bash -curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.0-pro:generateContent \ +curl http://localhost:4000/vertex_ai/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/${MODEL_ID}:generateContent \ -H "Content-Type: application/json" \ -H "x-litellm-api-key: Bearer sk-1234" \ -d '{ @@ -101,7 +195,7 @@ litellm Let's call the Google AI Studio token counting endpoint ```bash -curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.0-pro:generateContent \ +curl http://localhost:4000/vertex-ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.0-pro:generateContent \ -H "Content-Type: application/json" \ -H "Authorization: Bearer sk-1234" \ -d '{ @@ -140,7 +234,7 @@ LiteLLM Proxy Server supports two methods of authentication to Vertex AI: ```shell -curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.5-flash-001:generateContent \ +curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.5-flash-001:generateContent \ -H "Content-Type: application/json" \ -H "x-litellm-api-key: Bearer sk-1234" \ -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}' @@ -152,7 +246,7 @@ curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.5-flash-0 ```shell -curl http://localhost:4000/vertex_ai/publishers/google/models/textembedding-gecko@001:predict \ +curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/textembedding-gecko@001:predict \ -H "Content-Type: application/json" \ -H "x-litellm-api-key: Bearer sk-1234" \ -d '{"instances":[{"content": "gm"}]}' @@ -162,7 +256,7 @@ curl http://localhost:4000/vertex_ai/publishers/google/models/textembedding-geck ### Imagen API ```shell -curl http://localhost:4000/vertex_ai/publishers/google/models/imagen-3.0-generate-001:predict \ +curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/imagen-3.0-generate-001:predict \ -H "Content-Type: application/json" \ -H "x-litellm-api-key: Bearer sk-1234" \ -d '{"instances":[{"prompt": "make an otter"}], "parameters": {"sampleCount": 1}}' @@ -172,7 +266,7 @@ curl http://localhost:4000/vertex_ai/publishers/google/models/imagen-3.0-generat ### Count Tokens API ```shell -curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.5-flash-001:countTokens \ +curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.5-flash-001:countTokens \ -H "Content-Type: application/json" \ -H "x-litellm-api-key: Bearer sk-1234" \ -d '{"contents":[{"role": "user", "parts":[{"text": "hi"}]}]}' @@ -183,7 +277,7 @@ Create Fine Tuning Job ```shell -curl http://localhost:4000/vertex_ai/tuningJobs \ +curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.5-flash-001:tuningJobs \ -H "Content-Type: application/json" \ -H "x-litellm-api-key: Bearer sk-1234" \ -d '{ @@ -243,7 +337,7 @@ Expected Response ```bash -curl http://localhost:4000/vertex_ai/publishers/google/models/gemini-1.0-pro:generateContent \ +curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.0-pro:generateContent \ -H "Content-Type: application/json" \ -H "x-litellm-api-key: Bearer sk-1234" \ -d '{ @@ -268,7 +362,7 @@ tags: ["vertex-js-sdk", "pass-through-endpoint"] ```bash -curl http://localhost:4000/vertex-ai/publishers/google/models/gemini-1.0-pro:generateContent \ +curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-central1/publishers/google/models/gemini-1.0-pro:generateContent \ -H "Content-Type: application/json" \ -H "x-litellm-api-key: Bearer sk-1234" \ -H "tags: vertex-js-sdk,pass-through-endpoint" \ diff --git a/docs/my-website/docs/pass_through/vllm.md b/docs/my-website/docs/pass_through/vllm.md new file mode 100644 index 0000000000..b267622948 --- /dev/null +++ b/docs/my-website/docs/pass_through/vllm.md @@ -0,0 +1,185 @@ +# VLLM + +Pass-through endpoints for VLLM - call provider-specific endpoint, in native format (no translation). + +| Feature | Supported | Notes | +|-------|-------|-------| +| Cost Tracking | ❌ | Not supported | +| Logging | ✅ | works across all integrations | +| End-user Tracking | ❌ | [Tell us if you need this](https://github.com/BerriAI/litellm/issues/new) | +| Streaming | ✅ | | + +Just replace `https://my-vllm-server.com` with `LITELLM_PROXY_BASE_URL/vllm` 🚀 + +#### **Example Usage** + +```bash +curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +``` + +Supports **ALL** VLLM Endpoints (including streaming). + +## Quick Start + +Let's call the VLLM [`/metrics` endpoint](https://vllm.readthedocs.io/en/latest/api_reference/api_reference.html) + +1. Add HOSTED VLLM API BASE to your environment + +```bash +export HOSTED_VLLM_API_BASE="https://my-vllm-server.com" +``` + +2. Start LiteLLM Proxy + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +3. Test it! + +Let's call the VLLM `/metrics` endpoint + +```bash +curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +``` + + +## Examples + +Anything after `http://0.0.0.0:4000/vllm` is treated as a provider-specific route, and handled accordingly. + +Key Changes: + +| **Original Endpoint** | **Replace With** | +|------------------------------------------------------|-----------------------------------| +| `https://my-vllm-server.com` | `http://0.0.0.0:4000/vllm` (LITELLM_PROXY_BASE_URL="http://0.0.0.0:4000") | +| `bearer $VLLM_API_KEY` | `bearer anything` (use `bearer LITELLM_VIRTUAL_KEY` if Virtual Keys are setup on proxy) | + + +### **Example 1: Metrics endpoint** + +#### LiteLLM Proxy Call + +```bash +curl -L -X GET 'http://0.0.0.0:4000/vllm/metrics' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \ +``` + + +#### Direct VLLM API Call + +```bash +curl -L -X GET 'https://my-vllm-server.com/metrics' \ +-H 'Content-Type: application/json' \ +``` + +### **Example 2: Chat API** + +#### LiteLLM Proxy Call + +```bash +curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \ +-d '{ + "messages": [ + { + "role": "user", + "content": "I am going to Paris, what should I see?" + } + ], + "max_tokens": 2048, + "temperature": 0.8, + "top_p": 0.1, + "model": "qwen2.5-7b-instruct", +}' +``` + +#### Direct VLLM API Call + +```bash +curl -L -X POST 'https://my-vllm-server.com/chat/completions' \ +-H 'Content-Type: application/json' \ +-d '{ + "messages": [ + { + "role": "user", + "content": "I am going to Paris, what should I see?" + } + ], + "max_tokens": 2048, + "temperature": 0.8, + "top_p": 0.1, + "model": "qwen2.5-7b-instruct", +}' +``` + + +## Advanced - Use with Virtual Keys + +Pre-requisites +- [Setup proxy with DB](../proxy/virtual_keys.md#setup) + +Use this, to avoid giving developers the raw Cohere API key, but still letting them use Cohere endpoints. + +### Usage + +1. Setup environment + +```bash +export DATABASE_URL="" +export LITELLM_MASTER_KEY="" +export HOSTED_VLLM_API_BASE="" +``` + +```bash +litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +2. Generate virtual key + +```bash +curl -X POST 'http://0.0.0.0:4000/key/generate' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{}' +``` + +Expected Response + +```bash +{ + ... + "key": "sk-1234ewknldferwedojwojw" +} +``` + +3. Test it! + + +```bash +curl -L -X POST 'http://0.0.0.0:4000/vllm/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234ewknldferwedojwojw' \ + --data '{ + "messages": [ + { + "role": "user", + "content": "I am going to Paris, what should I see?" + } + ], + "max_tokens": 2048, + "temperature": 0.8, + "top_p": 0.1, + "model": "qwen2.5-7b-instruct", +}' +``` \ No newline at end of file diff --git a/docs/my-website/docs/providers/anthropic.md b/docs/my-website/docs/providers/anthropic.md index 55e9ba10d3..9e4f6908a4 100644 --- a/docs/my-website/docs/providers/anthropic.md +++ b/docs/my-website/docs/providers/anthropic.md @@ -821,6 +821,14 @@ print(f"\nResponse: {resp}") ## Usage - Thinking / `reasoning_content` +LiteLLM translates OpenAI's `reasoning_effort` to Anthropic's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/23051d89dd3611a81617d84277059cd88b2df511/litellm/llms/anthropic/chat/transformation.py#L298) + +| reasoning_effort | thinking | +| ---------------- | -------- | +| "low" | "budget_tokens": 1024 | +| "medium" | "budget_tokens": 2048 | +| "high" | "budget_tokens": 4096 | + @@ -830,7 +838,7 @@ from litellm import completion resp = completion( model="anthropic/claude-3-7-sonnet-20250219", messages=[{"role": "user", "content": "What is the capital of France?"}], - thinking={"type": "enabled", "budget_tokens": 1024}, + reasoning_effort="low", ) ``` @@ -863,7 +871,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \ -d '{ "model": "claude-3-7-sonnet-20250219", "messages": [{"role": "user", "content": "What is the capital of France?"}], - "thinking": {"type": "enabled", "budget_tokens": 1024} + "reasoning_effort": "low" }' ``` @@ -927,6 +935,44 @@ ModelResponse( ) ``` +### Pass `thinking` to Anthropic models + +You can also pass the `thinking` parameter to Anthropic models. + + +You can also pass the `thinking` parameter to Anthropic models. + + + + +```python +response = litellm.completion( + model="anthropic/claude-3-7-sonnet-20250219", + messages=[{"role": "user", "content": "What is the capital of France?"}], + thinking={"type": "enabled", "budget_tokens": 1024}, +) +``` + + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "anthropic/claude-3-7-sonnet-20250219", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "thinking": {"type": "enabled", "budget_tokens": 1024} + }' +``` + + + + + + + ## **Passing Extra Headers to Anthropic API** Pass `extra_headers: dict` to `litellm.completion` @@ -1035,8 +1081,10 @@ response = completion( "content": [ {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."}, { - "type": "image_url", - "image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF + "type": "file", + "file": { + "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF + } }, ], } @@ -1081,8 +1129,10 @@ curl http://0.0.0.0:4000/v1/chat/completions \ "text": "You are a very professional document summarization specialist. Please summarize the given document" }, { - "type": "image_url", - "image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF + "type": "file", + "file": { + "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF + } } } ] diff --git a/docs/my-website/docs/providers/azure.md b/docs/my-website/docs/providers/azure.md index 111738a449..e58d8a7b5d 100644 --- a/docs/my-website/docs/providers/azure.md +++ b/docs/my-website/docs/providers/azure.md @@ -291,14 +291,15 @@ response = completion( ) ``` -## Azure O1 Models +## O-Series Models -| Model Name | Function Call | -|---------------------|----------------------------------------------------| -| o1-mini | `response = completion(model="azure/", messages=messages)` | -| o1-preview | `response = completion(model="azure/", messages=messages)` | +Azure OpenAI O-Series models are supported on LiteLLM. -Set `litellm.enable_preview_features = True` to use Azure O1 Models with streaming support. +LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic. + +To set this explicitly, set `model` to `azure/o_series/`. + +**Automatic Routing** @@ -306,60 +307,112 @@ Set `litellm.enable_preview_features = True` to use Azure O1 Models with streami ```python import litellm -litellm.enable_preview_features = True # 👈 KEY CHANGE - -response = litellm.completion( - model="azure/", - messages=[{"role": "user", "content": "What is the weather like in Boston?"}], - stream=True -) - -for chunk in response: - print(chunk) +litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name ``` - + -1. Setup config.yaml ```yaml model_list: - - model_name: o1-mini + - model_name: o3-mini litellm_params: - model: azure/o1-mini - api_base: "os.environ/AZURE_API_BASE" - api_key: "os.environ/AZURE_API_KEY" - api_version: "os.environ/AZURE_API_VERSION" - -litellm_settings: - enable_preview_features: true # 👈 KEY CHANGE + model: azure/o3-model + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY ``` -2. Start proxy + + + +**Explicit Routing** + + + + +```python +import litellm + +litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name +``` + + + +```yaml +model_list: + - model_name: o3-mini + litellm_params: + model: azure/o_series/my-random-deployment-name + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY +``` + + + + +## Azure Audio Model + + + + +```python +from litellm import completion +import os + +os.environ["AZURE_API_KEY"] = "" +os.environ["AZURE_API_BASE"] = "" +os.environ["AZURE_API_VERSION"] = "" + +response = completion( + model="azure/azure-openai-4o-audio", + messages=[ + { + "role": "user", + "content": "I want to try out speech to speech" + } + ], + modalities=["text","audio"], + audio={"voice": "alloy", "format": "wav"} +) + +print(response) +``` + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: azure-openai-4o-audio + litellm_params: + model: azure/azure-openai-4o-audio + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: os.environ/AZURE_API_VERSION +``` + +2. Start proxy ```bash litellm --config /path/to/config.yaml ``` -3. Test it +3. Test it! -```python -import openai -client = openai.OpenAI( - api_key="anything", - base_url="http://0.0.0.0:4000" -) -response = client.chat.completions.create(model="o1-mini", messages = [ - { - "role": "user", - "content": "this is a test request, write a short poem" - } -], -stream=True) - -for chunk in response: - print(chunk) +```bash +curl http://localhost:4000/v1/chat/completions \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "azure-openai-4o-audio", + "messages": [{"role": "user", "content": "I want to try out speech to speech"}], + "modalities": ["text","audio"], + "audio": {"voice": "alloy", "format": "wav"} + }' ``` + + @@ -425,7 +478,7 @@ response.stream_to_file(speech_file_path) ## **Authentication** -### Entrata ID - use `azure_ad_token` +### Entra ID - use `azure_ad_token` This is a walkthrough on how to use Azure Active Directory Tokens - Microsoft Entra ID to make `litellm.completion()` calls @@ -492,7 +545,7 @@ model_list: -### Entrata ID - use tenant_id, client_id, client_secret +### Entra ID - use tenant_id, client_id, client_secret Here is an example of setting up `tenant_id`, `client_id`, `client_secret` in your litellm proxy `config.yaml` ```yaml @@ -528,7 +581,7 @@ Example video of using `tenant_id`, `client_id`, `client_secret` with LiteLLM Pr -### Entrata ID - use client_id, username, password +### Entra ID - use client_id, username, password Here is an example of setting up `client_id`, `azure_username`, `azure_password` in your litellm proxy `config.yaml` ```yaml @@ -948,60 +1001,124 @@ Expected Response: {"data":[{"id":"batch_R3V...} ``` -## O-Series Models -Azure OpenAI O-Series models are supported on LiteLLM. +## **Azure Responses API** -LiteLLM routes any deployment name with `o1` or `o3` in the model name, to the O-Series [transformation](https://github.com/BerriAI/litellm/blob/91ed05df2962b8eee8492374b048d27cc144d08c/litellm/llms/azure/chat/o1_transformation.py#L4) logic. +| Property | Details | +|-------|-------| +| Description | Azure OpenAI Responses API | +| `custom_llm_provider` on LiteLLM | `azure/` | +| Supported Operations | `/v1/responses`| +| Azure OpenAI Responses API | [Azure OpenAI Responses API ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/responses?tabs=python-secure) | +| Cost Tracking, Logging Support | ✅ LiteLLM will log, track cost for Responses API Requests | +| Supported OpenAI Params | ✅ All OpenAI params are supported, [See here](https://github.com/BerriAI/litellm/blob/0717369ae6969882d149933da48eeb8ab0e691bd/litellm/llms/openai/responses/transformation.py#L23) | -To set this explicitly, set `model` to `azure/o_series/`. +## Usage -**Automatic Routing** +## Create a model response - + -```python +#### Non-streaming + +```python showLineNumbers title="Azure Responses API" import litellm -litellm.completion(model="azure/my-o3-deployment", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o3' in the deployment name -``` - - +# Non-streaming response +response = litellm.responses( + model="azure/o1-pro", + input="Tell me a three sentence bedtime story about a unicorn.", + max_output_tokens=100, + api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"), + api_base="https://litellm8397336933.openai.azure.com/", + api_version="2023-03-15-preview", +) -```yaml -model_list: - - model_name: o3-mini - litellm_params: - model: azure/o3-model - api_base: os.environ/AZURE_API_BASE - api_key: os.environ/AZURE_API_KEY +print(response) ``` - - - -**Explicit Routing** - - - - -```python +#### Streaming +```python showLineNumbers title="Azure Responses API" import litellm -litellm.completion(model="azure/o_series/my-random-deployment-name", messages=[{"role": "user", "content": "Hello, world!"}]) # 👈 Note: 'o_series/' in the deployment name -``` - - +# Streaming response +response = litellm.responses( + model="azure/o1-pro", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True, + api_key=os.getenv("AZURE_RESPONSES_OPENAI_API_KEY"), + api_base="https://litellm8397336933.openai.azure.com/", + api_version="2023-03-15-preview", +) -```yaml -model_list: - - model_name: o3-mini - litellm_params: - model: azure/o_series/my-random-deployment-name - api_base: os.environ/AZURE_API_BASE - api_key: os.environ/AZURE_API_KEY +for event in response: + print(event) ``` + + + + +First, add this to your litellm proxy config.yaml: +```yaml showLineNumbers title="Azure Responses API" +model_list: + - model_name: o1-pro + litellm_params: + model: azure/o1-pro + api_key: os.environ/AZURE_RESPONSES_OPENAI_API_KEY + api_base: https://litellm8397336933.openai.azure.com/ + api_version: 2023-03-15-preview +``` + +Start your LiteLLM proxy: +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +Then use the OpenAI SDK pointed to your proxy: + +#### Non-streaming +```python showLineNumbers +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Non-streaming response +response = client.responses.create( + model="o1-pro", + input="Tell me a three sentence bedtime story about a unicorn." +) + +print(response) +``` + +#### Streaming +```python showLineNumbers +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Streaming response +response = client.responses.create( + model="o1-pro", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True +) + +for event in response: + print(event) +``` + @@ -1076,32 +1193,24 @@ print(response) ``` -### Parallel Function calling +### Tool Calling / Function Calling + See a detailed walthrough of parallel function calling with litellm [here](https://docs.litellm.ai/docs/completion/function_call) + + + + + ```python # set Azure env variables import os +import litellm +import json + os.environ['AZURE_API_KEY'] = "" # litellm reads AZURE_API_KEY from .env and sends the request os.environ['AZURE_API_BASE'] = "https://openai-gpt-4-test-v-1.openai.azure.com/" os.environ['AZURE_API_VERSION'] = "2023-07-01-preview" -import litellm -import json -# Example dummy function hard coded to return the same weather -# In production, this could be your backend API or an external API -def get_current_weather(location, unit="fahrenheit"): - """Get the current weather in a given location""" - if "tokyo" in location.lower(): - return json.dumps({"location": "Tokyo", "temperature": "10", "unit": "celsius"}) - elif "san francisco" in location.lower(): - return json.dumps({"location": "San Francisco", "temperature": "72", "unit": "fahrenheit"}) - elif "paris" in location.lower(): - return json.dumps({"location": "Paris", "temperature": "22", "unit": "celsius"}) - else: - return json.dumps({"location": location, "temperature": "unknown"}) - -## Step 1: send the conversation and available functions to the model -messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}] tools = [ { "type": "function", @@ -1125,7 +1234,7 @@ tools = [ response = litellm.completion( model="azure/chatgpt-functioncalling", # model = azure/ - messages=messages, + messages=[{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}], tools=tools, tool_choice="auto", # auto is default, but we'll be explicit ) @@ -1134,8 +1243,49 @@ response_message = response.choices[0].message tool_calls = response.choices[0].message.tool_calls print("\nTool Choice:\n", tool_calls) ``` + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: azure-gpt-3.5 + litellm_params: + model: azure/chatgpt-functioncalling + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: "2023-07-01-preview" +``` + +2. Start proxy + +```bash +litellm --config config.yaml +``` + +3. Test it + +```bash +curl -L -X POST 'http://localhost:4000/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "azure-gpt-3.5", + "messages": [ + { + "role": "user", + "content": "Hey, how'\''s it going? Thinking long and hard before replying - what is the meaning of the world and life itself" + } + ] +}' +``` + + + + ### Spend Tracking for Azure OpenAI Models (PROXY) Set base model for cost tracking azure image-gen call diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md index 19b3728882..2a9c528a65 100644 --- a/docs/my-website/docs/providers/bedrock.md +++ b/docs/my-website/docs/providers/bedrock.md @@ -79,6 +79,7 @@ aws_session_name: Optional[str], aws_profile_name: Optional[str], aws_role_name: Optional[str], aws_web_identity_token: Optional[str], +aws_bedrock_runtime_endpoint: Optional[str], ``` ### 2. Start the proxy @@ -475,7 +476,7 @@ os.environ["AWS_REGION_NAME"] = "" resp = completion( model="bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", messages=[{"role": "user", "content": "What is the capital of France?"}], - thinking={"type": "enabled", "budget_tokens": 1024}, + reasoning_effort="low", ) print(resp) @@ -490,7 +491,7 @@ model_list: - model_name: bedrock-claude-3-7 litellm_params: model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 - thinking: {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON REQUEST + reasoning_effort: "low" # 👈 EITHER HERE OR ON REQUEST ``` 2. Start proxy @@ -508,7 +509,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \ -d '{ "model": "bedrock-claude-3-7", "messages": [{"role": "user", "content": "What is the capital of France?"}], - "thinking": {"type": "enabled", "budget_tokens": 1024} # 👈 EITHER HERE OR ON CONFIG.YAML + "reasoning_effort": "low" # 👈 EITHER HERE OR ON CONFIG.YAML }' ``` @@ -557,6 +558,10 @@ Same as [Anthropic API response](../providers/anthropic#usage---thinking--reason } ``` +### Pass `thinking` to Anthropic models + +Same as [Anthropic API response](../providers/anthropic#usage---thinking--reasoning_content). + ## Usage - Structured Output / JSON mode @@ -663,6 +668,58 @@ curl http://0.0.0.0:4000/v1/chat/completions \
+## Usage - Latency Optimized Inference + +Valid from v1.65.1+ + + + + +```python +from litellm import completion + +response = completion( + model="bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0", + messages=[{"role": "user", "content": "What is the capital of France?"}], + performanceConfig={"latency": "optimized"}, +) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: bedrock-claude-3-7 + litellm_params: + model: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 + performanceConfig: {"latency": "optimized"} # 👈 EITHER HERE OR ON REQUEST +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "bedrock-claude-3-7", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "performanceConfig": {"latency": "optimized"} # 👈 EITHER HERE OR ON CONFIG.YAML + }' +``` + + + + ## Usage - Bedrock Guardrails Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html) @@ -1115,14 +1172,22 @@ os.environ["AWS_REGION_NAME"] = "" # pdf url image_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" +# Download the file +response = requests.get(url) +file_data = response.content + +encoded_file = base64.b64encode(file_data).decode("utf-8") + # model model = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0" image_content = [ {"type": "text", "text": "What's this file about?"}, { - "type": "image_url", - "image_url": image_url, # OR {"url": image_url} + "type": "file", + "file": { + "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF + } }, ] @@ -1168,8 +1233,10 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ "messages": [ {"role": "user", "content": {"type": "text", "text": "What's this file about?"}}, { - "type": "image_url", - "image_url": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", + "type": "file", + "file": { + "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF + } } ] }' @@ -1427,10 +1494,14 @@ response = litellm.embedding( ## Supported AWS Bedrock Models + +LiteLLM supports ALL Bedrock models. + Here's an example of using a bedrock model with LiteLLM. For a complete list, refer to the [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) | Model Name | Command | |----------------------------|------------------------------------------------------------------| +| Deepseek R1 | `completion(model='bedrock/us.deepseek.r1-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` | | Anthropic Claude-V3.5 Sonnet | `completion(model='bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` | | Anthropic Claude-V3 sonnet | `completion(model='bedrock/anthropic.claude-3-sonnet-20240229-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` | | Anthropic Claude-V3 Haiku | `completion(model='bedrock/anthropic.claude-3-haiku-20240307-v1:0', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']` | @@ -1771,6 +1842,7 @@ response = completion( ) ``` + 1. Setup config.yaml @@ -1815,11 +1887,13 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ ``` + ### SSO Login (AWS Profile) - Set `AWS_PROFILE` environment variable - Make bedrock completion call + ```python import os from litellm import completion @@ -1912,12 +1986,46 @@ model_list: +Text to Image : +```bash +curl -L -X POST 'http://0.0.0.0:4000/v1/images/generations' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \ +-d '{ + "model": "amazon.nova-canvas-v1:0", + "prompt": "A cute baby sea otter" +}' +``` +Color Guided Generation: +```bash +curl -L -X POST 'http://0.0.0.0:4000/v1/images/generations' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \ +-d '{ + "model": "amazon.nova-canvas-v1:0", + "prompt": "A cute baby sea otter", + "taskType": "COLOR_GUIDED_GENERATION", + "colorGuidedGenerationParams":{"colors":["#FFFFFF"]} +}' +``` + +| Model Name | Function Call | +|-------------------------|---------------------------------------------| +| Stable Diffusion 3 - v0 | `image_generation(model="bedrock/stability.stability.sd3-large-v1:0", prompt=prompt)` | +| Stable Diffusion - v0 | `image_generation(model="bedrock/stability.stable-diffusion-xl-v0", prompt=prompt)` | +| Stable Diffusion - v1 | `image_generation(model="bedrock/stability.stable-diffusion-xl-v1", prompt=prompt)` | +| Amazon Nova Canvas - v0 | `image_generation(model="bedrock/amazon.nova-canvas-v1:0", prompt=prompt)` | + + ### Passing an external BedrockRuntime.Client as a parameter - Completion() + +This is a deprecated flow. Boto3 is not async. And boto3.client does not let us make the http call through httpx. Pass in your aws params through the method above 👆. [See Auth Code](https://github.com/BerriAI/litellm/blob/55a20c7cce99a93d36a82bf3ae90ba3baf9a7f89/litellm/llms/bedrock_httpx.py#L284) [Add new auth flow](https://github.com/BerriAI/litellm/issues) :::warning -This is a deprecated flow. Boto3 is not async. And boto3.client does not let us make the http call through httpx. Pass in your aws params through the method above 👆. [See Auth Code](https://github.com/BerriAI/litellm/blob/55a20c7cce99a93d36a82bf3ae90ba3baf9a7f89/litellm/llms/bedrock_httpx.py#L284) [Add new auth flow](https://github.com/BerriAI/litellm/issues) + + Experimental - 2024-Jun-23: diff --git a/docs/my-website/docs/providers/databricks.md b/docs/my-website/docs/providers/databricks.md index 395a544db4..8631cbfdad 100644 --- a/docs/my-website/docs/providers/databricks.md +++ b/docs/my-website/docs/providers/databricks.md @@ -1,7 +1,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# 🆕 Databricks +# Databricks LiteLLM supports all models on Databricks @@ -154,7 +154,205 @@ response = completion( temperature: 0.5 ``` -## Passings Databricks specific params - 'instruction' + +## Usage - Thinking / `reasoning_content` + +LiteLLM translates OpenAI's `reasoning_effort` to Anthropic's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/23051d89dd3611a81617d84277059cd88b2df511/litellm/llms/anthropic/chat/transformation.py#L298) + +| reasoning_effort | thinking | +| ---------------- | -------- | +| "low" | "budget_tokens": 1024 | +| "medium" | "budget_tokens": 2048 | +| "high" | "budget_tokens": 4096 | + + +Known Limitations: +- Support for passing thinking blocks back to Claude [Issue](https://github.com/BerriAI/litellm/issues/9790) + + + + + +```python +from litellm import completion +import os + +# set ENV variables (can also be passed in to .completion() - e.g. `api_base`, `api_key`) +os.environ["DATABRICKS_API_KEY"] = "databricks key" +os.environ["DATABRICKS_API_BASE"] = "databricks base url" + +resp = completion( + model="databricks/databricks-claude-3-7-sonnet", + messages=[{"role": "user", "content": "What is the capital of France?"}], + reasoning_effort="low", +) + +``` + + + + + +1. Setup config.yaml + +```yaml +- model_name: claude-3-7-sonnet + litellm_params: + model: databricks/databricks-claude-3-7-sonnet + api_key: os.environ/DATABRICKS_API_KEY + api_base: os.environ/DATABRICKS_API_BASE +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "claude-3-7-sonnet", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "reasoning_effort": "low" + }' +``` + + + + + +**Expected Response** + +```python +ModelResponse( + id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e', + created=1740470510, + model='claude-3-7-sonnet-20250219', + object='chat.completion', + system_fingerprint=None, + choices=[ + Choices( + finish_reason='stop', + index=0, + message=Message( + content="The capital of France is Paris.", + role='assistant', + tool_calls=None, + function_call=None, + provider_specific_fields={ + 'citations': None, + 'thinking_blocks': [ + { + 'type': 'thinking', + 'thinking': 'The capital of France is Paris. This is a very straightforward factual question.', + 'signature': 'EuYBCkQYAiJAy6...' + } + ] + } + ), + thinking_blocks=[ + { + 'type': 'thinking', + 'thinking': 'The capital of France is Paris. This is a very straightforward factual question.', + 'signature': 'EuYBCkQYAiJAy6AGB...' + } + ], + reasoning_content='The capital of France is Paris. This is a very straightforward factual question.' + ) + ], + usage=Usage( + completion_tokens=68, + prompt_tokens=42, + total_tokens=110, + completion_tokens_details=None, + prompt_tokens_details=PromptTokensDetailsWrapper( + audio_tokens=None, + cached_tokens=0, + text_tokens=None, + image_tokens=None + ), + cache_creation_input_tokens=0, + cache_read_input_tokens=0 + ) +) +``` + +### Pass `thinking` to Anthropic models + +You can also pass the `thinking` parameter to Anthropic models. + + +You can also pass the `thinking` parameter to Anthropic models. + + + + +```python +from litellm import completion +import os + +# set ENV variables (can also be passed in to .completion() - e.g. `api_base`, `api_key`) +os.environ["DATABRICKS_API_KEY"] = "databricks key" +os.environ["DATABRICKS_API_BASE"] = "databricks base url" + +response = litellm.completion( + model="databricks/databricks-claude-3-7-sonnet", + messages=[{"role": "user", "content": "What is the capital of France?"}], + thinking={"type": "enabled", "budget_tokens": 1024}, +) +``` + + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "databricks/databricks-claude-3-7-sonnet", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "thinking": {"type": "enabled", "budget_tokens": 1024} + }' +``` + + + + + + + + +## Supported Databricks Chat Completion Models + +:::tip + +**We support ALL Databricks models, just set `model=databricks/` as a prefix when sending litellm requests** + +::: + + +| Model Name | Command | +|----------------------------|------------------------------------------------------------------| +| databricks/databricks-claude-3-7-sonnet | `completion(model='databricks/databricks/databricks-claude-3-7-sonnet', messages=messages)` | +| databricks-meta-llama-3-1-70b-instruct | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)` | +| databricks-meta-llama-3-1-405b-instruct | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)` | +| databricks-dbrx-instruct | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)` | +| databricks-meta-llama-3-70b-instruct | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)` | +| databricks-llama-2-70b-chat | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)` | +| databricks-mixtral-8x7b-instruct | `completion(model='databricks/databricks-mixtral-8x7b-instruct', messages=messages)` | +| databricks-mpt-30b-instruct | `completion(model='databricks/databricks-mpt-30b-instruct', messages=messages)` | +| databricks-mpt-7b-instruct | `completion(model='databricks/databricks-mpt-7b-instruct', messages=messages)` | + + +## Embedding Models + +### Passing Databricks specific params - 'instruction' For embedding models, databricks lets you pass in an additional param 'instruction'. [Full Spec](https://github.com/BerriAI/litellm/blob/43353c28b341df0d9992b45c6ce464222ebd7984/litellm/llms/databricks.py#L164) @@ -187,27 +385,6 @@ response = litellm.embedding( instruction: "Represent this sentence for searching relevant passages:" ``` - -## Supported Databricks Chat Completion Models - -:::tip - -**We support ALL Databricks models, just set `model=databricks/` as a prefix when sending litellm requests** - -::: - - -| Model Name | Command | -|----------------------------|------------------------------------------------------------------| -| databricks-meta-llama-3-1-70b-instruct | `completion(model='databricks/databricks-meta-llama-3-1-70b-instruct', messages=messages)` | -| databricks-meta-llama-3-1-405b-instruct | `completion(model='databricks/databricks-meta-llama-3-1-405b-instruct', messages=messages)` | -| databricks-dbrx-instruct | `completion(model='databricks/databricks-dbrx-instruct', messages=messages)` | -| databricks-meta-llama-3-70b-instruct | `completion(model='databricks/databricks-meta-llama-3-70b-instruct', messages=messages)` | -| databricks-llama-2-70b-chat | `completion(model='databricks/databricks-llama-2-70b-chat', messages=messages)` | -| databricks-mixtral-8x7b-instruct | `completion(model='databricks/databricks-mixtral-8x7b-instruct', messages=messages)` | -| databricks-mpt-30b-instruct | `completion(model='databricks/databricks-mpt-30b-instruct', messages=messages)` | -| databricks-mpt-7b-instruct | `completion(model='databricks/databricks-mpt-7b-instruct', messages=messages)` | - ## Supported Databricks Embedding Models :::tip diff --git a/docs/my-website/docs/providers/gemini.md b/docs/my-website/docs/providers/gemini.md index 4a6cfdf1a3..434df6a7c9 100644 --- a/docs/my-website/docs/providers/gemini.md +++ b/docs/my-website/docs/providers/gemini.md @@ -39,14 +39,164 @@ response = completion( - temperature - top_p - max_tokens +- max_completion_tokens - stream - tools - tool_choice +- functions - response_format - n - stop +- logprobs +- frequency_penalty +- modalities +- reasoning_content + +**Anthropic Params** +- thinking (used to set max budget tokens across anthropic/gemini models) + +[**See Updated List**](https://github.com/BerriAI/litellm/blob/main/litellm/llms/gemini/chat/transformation.py#L70) + + + +## Usage - Thinking / `reasoning_content` + +LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362) + +**Mapping** + +| reasoning_effort | thinking | +| ---------------- | -------- | +| "low" | "budget_tokens": 1024 | +| "medium" | "budget_tokens": 2048 | +| "high" | "budget_tokens": 4096 | + + + + +```python +from litellm import completion + +resp = completion( + model="gemini/gemini-2.5-flash-preview-04-17", + messages=[{"role": "user", "content": "What is the capital of France?"}], + reasoning_effort="low", +) + +``` + + + + + +1. Setup config.yaml + +```yaml +- model_name: gemini-2.5-flash + litellm_params: + model: gemini/gemini-2.5-flash-preview-04-17 + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "gemini-2.5-flash", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "reasoning_effort": "low" + }' +``` + + + + + +**Expected Response** + +```python +ModelResponse( + id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e', + created=1740470510, + model='claude-3-7-sonnet-20250219', + object='chat.completion', + system_fingerprint=None, + choices=[ + Choices( + finish_reason='stop', + index=0, + message=Message( + content="The capital of France is Paris.", + role='assistant', + tool_calls=None, + function_call=None, + reasoning_content='The capital of France is Paris. This is a very straightforward factual question.' + ), + ) + ], + usage=Usage( + completion_tokens=68, + prompt_tokens=42, + total_tokens=110, + completion_tokens_details=None, + prompt_tokens_details=PromptTokensDetailsWrapper( + audio_tokens=None, + cached_tokens=0, + text_tokens=None, + image_tokens=None + ), + cache_creation_input_tokens=0, + cache_read_input_tokens=0 + ) +) +``` + +### Pass `thinking` to Gemini models + +You can also pass the `thinking` parameter to Gemini models. + +This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget). + + + + +```python +response = litellm.completion( + model="gemini/gemini-2.5-flash-preview-04-17", + messages=[{"role": "user", "content": "What is the capital of France?"}], + thinking={"type": "enabled", "budget_tokens": 1024}, +) +``` + + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "gemini/gemini-2.5-flash-preview-04-17", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "thinking": {"type": "enabled", "budget_tokens": 1024} + }' +``` + + + + + + -[**See Updated List**](https://github.com/BerriAI/litellm/blob/1c747f3ad372399c5b95cc5696b06a5fbe53186b/litellm/llms/vertex_httpx.py#L122) ## Passing Gemini Specific Params ### Response schema @@ -365,7 +515,7 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ ## Specifying Safety Settings -In certain use-cases you may need to make calls to the models and pass [safety settigns](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example: +In certain use-cases you may need to make calls to the models and pass [safety settings](https://ai.google.dev/docs/safety_setting_gemini) different from the defaults. To do so, simple pass the `safety_settings` argument to `completion` or `acompletion`. For example: ```python response = completion( @@ -438,6 +588,179 @@ assert isinstance( ``` +### Google Search Tool + + + + +```python +from litellm import completion +import os + +os.environ["GEMINI_API_KEY"] = ".." + +tools = [{"googleSearch": {}}] # 👈 ADD GOOGLE SEARCH + +response = completion( + model="gemini/gemini-2.0-flash", + messages=[{"role": "user", "content": "What is the weather in San Francisco?"}], + tools=tools, +) + +print(response) +``` + + + + +1. Setup config.yaml +```yaml +model_list: + - model_name: gemini-2.0-flash + litellm_params: + model: gemini/gemini-2.0-flash + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start Proxy +```bash +$ litellm --config /path/to/config.yaml +``` + +3. Make Request! +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gemini-2.0-flash", + "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}], + "tools": [{"googleSearch": {}}] +} +' +``` + + + + +### Google Search Retrieval + + + + + +```python +from litellm import completion +import os + +os.environ["GEMINI_API_KEY"] = ".." + +tools = [{"googleSearchRetrieval": {}}] # 👈 ADD GOOGLE SEARCH + +response = completion( + model="gemini/gemini-2.0-flash", + messages=[{"role": "user", "content": "What is the weather in San Francisco?"}], + tools=tools, +) + +print(response) +``` + + + + +1. Setup config.yaml +```yaml +model_list: + - model_name: gemini-2.0-flash + litellm_params: + model: gemini/gemini-2.0-flash + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start Proxy +```bash +$ litellm --config /path/to/config.yaml +``` + +3. Make Request! +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gemini-2.0-flash", + "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}], + "tools": [{"googleSearchRetrieval": {}}] +} +' +``` + + + + + +### Code Execution Tool + + + + + +```python +from litellm import completion +import os + +os.environ["GEMINI_API_KEY"] = ".." + +tools = [{"codeExecution": {}}] # 👈 ADD GOOGLE SEARCH + +response = completion( + model="gemini/gemini-2.0-flash", + messages=[{"role": "user", "content": "What is the weather in San Francisco?"}], + tools=tools, +) + +print(response) +``` + + + + +1. Setup config.yaml +```yaml +model_list: + - model_name: gemini-2.0-flash + litellm_params: + model: gemini/gemini-2.0-flash + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start Proxy +```bash +$ litellm --config /path/to/config.yaml +``` + +3. Make Request! +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gemini-2.0-flash", + "messages": [{"role": "user", "content": "What is the weather in San Francisco?"}], + "tools": [{"codeExecution": {}}] +} +' +``` + + + + + + + + + ## JSON Mode @@ -589,8 +912,10 @@ response = litellm.completion( "content": [ {"type": "text", "text": "Please summarize the audio."}, { - "type": "image_url", - "image_url": "data:audio/mp3;base64,{}".format(encoded_data), # 👈 SET MIME_TYPE + DATA + "type": "file", + "file": { + "file_data": "data:audio/mp3;base64,{}".format(encoded_data), # 👈 SET MIME_TYPE + DATA + } }, ], } @@ -640,8 +965,11 @@ response = litellm.completion( "content": [ {"type": "text", "text": "Please summarize the file."}, { - "type": "image_url", - "image_url": "https://storage..." # 👈 SET THE IMG URL + "type": "file", + "file": { + "file_id": "https://storage...", # 👈 SET THE IMG URL + "format": "application/pdf" # OPTIONAL + } }, ], } @@ -668,8 +996,11 @@ response = litellm.completion( "content": [ {"type": "text", "text": "Please summarize the file."}, { - "type": "image_url", - "image_url": "gs://..." # 👈 SET THE cloud storage bucket url + "type": "file", + "file": { + "file_id": "gs://storage...", # 👈 SET THE IMG URL + "format": "application/pdf" # OPTIONAL + } }, ], } @@ -879,3 +1210,54 @@ response = await client.chat.completions.create( + +## Image Generation + + + + +```python +from litellm import completion + +response = completion( + model="gemini/gemini-2.0-flash-exp-image-generation", + messages=[{"role": "user", "content": "Generate an image of a cat"}], + modalities=["image", "text"], +) +assert response.choices[0].message.content is not None # ".." +``` + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: gemini-2.0-flash-exp-image-generation + litellm_params: + model: gemini/gemini-2.0-flash-exp-image-generation + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl -L -X POST 'http://localhost:4000/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gemini-2.0-flash-exp-image-generation", + "messages": [{"role": "user", "content": "Generate an image of a cat"}], + "modalities": ["image", "text"] +}' +``` + + + + diff --git a/docs/my-website/docs/providers/google_ai_studio/files.md b/docs/my-website/docs/providers/google_ai_studio/files.md new file mode 100644 index 0000000000..500f1d5718 --- /dev/null +++ b/docs/my-website/docs/providers/google_ai_studio/files.md @@ -0,0 +1,161 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# [BETA] Google AI Studio (Gemini) Files API + +Use this to upload files to Google AI Studio (Gemini). + +Useful to pass in large media files to Gemini's `/generateContent` endpoint. + +| Action | Supported | +|----------|-----------| +| `create` | Yes | +| `delete` | No | +| `retrieve` | No | +| `list` | No | + +## Usage + + + + +```python +import base64 +import requests +from litellm import completion, create_file +import os + + +### UPLOAD FILE ### + +# Fetch the audio file and convert it to a base64 encoded string +url = "https://cdn.openai.com/API/docs/audio/alloy.wav" +response = requests.get(url) +response.raise_for_status() +wav_data = response.content +encoded_string = base64.b64encode(wav_data).decode('utf-8') + + +file = create_file( + file=wav_data, + purpose="user_data", + extra_body={"custom_llm_provider": "gemini"}, + api_key=os.getenv("GEMINI_API_KEY"), +) + +print(f"file: {file}") + +assert file is not None + + +### GENERATE CONTENT ### +completion = completion( + model="gemini-2.0-flash", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this recording?" + }, + { + "type": "file", + "file": { + "file_id": file.id, + "filename": "my-test-name", + "format": "audio/wav" + } + } + ] + }, + ] +) + +print(completion.choices[0].message) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: "gemini-2.0-flash" + litellm_params: + model: gemini/gemini-2.0-flash + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start proxy + +```bash +litellm --config config.yaml +``` + +3. Test it + +```python +import base64 +import requests +from openai import OpenAI + +client = OpenAI( + base_url="http://0.0.0.0:4000", + api_key="sk-1234" +) + +# Fetch the audio file and convert it to a base64 encoded string +url = "https://cdn.openai.com/API/docs/audio/alloy.wav" +response = requests.get(url) +response.raise_for_status() +wav_data = response.content +encoded_string = base64.b64encode(wav_data).decode('utf-8') + + +file = client.files.create( + file=wav_data, + purpose="user_data", + extra_body={"target_model_names": "gemini-2.0-flash"} +) + +print(f"file: {file}") + +assert file is not None + +completion = client.chat.completions.create( + model="gemini-2.0-flash", + modalities=["text", "audio"], + audio={"voice": "alloy", "format": "wav"}, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is in this recording?" + }, + { + "type": "file", + "file": { + "file_id": file.id, + "filename": "my-test-name", + "format": "audio/wav" + } + } + ] + }, + ], + extra_body={"drop_params": True} +) + +print(completion.choices[0].message) +``` + + + + + + + diff --git a/docs/my-website/docs/providers/huggingface.md b/docs/my-website/docs/providers/huggingface.md index 5297a688ba..399d49b5f4 100644 --- a/docs/my-website/docs/providers/huggingface.md +++ b/docs/my-website/docs/providers/huggingface.md @@ -2,466 +2,392 @@ import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Huggingface +# Hugging Face +LiteLLM supports running inference across multiple services for models hosted on the Hugging Face Hub. -LiteLLM supports the following types of Hugging Face models: +- **Serverless Inference Providers** - Hugging Face offers an easy and unified access to serverless AI inference through multiple inference providers, like [Together AI](https://together.ai) and [Sambanova](https://sambanova.ai). This is the fastest way to integrate AI in your products with a maintenance-free and scalable solution. More details in the [Inference Providers documentation](https://huggingface.co/docs/inference-providers/index). +- **Dedicated Inference Endpoints** - which is a product to easily deploy models to production. Inference is run by Hugging Face in a dedicated, fully managed infrastructure on a cloud provider of your choice. You can deploy your model on Hugging Face Inference Endpoints by following [these steps](https://huggingface.co/docs/inference-endpoints/guides/create_endpoint). -- Serverless Inference API (free) - loaded and ready to use: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation -- Dedicated Inference Endpoints (paid) - manual deployment: https://ui.endpoints.huggingface.co/ -- All LLMs served via Hugging Face's Inference use [Text-generation-inference](https://huggingface.co/docs/text-generation-inference). + +## Supported Models + +### Serverless Inference Providers +You can check available models for an inference provider by going to [huggingface.co/models](https://huggingface.co/models), clicking the "Other" filter tab, and selecting your desired provider: + +![Filter models by Inference Provider](../../img/hf_filter_inference_providers.png) + +For example, you can find all Fireworks supported models [here](https://huggingface.co/models?inference_provider=fireworks-ai&sort=trending). + + +### Dedicated Inference Endpoints +Refer to the [Inference Endpoints catalog](https://endpoints.huggingface.co/catalog) for a list of available models. ## Usage + + + +### Authentication +With a single Hugging Face token, you can access inference through multiple providers. Your calls are routed through Hugging Face and the usage is billed directly to your Hugging Face account at the standard provider API rates. + +Simply set the `HF_TOKEN` environment variable with your Hugging Face token, you can create one here: https://huggingface.co/settings/tokens. + +```bash +export HF_TOKEN="hf_xxxxxx" +``` +or alternatively, you can pass your Hugging Face token as a parameter: +```python +completion(..., api_key="hf_xxxxxx") +``` + +### Getting Started + +To use a Hugging Face model, specify both the provider and model you want to use in the following format: +``` +huggingface/// +``` +Where `/` is the Hugging Face model ID and `` is the inference provider. +By default, if you don't specify a provider, LiteLLM will use the [HF Inference API](https://huggingface.co/docs/api-inference/en/index). + +Examples: + +```python +# Run DeepSeek-R1 inference through Together AI +completion(model="huggingface/together/deepseek-ai/DeepSeek-R1",...) + +# Run Qwen2.5-72B-Instruct inference through Sambanova +completion(model="huggingface/sambanova/Qwen/Qwen2.5-72B-Instruct",...) + +# Run Llama-3.3-70B-Instruct inference through HF Inference API +completion(model="huggingface/meta-llama/Llama-3.3-70B-Instruct",...) +``` + + Open In Colab -You need to tell LiteLLM when you're calling Huggingface. -This is done by adding the "huggingface/" prefix to `model`, example `completion(model="huggingface/",...)`. - - - - -By default, LiteLLM will assume a Hugging Face call follows the [Messages API](https://huggingface.co/docs/text-generation-inference/messages_api), which is fully compatible with the OpenAI Chat Completion API. - - - +### Basic Completion +Here's an example of chat completion using the DeepSeek-R1 model through Together AI: ```python import os from litellm import completion -# [OPTIONAL] set env var -os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key" +os.environ["HF_TOKEN"] = "hf_xxxxxx" -messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}] - -# e.g. Call 'https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct' from Serverless Inference API response = completion( - model="huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", - messages=[{ "content": "Hello, how are you?","role": "user"}], + model="huggingface/together/deepseek-ai/DeepSeek-R1", + messages=[ + { + "role": "user", + "content": "How many r's are in the word 'strawberry'?", + } + ], +) +print(response) +``` + +### Streaming +Now, let's see what a streaming request looks like. + +```python +import os +from litellm import completion + +os.environ["HF_TOKEN"] = "hf_xxxxxx" + +response = completion( + model="huggingface/together/deepseek-ai/DeepSeek-R1", + messages=[ + { + "role": "user", + "content": "How many r's are in the word `strawberry`?", + + } + ], + stream=True, +) + +for chunk in response: + print(chunk) +``` + +### Image Input +You can also pass images when the model supports it. Here is an example using [Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) model through Sambanova. + +```python +from litellm import completion + +# Set your Hugging Face Token +os.environ["HF_TOKEN"] = "hf_xxxxxx" + +messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + } + }, + ], + } + ] + +response = completion( + model="huggingface/sambanova/meta-llama/Llama-3.2-11B-Vision-Instruct", + messages=messages, +) +print(response.choices[0]) +``` + +### Function Calling +You can extend the model's capabilities by giving them access to tools. Here is an example with function calling using [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) model through Sambanova. + +```python +import os +from litellm import completion + +# Set your Hugging Face Token +os.environ["HF_TOKEN"] = "hf_xxxxxx" + +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + } + } +] +messages = [ + { + "role": "user", + "content": "What's the weather like in Boston today?", + } +] + +response = completion( + model="huggingface/sambanova/meta-llama/Llama-3.3-70B-Instruct", + messages=messages, + tools=tools, + tool_choice="auto" +) +print(response) +``` + + + + + + + Open In Colab + + +### Basic Completion +After you have [deployed your Hugging Face Inference Endpoint](https://endpoints.huggingface.co/new) on dedicated infrastructure, you can run inference on it by providing the endpoint base URL in `api_base`, and indicating `huggingface/tgi` as the model name. + +```python +import os +from litellm import completion + +os.environ["HF_TOKEN"] = "hf_xxxxxx" + +response = completion( + model="huggingface/tgi", + messages=[{"content": "Hello, how are you?", "role": "user"}], + api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/" +) +print(response) +``` + +### Streaming + +```python +import os +from litellm import completion + +os.environ["HF_TOKEN"] = "hf_xxxxxx" + +response = completion( + model="huggingface/tgi", + messages=[{"content": "Hello, how are you?", "role": "user"}], + api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/", stream=True ) -print(response) -``` - - - - -1. Add models to your config.yaml - -```yaml -model_list: - - model_name: llama-3.1-8B-instruct - litellm_params: - model: huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct - api_key: os.environ/HUGGINGFACE_API_KEY -``` - -2. Start the proxy - -```bash -$ litellm --config /path/to/config.yaml --debug -``` - -3. Test it! - -```shell -curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Authorization: Bearer sk-1234' \ - --header 'Content-Type: application/json' \ - --data '{ - "model": "llama-3.1-8B-instruct", - "messages": [ - { - "role": "user", - "content": "I like you!" - } - ], -}' -``` - - - - - - -Append `text-classification` to the model name - -e.g. `huggingface/text-classification/` - - - - -```python -import os -from litellm import completion - -# [OPTIONAL] set env var -os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key" - -messages = [{ "content": "I like you, I love you!","role": "user"}] - -# e.g. Call 'shahrukhx01/question-vs-statement-classifier' hosted on HF Inference endpoints -response = completion( - model="huggingface/text-classification/shahrukhx01/question-vs-statement-classifier", - messages=messages, - api_base="https://my-endpoint.endpoints.huggingface.cloud", -) - -print(response) -``` - - - - -1. Add models to your config.yaml - -```yaml -model_list: - - model_name: bert-classifier - litellm_params: - model: huggingface/text-classification/shahrukhx01/question-vs-statement-classifier - api_key: os.environ/HUGGINGFACE_API_KEY - api_base: "https://my-endpoint.endpoints.huggingface.cloud" -``` - -2. Start the proxy - -```bash -$ litellm --config /path/to/config.yaml --debug -``` - -3. Test it! - -```shell -curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Authorization: Bearer sk-1234' \ - --header 'Content-Type: application/json' \ - --data '{ - "model": "bert-classifier", - "messages": [ - { - "role": "user", - "content": "I like you!" - } - ], -}' -``` - - - - - - -Steps to use -* Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/ -* Set `api_base` to your deployed api base -* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint - - - - -```python -import os -from litellm import completion - -os.environ["HUGGINGFACE_API_KEY"] = "" - -# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b -# add the 'huggingface/' prefix to the model to set huggingface as the provider -# set api base to your deployed api endpoint from hugging face -response = completion( - model="huggingface/glaiveai/glaive-coder-7b", - messages=[{ "content": "Hello, how are you?","role": "user"}], - api_base="https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud" -) -print(response) -``` - - - - -1. Add models to your config.yaml - -```yaml -model_list: - - model_name: glaive-coder - litellm_params: - model: huggingface/glaiveai/glaive-coder-7b - api_key: os.environ/HUGGINGFACE_API_KEY - api_base: "https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud" -``` - -2. Start the proxy - -```bash -$ litellm --config /path/to/config.yaml --debug -``` - -3. Test it! - -```shell -curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Authorization: Bearer sk-1234' \ - --header 'Content-Type: application/json' \ - --data '{ - "model": "glaive-coder", - "messages": [ - { - "role": "user", - "content": "I like you!" - } - ], -}' -``` - - - - - - - -## Streaming - - - Open In Colab - - -You need to tell LiteLLM when you're calling Huggingface. -This is done by adding the "huggingface/" prefix to `model`, example `completion(model="huggingface/",...)`. - -```python -import os -from litellm import completion - -# [OPTIONAL] set env var -os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key" - -messages = [{ "content": "There's a llama in my garden 😱 What should I do?","role": "user"}] - -# e.g. Call 'facebook/blenderbot-400M-distill' hosted on HF Inference endpoints -response = completion( - model="huggingface/facebook/blenderbot-400M-distill", - messages=messages, - api_base="https://my-endpoint.huggingface.cloud", - stream=True -) - -print(response) for chunk in response: - print(chunk) + print(chunk) ``` +### Image Input + +```python +import os +from litellm import completion + +os.environ["HF_TOKEN"] = "hf_xxxxxx" + +messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + } + }, + ], + } + ] +response = completion( + model="huggingface/tgi", + messages=messages, + api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/"" +) +print(response.choices[0]) +``` + +### Function Calling + +```python +import os +from litellm import completion + +os.environ["HF_TOKEN"] = "hf_xxxxxx" + +functions = [{ + "name": "get_weather", + "description": "Get the weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The location to get weather for" + } + }, + "required": ["location"] + } +}] + +response = completion( + model="huggingface/tgi", + messages=[{"content": "What's the weather like in San Francisco?", "role": "user"}], + api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/", + functions=functions +) +print(response) +``` + + + + +## LiteLLM Proxy Server with Hugging Face models +You can set up a [LiteLLM Proxy Server](https://docs.litellm.ai/#litellm-proxy-server-llm-gateway) to serve Hugging Face models through any of the supported Inference Providers. Here's how to do it: + +### Step 1. Setup the config file + +In this case, we are configuring a proxy to serve `DeepSeek R1` from Hugging Face, using Together AI as the backend Inference Provider. + +```yaml +model_list: + - model_name: my-r1-model + litellm_params: + model: huggingface/together/deepseek-ai/DeepSeek-R1 + api_key: os.environ/HF_TOKEN # ensure you have `HF_TOKEN` in your .env +``` + +### Step 2. Start the server +```bash +litellm --config /path/to/config.yaml +``` + +### Step 3. Make a request to the server + + + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "my-r1-model", + "messages": [ + { + "role": "user", + "content": "Hello, how are you?" + } + ] +}' +``` + + + + +```python +# pip install openai +from openai import OpenAI + +client = OpenAI( + base_url="http://0.0.0.0:4000", + api_key="anything", +) + +response = client.chat.completions.create( + model="my-r1-model", + messages=[ + {"role": "user", "content": "Hello, how are you?"} + ] +) +print(response) +``` + + + + + ## Embedding -LiteLLM supports Hugging Face's [text-embedding-inference](https://github.com/huggingface/text-embeddings-inference) format. +LiteLLM supports Hugging Face's [text-embedding-inference](https://github.com/huggingface/text-embeddings-inference) models as well. ```python from litellm import embedding import os -os.environ['HUGGINGFACE_API_KEY'] = "" +os.environ['HF_TOKEN'] = "hf_xxxxxx" response = embedding( model='huggingface/microsoft/codebert-base', input=["good morning from litellm"] ) ``` -## Advanced - -### Setting API KEYS + API BASE - -If required, you can set the api key + api base, set it in your os environment. [Code for how it's sent](https://github.com/BerriAI/litellm/blob/0100ab2382a0e720c7978fbf662cc6e6920e7e03/litellm/llms/huggingface_restapi.py#L25) - -```python -import os -os.environ["HUGGINGFACE_API_KEY"] = "" -os.environ["HUGGINGFACE_API_BASE"] = "" -``` - -### Viewing Log probs - -#### Using `decoder_input_details` - OpenAI `echo` - -The `echo` param is supported by OpenAI Completions - Use `litellm.text_completion()` for this - -```python -from litellm import text_completion -response = text_completion( - model="huggingface/bigcode/starcoder", - prompt="good morning", - max_tokens=10, logprobs=10, - echo=True -) -``` - -#### Output - -```json -{ - "id": "chatcmpl-3fc71792-c442-4ba1-a611-19dd0ac371ad", - "object": "text_completion", - "created": 1698801125.936519, - "model": "bigcode/starcoder", - "choices": [ - { - "text": ", I'm going to make you a sand", - "index": 0, - "logprobs": { - "tokens": [ - "good", - " morning", - ",", - " I", - "'m", - " going", - " to", - " make", - " you", - " a", - " s", - "and" - ], - "token_logprobs": [ - "None", - -14.96875, - -2.2285156, - -2.734375, - -2.0957031, - -2.0917969, - -0.09429932, - -3.1132812, - -1.3203125, - -1.2304688, - -1.6201172, - -0.010292053 - ] - }, - "finish_reason": "length" - } - ], - "usage": { - "completion_tokens": 9, - "prompt_tokens": 2, - "total_tokens": 11 - } -} -``` - -### Models with Prompt Formatting - -For models with special prompt templates (e.g. Llama2), we format the prompt to fit their template. - -#### Models with natively Supported Prompt Templates - -| Model Name | Works for Models | Function Call | Required OS Variables | -| ------------------------------------ | ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------- | -| mistralai/Mistral-7B-Instruct-v0.1 | mistralai/Mistral-7B-Instruct-v0.1 | `completion(model='huggingface/mistralai/Mistral-7B-Instruct-v0.1', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` | -| meta-llama/Llama-2-7b-chat | All meta-llama llama2 chat models | `completion(model='huggingface/meta-llama/Llama-2-7b', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` | -| tiiuae/falcon-7b-instruct | All falcon instruct models | `completion(model='huggingface/tiiuae/falcon-7b-instruct', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` | -| mosaicml/mpt-7b-chat | All mpt chat models | `completion(model='huggingface/mosaicml/mpt-7b-chat', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` | -| codellama/CodeLlama-34b-Instruct-hf | All codellama instruct models | `completion(model='huggingface/codellama/CodeLlama-34b-Instruct-hf', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` | -| WizardLM/WizardCoder-Python-34B-V1.0 | All wizardcoder models | `completion(model='huggingface/WizardLM/WizardCoder-Python-34B-V1.0', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` | -| Phind/Phind-CodeLlama-34B-v2 | All phind-codellama models | `completion(model='huggingface/Phind/Phind-CodeLlama-34B-v2', messages=messages, api_base="your_api_endpoint")` | `os.environ['HUGGINGFACE_API_KEY']` | - -**What if we don't support a model you need?** -You can also specify you're own custom prompt formatting, in case we don't have your model covered yet. - -**Does this mean you have to specify a prompt for all models?** -No. By default we'll concatenate your message content to make a prompt. - -**Default Prompt Template** - -```python -def default_pt(messages): - return " ".join(message["content"] for message in messages) -``` - -[Code for how prompt formats work in LiteLLM](https://github.com/BerriAI/litellm/blob/main/litellm/llms/prompt_templates/factory.py) - -#### Custom prompt templates - -```python -import litellm - -# Create your own custom prompt template works -litellm.register_prompt_template( - model="togethercomputer/LLaMA-2-7B-32K", - roles={ - "system": { - "pre_message": "[INST] <>\n", - "post_message": "\n<>\n [/INST]\n" - }, - "user": { - "pre_message": "[INST] ", - "post_message": " [/INST]\n" - }, - "assistant": { - "post_message": "\n" - } - } - ) - -def test_huggingface_custom_model(): - model = "huggingface/togethercomputer/LLaMA-2-7B-32K" - response = completion(model=model, messages=messages, api_base="https://ecd4sb5n09bo4ei2.us-east-1.aws.endpoints.huggingface.cloud") - print(response['choices'][0]['message']['content']) - return response - -test_huggingface_custom_model() -``` - -[Implementation Code](https://github.com/BerriAI/litellm/blob/c0b3da2c14c791a0b755f0b1e5a9ef065951ecbf/litellm/llms/huggingface_restapi.py#L52) - -### Deploying a model on huggingface - -You can use any chat/text model from Hugging Face with the following steps: - -- Copy your model id/url from Huggingface Inference Endpoints - - [ ] Go to https://ui.endpoints.huggingface.co/ - - [ ] Copy the url of the specific model you'd like to use - HF_Dashboard -- Set it as your model name -- Set your HUGGINGFACE_API_KEY as an environment variable - -Need help deploying a model on huggingface? [Check out this guide.](https://huggingface.co/docs/inference-endpoints/guides/create_endpoint) - -# output - -Same as the OpenAI format, but also includes logprobs. [See the code](https://github.com/BerriAI/litellm/blob/b4b2dbf005142e0a483d46a07a88a19814899403/litellm/llms/huggingface_restapi.py#L115) - -```json -{ - "choices": [ - { - "finish_reason": "stop", - "index": 0, - "message": { - "content": "\ud83d\ude31\n\nComment: @SarahSzabo I'm", - "role": "assistant", - "logprobs": -22.697942825499993 - } - } - ], - "created": 1693436637.38206, - "model": "https://ji16r2iys9a8rjk2.us-east-1.aws.endpoints.huggingface.cloud", - "usage": { - "prompt_tokens": 14, - "completion_tokens": 11, - "total_tokens": 25 - } -} -``` - # FAQ -**Does this support stop sequences?** +**How does billing work with Hugging Face Inference Providers?** -Yes, we support stop sequences - and you can pass as many as allowed by Hugging Face (or any provider!) +> Billing is centralized on your Hugging Face account, no matter which providers you are using. You are billed the standard provider API rates with no additional markup - Hugging Face simply passes through the provider costs. Note that [Hugging Face PRO](https://huggingface.co/subscribe/pro) users get $2 worth of Inference credits every month that can be used across providers. -**How do you deal with repetition penalty?** +**Do I need to create an account for each Inference Provider?** -We map the presence penalty parameter in openai to the repetition penalty parameter on Hugging Face. [See code](https://github.com/BerriAI/litellm/blob/b4b2dbf005142e0a483d46a07a88a19814899403/litellm/utils.py#L757). +> No, you don't need to create separate accounts. All requests are routed through Hugging Face, so you only need your HF token. This allows you to easily benchmark different providers and choose the one that best fits your needs. -We welcome any suggestions for improving our Hugging Face integration - Create an [issue](https://github.com/BerriAI/litellm/issues/new/choose)/[Join the Discord](https://discord.com/invite/wuPM9dRgDw)! +**Will more inference providers be supported by Hugging Face in the future?** + +> Yes! New inference providers (and models) are being added gradually. + +We welcome any suggestions for improving our Hugging Face integration - Create an [issue](https://github.com/BerriAI/litellm/issues/new/choose)/[Join the Discord](https://discord.com/invite/wuPM9dRgDw)! \ No newline at end of file diff --git a/docs/my-website/docs/providers/infinity.md b/docs/my-website/docs/providers/infinity.md index 091503bf18..7900d5adb4 100644 --- a/docs/my-website/docs/providers/infinity.md +++ b/docs/my-website/docs/providers/infinity.md @@ -3,18 +3,17 @@ import TabItem from '@theme/TabItem'; # Infinity -| Property | Details | -|-------|-------| -| Description | Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip| -| Provider Route on LiteLLM | `infinity/` | -| Supported Operations | `/rerank` | -| Link to Provider Doc | [Infinity ↗](https://github.com/michaelfeil/infinity) | - +| Property | Details | +| ------------------------- | ---------------------------------------------------------------------------------------------------------- | +| Description | Infinity is a high-throughput, low-latency REST API for serving text-embeddings, reranking models and clip | +| Provider Route on LiteLLM | `infinity/` | +| Supported Operations | `/rerank`, `/embeddings` | +| Link to Provider Doc | [Infinity ↗](https://github.com/michaelfeil/infinity) | ## **Usage - LiteLLM Python SDK** ```python -from litellm import rerank +from litellm import rerank, embedding import os os.environ["INFINITY_API_BASE"] = "http://localhost:8080" @@ -39,8 +38,8 @@ model_list: - model_name: custom-infinity-rerank litellm_params: model: infinity/rerank - api_key: os.environ/INFINITY_API_KEY api_base: https://localhost:8080 + api_key: os.environ/INFINITY_API_KEY ``` Start litellm @@ -51,7 +50,9 @@ litellm --config /path/to/config.yaml # RUNNING on http://0.0.0.0:4000 ``` -Test request +## Test request: + +### Rerank ```bash curl http://0.0.0.0:4000/rerank \ @@ -70,15 +71,14 @@ curl http://0.0.0.0:4000/rerank \ }' ``` +#### Supported Cohere Rerank API Params -## Supported Cohere Rerank API Params - -| Param | Type | Description | -|-------|-------|-------| -| `query` | `str` | The query to rerank the documents against | -| `documents` | `list[str]` | The documents to rerank | -| `top_n` | `int` | The number of documents to return | -| `return_documents` | `bool` | Whether to return the documents in the response | +| Param | Type | Description | +| ------------------ | ----------- | ----------------------------------------------- | +| `query` | `str` | The query to rerank the documents against | +| `documents` | `list[str]` | The documents to rerank | +| `top_n` | `int` | The number of documents to return | +| `return_documents` | `bool` | Whether to return the documents in the response | ### Usage - Return Documents @@ -138,6 +138,7 @@ response = rerank( raw_scores=True, # 👈 PROVIDER-SPECIFIC PARAM ) ``` + @@ -161,7 +162,7 @@ litellm --config /path/to/config.yaml # RUNNING on http://0.0.0.0:4000 ``` -3. Test it! +3. Test it! ```bash curl http://0.0.0.0:4000/rerank \ @@ -179,6 +180,121 @@ curl http://0.0.0.0:4000/rerank \ "raw_scores": True # 👈 PROVIDER-SPECIFIC PARAM }' ``` + + +## Embeddings + +LiteLLM provides an OpenAI api compatible `/embeddings` endpoint for embedding calls. + +**Setup** + +Add this to your litellm proxy config.yaml + +```yaml +model_list: + - model_name: custom-infinity-embedding + litellm_params: + model: infinity/provider/custom-embedding-v1 + api_base: http://localhost:8080 + api_key: os.environ/INFINITY_API_KEY +``` + +### Test request: + +```bash +curl http://0.0.0.0:4000/embeddings \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "custom-infinity-embedding", + "input": ["hello"] + }' +``` + +#### Supported Embedding API Params + +| Param | Type | Description | +| ----------------- | ----------- | ----------------------------------------------------------- | +| `model` | `str` | The embedding model to use | +| `input` | `list[str]` | The text inputs to generate embeddings for | +| `encoding_format` | `str` | The format to return embeddings in (e.g. "float", "base64") | +| `modality` | `str` | The type of input (e.g. "text", "image", "audio") | + +### Usage - Basic Examples + + + + +```python +from litellm import embedding +import os + +os.environ["INFINITY_API_BASE"] = "http://localhost:8080" + +response = embedding( + model="infinity/bge-small", + input=["good morning from litellm"] +) + +print(response.data[0]['embedding']) +``` + + + + + +```bash +curl http://0.0.0.0:4000/embeddings \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "custom-infinity-embedding", + "input": ["hello"] + }' +``` + + + + +### Usage - OpenAI Client + + + + +```python +from openai import OpenAI + +client = OpenAI( + api_key="", + base_url="" +) + +response = client.embeddings.create( + model="bge-small", + input=["The food was delicious and the waiter..."], + encoding_format="float" +) + +print(response.data[0].embedding) +``` + + + + + +```bash +curl http://0.0.0.0:4000/embeddings \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "bge-small", + "input": ["The food was delicious and the waiter..."], + "encoding_format": "float" + }' +``` + + + diff --git a/docs/my-website/docs/providers/litellm_proxy.md b/docs/my-website/docs/providers/litellm_proxy.md index e204caba0a..a66423dac5 100644 --- a/docs/my-website/docs/providers/litellm_proxy.md +++ b/docs/my-website/docs/providers/litellm_proxy.md @@ -57,7 +57,7 @@ messages = [{ "content": "Hello, how are you?","role": "user"}] # litellm proxy call response = completion( model="litellm_proxy/your-model-name", - messages, + messages=messages, api_base = "your-litellm-proxy-url", api_key = "your-litellm-proxy-api-key" ) @@ -76,7 +76,7 @@ messages = [{ "content": "Hello, how are you?","role": "user"}] # openai call response = completion( model="litellm_proxy/your-model-name", - messages, + messages=messages, api_base = "your-litellm-proxy-url", stream=True ) diff --git a/docs/my-website/docs/providers/ollama.md b/docs/my-website/docs/providers/ollama.md index 848be2beb7..d59d9dd0ce 100644 --- a/docs/my-website/docs/providers/ollama.md +++ b/docs/my-website/docs/providers/ollama.md @@ -202,6 +202,67 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ + +## Using Ollama FIM on `/v1/completions` + +LiteLLM supports calling Ollama's `/api/generate` endpoint on `/v1/completions` requests. + + + + +```python +import litellm +litellm._turn_on_debug() # turn on debug to see the request +from litellm import completion + +response = completion( + model="ollama/llama3.1", + prompt="Hello, world!", + api_base="http://localhost:11434" +) +print(response) +``` + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: "llama3.1" + litellm_params: + model: "ollama/llama3.1" + api_base: "http://localhost:11434" +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml --detailed_debug + +# RUNNING ON http://0.0.0.0:4000 +``` + +3. Test it! + +```python +from openai import OpenAI + +client = OpenAI( + api_key="anything", # 👈 PROXY KEY (can be anything, if master_key not set) + base_url="http://0.0.0.0:4000" # 👈 PROXY BASE URL +) + +response = client.completions.create( + model="ollama/llama3.1", + prompt="Hello, world!", + api_base="http://localhost:11434" +) +print(response) +``` + + + ## Using ollama `api/chat` In order to send ollama requests to `POST /api/chat` on your ollama server, set the model prefix to `ollama_chat` diff --git a/docs/my-website/docs/providers/openai.md b/docs/my-website/docs/providers/openai.md index 15661f6521..a4aee5dbf7 100644 --- a/docs/my-website/docs/providers/openai.md +++ b/docs/my-website/docs/providers/openai.md @@ -163,6 +163,12 @@ os.environ["OPENAI_API_BASE"] = "openaiai-api-base" # OPTIONAL | Model Name | Function Call | |-----------------------|-----------------------------------------------------------------| +| gpt-4.1 | `response = completion(model="gpt-4.1", messages=messages)` | +| gpt-4.1-mini | `response = completion(model="gpt-4.1-mini", messages=messages)` | +| gpt-4.1-nano | `response = completion(model="gpt-4.1-nano", messages=messages)` | +| o4-mini | `response = completion(model="o4-mini", messages=messages)` | +| o3-mini | `response = completion(model="o3-mini", messages=messages)` | +| o3 | `response = completion(model="o3", messages=messages)` | | o1-mini | `response = completion(model="o1-mini", messages=messages)` | | o1-preview | `response = completion(model="o1-preview", messages=messages)` | | gpt-4o-mini | `response = completion(model="gpt-4o-mini", messages=messages)` | @@ -228,6 +234,92 @@ response = completion( ``` +## PDF File Parsing + +OpenAI has a new `file` message type that allows you to pass in a PDF file and have it parsed into a structured output. [Read more](https://platform.openai.com/docs/guides/pdf-files?api-mode=chat&lang=python) + + + + +```python +import base64 +from litellm import completion + +with open("draconomicon.pdf", "rb") as f: + data = f.read() + +base64_string = base64.b64encode(data).decode("utf-8") + +completion = completion( + model="gpt-4o", + messages=[ + { + "role": "user", + "content": [ + { + "type": "file", + "file": { + "filename": "draconomicon.pdf", + "file_data": f"data:application/pdf;base64,{base64_string}", + } + }, + { + "type": "text", + "text": "What is the first dragon in the book?", + } + ], + }, + ], +) + +print(completion.choices[0].message.content) +``` + + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: openai-model + litellm_params: + model: gpt-4o + api_key: os.environ/OPENAI_API_KEY +``` + +2. Start the proxy + +```bash +litellm --config config.yaml +``` + +3. Test it! + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "openai-model", + "messages": [ + {"role": "user", "content": [ + { + "type": "file", + "file": { + "filename": "draconomicon.pdf", + "file_data": f"data:application/pdf;base64,{base64_string}", + } + } + ]} + ] +}' +``` + + + + ## OpenAI Fine Tuned Models | Model Name | Function Call | @@ -239,6 +331,74 @@ response = completion( | fine tuned `gpt-3.5-turbo-0613` | `response = completion(model="ft:gpt-3.5-turbo-0613", messages=messages)` | +## OpenAI Audio Transcription + +LiteLLM supports OpenAI Audio Transcription endpoint. + +Supported models: + +| Model Name | Function Call | +|---------------------------|-----------------------------------------------------------------| +| `whisper-1` | `response = completion(model="whisper-1", file=audio_file)` | +| `gpt-4o-transcribe` | `response = completion(model="gpt-4o-transcribe", file=audio_file)` | +| `gpt-4o-mini-transcribe` | `response = completion(model="gpt-4o-mini-transcribe", file=audio_file)` | + + + + +```python +from litellm import transcription +import os + +# set api keys +os.environ["OPENAI_API_KEY"] = "" +audio_file = open("/path/to/audio.mp3", "rb") + +response = transcription(model="gpt-4o-transcribe", file=audio_file) + +print(f"response: {response}") +``` + + + + +1. Setup config.yaml + +```yaml +model_list: +- model_name: gpt-4o-transcribe + litellm_params: + model: gpt-4o-transcribe + api_key: os.environ/OPENAI_API_KEY + model_info: + mode: audio_transcription + +general_settings: + master_key: sk-1234 +``` + +2. Start the proxy + +```bash +litellm --config config.yaml +``` + +3. Test it! + +```bash +curl --location 'http://0.0.0.0:8000/v1/audio/transcriptions' \ +--header 'Authorization: Bearer sk-1234' \ +--form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \ +--form 'model="gpt-4o-transcribe"' +``` + + + + + + + + ## Advanced ### Getting OpenAI API Response Headers @@ -449,26 +609,6 @@ response = litellm.acompletion( ) ``` -### Using Helicone Proxy with LiteLLM -```python -import os -import litellm -from litellm import completion - -os.environ["OPENAI_API_KEY"] = "" - -# os.environ["OPENAI_API_BASE"] = "" -litellm.api_base = "https://oai.hconeai.com/v1" -litellm.headers = { - "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", - "Helicone-Cache-Enabled": "true", -} - -messages = [{ "content": "Hello, how are you?","role": "user"}] - -# openai call -response = completion("gpt-3.5-turbo", messages) -``` ### Using OpenAI Proxy with LiteLLM ```python diff --git a/docs/my-website/docs/providers/openrouter.md b/docs/my-website/docs/providers/openrouter.md index 09669c9f9e..58a87f6849 100644 --- a/docs/my-website/docs/providers/openrouter.md +++ b/docs/my-website/docs/providers/openrouter.md @@ -10,9 +10,11 @@ LiteLLM supports all the text / chat / vision models from [OpenRouter](https://o import os from litellm import completion os.environ["OPENROUTER_API_KEY"] = "" +os.environ["OPENROUTER_API_BASE"] = "" # [OPTIONAL] defaults to https://openrouter.ai/api/v1 -os.environ["OR_SITE_URL"] = "" # optional -os.environ["OR_APP_NAME"] = "" # optional + +os.environ["OR_SITE_URL"] = "" # [OPTIONAL] +os.environ["OR_APP_NAME"] = "" # [OPTIONAL] response = completion( model="openrouter/google/palm-2-chat-bison", diff --git a/docs/my-website/docs/providers/perplexity.md b/docs/my-website/docs/providers/perplexity.md index 620a7640ad..5ef1f8861a 100644 --- a/docs/my-website/docs/providers/perplexity.md +++ b/docs/my-website/docs/providers/perplexity.md @@ -17,7 +17,7 @@ import os os.environ['PERPLEXITYAI_API_KEY'] = "" response = completion( - model="perplexity/mistral-7b-instruct", + model="perplexity/sonar-pro", messages=messages ) print(response) @@ -30,7 +30,7 @@ import os os.environ['PERPLEXITYAI_API_KEY'] = "" response = completion( - model="perplexity/mistral-7b-instruct", + model="perplexity/sonar-pro", messages=messages, stream=True ) @@ -45,19 +45,12 @@ All models listed here https://docs.perplexity.ai/docs/model-cards are supported | Model Name | Function Call | |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| pplx-7b-chat | `completion(model="perplexity/pplx-7b-chat", messages)` | -| pplx-70b-chat | `completion(model="perplexity/pplx-70b-chat", messages)` | -| pplx-7b-online | `completion(model="perplexity/pplx-7b-online", messages)` | -| pplx-70b-online | `completion(model="perplexity/pplx-70b-online", messages)` | -| codellama-34b-instruct | `completion(model="perplexity/codellama-34b-instruct", messages)` | -| llama-2-13b-chat | `completion(model="perplexity/llama-2-13b-chat", messages)` | -| llama-2-70b-chat | `completion(model="perplexity/llama-2-70b-chat", messages)` | -| mistral-7b-instruct | `completion(model="perplexity/mistral-7b-instruct", messages)` | -| openhermes-2-mistral-7b | `completion(model="perplexity/openhermes-2-mistral-7b", messages)` | -| openhermes-2.5-mistral-7b | `completion(model="perplexity/openhermes-2.5-mistral-7b", messages)` | -| pplx-7b-chat-alpha | `completion(model="perplexity/pplx-7b-chat-alpha", messages)` | -| pplx-70b-chat-alpha | `completion(model="perplexity/pplx-70b-chat-alpha", messages)` | - +| sonar-deep-research | `completion(model="perplexity/sonar-deep-research", messages)` | +| sonar-reasoning-pro | `completion(model="perplexity/sonar-reasoning-pro", messages)` | +| sonar-reasoning | `completion(model="perplexity/sonar-reasoning", messages)` | +| sonar-pro | `completion(model="perplexity/sonar-pro", messages)` | +| sonar | `completion(model="perplexity/sonar", messages)` | +| r1-1776 | `completion(model="perplexity/r1-1776", messages)` | diff --git a/docs/my-website/docs/providers/predibase.md b/docs/my-website/docs/providers/predibase.md index 31713aef1e..9f25309c19 100644 --- a/docs/my-website/docs/providers/predibase.md +++ b/docs/my-website/docs/providers/predibase.md @@ -230,7 +230,7 @@ response = completion( model="predibase/llama-3-8b-instruct", messages = [{ "content": "Hello, how are you?","role": "user"}], adapter_id="my_repo/3", - adapter_soruce="pbase", + adapter_source="pbase", ) ``` diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index 10ac13ecaf..762bd5f332 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -347,7 +347,7 @@ Return a `list[Recipe]` completion(model="vertex_ai/gemini-1.5-flash-preview-0514", messages=messages, response_format={ "type": "json_object" }) ``` -### **Grounding** +### **Grounding - Web Search** Add Google Search Result grounding to vertex ai calls. @@ -358,7 +358,7 @@ See the grounding metadata with `response_obj._hidden_params["vertex_ai_groundin -```python +```python showLineNumbers from litellm import completion ## SETUP ENVIRONMENT @@ -377,14 +377,36 @@ print(resp) -```bash + + + +```python showLineNumbers +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys + base_url="http://0.0.0.0:4000/v1/" # point to litellm proxy +) + +response = client.chat.completions.create( + model="gemini-pro", + messages=[{"role": "user", "content": "Who won the world cup?"}], + tools=[{"googleSearchRetrieval": {}}], +) + +print(response) +``` + + + +```bash showLineNumbers curl http://localhost:4000/v1/chat/completions \ -H "Content-Type: application/json" \ -H "Authorization: Bearer sk-1234" \ -d '{ "model": "gemini-pro", "messages": [ - {"role": "user", "content": "Hello, Claude!"} + {"role": "user", "content": "Who won the world cup?"} ], "tools": [ { @@ -394,10 +416,82 @@ curl http://localhost:4000/v1/chat/completions \ }' ``` + + +You can also use the `enterpriseWebSearch` tool for an [enterprise compliant search](https://cloud.google.com/vertex-ai/generative-ai/docs/grounding/web-grounding-enterprise). + + + + +```python showLineNumbers +from litellm import completion + +## SETUP ENVIRONMENT +# !gcloud auth application-default login - run this to add vertex credentials to your env + +tools = [{"enterpriseWebSearch": {}}] # 👈 ADD GOOGLE ENTERPRISE SEARCH + +resp = litellm.completion( + model="vertex_ai/gemini-1.0-pro-001", + messages=[{"role": "user", "content": "Who won the world cup?"}], + tools=tools, + ) + +print(resp) +``` + + + + + + +```python showLineNumbers +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys + base_url="http://0.0.0.0:4000/v1/" # point to litellm proxy +) + +response = client.chat.completions.create( + model="gemini-pro", + messages=[{"role": "user", "content": "Who won the world cup?"}], + tools=[{"enterpriseWebSearch": {}}], +) + +print(response) +``` + + + +```bash showLineNumbers +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-pro", + "messages": [ + {"role": "user", "content": "Who won the world cup?"} + ], + "tools": [ + { + "enterpriseWebSearch": {} + } + ] + }' + +``` + + + + + + + #### **Moving from Vertex AI SDK to LiteLLM (GROUNDING)** @@ -448,6 +542,154 @@ print(resp) ``` +### **Thinking / `reasoning_content`** + +LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362) + +**Mapping** + +| reasoning_effort | thinking | +| ---------------- | -------- | +| "low" | "budget_tokens": 1024 | +| "medium" | "budget_tokens": 2048 | +| "high" | "budget_tokens": 4096 | + + + + +```python +from litellm import completion + +# !gcloud auth application-default login - run this to add vertex credentials to your env + +resp = completion( + model="vertex_ai/gemini-2.5-flash-preview-04-17", + messages=[{"role": "user", "content": "What is the capital of France?"}], + reasoning_effort="low", + vertex_project="project-id", + vertex_location="us-central1" +) + +``` + + + + + +1. Setup config.yaml + +```yaml +- model_name: gemini-2.5-flash + litellm_params: + model: vertex_ai/gemini-2.5-flash-preview-04-17 + vertex_credentials: {"project_id": "project-id", "location": "us-central1", "project_key": "project-key"} + vertex_project: "project-id" + vertex_location: "us-central1" +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "gemini-2.5-flash", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "reasoning_effort": "low" + }' +``` + + + + + +**Expected Response** + +```python +ModelResponse( + id='chatcmpl-c542d76d-f675-4e87-8e5f-05855f5d0f5e', + created=1740470510, + model='claude-3-7-sonnet-20250219', + object='chat.completion', + system_fingerprint=None, + choices=[ + Choices( + finish_reason='stop', + index=0, + message=Message( + content="The capital of France is Paris.", + role='assistant', + tool_calls=None, + function_call=None, + reasoning_content='The capital of France is Paris. This is a very straightforward factual question.' + ), + ) + ], + usage=Usage( + completion_tokens=68, + prompt_tokens=42, + total_tokens=110, + completion_tokens_details=None, + prompt_tokens_details=PromptTokensDetailsWrapper( + audio_tokens=None, + cached_tokens=0, + text_tokens=None, + image_tokens=None + ), + cache_creation_input_tokens=0, + cache_read_input_tokens=0 + ) +) +``` + +#### Pass `thinking` to Gemini models + +You can also pass the `thinking` parameter to Gemini models. + +This is translated to Gemini's [`thinkingConfig` parameter](https://ai.google.dev/gemini-api/docs/thinking#set-budget). + + + + +```python +from litellm import completion + +# !gcloud auth application-default login - run this to add vertex credentials to your env + +response = litellm.completion( + model="vertex_ai/gemini-2.5-flash-preview-04-17", + messages=[{"role": "user", "content": "What is the capital of France?"}], + thinking={"type": "enabled", "budget_tokens": 1024}, + vertex_project="project-id", + vertex_location="us-central1" +) +``` + + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "vertex_ai/gemini-2.5-flash-preview-04-17", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "thinking": {"type": "enabled", "budget_tokens": 1024} + }' +``` + + + + + ### **Context Caching** Use Vertex AI context caching is supported by calling provider api directly. (Unified Endpoint support comin soon.). @@ -1369,6 +1611,103 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ +## Gemini Pro +| Model Name | Function Call | +|------------------|--------------------------------------| +| gemini-pro | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` | + +## Fine-tuned Models + +You can call fine-tuned Vertex AI Gemini models through LiteLLM + +| Property | Details | +|----------|---------| +| Provider Route | `vertex_ai/gemini/{MODEL_ID}` | +| Vertex Documentation | [Vertex AI - Fine-tuned Gemini Models](https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-use-supervised-tuning#test_the_tuned_model_with_a_prompt)| +| Supported Operations | `/chat/completions`, `/completions`, `/embeddings`, `/images` | + +To use a model that follows the `/gemini` request/response format, simply set the model parameter as + +```python title="Model parameter for calling fine-tuned gemini models" +model="vertex_ai/gemini/" +``` + + + + +```python showLineNumbers title="Example" +import litellm +import os + +## set ENV variables +os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811" +os.environ["VERTEXAI_LOCATION"] = "us-central1" + +response = litellm.completion( + model="vertex_ai/gemini/", # e.g. vertex_ai/gemini/4965075652664360960 + messages=[{ "content": "Hello, how are you?","role": "user"}], +) +``` + + + + +1. Add Vertex Credentials to your env + +```bash title="Authenticate to Vertex AI" +!gcloud auth application-default login +``` + +2. Setup config.yaml + +```yaml showLineNumbers title="Add to litellm config" +- model_name: finetuned-gemini + litellm_params: + model: vertex_ai/gemini/ + vertex_project: + vertex_location: +``` + +3. Test it! + + + + +```python showLineNumbers title="Example request" +from openai import OpenAI + +client = OpenAI( + api_key="your-litellm-key", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="finetuned-gemini", + messages=[ + {"role": "user", "content": "hi"} + ] +) +print(response) +``` + + + + +```bash showLineNumbers title="Example request" +curl --location 'https://0.0.0.0:4000/v1/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: ' \ +--data '{"model": "finetuned-gemini" ,"messages":[{"role": "user", "content":[{"type": "text", "text": "hi"}]}]}' +``` + + + + + + + + + ## Model Garden :::tip @@ -1479,67 +1818,6 @@ response = completion( -## Gemini Pro -| Model Name | Function Call | -|------------------|--------------------------------------| -| gemini-pro | `completion('gemini-pro', messages)`, `completion('vertex_ai/gemini-pro', messages)` | - -## Fine-tuned Models - -Fine tuned models on vertex have a numerical model/endpoint id. - - - - -```python -from litellm import completion -import os - -## set ENV variables -os.environ["VERTEXAI_PROJECT"] = "hardy-device-38811" -os.environ["VERTEXAI_LOCATION"] = "us-central1" - -response = completion( - model="vertex_ai/", # e.g. vertex_ai/4965075652664360960 - messages=[{ "content": "Hello, how are you?","role": "user"}], - base_model="vertex_ai/gemini-1.5-pro" # the base model - used for routing -) -``` - - - - -1. Add Vertex Credentials to your env - -```bash -!gcloud auth application-default login -``` - -2. Setup config.yaml - -```yaml -- model_name: finetuned-gemini - litellm_params: - model: vertex_ai/ - vertex_project: - vertex_location: - model_info: - base_model: vertex_ai/gemini-1.5-pro # IMPORTANT -``` - -3. Test it! - -```bash -curl --location 'https://0.0.0.0:4000/v1/chat/completions' \ ---header 'Content-Type: application/json' \ ---header 'Authorization: ' \ ---data '{"model": "finetuned-gemini" ,"messages":[{"role": "user", "content":[{"type": "text", "text": "hi"}]}]}' -``` - - - - - ## Gemini Pro Vision | Model Name | Function Call | @@ -1684,23 +1962,25 @@ assert isinstance( ``` -## Usage - PDF / Videos / etc. Files +## Usage - PDF / Videos / Audio etc. Files Pass any file supported by Vertex AI, through LiteLLM. -LiteLLM Supports the following image types passed in url +LiteLLM Supports the following file types passed in url. + +Using `file` message type for VertexAI is live from v1.65.1+ ``` -Images with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg -Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg +Files with Cloud Storage URIs - gs://cloud-samples-data/generative-ai/image/boats.jpeg +Files with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg Videos with Cloud Storage URIs - https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4 -Base64 Encoded Local Images +Base64 Encoded Local Files ``` -### **Using `gs://`** +### **Using `gs://` or any URL** ```python from litellm import completion @@ -1712,8 +1992,11 @@ response = completion( "content": [ {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."}, { - "type": "image_url", - "image_url": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf", # 👈 PDF + "type": "file", + "file": { + "file_id": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf", + "format": "application/pdf" # OPTIONAL - specify mime-type + } }, ], } @@ -1747,8 +2030,16 @@ response = completion( "content": [ {"type": "text", "text": "You are a very professional document summarization specialist. Please summarize the given document."}, { - "type": "image_url", - "image_url": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF + "type": "file", + "file": { + "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF + } + }, + { + "type": "audio_input", + "audio_input { + "audio_input": f"data:audio/mp3;base64,{encoded_file}", # 👈 AUDIO File ('file' message works as too) + } }, ], } @@ -1794,8 +2085,11 @@ curl http://0.0.0.0:4000/v1/chat/completions \ "text": "You are a very professional document summarization specialist. Please summarize the given document" }, { - "type": "image_url", - "image_url": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf" # 👈 PDF + "type": "file", + "file": { + "file_id": "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf", + "format": "application/pdf" # OPTIONAL + } } } ] @@ -1822,11 +2116,18 @@ curl http://0.0.0.0:4000/v1/chat/completions \ "text": "You are a very professional document summarization specialist. Please summarize the given document" }, { - "type": "image_url", - "image_url": "data:application/pdf;base64,{encoded_file}" # 👈 PDF - } - } - ] + "type": "file", + "file": { + "file_data": f"data:application/pdf;base64,{encoded_file}", # 👈 PDF + }, + }, + { + "type": "audio_input", + "audio_input { + "audio_input": f"data:audio/mp3;base64,{encoded_file}", # 👈 AUDIO File ('file' message works as too) + } + }, + ] } ], "max_tokens": 300 @@ -1836,6 +2137,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \ + ## Chat Models | Model Name | Function Call | |------------------|--------------------------------------| @@ -2044,7 +2346,12 @@ print(response) ## **Multi-Modal Embeddings** -Usage + +Known Limitations: +- Only supports 1 image / video / image per request +- Only supports GCS or base64 encoded images / videos + +### Usage @@ -2260,6 +2567,115 @@ print(f"Text Embedding: {embeddings.text_embedding}") +### Text + Image + Video Embeddings + + + + +Text + Image + +```python +response = await litellm.aembedding( + model="vertex_ai/multimodalembedding@001", + input=["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"] # will be sent as a gcs image +) +``` + +Text + Video + +```python +response = await litellm.aembedding( + model="vertex_ai/multimodalembedding@001", + input=["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image +) +``` + +Image + Video + +```python +response = await litellm.aembedding( + model="vertex_ai/multimodalembedding@001", + input=["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image +) +``` + + + + + +1. Add model to config.yaml +```yaml +model_list: + - model_name: multimodalembedding@001 + litellm_params: + model: vertex_ai/multimodalembedding@001 + vertex_project: "adroit-crow-413218" + vertex_location: "us-central1" + vertex_credentials: adroit-crow-413218-a956eef1a2a8.json + +litellm_settings: + drop_params: True +``` + +2. Start Proxy + +``` +$ litellm --config /path/to/config.yaml +``` + +3. Make Request use OpenAI Python SDK, Langchain Python SDK + + +Text + Image + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +# # request sent to model set on litellm proxy, `litellm --model` +response = client.embeddings.create( + model="multimodalembedding@001", + input = ["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"], +) + +print(response) +``` + +Text + Video +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +# # request sent to model set on litellm proxy, `litellm --model` +response = client.embeddings.create( + model="multimodalembedding@001", + input = ["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"], +) + +print(response) +``` + +Image + Video +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +# # request sent to model set on litellm proxy, `litellm --model` +response = client.embeddings.create( + model="multimodalembedding@001", + input = ["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"], +) + +print(response) +``` + + + + + ## **Image Generation Models** Usage diff --git a/docs/my-website/docs/providers/vllm.md b/docs/my-website/docs/providers/vllm.md index b5987167ec..5c8233b056 100644 --- a/docs/my-website/docs/providers/vllm.md +++ b/docs/my-website/docs/providers/vllm.md @@ -161,6 +161,120 @@ curl -L -X POST 'http://0.0.0.0:4000/embeddings' \ Example Implementation from VLLM [here](https://github.com/vllm-project/vllm/pull/10020) + + + +Use this to send a video url to VLLM + Gemini in the same format, using OpenAI's `files` message type. + +There are two ways to send a video url to VLLM: + +1. Pass the video url directly + +``` +{"type": "file", "file": {"file_id": video_url}}, +``` + +2. Pass the video data as base64 + +``` +{"type": "file", "file": {"file_data": f"data:video/mp4;base64,{video_data_base64}"}} +``` + + + + +```python +from litellm import completion + +messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Summarize the following video" + }, + { + "type": "file", + "file": { + "file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + } + } + ] + } +] + +# call vllm +os.environ["HOSTED_VLLM_API_BASE"] = "https://hosted-vllm-api.co" +os.environ["HOSTED_VLLM_API_KEY"] = "" # [optional], if your VLLM server requires an API key +response = completion( + model="hosted_vllm/qwen", # pass the vllm model name + messages=messages, +) + +# call gemini +os.environ["GEMINI_API_KEY"] = "your-gemini-api-key" +response = completion( + model="gemini/gemini-1.5-flash", # pass the gemini model name + messages=messages, +) + +print(response) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: my-model + litellm_params: + model: hosted_vllm/qwen # add hosted_vllm/ prefix to route as OpenAI provider + api_base: https://hosted-vllm-api.co # add api base for OpenAI compatible provider + - model_name: my-gemini-model + litellm_params: + model: gemini/gemini-1.5-flash # add gemini/ prefix to route as Google AI Studio provider + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start the proxy + +```bash +$ litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +3. Test it! + +```bash +curl -X POST http://0.0.0.0:4000/chat/completions \ +-H "Authorization: Bearer sk-1234" \ +-H "Content-Type: application/json" \ +-d '{ + "model": "my-model", + "messages": [ + {"role": "user", "content": + [ + {"type": "text", "text": "Summarize the following video"}, + {"type": "file", "file": {"file_id": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"}} + ] + } + ] +}' +``` + + + + + + + + +Use this to send a video url to VLLM in it's native message format (`video_url`). + There are two ways to send a video url to VLLM: 1. Pass the video url directly @@ -249,6 +363,10 @@ curl -X POST http://0.0.0.0:4000/chat/completions \ + + + + ## (Deprecated) for `vllm pip package` ### Using - `litellm.completion` diff --git a/docs/my-website/docs/providers/xai.md b/docs/my-website/docs/providers/xai.md index 3faf7d1052..49a3640991 100644 --- a/docs/my-website/docs/providers/xai.md +++ b/docs/my-website/docs/providers/xai.md @@ -18,13 +18,14 @@ os.environ['XAI_API_KEY'] ``` ## Sample Usage -```python + +```python showLineNumbers title="LiteLLM python sdk usage - Non-streaming" from litellm import completion import os os.environ['XAI_API_KEY'] = "" response = completion( - model="xai/grok-2-latest", + model="xai/grok-3-mini-beta", messages=[ { "role": "user", @@ -45,13 +46,14 @@ print(response) ``` ## Sample Usage - Streaming -```python + +```python showLineNumbers title="LiteLLM python sdk usage - Streaming" from litellm import completion import os os.environ['XAI_API_KEY'] = "" response = completion( - model="xai/grok-2-latest", + model="xai/grok-3-mini-beta", messages=[ { "role": "user", @@ -75,14 +77,15 @@ for chunk in response: ``` ## Sample Usage - Vision -```python + +```python showLineNumbers title="LiteLLM python sdk usage - Vision" import os from litellm import completion os.environ["XAI_API_KEY"] = "your-api-key" response = completion( - model="xai/grok-2-latest", + model="xai/grok-2-vision-latest", messages=[ { "role": "user", @@ -110,7 +113,7 @@ Here's how to call a XAI model with the LiteLLM Proxy Server 1. Modify the config.yaml - ```yaml + ```yaml showLineNumbers model_list: - model_name: my-model litellm_params: @@ -131,7 +134,7 @@ Here's how to call a XAI model with the LiteLLM Proxy Server - ```python + ```python showLineNumbers import openai client = openai.OpenAI( api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys @@ -173,3 +176,81 @@ Here's how to call a XAI model with the LiteLLM Proxy Server +## Reasoning Usage + +LiteLLM supports reasoning usage for xAI models. + + + + + +```python showLineNumbers title="reasoning with xai/grok-3-mini-beta" +import litellm +response = litellm.completion( + model="xai/grok-3-mini-beta", + messages=[{"role": "user", "content": "What is 101*3?"}], + reasoning_effort="low", +) + +print("Reasoning Content:") +print(response.choices[0].message.reasoning_content) + +print("\nFinal Response:") +print(completion.choices[0].message.content) + +print("\nNumber of completion tokens (input):") +print(completion.usage.completion_tokens) + +print("\nNumber of reasoning tokens (input):") +print(completion.usage.completion_tokens_details.reasoning_tokens) +``` + + + + +```python showLineNumbers title="reasoning with xai/grok-3-mini-beta" +import openai +client = openai.OpenAI( + api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys + base_url="http://0.0.0.0:4000" # litellm-proxy-base url +) + +response = client.chat.completions.create( + model="xai/grok-3-mini-beta", + messages=[{"role": "user", "content": "What is 101*3?"}], + reasoning_effort="low", +) + +print("Reasoning Content:") +print(response.choices[0].message.reasoning_content) + +print("\nFinal Response:") +print(completion.choices[0].message.content) + +print("\nNumber of completion tokens (input):") +print(completion.usage.completion_tokens) + +print("\nNumber of reasoning tokens (input):") +print(completion.usage.completion_tokens_details.reasoning_tokens) +``` + + + + +**Example Response:** + +```shell +Reasoning Content: +Let me calculate 101 multiplied by 3: +101 * 3 = 303. +I can double-check that: 100 * 3 is 300, and 1 * 3 is 3, so 300 + 3 = 303. Yes, that's correct. + +Final Response: +The result of 101 multiplied by 3 is 303. + +Number of completion tokens (input): +14 + +Number of reasoning tokens (input): +310 +``` diff --git a/docs/my-website/docs/proxy/admin_ui_sso.md b/docs/my-website/docs/proxy/admin_ui_sso.md index b7f8ddd585..0bbba57fd9 100644 --- a/docs/my-website/docs/proxy/admin_ui_sso.md +++ b/docs/my-website/docs/proxy/admin_ui_sso.md @@ -147,11 +147,16 @@ Some SSO providers require a specific redirect url for login and logout. You can - Login: `/sso/key/generate` - Logout: `` +Here's the env var to set the logout url on the proxy +```bash +PROXY_LOGOUT_URL="https://www.google.com" +``` + #### Step 3. Set `PROXY_BASE_URL` in your .env Set this in your .env (so the proxy can set the correct redirect url) ```shell -PROXY_BASE_URL=https://litellm-api.up.railway.app/ +PROXY_BASE_URL=https://litellm-api.up.railway.app ``` #### Step 4. Test flow diff --git a/docs/my-website/docs/proxy/call_hooks.md b/docs/my-website/docs/proxy/call_hooks.md index 8ea220cfa1..a7b0afcc18 100644 --- a/docs/my-website/docs/proxy/call_hooks.md +++ b/docs/my-website/docs/proxy/call_hooks.md @@ -70,6 +70,21 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit response: str, ): pass + + aasync def async_post_call_streaming_iterator_hook( + self, + user_api_key_dict: UserAPIKeyAuth, + response: Any, + request_data: dict, + ) -> AsyncGenerator[ModelResponseStream, None]: + """ + Passes the entire stream to the guardrail + + This is useful for plugins that need to see the entire stream. + """ + async for item in response: + yield item + proxy_handler_instance = MyCustomHandler() ``` diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md index 9e24437449..1e3c800b03 100644 --- a/docs/my-website/docs/proxy/config_settings.md +++ b/docs/my-website/docs/proxy/config_settings.md @@ -147,6 +147,7 @@ general_settings: |------|------|-------------| | completion_model | string | The default model to use for completions when `model` is not specified in the request | | disable_spend_logs | boolean | If true, turns off writing each transaction to the database | +| disable_spend_updates | boolean | If true, turns off all spend updates to the DB. Including key/user/team spend updates. | | disable_master_key_return | boolean | If true, turns off returning master key on UI. (checked on '/user/info' endpoint) | | disable_retry_on_max_parallel_request_limit_error | boolean | If true, turns off retries when max parallel request limit is reached | | disable_reset_budget | boolean | If true, turns off reset budget scheduled task | @@ -159,7 +160,7 @@ general_settings: | database_url | string | The URL for the database connection [Set up Virtual Keys](virtual_keys) | | database_connection_pool_limit | integer | The limit for database connection pool [Setting DB Connection Pool limit](#configure-db-pool-limits--connection-timeouts) | | database_connection_timeout | integer | The timeout for database connections in seconds [Setting DB Connection Pool limit, timeout](#configure-db-pool-limits--connection-timeouts) | -| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key | +| allow_requests_on_db_unavailable | boolean | If true, allows requests to succeed even if DB is unreachable. **Only use this if running LiteLLM in your VPC** This will allow requests to work even when LiteLLM cannot connect to the DB to verify a Virtual Key [Doc on graceful db unavailability](prod#5-if-running-litellm-on-vpc-gracefully-handle-db-unavailability) | | custom_auth | string | Write your own custom authentication logic [Doc Custom Auth](virtual_keys#custom-auth) | | max_parallel_requests | integer | The max parallel requests allowed per deployment | | global_max_parallel_requests | integer | The max parallel requests allowed on the proxy overall | @@ -177,7 +178,7 @@ general_settings: | use_x_forwarded_for | str | If true, uses the X-Forwarded-For header to get the client IP address | | service_account_settings | List[Dict[str, Any]] | Set `service_account_settings` if you want to create settings that only apply to service account keys (Doc on service accounts)[./service_accounts.md] | | image_generation_model | str | The default model to use for image generation - ignores model set in request | -| store_model_in_db | boolean | If true, allows `/model/new` endpoint to store model information in db. Endpoint disabled by default. [Doc on `/model/new` endpoint](./model_management.md#create-a-new-model) | +| store_model_in_db | boolean | If true, enables storing model + credential information in the DB. | | store_prompts_in_spend_logs | boolean | If true, allows prompts and responses to be stored in the spend logs table. | | max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. | | max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. | @@ -298,6 +299,9 @@ router_settings: |------|-------------| | ACTIONS_ID_TOKEN_REQUEST_TOKEN | Token for requesting ID in GitHub Actions | ACTIONS_ID_TOKEN_REQUEST_URL | URL for requesting ID token in GitHub Actions +| AGENTOPS_ENVIRONMENT | Environment for AgentOps logging integration +| AGENTOPS_API_KEY | API Key for AgentOps logging integration +| AGENTOPS_SERVICE_NAME | Service Name for AgentOps logging integration | AISPEND_ACCOUNT_ID | Account ID for AI Spend | AISPEND_API_KEY | API Key for AI Spend | ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access @@ -322,6 +326,9 @@ router_settings: | AZURE_AUTHORITY_HOST | Azure authority host URL | AZURE_CLIENT_ID | Client ID for Azure services | AZURE_CLIENT_SECRET | Client secret for Azure services +| AZURE_TENANT_ID | Tenant ID for Azure Active Directory +| AZURE_USERNAME | Username for Azure services, use in conjunction with AZURE_PASSWORD for azure ad token with basic username/password workflow +| AZURE_PASSWORD | Password for Azure services, use in conjunction with AZURE_USERNAME for azure ad token with basic username/password workflow | AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token | AZURE_KEY_VAULT_URI | URI for Azure Key Vault | AZURE_STORAGE_ACCOUNT_KEY | The Azure Storage Account Key to use for Authentication to Azure Blob Storage logging @@ -330,7 +337,6 @@ router_settings: | AZURE_STORAGE_TENANT_ID | The Application Tenant ID to use for Authentication to Azure Blob Storage logging | AZURE_STORAGE_CLIENT_ID | The Application Client ID to use for Authentication to Azure Blob Storage logging | AZURE_STORAGE_CLIENT_SECRET | The Application Client Secret to use for Authentication to Azure Blob Storage logging -| AZURE_TENANT_ID | Tenant ID for Azure Active Directory | BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service | BRAINTRUST_API_KEY | API key for Braintrust integration | CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI @@ -405,6 +411,7 @@ router_settings: | HELICONE_API_KEY | API key for Helicone service | HOSTNAME | Hostname for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog) | HUGGINGFACE_API_BASE | Base URL for Hugging Face API +| HUGGINGFACE_API_KEY | API key for Hugging Face API | IAM_TOKEN_DB_AUTH | IAM token for database authentication | JSON_LOGS | Enable JSON formatted logging | JWT_AUDIENCE | Expected audience for JWT tokens @@ -431,6 +438,7 @@ router_settings: | LITERAL_BATCH_SIZE | Batch size for Literal operations | LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI | LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests +| LITELLM_MODIFY_PARAMS | Parameters to modify in LiteLLM requests | LITELLM_EMAIL | Email associated with LiteLLM account | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM @@ -444,9 +452,12 @@ router_settings: | LITELLM_TOKEN | Access token for LiteLLM integration | LITELLM_PRINT_STANDARD_LOGGING_PAYLOAD | If true, prints the standard logging payload to the console - useful for debugging | LOGFIRE_TOKEN | Token for Logfire logging service +| MISTRAL_API_BASE | Base URL for Mistral API +| MISTRAL_API_KEY | API key for Mistral API | MICROSOFT_CLIENT_ID | Client ID for Microsoft services | MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services | MICROSOFT_TENANT | Tenant ID for Microsoft Azure +| MICROSOFT_SERVICE_PRINCIPAL_ID | Service Principal ID for Microsoft Enterprise Application. (This is an advanced feature if you want litellm to auto-assign members to Litellm Teams based on their Microsoft Entra ID Groups) | NO_DOCS | Flag to disable documentation generation | NO_PROXY | List of addresses to bypass proxy | OAUTH_TOKEN_INFO_ENDPOINT | Endpoint for OAuth token info retrieval @@ -478,7 +489,7 @@ router_settings: | PROXY_ADMIN_ID | Admin identifier for proxy server | PROXY_BASE_URL | Base URL for proxy service | PROXY_LOGOUT_URL | URL for logging out of the proxy service -| PROXY_MASTER_KEY | Master key for proxy authentication +| LITELLM_MASTER_KEY | Master key for proxy authentication | QDRANT_API_BASE | Base URL for Qdrant API | QDRANT_API_KEY | API key for Qdrant service | QDRANT_URL | Connection URL for Qdrant database @@ -499,9 +510,11 @@ router_settings: | SMTP_USERNAME | Username for SMTP authentication (do not set if SMTP does not require auth) | SPEND_LOGS_URL | URL for retrieving spend logs | SSL_CERTIFICATE | Path to the SSL certificate file +| SSL_SECURITY_LEVEL | [BETA] Security level for SSL/TLS connections. E.g. `DEFAULT@SECLEVEL=1` | SSL_VERIFY | Flag to enable or disable SSL certificate verification | SUPABASE_KEY | API key for Supabase service | SUPABASE_URL | Base URL for Supabase instance +| STORE_MODEL_IN_DB | If true, enables storing model + credential information in the DB. | TEST_EMAIL_ADDRESS | Email address used for testing purposes | UI_LOGO_PATH | Path to the logo image used in the UI | UI_PASSWORD | Password for accessing the UI @@ -512,5 +525,5 @@ router_settings: | UPSTREAM_LANGFUSE_RELEASE | Release version identifier for upstream Langfuse | UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication | USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption +| USE_PRISMA_MIGRATE | Flag to use prisma migrate instead of prisma db push. Recommended for production environments. | WEBHOOK_URL | URL for receiving webhooks from external services - diff --git a/docs/my-website/docs/proxy/cost_tracking.md b/docs/my-website/docs/proxy/cost_tracking.md index 7f90273c39..5b17e565a5 100644 --- a/docs/my-website/docs/proxy/cost_tracking.md +++ b/docs/my-website/docs/proxy/cost_tracking.md @@ -6,6 +6,8 @@ import Image from '@theme/IdealImage'; Track spend for keys, users, and teams across 100+ LLMs. +LiteLLM automatically tracks spend for all known models. See our [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) + ### How to Track Spend with LiteLLM **Step 1** @@ -35,10 +37,10 @@ response = client.chat.completions.create( "content": "this is a test request, write a short poem" } ], - user="palantir", - extra_body={ + user="palantir", # OPTIONAL: pass user to track spend by user + extra_body={ "metadata": { - "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] + "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] # ENTERPRISE: pass tags to track spend by tags } } ) @@ -63,9 +65,9 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ "content": "what llm are you" } ], - "user": "palantir", + "user": "palantir", # OPTIONAL: pass user to track spend by user "metadata": { - "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] + "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] # ENTERPRISE: pass tags to track spend by tags } }' ``` @@ -90,7 +92,7 @@ chat = ChatOpenAI( user="palantir", extra_body={ "metadata": { - "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] + "tags": ["jobID:214590dsff09fds", "taskName:run_page_classification"] # ENTERPRISE: pass tags to track spend by tags } } ) @@ -150,8 +152,112 @@ Navigate to the Usage Tab on the LiteLLM UI (found on https://your-proxy-endpoin -## ✨ (Enterprise) API Endpoints to get Spend -### Getting Spend Reports - To Charge Other Teams, Customers, Users +### Allowing Non-Proxy Admins to access `/spend` endpoints + +Use this when you want non-proxy admins to access `/spend` endpoints + +:::info + +Schedule a [meeting with us to get your Enterprise License](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) + +::: + +##### Create Key +Create Key with with `permissions={"get_spend_routes": true}` +```shell +curl --location 'http://0.0.0.0:4000/key/generate' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "permissions": {"get_spend_routes": true} + }' +``` + +##### Use generated key on `/spend` endpoints + +Access spend Routes with newly generate keys +```shell +curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \ + -H 'Authorization: Bearer sk-H16BKvrSNConSsBYLGc_7A' +``` + + + +#### Reset Team, API Key Spend - MASTER KEY ONLY + +Use `/global/spend/reset` if you want to: +- Reset the Spend for all API Keys, Teams. The `spend` for ALL Teams and Keys in `LiteLLM_TeamTable` and `LiteLLM_VerificationToken` will be set to `spend=0` + +- LiteLLM will maintain all the logs in `LiteLLMSpendLogs` for Auditing Purposes + +##### Request +Only the `LITELLM_MASTER_KEY` you set can access this route +```shell +curl -X POST \ + 'http://localhost:4000/global/spend/reset' \ + -H 'Authorization: Bearer sk-1234' \ + -H 'Content-Type: application/json' +``` + +##### Expected Responses + +```shell +{"message":"Spend for all API Keys and Teams reset successfully","status":"success"} +``` + +## Daily Spend Breakdown API + +Retrieve granular daily usage data for a user (by model, provider, and API key) with a single endpoint. + +Example Request: + +```shell title="Daily Spend Breakdown API" showLineNumbers +curl -L -X GET 'http://localhost:4000/user/daily/activity?start_date=2025-03-20&end_date=2025-03-27' \ +-H 'Authorization: Bearer sk-...' +``` + +```json title="Daily Spend Breakdown API Response" showLineNumbers +{ + "results": [ + { + "date": "2025-03-27", + "metrics": { + "spend": 0.0177072, + "prompt_tokens": 111, + "completion_tokens": 1711, + "total_tokens": 1822, + "api_requests": 11 + }, + "breakdown": { + "models": { + "gpt-4o-mini": { + "spend": 1.095e-05, + "prompt_tokens": 37, + "completion_tokens": 9, + "total_tokens": 46, + "api_requests": 1 + }, + "providers": { "openai": { ... }, "azure_ai": { ... } }, + "api_keys": { "3126b6eaf1...": { ... } } + } + } + ], + "metadata": { + "total_spend": 0.7274667, + "total_prompt_tokens": 280990, + "total_completion_tokens": 376674, + "total_api_requests": 14 + } +} +``` + +### API Reference + +See our [Swagger API](https://litellm-api.up.railway.app/#/Budget%20%26%20Spend%20Tracking/get_user_daily_activity_user_daily_activity_get) for more details on the `/user/daily/activity` endpoint + +## ✨ (Enterprise) Generate Spend Reports + +Use this to charge other teams, customers, users Use the `/global/spend/report` endpoint to get spend reports @@ -470,105 +576,6 @@ curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end -### Allowing Non-Proxy Admins to access `/spend` endpoints - -Use this when you want non-proxy admins to access `/spend` endpoints - -:::info - -Schedule a [meeting with us to get your Enterprise License](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) - -::: - -##### Create Key -Create Key with with `permissions={"get_spend_routes": true}` -```shell -curl --location 'http://0.0.0.0:4000/key/generate' \ - --header 'Authorization: Bearer sk-1234' \ - --header 'Content-Type: application/json' \ - --data '{ - "permissions": {"get_spend_routes": true} - }' -``` - -##### Use generated key on `/spend` endpoints - -Access spend Routes with newly generate keys -```shell -curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end_date=2024-06-30' \ - -H 'Authorization: Bearer sk-H16BKvrSNConSsBYLGc_7A' -``` - - - -#### Reset Team, API Key Spend - MASTER KEY ONLY - -Use `/global/spend/reset` if you want to: -- Reset the Spend for all API Keys, Teams. The `spend` for ALL Teams and Keys in `LiteLLM_TeamTable` and `LiteLLM_VerificationToken` will be set to `spend=0` - -- LiteLLM will maintain all the logs in `LiteLLMSpendLogs` for Auditing Purposes - -##### Request -Only the `LITELLM_MASTER_KEY` you set can access this route -```shell -curl -X POST \ - 'http://localhost:4000/global/spend/reset' \ - -H 'Authorization: Bearer sk-1234' \ - -H 'Content-Type: application/json' -``` - -##### Expected Responses - -```shell -{"message":"Spend for all API Keys and Teams reset successfully","status":"success"} -``` - - - - -## Spend Tracking for Azure OpenAI Models - -Set base model for cost tracking azure image-gen call - -#### Image Generation - -```yaml -model_list: - - model_name: dall-e-3 - litellm_params: - model: azure/dall-e-3-test - api_version: 2023-06-01-preview - api_base: https://openai-gpt-4-test-v-1.openai.azure.com/ - api_key: os.environ/AZURE_API_KEY - base_model: dall-e-3 # 👈 set dall-e-3 as base model - model_info: - mode: image_generation -``` - -#### Chat Completions / Embeddings - -**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking - -**Solution** ✅ : Set `base_model` on your config so litellm uses the correct model for calculating azure cost - -Get the base model name from [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) - -Example config with `base_model` -```yaml -model_list: - - model_name: azure-gpt-3.5 - litellm_params: - model: azure/chatgpt-v-2 - api_base: os.environ/AZURE_API_BASE - api_key: os.environ/AZURE_API_KEY - api_version: "2023-07-01-preview" - model_info: - base_model: azure/gpt-4-1106-preview -``` - -## Custom Input/Output Pricing - -👉 Head to [Custom Input/Output Pricing](https://docs.litellm.ai/docs/proxy/custom_pricing) to setup custom pricing or your models ## ✨ Custom Spend Log metadata @@ -587,4 +594,5 @@ Logging specific key,value pairs in spend logs metadata is an enterprise feature Tracking spend with Custom tags is an enterprise feature. [See here](./enterprise.md#tracking-spend-for-custom-tags) -::: \ No newline at end of file +::: + diff --git a/docs/my-website/docs/proxy/custom_pricing.md b/docs/my-website/docs/proxy/custom_pricing.md index 16d634dee4..792d5c26dd 100644 --- a/docs/my-website/docs/proxy/custom_pricing.md +++ b/docs/my-website/docs/proxy/custom_pricing.md @@ -26,10 +26,12 @@ model_list: - model_name: sagemaker-completion-model litellm_params: model: sagemaker/berri-benchmarking-Llama-2-70b-chat-hf-4 + model_info: input_cost_per_second: 0.000420 - model_name: sagemaker-embedding-model litellm_params: model: sagemaker/berri-benchmarking-gpt-j-6b-fp16 + model_info: input_cost_per_second: 0.000420 ``` @@ -55,11 +57,55 @@ model_list: api_key: os.environ/AZURE_API_KEY api_base: os.environ/AZURE_API_BASE api_version: os.envrion/AZURE_API_VERSION + model_info: input_cost_per_token: 0.000421 # 👈 ONLY to track cost per token output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token ``` -### Debugging +## Override Model Cost Map + +You can override [our model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) with your own custom pricing for a mapped model. + +Just add a `model_info` key to your model in the config, and override the desired keys. + +Example: Override Anthropic's model cost map for the `prod/claude-3-5-sonnet-20241022` model. + +```yaml +model_list: + - model_name: "prod/claude-3-5-sonnet-20241022" + litellm_params: + model: "anthropic/claude-3-5-sonnet-20241022" + api_key: os.environ/ANTHROPIC_PROD_API_KEY + model_info: + input_cost_per_token: 0.000006 + output_cost_per_token: 0.00003 + cache_creation_input_token_cost: 0.0000075 + cache_read_input_token_cost: 0.0000006 +``` + +## Set 'base_model' for Cost Tracking (e.g. Azure deployments) + +**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking + +**Solution** ✅ : Set `base_model` on your config so litellm uses the correct model for calculating azure cost + +Get the base model name from [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) + +Example config with `base_model` +```yaml +model_list: + - model_name: azure-gpt-3.5 + litellm_params: + model: azure/chatgpt-v-2 + api_base: os.environ/AZURE_API_BASE + api_key: os.environ/AZURE_API_KEY + api_version: "2023-07-01-preview" + model_info: + base_model: azure/gpt-4-1106-preview +``` + + +## Debugging If you're custom pricing is not being used or you're seeing errors, please check the following: diff --git a/docs/my-website/docs/proxy/custom_prompt_management.md b/docs/my-website/docs/proxy/custom_prompt_management.md new file mode 100644 index 0000000000..72a7333276 --- /dev/null +++ b/docs/my-website/docs/proxy/custom_prompt_management.md @@ -0,0 +1,194 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Custom Prompt Management + +Connect LiteLLM to your prompt management system with custom hooks. + +## Overview + + + + + +## How it works + +## Quick Start + +### 1. Create Your Custom Prompt Manager + +Create a class that inherits from `CustomPromptManagement` to handle prompt retrieval and formatting: + +**Example Implementation** + +Create a new file called `custom_prompt.py` and add this code. The key method here is `get_chat_completion_prompt` you can implement custom logic to retrieve and format prompts based on the `prompt_id` and `prompt_variables`. + +```python +from typing import List, Tuple, Optional +from litellm.integrations.custom_prompt_management import CustomPromptManagement +from litellm.types.llms.openai import AllMessageValues +from litellm.types.utils import StandardCallbackDynamicParams + +class MyCustomPromptManagement(CustomPromptManagement): + def get_chat_completion_prompt( + self, + model: str, + messages: List[AllMessageValues], + non_default_params: dict, + prompt_id: str, + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + ) -> Tuple[str, List[AllMessageValues], dict]: + """ + Retrieve and format prompts based on prompt_id. + + Returns: + - model: The model to use + - messages: The formatted messages + - non_default_params: Optional parameters like temperature + """ + # Example matching the diagram: Add system message for prompt_id "1234" + if prompt_id == "1234": + # Prepend system message while preserving existing messages + new_messages = [ + {"role": "system", "content": "Be a good Bot!"}, + ] + messages + return model, new_messages, non_default_params + + # Default: Return original messages if no prompt_id match + return model, messages, non_default_params + +prompt_management = MyCustomPromptManagement() +``` + +### 2. Configure Your Prompt Manager in LiteLLM `config.yaml` + +```yaml +model_list: + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4 + api_key: os.environ/OPENAI_API_KEY + +litellm_settings: + callbacks: custom_prompt.prompt_management # sets litellm.callbacks = [prompt_management] +``` + +### 3. Start LiteLLM Gateway + + + + +Mount your `custom_logger.py` on the LiteLLM Docker container. + +```shell +docker run -d \ + -p 4000:4000 \ + -e OPENAI_API_KEY=$OPENAI_API_KEY \ + --name my-app \ + -v $(pwd)/my_config.yaml:/app/config.yaml \ + -v $(pwd)/custom_logger.py:/app/custom_logger.py \ + my-app:latest \ + --config /app/config.yaml \ + --port 4000 \ + --detailed_debug \ +``` + + + + + +```shell +litellm --config config.yaml --detailed_debug +``` + + + + +### 4. Test Your Custom Prompt Manager + +When you pass `prompt_id="1234"`, the custom prompt manager will add a system message "Be a good Bot!" to your conversation: + + + + +```python +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="gemini-1.5-pro", + messages=[{"role": "user", "content": "hi"}], + prompt_id="1234" +) + +print(response.choices[0].message.content) +``` + + + + +```python +from langchain.chat_models import ChatOpenAI +from langchain.schema import HumanMessage + +chat = ChatOpenAI( + model="gpt-4", + openai_api_key="sk-1234", + openai_api_base="http://0.0.0.0:4000", + extra_body={ + "prompt_id": "1234" + } +) + +messages = [] +response = chat(messages) + +print(response.content) +``` + + + + +```shell +curl -X POST http://0.0.0.0:4000/v1/chat/completions \ +-H "Content-Type: application/json" \ +-H "Authorization: Bearer sk-1234" \ +-d '{ + "model": "gemini-1.5-pro", + "messages": [{"role": "user", "content": "hi"}], + "prompt_id": "1234" +}' +``` + + + +The request will be transformed from: +```json +{ + "model": "gemini-1.5-pro", + "messages": [{"role": "user", "content": "hi"}], + "prompt_id": "1234" +} +``` + +To: +```json +{ + "model": "gemini-1.5-pro", + "messages": [ + {"role": "system", "content": "Be a good Bot!"}, + {"role": "user", "content": "hi"} + ] +} +``` + + diff --git a/docs/my-website/docs/proxy/db_deadlocks.md b/docs/my-website/docs/proxy/db_deadlocks.md new file mode 100644 index 0000000000..332374995d --- /dev/null +++ b/docs/my-website/docs/proxy/db_deadlocks.md @@ -0,0 +1,86 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# High Availability Setup (Resolve DB Deadlocks) + +Resolve any Database Deadlocks you see in high traffic by using this setup + +## What causes the problem? + +LiteLLM writes `UPDATE` and `UPSERT` queries to the DB. When using 10+ instances of LiteLLM, these queries can cause deadlocks since each instance could simultaneously attempt to update the same `user_id`, `team_id`, `key` etc. + +## How the high availability setup fixes the problem +- All instances will write to a Redis queue instead of the DB. +- A single instance will acquire a lock on the DB and flush the redis queue to the DB. + + +## How it works + +### Stage 1. Each instance writes updates to redis + +Each instance will accumlate the spend updates for a key, user, team, etc and write the updates to a redis queue. + + +

+Each instance writes updates to redis +

+ + +### Stage 2. A single instance flushes the redis queue to the DB + +A single instance will acquire a lock on the DB and flush all elements in the redis queue to the DB. + +- 1 instance will attempt to acquire the lock for the DB update job +- The status of the lock is stored in redis +- If the instance acquires the lock to write to DB + - It will read all updates from redis + - Aggregate all updates into 1 transaction + - Write updates to DB + - Release the lock +- Note: Only 1 instance can acquire the lock at a time, this limits the number of instances that can write to the DB at once + + + +

+A single instance flushes the redis queue to the DB +

+ + +## Usage + +### Required components + +- Redis +- Postgres + +### Setup on LiteLLM config + +You can enable using the redis buffer by setting `use_redis_transaction_buffer: true` in the `general_settings` section of your `proxy_config.yaml` file. + +Note: This setup requires litellm to be connected to a redis instance. + +```yaml showLineNumbers title="litellm proxy_config.yaml" +general_settings: + use_redis_transaction_buffer: true + +litellm_settings: + cache: True + cache_params: + type: redis + supported_call_types: [] # Optional: Set cache for proxy, but not on the actual llm api call +``` + +## Monitoring + +LiteLLM emits the following prometheus metrics to monitor the health/status of the in memory buffer and redis buffer. + + +| Metric Name | Description | Storage Type | +|-----------------------------------------------------|-----------------------------------------------------------------------------|--------------| +| `litellm_pod_lock_manager_size` | Indicates which pod has the lock to write updates to the database. | Redis | +| `litellm_in_memory_daily_spend_update_queue_size` | Number of items in the in-memory daily spend update queue. These are the aggregate spend logs for each user. | In-Memory | +| `litellm_redis_daily_spend_update_queue_size` | Number of items in the Redis daily spend update queue. These are the aggregate spend logs for each user. | Redis | +| `litellm_in_memory_spend_update_queue_size` | In-memory aggregate spend values for keys, users, teams, team members, etc.| In-Memory | +| `litellm_redis_spend_update_queue_size` | Redis aggregate spend values for keys, users, teams, etc. | Redis | + diff --git a/docs/my-website/docs/proxy/guardrails/aim_security.md b/docs/my-website/docs/proxy/guardrails/aim_security.md index 8f612b9dbe..d76c4e0c1c 100644 --- a/docs/my-website/docs/proxy/guardrails/aim_security.md +++ b/docs/my-website/docs/proxy/guardrails/aim_security.md @@ -23,6 +23,12 @@ In the newly created guard's page, you can find a reference to the prompt policy You can decide which detections will be enabled, and set the threshold for each detection. +:::info +When using LiteLLM with virtual keys, key-specific policies can be set directly in Aim's guards page by specifying the virtual key alias when creating the guard. + +Only the aliases of your virtual keys (and not the actual key secrets) will be sent to Aim. +::: + ### 3. Add Aim Guardrail on your LiteLLM config.yaml Define your guardrails under the `guardrails` section @@ -134,7 +140,7 @@ The above request should not be blocked, and you should receive a regular LLM re -# Advanced +## Advanced Aim Guard provides user-specific Guardrail policies, enabling you to apply tailored policies to individual users. To utilize this feature, include the end-user's email in the request payload by setting the `x-aim-user-email` header of your request. diff --git a/docs/my-website/docs/proxy/guardrails/custom_guardrail.md b/docs/my-website/docs/proxy/guardrails/custom_guardrail.md index 50deac511f..657ccab68e 100644 --- a/docs/my-website/docs/proxy/guardrails/custom_guardrail.md +++ b/docs/my-website/docs/proxy/guardrails/custom_guardrail.md @@ -10,10 +10,12 @@ Use this is you want to write code to run a custom guardrail ### 1. Write a `CustomGuardrail` Class -A CustomGuardrail has 3 methods to enforce guardrails +A CustomGuardrail has 4 methods to enforce guardrails - `async_pre_call_hook` - (Optional) modify input or reject request before making LLM API call - `async_moderation_hook` - (Optional) reject request, runs while making LLM API call (help to lower latency) - `async_post_call_success_hook`- (Optional) apply guardrail on input/output, runs after making LLM API call +- `async_post_call_streaming_iterator_hook` - (Optional) pass the entire stream to the guardrail + **[See detailed spec of methods here](#customguardrail-methods)** @@ -128,6 +130,23 @@ class myCustomGuardrail(CustomGuardrail): ): raise ValueError("Guardrail failed Coffee Detected") + async def async_post_call_streaming_iterator_hook( + self, + user_api_key_dict: UserAPIKeyAuth, + response: Any, + request_data: dict, + ) -> AsyncGenerator[ModelResponseStream, None]: + """ + Passes the entire stream to the guardrail + + This is useful for guardrails that need to see the entire response, such as PII masking. + + See Aim guardrail implementation for an example - https://github.com/BerriAI/litellm/blob/d0e022cfacb8e9ebc5409bb652059b6fd97b45c0/litellm/proxy/guardrails/guardrail_hooks/aim.py#L168 + + Triggered by mode: 'post_call' + """ + async for item in response: + yield item ``` diff --git a/docs/my-website/docs/prompt_injection.md b/docs/my-website/docs/proxy/guardrails/prompt_injection.md similarity index 100% rename from docs/my-website/docs/prompt_injection.md rename to docs/my-website/docs/proxy/guardrails/prompt_injection.md diff --git a/docs/my-website/docs/proxy/guardrails/quick_start.md b/docs/my-website/docs/proxy/guardrails/quick_start.md index 6744dc6578..aeac507e0a 100644 --- a/docs/my-website/docs/proxy/guardrails/quick_start.md +++ b/docs/my-website/docs/proxy/guardrails/quick_start.md @@ -17,6 +17,14 @@ model_list: api_key: os.environ/OPENAI_API_KEY guardrails: + - guardrail_name: general-guard + litellm_params: + guardrail: aim + mode: [pre_call, post_call] + api_key: os.environ/AIM_API_KEY + api_base: os.environ/AIM_API_BASE + default_on: true # Optional + - guardrail_name: "aporia-pre-guard" litellm_params: guardrail: aporia # supported values: "aporia", "lakera" @@ -45,6 +53,7 @@ guardrails: - `pre_call` Run **before** LLM call, on **input** - `post_call` Run **after** LLM call, on **input & output** - `during_call` Run **during** LLM call, on **input** Same as `pre_call` but runs in parallel as LLM call. Response not returned until guardrail check completes +- A list of the above values to run multiple modes, e.g. `mode: [pre_call, post_call]` ## 2. Start LiteLLM Gateway @@ -569,4 +578,4 @@ guardrails: Union[ class DynamicGuardrailParams: extra_body: Dict[str, Any] # Additional parameters for the guardrail -``` \ No newline at end of file +``` diff --git a/docs/my-website/docs/proxy/image_handling.md b/docs/my-website/docs/proxy/image_handling.md new file mode 100644 index 0000000000..300ab0bc38 --- /dev/null +++ b/docs/my-website/docs/proxy/image_handling.md @@ -0,0 +1,21 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Image URL Handling + + + +Some LLM API's don't support url's for images, but do support base-64 strings. + +For those, LiteLLM will: + +1. Detect a URL being passed +2. Check if the LLM API supports a URL +3. Else, will download the base64 +4. Send the provider a base64 string. + + +LiteLLM also caches this result, in-memory to reduce latency for subsequent calls. + +The limit for an in-memory cache is 1MB. \ No newline at end of file diff --git a/docs/my-website/docs/proxy/litellm_managed_files.md b/docs/my-website/docs/proxy/litellm_managed_files.md new file mode 100644 index 0000000000..6e40c6dd44 --- /dev/null +++ b/docs/my-website/docs/proxy/litellm_managed_files.md @@ -0,0 +1,279 @@ +import TabItem from '@theme/TabItem'; +import Tabs from '@theme/Tabs'; +import Image from '@theme/IdealImage'; + +# [BETA] Unified File ID + +Reuse the same 'file id' across different providers. + +| Feature | Description | Comments | +| --- | --- | --- | +| Proxy | ✅ | | +| SDK | ❌ | Requires postgres DB for storing file ids | +| Available across all providers | ✅ | | + + + +Limitations of LiteLLM Managed Files: +- Only works for `/chat/completions` requests. +- Assumes just 1 model configured per model_name. + +Follow [here](https://github.com/BerriAI/litellm/discussions/9632) for multiple models, batches support. + +### 1. Setup config.yaml + +``` +model_list: + - model_name: "gemini-2.0-flash" + litellm_params: + model: vertex_ai/gemini-2.0-flash + vertex_project: my-project-id + vertex_location: us-central1 + - model_name: "gpt-4o-mini-openai" + litellm_params: + model: gpt-4o-mini + api_key: os.environ/OPENAI_API_KEY +``` + +### 2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +### 3. Test it! + +Specify `target_model_names` to use the same file id across different providers. This is the list of model_names set via config.yaml (or 'public_model_names' on UI). + +```python +target_model_names="gpt-4o-mini-openai, gemini-2.0-flash" # 👈 Specify model_names +``` + +Check `/v1/models` to see the list of available model names for a key. + +#### **Store a PDF file** + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0) + + +# Download and save the PDF locally +url = ( + "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf" +) +response = requests.get(url) +response.raise_for_status() + +# Save the PDF locally +with open("2403.05530.pdf", "wb") as f: + f.write(response.content) + +file = client.files.create( + file=open("2403.05530.pdf", "rb"), + purpose="user_data", # can be any openai 'purpose' value + extra_body={"target_model_names": "gpt-4o-mini-openai, gemini-2.0-flash"}, # 👈 Specify model_names +) + +print(f"file id={file.id}") +``` + +#### **Use the same file id across different providers** + + + + +```python +completion = client.chat.completions.create( + model="gpt-4o-mini-openai", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this recording?"}, + { + "type": "file", + "file": { + "file_id": file.id, + }, + }, + ], + }, + ] +) + +print(completion.choices[0].message) +``` + + + + + +```python +completion = client.chat.completions.create( + model="gemini-2.0-flash", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this recording?"}, + { + "type": "file", + "file": { + "file_id": file.id, + }, + }, + ], + }, + ] +) + +print(completion.choices[0].message) + +``` + + + + +### Complete Example + +```python +import base64 +import requests +from openai import OpenAI + +client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0) + + +# Download and save the PDF locally +url = ( + "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf" +) +response = requests.get(url) +response.raise_for_status() + +# Save the PDF locally +with open("2403.05530.pdf", "wb") as f: + f.write(response.content) + +# Read the local PDF file +file = client.files.create( + file=open("2403.05530.pdf", "rb"), + purpose="user_data", # can be any openai 'purpose' value + extra_body={"target_model_names": "gpt-4o-mini-openai, vertex_ai/gemini-2.0-flash"}, +) + +print(f"file.id: {file.id}") # 👈 Unified file id + +## GEMINI CALL ### +completion = client.chat.completions.create( + model="gemini-2.0-flash", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this recording?"}, + { + "type": "file", + "file": { + "file_id": file.id, + }, + }, + ], + }, + ] +) + +print(completion.choices[0].message) + + +### OPENAI CALL ### +completion = client.chat.completions.create( + model="gpt-4o-mini-openai", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this recording?"}, + { + "type": "file", + "file": { + "file_id": file.id, + }, + }, + ], + }, + ], +) + +print(completion.choices[0].message) + +``` + + +### Supported Endpoints + +#### Create a file - `/files` + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0) + +# Download and save the PDF locally +url = ( + "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf" +) +response = requests.get(url) +response.raise_for_status() + +# Save the PDF locally +with open("2403.05530.pdf", "wb") as f: + f.write(response.content) + +# Read the local PDF file +file = client.files.create( + file=open("2403.05530.pdf", "rb"), + purpose="user_data", # can be any openai 'purpose' value + extra_body={"target_model_names": "gpt-4o-mini-openai, vertex_ai/gemini-2.0-flash"}, +) +``` + +#### Retrieve a file - `/files/{file_id}` + +```python +client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0) + +file = client.files.retrieve(file_id=file.id) +``` + +#### Delete a file - `/files/{file_id}/delete` + +```python +client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234", max_retries=0) + +file = client.files.delete(file_id=file.id) +``` + +### FAQ + +**1. Does LiteLLM store the file?** + +No, LiteLLM does not store the file. It only stores the file id's in the postgres DB. + +**2. How does LiteLLM know which file to use for a given file id?** + +LiteLLM stores a mapping of the litellm file id to the model-specific file id in the postgres DB. When a request comes in, LiteLLM looks up the model-specific file id and uses it in the request to the provider. + +**3. How do file deletions work?** + +When a file is deleted, LiteLLM deletes the mapping from the postgres DB, and the files on each provider. + +### Architecture + + + + + + \ No newline at end of file diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md index e13a403634..c8731dd270 100644 --- a/docs/my-website/docs/proxy/logging.md +++ b/docs/my-website/docs/proxy/logging.md @@ -862,7 +862,7 @@ Add the following to your env ```shell OTEL_EXPORTER="otlp_http" -OTEL_ENDPOINT="http:/0.0.0.0:4317" +OTEL_ENDPOINT="http://0.0.0.0:4317" OTEL_HEADERS="x-honeycomb-team=" # Optional ``` @@ -2501,4 +2501,4 @@ litellm_settings: :::info `thresholds` are not required by default, but you can tune the values to your needs. Default values is `4` for all categories -::: --> \ No newline at end of file +::: --> diff --git a/docs/my-website/docs/proxy/logging_spec.md b/docs/my-website/docs/proxy/logging_spec.md index 7da937e565..b314dd350b 100644 --- a/docs/my-website/docs/proxy/logging_spec.md +++ b/docs/my-website/docs/proxy/logging_spec.md @@ -79,6 +79,7 @@ Inherits from `StandardLoggingUserAPIKeyMetadata` and adds: | `response_cost` | `Optional[str]` | Optional response cost | | `additional_headers` | `Optional[StandardLoggingAdditionalHeaders]` | Additional headers | | `batch_models` | `Optional[List[str]]` | Only set for Batches API. Lists the models used for cost calculation | +| `litellm_model_name` | `Optional[str]` | Model name sent in request | ## StandardLoggingModelInformation diff --git a/docs/my-website/docs/proxy/model_discovery.md b/docs/my-website/docs/proxy/model_discovery.md new file mode 100644 index 0000000000..5790dfc520 --- /dev/null +++ b/docs/my-website/docs/proxy/model_discovery.md @@ -0,0 +1,108 @@ +# Model Discovery + +Use this to give users an accurate list of models available behind provider endpoint, when calling `/v1/models` for wildcard models. + +## Supported Models + +- Fireworks AI +- OpenAI +- Gemini +- LiteLLM Proxy +- Topaz +- Anthropic +- XAI +- VLLM +- Vertex AI + +### Usage + +**1. Setup config.yaml** + +```yaml +model_list: + - model_name: xai/* + litellm_params: + model: xai/* + api_key: os.environ/XAI_API_KEY + +litellm_settings: + check_provider_endpoint: true # 👈 Enable checking provider endpoint for wildcard models +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +**3. Call `/v1/models`** + +```bash +curl -X GET "http://localhost:4000/v1/models" -H "Authorization: Bearer $LITELLM_KEY" +``` + +Expected response + +```json +{ + "data": [ + { + "id": "xai/grok-2-1212", + "object": "model", + "created": 1677610602, + "owned_by": "openai" + }, + { + "id": "xai/grok-2-vision-1212", + "object": "model", + "created": 1677610602, + "owned_by": "openai" + }, + { + "id": "xai/grok-3-beta", + "object": "model", + "created": 1677610602, + "owned_by": "openai" + }, + { + "id": "xai/grok-3-fast-beta", + "object": "model", + "created": 1677610602, + "owned_by": "openai" + }, + { + "id": "xai/grok-3-mini-beta", + "object": "model", + "created": 1677610602, + "owned_by": "openai" + }, + { + "id": "xai/grok-3-mini-fast-beta", + "object": "model", + "created": 1677610602, + "owned_by": "openai" + }, + { + "id": "xai/grok-beta", + "object": "model", + "created": 1677610602, + "owned_by": "openai" + }, + { + "id": "xai/grok-vision-beta", + "object": "model", + "created": 1677610602, + "owned_by": "openai" + }, + { + "id": "xai/grok-2-image-1212", + "object": "model", + "created": 1677610602, + "owned_by": "openai" + } + ], + "object": "list" +} +``` \ No newline at end of file diff --git a/docs/my-website/docs/proxy/prod.md b/docs/my-website/docs/proxy/prod.md index d3ba2d6224..2d09502d52 100644 --- a/docs/my-website/docs/proxy/prod.md +++ b/docs/my-website/docs/proxy/prod.md @@ -94,15 +94,31 @@ This disables the load_dotenv() functionality, which will automatically load you ## 5. If running LiteLLM on VPC, gracefully handle DB unavailability -This will allow LiteLLM to continue to process requests even if the DB is unavailable. This is better handling for DB unavailability. +When running LiteLLM on a VPC (and inaccessible from the public internet), you can enable graceful degradation so that request processing continues even if the database is temporarily unavailable. + **WARNING: Only do this if you're running LiteLLM on VPC, that cannot be accessed from the public internet.** -```yaml +#### Configuration + +```yaml showLineNumbers title="litellm config.yaml" general_settings: allow_requests_on_db_unavailable: True ``` +#### Expected Behavior + +When `allow_requests_on_db_unavailable` is set to `true`, LiteLLM will handle errors as follows: + +| Type of Error | Expected Behavior | Details | +|---------------|-------------------|----------------| +| Prisma Errors | ✅ Request will be allowed | Covers issues like DB connection resets or rejections from the DB via Prisma, the ORM used by LiteLLM. | +| Httpx Errors | ✅ Request will be allowed | Occurs when the database is unreachable, allowing the request to proceed despite the DB outage. | +| Pod Startup Behavior | ✅ Pods start regardless | LiteLLM Pods will start even if the database is down or unreachable, ensuring higher uptime guarantees for deployments. | +| Health/Readiness Check | ✅ Always returns 200 OK | The /health/readiness endpoint returns a 200 OK status to ensure that pods remain operational even when the database is unavailable. +| LiteLLM Budget Errors or Model Errors | ❌ Request will be blocked | Triggered when the DB is reachable but the authentication token is invalid, lacks access, or exceeds budget limits. | + + ## 6. Disable spend_logs & error_logs if not using the LiteLLM UI By default, LiteLLM writes several types of logs to the database: @@ -161,6 +177,50 @@ export LITELLM_SALT_KEY="sk-1234" [**See Code**](https://github.com/BerriAI/litellm/blob/036a6821d588bd36d170713dcf5a72791a694178/litellm/proxy/common_utils/encrypt_decrypt_utils.py#L15) + +## 9. Use `prisma migrate deploy` + +Use this to handle db migrations across LiteLLM versions in production + + + + +```bash +USE_PRISMA_MIGRATE="True" +``` + + + + + +```bash +litellm --use_prisma_migrate +``` + + + + +Benefits: + +The migrate deploy command: + +- **Does not** issue a warning if an already applied migration is missing from migration history +- **Does not** detect drift (production database schema differs from migration history end state - for example, due to a hotfix) +- **Does not** reset the database or generate artifacts (such as Prisma Client) +- **Does not** rely on a shadow database + + +### How does LiteLLM handle DB migrations in production? + +1. A new migration file is written to our `litellm-proxy-extras` package. [See all](https://github.com/BerriAI/litellm/tree/main/litellm-proxy-extras/litellm_proxy_extras/migrations) + +2. The core litellm pip package is bumped to point to the new `litellm-proxy-extras` package. This ensures, older versions of LiteLLM will continue to use the old migrations. [See code](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/pyproject.toml#L58) + +3. When you upgrade to a new version of LiteLLM, the migration file is applied to the database. [See code](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/litellm-proxy-extras/litellm_proxy_extras/utils.py#L42) + + + + ## Extras ### Expected Performance in Production @@ -182,94 +242,4 @@ You should only see the following level of details in logs on the proxy server # INFO: 192.168.2.205:11774 - "POST /chat/completions HTTP/1.1" 200 OK # INFO: 192.168.2.205:34717 - "POST /chat/completions HTTP/1.1" 200 OK # INFO: 192.168.2.205:29734 - "POST /chat/completions HTTP/1.1" 200 OK -``` - - -### Machine Specifications to Deploy LiteLLM - -| Service | Spec | CPUs | Memory | Architecture | Version| -| --- | --- | --- | --- | --- | --- | -| Server | `t2.small`. | `1vCPUs` | `8GB` | `x86` | -| Redis Cache | - | - | - | - | 7.0+ Redis Engine| - - -### Reference Kubernetes Deployment YAML - -Reference Kubernetes `deployment.yaml` that was load tested by us - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: litellm-deployment -spec: - replicas: 3 - selector: - matchLabels: - app: litellm - template: - metadata: - labels: - app: litellm - spec: - containers: - - name: litellm-container - image: ghcr.io/berriai/litellm:main-latest - imagePullPolicy: Always - env: - - name: AZURE_API_KEY - value: "d6******" - - name: AZURE_API_BASE - value: "https://ope******" - - name: LITELLM_MASTER_KEY - value: "sk-1234" - - name: DATABASE_URL - value: "po**********" - args: - - "--config" - - "/app/proxy_config.yaml" # Update the path to mount the config file - volumeMounts: # Define volume mount for proxy_config.yaml - - name: config-volume - mountPath: /app - readOnly: true - livenessProbe: - httpGet: - path: /health/liveliness - port: 4000 - initialDelaySeconds: 120 - periodSeconds: 15 - successThreshold: 1 - failureThreshold: 3 - timeoutSeconds: 10 - readinessProbe: - httpGet: - path: /health/readiness - port: 4000 - initialDelaySeconds: 120 - periodSeconds: 15 - successThreshold: 1 - failureThreshold: 3 - timeoutSeconds: 10 - volumes: # Define volume to mount proxy_config.yaml - - name: config-volume - configMap: - name: litellm-config - -``` - - -Reference Kubernetes `service.yaml` that was load tested by us -```yaml -apiVersion: v1 -kind: Service -metadata: - name: litellm-service -spec: - selector: - app: litellm - ports: - - protocol: TCP - port: 4000 - targetPort: 4000 - type: LoadBalancer -``` +``` \ No newline at end of file diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 8dff527ae5..0ce94ab962 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -95,7 +95,14 @@ Use this for for tracking per [user, key, team, etc.](virtual_keys) ### Initialize Budget Metrics on Startup -If you want to initialize the key/team budget metrics on startup, you can set the `prometheus_initialize_budget_metrics` to `true` in the `config.yaml` +If you want litellm to emit the budget metrics for all keys, teams irrespective of whether they are getting requests or not, set `prometheus_initialize_budget_metrics` to `true` in the `config.yaml` + +**How this works:** + +- If the `prometheus_initialize_budget_metrics` is set to `true` + - Every 5 minutes litellm runs a cron job to read all keys, teams from the database + - It then emits the budget metrics for each key, team + - This is used to populate the budget metrics on the `/metrics` endpoint ```yaml litellm_settings: @@ -242,6 +249,19 @@ litellm_settings: | `litellm_redis_fails` | Number of failed redis calls | | `litellm_self_latency` | Histogram latency for successful litellm api call | +#### DB Transaction Queue Health Metrics + +Use these metrics to monitor the health of the DB Transaction Queue. Eg. Monitoring the size of the in-memory and redis buffers. + +| Metric Name | Description | Storage Type | +|-----------------------------------------------------|-----------------------------------------------------------------------------|--------------| +| `litellm_pod_lock_manager_size` | Indicates which pod has the lock to write updates to the database. | Redis | +| `litellm_in_memory_daily_spend_update_queue_size` | Number of items in the in-memory daily spend update queue. These are the aggregate spend logs for each user. | In-Memory | +| `litellm_redis_daily_spend_update_queue_size` | Number of items in the Redis daily spend update queue. These are the aggregate spend logs for each user. | Redis | +| `litellm_in_memory_spend_update_queue_size` | In-memory aggregate spend values for keys, users, teams, team members, etc.| In-Memory | +| `litellm_redis_spend_update_queue_size` | Redis aggregate spend values for keys, users, teams, etc. | Redis | + + ## **🔥 LiteLLM Maintained Grafana Dashboards ** @@ -268,6 +288,17 @@ Here is a screenshot of the metrics you can monitor with the LiteLLM Grafana Das +## Add authentication on /metrics endpoint + +**By default /metrics endpoint is unauthenticated.** + +You can opt into running litellm authentication on the /metrics endpoint by setting the following on the config + +```yaml +litellm_settings: + require_auth_for_metrics_endpoint: true +``` + ## FAQ ### What are `_created` vs. `_total` metrics? diff --git a/docs/my-website/docs/proxy/prompt_management.md b/docs/my-website/docs/proxy/prompt_management.md index 980043f455..c09231dd59 100644 --- a/docs/my-website/docs/proxy/prompt_management.md +++ b/docs/my-website/docs/proxy/prompt_management.md @@ -2,7 +2,7 @@ import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# [BETA] Prompt Management +# Prompt Management :::info @@ -12,9 +12,10 @@ This feature is currently in beta, and might change unexpectedly. We expect this Run experiments or change the specific model (e.g. from gpt-4o to gpt4o-mini finetune) from your prompt management tool (e.g. Langfuse) instead of making changes in the application. -Supported Integrations: -- [Langfuse](https://langfuse.com/docs/prompts/get-started) -- [Humanloop](../observability/humanloop) +| Supported Integrations | Link | +|------------------------|------| +| Langfuse | [Get Started](https://langfuse.com/docs/prompts/get-started) | +| Humanloop | [Get Started](../observability/humanloop) | ## Quick Start diff --git a/docs/my-website/docs/proxy/release_cycle.md b/docs/my-website/docs/proxy/release_cycle.md index 947a4ae6b3..c5782087f2 100644 --- a/docs/my-website/docs/proxy/release_cycle.md +++ b/docs/my-website/docs/proxy/release_cycle.md @@ -4,9 +4,17 @@ Litellm Proxy has the following release cycle: - `v1.x.x-nightly`: These are releases which pass ci/cd. - `v1.x.x.rc`: These are releases which pass ci/cd + [manual review](https://github.com/BerriAI/litellm/discussions/8495#discussioncomment-12180711). -- `v1.x.x` OR `v1.x.x-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing. +- `v1.x.x:main-stable`: These are releases which pass ci/cd + manual review + 3 days of production testing. -In production, we recommend using the latest `v1.x.x` release. +In production, we recommend using the latest `v1.x.x:main-stable` release. -Follow our release notes [here](https://github.com/BerriAI/litellm/releases). \ No newline at end of file +Follow our release notes [here](https://github.com/BerriAI/litellm/releases). + + +## FAQ + +### Is there a release schedule for LiteLLM stable release? + +Stable releases come out every week (typically Sunday) + diff --git a/docs/my-website/docs/proxy/response_headers.md b/docs/my-website/docs/proxy/response_headers.md index b07f82d780..32f09fab42 100644 --- a/docs/my-website/docs/proxy/response_headers.md +++ b/docs/my-website/docs/proxy/response_headers.md @@ -43,19 +43,19 @@ These headers are useful for clients to understand the current rate limit status | `x-litellm-max-fallbacks` | int | Maximum number of fallback attempts allowed | ## Cost Tracking Headers -| Header | Type | Description | -|--------|------|-------------| -| `x-litellm-response-cost` | float | Cost of the API call | -| `x-litellm-key-spend` | float | Total spend for the API key | +| Header | Type | Description | Available on Pass-Through Endpoints | +|--------|------|-------------|-------------| +| `x-litellm-response-cost` | float | Cost of the API call | | +| `x-litellm-key-spend` | float | Total spend for the API key | ✅ | ## LiteLLM Specific Headers -| Header | Type | Description | -|--------|------|-------------| -| `x-litellm-call-id` | string | Unique identifier for the API call | -| `x-litellm-model-id` | string | Unique identifier for the model used | -| `x-litellm-model-api-base` | string | Base URL of the API endpoint | -| `x-litellm-version` | string | Version of LiteLLM being used | -| `x-litellm-model-group` | string | Model group identifier | +| Header | Type | Description | Available on Pass-Through Endpoints | +|--------|------|-------------|-------------| +| `x-litellm-call-id` | string | Unique identifier for the API call | ✅ | +| `x-litellm-model-id` | string | Unique identifier for the model used | | +| `x-litellm-model-api-base` | string | Base URL of the API endpoint | ✅ | +| `x-litellm-version` | string | Version of LiteLLM being used | | +| `x-litellm-model-group` | string | Model group identifier | | ## Response headers from LLM providers diff --git a/docs/my-website/docs/proxy/self_serve.md b/docs/my-website/docs/proxy/self_serve.md index 604ceee3e5..a1e7c64cd9 100644 --- a/docs/my-website/docs/proxy/self_serve.md +++ b/docs/my-website/docs/proxy/self_serve.md @@ -161,6 +161,83 @@ Here's the available UI roles for a LiteLLM Internal User: - `internal_user`: can login, view/create/delete their own keys, view their spend. **Cannot** add new users. - `internal_user_viewer`: can login, view their own keys, view their own spend. **Cannot** create/delete keys, add new users. +## Auto-add SSO users to teams + +This walks through setting up sso auto-add for **Okta, Google SSO** + +### Okta, Google SSO + +1. Specify the JWT field that contains the team ids, that the user belongs to. + +```yaml +general_settings: + master_key: sk-1234 + litellm_jwtauth: + team_ids_jwt_field: "groups" # 👈 CAN BE ANY FIELD +``` + +This is assuming your SSO token looks like this. **If you need to inspect the JWT fields received from your SSO provider by LiteLLM, follow these instructions [here](#debugging-sso-jwt-fields)** + +``` +{ + ..., + "groups": ["team_id_1", "team_id_2"] +} +``` + +2. Create the teams on LiteLLM + +```bash +curl -X POST '/team/new' \ +-H 'Authorization: Bearer ' \ +-H 'Content-Type: application/json' \ +-D '{ + "team_alias": "team_1", + "team_id": "team_id_1" # 👈 MUST BE THE SAME AS THE SSO GROUP ID +}' +``` + +3. Test the SSO flow + +Here's a walkthrough of [how it works](https://www.loom.com/share/8959be458edf41fd85937452c29a33f3?sid=7ebd6d37-569a-4023-866e-e0cde67cb23e) + +### Microsoft Entra ID SSO group assignment + +Follow this [tutorial for auto-adding sso users to teams with Microsoft Entra ID](https://docs.litellm.ai/docs/tutorials/msft_sso) + +### Debugging SSO JWT fields + +If you need to inspect the JWT fields received from your SSO provider by LiteLLM, follow these instructions. This guide walks you through setting up a debug callback to view the JWT data during the SSO process. + + + +
+ +1. Add `/sso/debug/callback` as a redirect URL in your SSO provider + + In your SSO provider's settings, add the following URL as a new redirect (callback) URL: + + ```bash showLineNumbers title="Redirect URL" + http:///sso/debug/callback + ``` + + +2. Navigate to the debug login page on your browser + + Navigate to the following URL on your browser: + + ```bash showLineNumbers title="URL to navigate to" + https:///sso/debug/login + ``` + + This will initiate the standard SSO flow. You will be redirected to your SSO provider's login screen, and after successful authentication, you will be redirected back to LiteLLM's debug callback route. + + +3. View the JWT fields + +Once redirected, you should see a page called "SSO Debug Information". This page displays the JWT fields received from your SSO provider (as shown in the image above) + + ## Advanced ### Setting custom logout URLs @@ -196,40 +273,26 @@ This budget does not apply to keys created under non-default teams. [**Go Here**](./team_budgets.md) -### Auto-add SSO users to teams +### Set default params for new teams -1. Specify the JWT field that contains the team ids, that the user belongs to. +When you connect litellm to your SSO provider, litellm can auto-create teams. Use this to set the default `models`, `max_budget`, `budget_duration` for these auto-created teams. -```yaml -general_settings: - master_key: sk-1234 - litellm_jwtauth: - team_ids_jwt_field: "groups" # 👈 CAN BE ANY FIELD +**How it works** + +1. When litellm fetches `groups` from your SSO provider, it will check if the corresponding group_id exists as a `team_id` in litellm. +2. If the team_id does not exist, litellm will auto-create a team with the default params you've set. +3. If the team_id already exist, litellm will not apply any settings on the team. + +**Usage** + +```yaml showLineNumbers title="Default Params for new teams" +litellm_settings: + default_team_params: # Default Params to apply when litellm auto creates a team from SSO IDP provider + max_budget: 100 # Optional[float], optional): $100 budget for the team + budget_duration: 30d # Optional[str], optional): 30 days budget_duration for the team + models: ["gpt-3.5-turbo"] # Optional[List[str]], optional): models to be used by the team ``` -This is assuming your SSO token looks like this: -``` -{ - ..., - "groups": ["team_id_1", "team_id_2"] -} -``` - -2. Create the teams on LiteLLM - -```bash -curl -X POST '/team/new' \ --H 'Authorization: Bearer ' \ --H 'Content-Type: application/json' \ --D '{ - "team_alias": "team_1", - "team_id": "team_id_1" # 👈 MUST BE THE SAME AS THE SSO GROUP ID -}' -``` - -3. Test the SSO flow - -Here's a walkthrough of [how it works](https://www.loom.com/share/8959be458edf41fd85937452c29a33f3?sid=7ebd6d37-569a-4023-866e-e0cde67cb23e) ### Restrict Users from creating personal keys @@ -241,7 +304,7 @@ This will also prevent users from using their session tokens on the test keys ch ## **All Settings for Self Serve / SSO Flow** -```yaml +```yaml showLineNumbers title="All Settings for Self Serve / SSO Flow" litellm_settings: max_internal_user_budget: 10 # max budget for internal users internal_user_budget_duration: "1mo" # reset every month @@ -251,6 +314,11 @@ litellm_settings: max_budget: 100 # Optional[float], optional): $100 budget for a new SSO sign in user budget_duration: 30d # Optional[str], optional): 30 days budget_duration for a new SSO sign in user models: ["gpt-3.5-turbo"] # Optional[List[str]], optional): models to be used by a new SSO sign in user + + default_team_params: # Default Params to apply when litellm auto creates a team from SSO IDP provider + max_budget: 100 # Optional[float], optional): $100 budget for the team + budget_duration: 30d # Optional[str], optional): 30 days budget_duration for the team + models: ["gpt-3.5-turbo"] # Optional[List[str]], optional): models to be used by the team upperbound_key_generate_params: # Upperbound for /key/generate requests when self-serve flow is on diff --git a/docs/my-website/docs/reasoning_content.md b/docs/my-website/docs/reasoning_content.md index 1cce3f0570..12a0f17ba0 100644 --- a/docs/my-website/docs/reasoning_content.md +++ b/docs/my-website/docs/reasoning_content.md @@ -15,14 +15,17 @@ Supported Providers: - Bedrock (Anthropic + Deepseek) (`bedrock/`) - Vertex AI (Anthropic) (`vertexai/`) - OpenRouter (`openrouter/`) +- XAI (`xai/`) +- Google AI Studio (`google/`) +- Vertex AI (`vertex_ai/`) LiteLLM will standardize the `reasoning_content` in the response and `thinking_blocks` in the assistant message. -```python +```python title="Example response from litellm" "message": { ... "reasoning_content": "The capital of France is Paris.", - "thinking_blocks": [ + "thinking_blocks": [ # only returned for Anthropic models { "type": "thinking", "thinking": "The capital of France is Paris.", @@ -37,7 +40,7 @@ LiteLLM will standardize the `reasoning_content` in the response and `thinking_b -```python +```python showLineNumbers from litellm import completion import os @@ -48,7 +51,7 @@ response = completion( messages=[ {"role": "user", "content": "What is the capital of France?"}, ], - thinking={"type": "enabled", "budget_tokens": 1024} # 👈 REQUIRED FOR ANTHROPIC models (on `anthropic/`, `bedrock/`, `vertexai/`) + reasoning_effort="low", ) print(response.choices[0].message.content) ``` @@ -68,7 +71,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \ "content": "What is the capital of France?" } ], - "thinking": {"type": "enabled", "budget_tokens": 1024} + "reasoning_effort": "low" }' ``` @@ -111,7 +114,7 @@ Here's how to use `thinking` blocks by Anthropic with tool calling. -```python +```python showLineNumbers litellm._turn_on_debug() litellm.modify_params = True model = "anthropic/claude-3-7-sonnet-20250219" # works across Anthropic, Bedrock, Vertex AI @@ -150,7 +153,7 @@ response = litellm.completion( messages=messages, tools=tools, tool_choice="auto", # auto is default, but we'll be explicit - thinking={"type": "enabled", "budget_tokens": 1024}, + reasoning_effort="low", ) print("Response\n", response) response_message = response.choices[0].message @@ -198,9 +201,9 @@ if tool_calls: model=model, messages=messages, seed=22, + reasoning_effort="low", # tools=tools, drop_params=True, - thinking={"type": "enabled", "budget_tokens": 1024}, ) # get a new response from the model where it can see the function response print("second response\n", second_response) ``` @@ -210,7 +213,7 @@ if tool_calls: 1. Setup config.yaml -```yaml +```yaml showLineNumbers model_list: - model_name: claude-3-7-sonnet-thinking litellm_params: @@ -224,7 +227,7 @@ model_list: 2. Run proxy -```bash +```bash showLineNumbers litellm --config config.yaml # RUNNING on http://0.0.0.0:4000 @@ -332,7 +335,7 @@ curl http://0.0.0.0:4000/v1/chat/completions \ Set `drop_params=True` to drop the 'thinking' blocks when swapping from Anthropic to Deepseek models. Suggest improvements to this approach [here](https://github.com/BerriAI/litellm/discussions/8927). -```python +```python showLineNumbers litellm.drop_params = True # 👈 EITHER GLOBALLY or per request # or per request @@ -340,7 +343,7 @@ litellm.drop_params = True # 👈 EITHER GLOBALLY or per request response = litellm.completion( model="anthropic/claude-3-7-sonnet-20250219", messages=[{"role": "user", "content": "What is the capital of France?"}], - thinking={"type": "enabled", "budget_tokens": 1024}, + reasoning_effort="low", drop_params=True, ) @@ -348,7 +351,7 @@ response = litellm.completion( response = litellm.completion( model="deepseek/deepseek-chat", messages=[{"role": "user", "content": "What is the capital of France?"}], - thinking={"type": "enabled", "budget_tokens": 1024}, + reasoning_effort="low", drop_params=True, ) ``` @@ -364,3 +367,123 @@ These fields can be accessed via `response.choices[0].message.reasoning_content` - `thinking` - str: The thinking from the model. - `signature` - str: The signature delta from the model. + + +## Pass `thinking` to Anthropic models + +You can also pass the `thinking` parameter to Anthropic models. + + + + +```python showLineNumbers +response = litellm.completion( + model="anthropic/claude-3-7-sonnet-20250219", + messages=[{"role": "user", "content": "What is the capital of France?"}], + thinking={"type": "enabled", "budget_tokens": 1024}, +) +``` + + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "anthropic/claude-3-7-sonnet-20250219", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "thinking": {"type": "enabled", "budget_tokens": 1024} + }' +``` + + + + +## Checking if a model supports reasoning + + + + +Use `litellm.supports_reasoning(model="")` -> returns `True` if model supports reasoning and `False` if not. + +```python showLineNumbers title="litellm.supports_reasoning() usage" +import litellm + +# Example models that support reasoning +assert litellm.supports_reasoning(model="anthropic/claude-3-7-sonnet-20250219") == True +assert litellm.supports_reasoning(model="deepseek/deepseek-chat") == True + +# Example models that do not support reasoning +assert litellm.supports_reasoning(model="openai/gpt-3.5-turbo") == False +``` + + + + +1. Define models that support reasoning in your `config.yaml`. You can optionally add `supports_reasoning: True` to the `model_info` if LiteLLM does not automatically detect it for your custom model. + +```yaml showLineNumbers title="litellm proxy config.yaml" +model_list: + - model_name: claude-3-sonnet-reasoning + litellm_params: + model: anthropic/claude-3-7-sonnet-20250219 + api_key: os.environ/ANTHROPIC_API_KEY + - model_name: deepseek-reasoning + litellm_params: + model: deepseek/deepseek-chat + api_key: os.environ/DEEPSEEK_API_KEY + # Example for a custom model where detection might be needed + - model_name: my-custom-reasoning-model + litellm_params: + model: openai/my-custom-model # Assuming it's OpenAI compatible + api_base: http://localhost:8000 + api_key: fake-key + model_info: + supports_reasoning: True # Explicitly mark as supporting reasoning +``` + +2. Run the proxy server: + +```bash showLineNumbers title="litellm --config config.yaml" +litellm --config config.yaml +``` + +3. Call `/model_group/info` to check if your model supports `reasoning` + +```shell showLineNumbers title="curl /model_group/info" +curl -X 'GET' \ + 'http://localhost:4000/model_group/info' \ + -H 'accept: application/json' \ + -H 'x-api-key: sk-1234' +``` + +Expected Response + +```json showLineNumbers title="response from /model_group/info" +{ + "data": [ + { + "model_group": "claude-3-sonnet-reasoning", + "providers": ["anthropic"], + "mode": "chat", + "supports_reasoning": true, + }, + { + "model_group": "deepseek-reasoning", + "providers": ["deepseek"], + "supports_reasoning": true, + }, + { + "model_group": "my-custom-reasoning-model", + "providers": ["openai"], + "supports_reasoning": true, + } + ] +} +```` + + + + diff --git a/docs/my-website/docs/response_api.md b/docs/my-website/docs/response_api.md index 0604a42586..532f20bc05 100644 --- a/docs/my-website/docs/response_api.md +++ b/docs/my-website/docs/response_api.md @@ -14,22 +14,22 @@ LiteLLM provides a BETA endpoint in the spec of [OpenAI's `/responses` API](http | Fallbacks | ✅ | Works between supported models | | Loadbalancing | ✅ | Works between supported models | | Supported LiteLLM Versions | 1.63.8+ | | -| Supported LLM providers | `openai` | | +| Supported LLM providers | **All LiteLLM supported providers** | `openai`, `anthropic`, `bedrock`, `vertex_ai`, `gemini`, `azure`, `azure_ai` etc. | ## Usage -## Create a model response +### LiteLLM Python SDK - + #### Non-streaming -```python +```python showLineNumbers title="OpenAI Non-streaming Response" import litellm # Non-streaming response response = litellm.responses( - model="gpt-4o", + model="openai/o1-pro", input="Tell me a three sentence bedtime story about a unicorn.", max_output_tokens=100 ) @@ -38,12 +38,12 @@ print(response) ``` #### Streaming -```python +```python showLineNumbers title="OpenAI Streaming Response" import litellm # Streaming response response = litellm.responses( - model="gpt-4o", + model="openai/o1-pro", input="Tell me a three sentence bedtime story about a unicorn.", stream=True ) @@ -53,58 +53,169 @@ for event in response: ``` - -First, add this to your litellm proxy config.yaml: -```yaml -model_list: - - model_name: gpt-4o - litellm_params: - model: openai/gpt-4o - api_key: os.environ/OPENAI_API_KEY -``` - -Start your LiteLLM proxy: -```bash -litellm --config /path/to/config.yaml - -# RUNNING on http://0.0.0.0:4000 -``` - -Then use the OpenAI SDK pointed to your proxy: + #### Non-streaming -```python -from openai import OpenAI +```python showLineNumbers title="Anthropic Non-streaming Response" +import litellm +import os -# Initialize client with your proxy URL -client = OpenAI( - base_url="http://localhost:4000", # Your proxy URL - api_key="your-api-key" # Your proxy API key -) +# Set API key +os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key" # Non-streaming response -response = client.responses.create( - model="gpt-4o", - input="Tell me a three sentence bedtime story about a unicorn." +response = litellm.responses( + model="anthropic/claude-3-5-sonnet-20240620", + input="Tell me a three sentence bedtime story about a unicorn.", + max_output_tokens=100 ) print(response) ``` #### Streaming -```python -from openai import OpenAI +```python showLineNumbers title="Anthropic Streaming Response" +import litellm +import os -# Initialize client with your proxy URL -client = OpenAI( - base_url="http://localhost:4000", # Your proxy URL - api_key="your-api-key" # Your proxy API key -) +# Set API key +os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-api-key" # Streaming response -response = client.responses.create( - model="gpt-4o", +response = litellm.responses( + model="anthropic/claude-3-5-sonnet-20240620", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True +) + +for event in response: + print(event) +``` + + + + + +#### Non-streaming +```python showLineNumbers title="Vertex AI Non-streaming Response" +import litellm +import os + +# Set credentials - Vertex AI uses application default credentials +# Run 'gcloud auth application-default login' to authenticate +os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id" +os.environ["VERTEXAI_LOCATION"] = "us-central1" + +# Non-streaming response +response = litellm.responses( + model="vertex_ai/gemini-1.5-pro", + input="Tell me a three sentence bedtime story about a unicorn.", + max_output_tokens=100 +) + +print(response) +``` + +#### Streaming +```python showLineNumbers title="Vertex AI Streaming Response" +import litellm +import os + +# Set credentials - Vertex AI uses application default credentials +# Run 'gcloud auth application-default login' to authenticate +os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id" +os.environ["VERTEXAI_LOCATION"] = "us-central1" + +# Streaming response +response = litellm.responses( + model="vertex_ai/gemini-1.5-pro", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True +) + +for event in response: + print(event) +``` + + + + + +#### Non-streaming +```python showLineNumbers title="AWS Bedrock Non-streaming Response" +import litellm +import os + +# Set AWS credentials +os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id" +os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key" +os.environ["AWS_REGION_NAME"] = "us-west-2" # or your AWS region + +# Non-streaming response +response = litellm.responses( + model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + input="Tell me a three sentence bedtime story about a unicorn.", + max_output_tokens=100 +) + +print(response) +``` + +#### Streaming +```python showLineNumbers title="AWS Bedrock Streaming Response" +import litellm +import os + +# Set AWS credentials +os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key-id" +os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-access-key" +os.environ["AWS_REGION_NAME"] = "us-west-2" # or your AWS region + +# Streaming response +response = litellm.responses( + model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True +) + +for event in response: + print(event) +``` + + + + + +#### Non-streaming +```python showLineNumbers title="Google AI Studio Non-streaming Response" +import litellm +import os + +# Set API key for Google AI Studio +os.environ["GEMINI_API_KEY"] = "your-gemini-api-key" + +# Non-streaming response +response = litellm.responses( + model="gemini/gemini-1.5-flash", + input="Tell me a three sentence bedtime story about a unicorn.", + max_output_tokens=100 +) + +print(response) +``` + +#### Streaming +```python showLineNumbers title="Google AI Studio Streaming Response" +import litellm +import os + +# Set API key for Google AI Studio +os.environ["GEMINI_API_KEY"] = "your-gemini-api-key" + +# Streaming response +response = litellm.responses( + model="gemini/gemini-1.5-flash", input="Tell me a three sentence bedtime story about a unicorn.", stream=True ) @@ -115,3 +226,408 @@ for event in response: + +### LiteLLM Proxy with OpenAI SDK + +First, set up and start your LiteLLM proxy server. + +```bash title="Start LiteLLM Proxy Server" +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + + + + +First, add this to your litellm proxy config.yaml: +```yaml showLineNumbers title="OpenAI Proxy Configuration" +model_list: + - model_name: openai/o1-pro + litellm_params: + model: openai/o1-pro + api_key: os.environ/OPENAI_API_KEY +``` + +#### Non-streaming +```python showLineNumbers title="OpenAI Proxy Non-streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Non-streaming response +response = client.responses.create( + model="openai/o1-pro", + input="Tell me a three sentence bedtime story about a unicorn." +) + +print(response) +``` + +#### Streaming +```python showLineNumbers title="OpenAI Proxy Streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Streaming response +response = client.responses.create( + model="openai/o1-pro", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True +) + +for event in response: + print(event) +``` + + + + + +First, add this to your litellm proxy config.yaml: +```yaml showLineNumbers title="Anthropic Proxy Configuration" +model_list: + - model_name: anthropic/claude-3-5-sonnet-20240620 + litellm_params: + model: anthropic/claude-3-5-sonnet-20240620 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +#### Non-streaming +```python showLineNumbers title="Anthropic Proxy Non-streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Non-streaming response +response = client.responses.create( + model="anthropic/claude-3-5-sonnet-20240620", + input="Tell me a three sentence bedtime story about a unicorn." +) + +print(response) +``` + +#### Streaming +```python showLineNumbers title="Anthropic Proxy Streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Streaming response +response = client.responses.create( + model="anthropic/claude-3-5-sonnet-20240620", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True +) + +for event in response: + print(event) +``` + + + + + +First, add this to your litellm proxy config.yaml: +```yaml showLineNumbers title="Vertex AI Proxy Configuration" +model_list: + - model_name: vertex_ai/gemini-1.5-pro + litellm_params: + model: vertex_ai/gemini-1.5-pro + vertex_project: your-gcp-project-id + vertex_location: us-central1 +``` + +#### Non-streaming +```python showLineNumbers title="Vertex AI Proxy Non-streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Non-streaming response +response = client.responses.create( + model="vertex_ai/gemini-1.5-pro", + input="Tell me a three sentence bedtime story about a unicorn." +) + +print(response) +``` + +#### Streaming +```python showLineNumbers title="Vertex AI Proxy Streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Streaming response +response = client.responses.create( + model="vertex_ai/gemini-1.5-pro", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True +) + +for event in response: + print(event) +``` + + + + + +First, add this to your litellm proxy config.yaml: +```yaml showLineNumbers title="AWS Bedrock Proxy Configuration" +model_list: + - model_name: bedrock/anthropic.claude-3-sonnet-20240229-v1:0 + litellm_params: + model: bedrock/anthropic.claude-3-sonnet-20240229-v1:0 + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: us-west-2 +``` + +#### Non-streaming +```python showLineNumbers title="AWS Bedrock Proxy Non-streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Non-streaming response +response = client.responses.create( + model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + input="Tell me a three sentence bedtime story about a unicorn." +) + +print(response) +``` + +#### Streaming +```python showLineNumbers title="AWS Bedrock Proxy Streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Streaming response +response = client.responses.create( + model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True +) + +for event in response: + print(event) +``` + + + + + +First, add this to your litellm proxy config.yaml: +```yaml showLineNumbers title="Google AI Studio Proxy Configuration" +model_list: + - model_name: gemini/gemini-1.5-flash + litellm_params: + model: gemini/gemini-1.5-flash + api_key: os.environ/GEMINI_API_KEY +``` + +#### Non-streaming +```python showLineNumbers title="Google AI Studio Proxy Non-streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Non-streaming response +response = client.responses.create( + model="gemini/gemini-1.5-flash", + input="Tell me a three sentence bedtime story about a unicorn." +) + +print(response) +``` + +#### Streaming +```python showLineNumbers title="Google AI Studio Proxy Streaming Response" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-api-key" # Your proxy API key +) + +# Streaming response +response = client.responses.create( + model="gemini/gemini-1.5-flash", + input="Tell me a three sentence bedtime story about a unicorn.", + stream=True +) + +for event in response: + print(event) +``` + + + + +## Supported Responses API Parameters + +| Provider | Supported Parameters | +|----------|---------------------| +| `openai` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) | +| `azure` | [All Responses API parameters are supported](https://github.com/BerriAI/litellm/blob/7c3df984da8e4dff9201e4c5353fdc7a2b441831/litellm/llms/openai/responses/transformation.py#L23) | +| `anthropic` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) | +| `bedrock` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) | +| `gemini` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) | +| `vertex_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) | +| `azure_ai` | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) | +| All other llm api providers | [See supported parameters here](https://github.com/BerriAI/litellm/blob/f39d9178868662746f159d5ef642c7f34f9bfe5f/litellm/responses/litellm_completion_transformation/transformation.py#L57) | + +## Load Balancing with Routing Affinity + +When using the Responses API with multiple deployments of the same model (e.g., multiple Azure OpenAI endpoints), LiteLLM provides routing affinity for conversations. This ensures that follow-up requests using a `previous_response_id` are routed to the same deployment that generated the original response. + + +#### Example Usage + + + + +```python showLineNumbers title="Python SDK with Routing Affinity" +import litellm + +# Set up router with multiple deployments of the same model +router = litellm.Router( + model_list=[ + { + "model_name": "azure-gpt4-turbo", + "litellm_params": { + "model": "azure/gpt-4-turbo", + "api_key": "your-api-key-1", + "api_version": "2024-06-01", + "api_base": "https://endpoint1.openai.azure.com", + }, + }, + { + "model_name": "azure-gpt4-turbo", + "litellm_params": { + "model": "azure/gpt-4-turbo", + "api_key": "your-api-key-2", + "api_version": "2024-06-01", + "api_base": "https://endpoint2.openai.azure.com", + }, + }, + ], + optional_pre_call_checks=["responses_api_deployment_check"], +) + +# Initial request +response = await router.aresponses( + model="azure-gpt4-turbo", + input="Hello, who are you?", + truncation="auto", +) + +# Store the response ID +response_id = response.id + +# Follow-up request - will be automatically routed to the same deployment +follow_up = await router.aresponses( + model="azure-gpt4-turbo", + input="Tell me more about yourself", + truncation="auto", + previous_response_id=response_id # This ensures routing to the same deployment +) +``` + + + + +#### 1. Setup routing affinity on proxy config.yaml + +To enable routing affinity for Responses API in your LiteLLM proxy, set `optional_pre_call_checks: ["responses_api_deployment_check"]` in your proxy config.yaml. + +```yaml showLineNumbers title="config.yaml with Responses API Routing Affinity" +model_list: + - model_name: azure-gpt4-turbo + litellm_params: + model: azure/gpt-4-turbo + api_key: your-api-key-1 + api_version: 2024-06-01 + api_base: https://endpoint1.openai.azure.com + - model_name: azure-gpt4-turbo + litellm_params: + model: azure/gpt-4-turbo + api_key: your-api-key-2 + api_version: 2024-06-01 + api_base: https://endpoint2.openai.azure.com + +router_settings: + optional_pre_call_checks: ["responses_api_deployment_check"] +``` + +#### 2. Use the OpenAI Python SDK to make requests to LiteLLM Proxy + +```python showLineNumbers title="OpenAI Client with Proxy Server" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-api-key" +) + +# Initial request +response = client.responses.create( + model="azure-gpt4-turbo", + input="Hello, who are you?" +) + +response_id = response.id + +# Follow-up request - will be automatically routed to the same deployment +follow_up = client.responses.create( + model="azure-gpt4-turbo", + input="Tell me more about yourself", + previous_response_id=response_id # This ensures routing to the same deployment +) +``` + + + diff --git a/docs/my-website/docs/set_keys.md b/docs/my-website/docs/set_keys.md index 3a5ff08d63..693cf5f7f4 100644 --- a/docs/my-website/docs/set_keys.md +++ b/docs/my-website/docs/set_keys.md @@ -188,7 +188,13 @@ Currently implemented for: - OpenAI (if OPENAI_API_KEY is set) - Fireworks AI (if FIREWORKS_AI_API_KEY is set) - LiteLLM Proxy (if LITELLM_PROXY_API_KEY is set) +- Gemini (if GEMINI_API_KEY is set) +- XAI (if XAI_API_KEY is set) +- Anthropic (if ANTHROPIC_API_KEY is set) +You can also specify a custom provider to check: + +**All providers**: ```python from litellm import get_valid_models @@ -196,6 +202,14 @@ valid_models = get_valid_models(check_provider_endpoint=True) print(valid_models) ``` +**Specific provider**: +```python +from litellm import get_valid_models + +valid_models = get_valid_models(check_provider_endpoint=True, custom_llm_provider="openai") +print(valid_models) +``` + ### `validate_environment(model: str)` This helper tells you if you have all the required environment variables for a model, and if not - what's missing. diff --git a/docs/my-website/docs/tutorials/msft_sso.md b/docs/my-website/docs/tutorials/msft_sso.md new file mode 100644 index 0000000000..f7ad6440f2 --- /dev/null +++ b/docs/my-website/docs/tutorials/msft_sso.md @@ -0,0 +1,162 @@ +import Image from '@theme/IdealImage'; + +# Microsoft SSO: Sync Groups, Members with LiteLLM + +Sync Microsoft SSO Groups, Members with LiteLLM Teams. + + + +
+
+ + +## Prerequisites + +- An Azure Entra ID account with administrative access +- A LiteLLM Enterprise App set up in your Azure Portal +- Access to Microsoft Entra ID (Azure AD) + + +## Overview of this tutorial + +1. Auto-Create Entra ID Groups on LiteLLM Teams +2. Sync Entra ID Team Memberships +3. Set default params for new teams and users auto-created on LiteLLM + +## 1. Auto-Create Entra ID Groups on LiteLLM Teams + +In this step, our goal is to have LiteLLM automatically create a new team on the LiteLLM DB when there is a new Group Added to the LiteLLM Enterprise App on Azure Entra ID. + +### 1.1 Create a new group in Entra ID + + +Navigate to [your Azure Portal](https://portal.azure.com/) > Groups > New Group. Create a new group. + + + +### 1.2 Assign the group to your LiteLLM Enterprise App + +On your Azure Portal, navigate to `Enterprise Applications` > Select your litellm app + + + +
+
+ +Once you've selected your litellm app, click on `Users and Groups` > `Add user/group` + + + +
+ +Now select the group you created in step 1.1. And add it to the LiteLLM Enterprise App. At this point we have added `Production LLM Evals Group` to the LiteLLM Enterprise App. The next steps is having LiteLLM automatically create the `Production LLM Evals Group` on the LiteLLM DB when a new user signs in. + + + + +### 1.3 Sign in to LiteLLM UI via SSO + +Sign into the LiteLLM UI via SSO. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID. + + + +### 1.4 Check the new team on LiteLLM UI + +On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group` auto-created on LiteLLM. + + + +#### How this works + +When a SSO user signs in to LiteLLM: +- LiteLLM automatically fetches the Groups under the LiteLLM Enterprise App +- It finds the Production LLM Evals Group assigned to the LiteLLM Enterprise App +- LiteLLM checks if this group's ID exists in the LiteLLM Teams Table +- Since the ID doesn't exist, LiteLLM automatically creates a new team with: + - Name: Production LLM Evals Group + - ID: Same as the Entra ID group's ID + +## 2. Sync Entra ID Team Memberships + +In this step, we will have LiteLLM automatically add a user to the `Production LLM Evals` Team on the LiteLLM DB when a new user is added to the `Production LLM Evals` Group in Entra ID. + +### 2.1 Navigate to the `Production LLM Evals` Group in Entra ID + +Navigate to the `Production LLM Evals` Group in Entra ID. + + + + +### 2.2 Add a member to the group in Entra ID + +Select `Members` > `Add members` + +In this stage you should add the user you want to add to the `Production LLM Evals` Team. + + + + + +### 2.3 Sign in as the new user on LiteLLM UI + +Sign in as the new user on LiteLLM UI. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID. During this step LiteLLM sync it's teams, team members with what is available from Entra ID + + + + + +### 2.4 Check the team membership on LiteLLM UI + +On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group`. Since your are now a member of the `Production LLM Evals Group` in Entra ID, you should see the new team `Production LLM Evals Group` on the LiteLLM UI. + + + +## 3. Set default params for new teams auto-created on LiteLLM + +Since litellm auto creates a new team on the LiteLLM DB when there is a new Group Added to the LiteLLM Enterprise App on Azure Entra ID, we can set default params for new teams created. + +This allows you to set a default budget, models, etc for new teams created. + +### 3.1 Set `default_team_params` on litellm + +Navigate to your litellm config file and set the following params + +```yaml showLineNumbers title="litellm config with default_team_params" +litellm_settings: + default_team_params: # Default Params to apply when litellm auto creates a team from SSO IDP provider + max_budget: 100 # Optional[float], optional): $100 budget for the team + budget_duration: 30d # Optional[str], optional): 30 days budget_duration for the team + models: ["gpt-3.5-turbo"] # Optional[List[str]], optional): models to be used by the team +``` + +### 3.2 Auto-create a new team on LiteLLM + +- In this step you should add a new group to the LiteLLM Enterprise App on Azure Entra ID (like we did in step 1.1). We will call this group `Default LiteLLM Prod Team` on Azure Entra ID. +- Start litellm proxy server with your config +- Sign into LiteLLM UI via SSO +- Navigate to `Teams` and you should see the new team `Default LiteLLM Prod Team` auto-created on LiteLLM +- Note LiteLLM will set the default params for this new team. + + + + +## Video Walkthrough + +This walks through setting up sso auto-add for **Microsoft Entra ID** + +Follow along this video for a walkthrough of how to set this up with Microsoft Entra ID + + + + + + + + + + + + + + + diff --git a/docs/my-website/docs/tutorials/openai_codex.md b/docs/my-website/docs/tutorials/openai_codex.md new file mode 100644 index 0000000000..bb5af956b0 --- /dev/null +++ b/docs/my-website/docs/tutorials/openai_codex.md @@ -0,0 +1,146 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Using LiteLLM with OpenAI Codex + +This guide walks you through connecting OpenAI Codex to LiteLLM. Using LiteLLM with Codex allows teams to: +- Access 100+ LLMs through the Codex interface +- Use powerful models like Gemini through a familiar interface +- Track spend and usage with LiteLLM's built-in analytics +- Control model access with virtual keys + + + +## Quickstart + +:::info + +Requires LiteLLM v1.66.3.dev5 and higher + +::: + + +Make sure to set up LiteLLM with the [LiteLLM Getting Started Guide](../proxy/docker_quick_start.md). + +## 1. Install OpenAI Codex + +Install the OpenAI Codex CLI tool globally using npm: + + + + +```bash showLineNumbers +npm i -g @openai/codex +``` + + + + +```bash showLineNumbers +yarn global add @openai/codex +``` + + + + +## 2. Start LiteLLM Proxy + + + + +```bash showLineNumbers +docker run \ + -v $(pwd)/litellm_config.yaml:/app/config.yaml \ + -p 4000:4000 \ + ghcr.io/berriai/litellm:main-latest \ + --config /app/config.yaml +``` + + + + +```bash showLineNumbers +litellm --config /path/to/config.yaml +``` + + + + +LiteLLM should now be running on [http://localhost:4000](http://localhost:4000) + +## 3. Configure LiteLLM for Model Routing + +Ensure your LiteLLM Proxy is properly configured to route to your desired models. Create a `litellm_config.yaml` file with the following content: + +```yaml showLineNumbers +model_list: + - model_name: o3-mini + litellm_params: + model: openai/o3-mini + api_key: os.environ/OPENAI_API_KEY + - model_name: claude-3-7-sonnet-latest + litellm_params: + model: anthropic/claude-3-7-sonnet-latest + api_key: os.environ/ANTHROPIC_API_KEY + - model_name: gemini-2.0-flash + litellm_params: + model: gemini/gemini-2.0-flash + api_key: os.environ/GEMINI_API_KEY + +litellm_settings: + drop_params: true +``` + +This configuration enables routing to specific OpenAI, Anthropic, and Gemini models with explicit names. + +## 4. Configure Codex to Use LiteLLM Proxy + +Set the required environment variables to point Codex to your LiteLLM Proxy: + +```bash +# Point to your LiteLLM Proxy server +export OPENAI_BASE_URL=http://0.0.0.0:4000 + +# Use your LiteLLM API key (if you've set up authentication) +export OPENAI_API_KEY="sk-1234" +``` + +## 5. Run Codex with Gemini + +With everything configured, you can now run Codex with Gemini: + +```bash showLineNumbers +codex --model gemini-2.0-flash --full-auto +``` + + + +The `--full-auto` flag allows Codex to automatically generate code without additional prompting. + +## 6. Advanced Options + +### Using Different Models + +You can use any model configured in your LiteLLM proxy: + +```bash +# Use Claude models +codex --model claude-3-7-sonnet-latest + +# Use Google AI Studio Gemini models +codex --model gemini/gemini-2.0-flash +``` + +## Troubleshooting + +- If you encounter connection issues, ensure your LiteLLM Proxy is running and accessible at the specified URL +- Verify your LiteLLM API key is valid if you're using authentication +- Check that your model routing configuration is correct +- For model-specific errors, ensure the model is properly configured in your LiteLLM setup + +## Additional Resources + +- [LiteLLM Docker Quick Start Guide](../proxy/docker_quick_start.md) +- [OpenAI Codex GitHub Repository](https://github.com/openai/codex) +- [LiteLLM Virtual Keys and Authentication](../proxy/virtual_keys.md) diff --git a/docs/my-website/docs/tutorials/openweb_ui.md b/docs/my-website/docs/tutorials/openweb_ui.md index ab1e2e121e..b2c1204069 100644 --- a/docs/my-website/docs/tutorials/openweb_ui.md +++ b/docs/my-website/docs/tutorials/openweb_ui.md @@ -98,6 +98,5 @@ On the models dropdown select `thinking-anthropic-claude-3-7-sonnet` - - - +## Additional Resources +- Running LiteLLM and OpenWebUI on Windows Localhost: A Comprehensive Guide [https://www.tanyongsheng.com/note/running-litellm-and-openwebui-on-windows-localhost-a-comprehensive-guide/](https://www.tanyongsheng.com/note/running-litellm-and-openwebui-on-windows-localhost-a-comprehensive-guide/) \ No newline at end of file diff --git a/docs/my-website/docs/tutorials/prompt_caching.md b/docs/my-website/docs/tutorials/prompt_caching.md new file mode 100644 index 0000000000..bf3d5a8dda --- /dev/null +++ b/docs/my-website/docs/tutorials/prompt_caching.md @@ -0,0 +1,128 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Auto-Inject Prompt Caching Checkpoints + +Reduce costs by up to 90% by using LiteLLM to auto-inject prompt caching checkpoints. + + + + +## How it works + +LiteLLM can automatically inject prompt caching checkpoints into your requests to LLM providers. This allows: + +- **Cost Reduction**: Long, static parts of your prompts can be cached to avoid repeated processing +- **No need to modify your application code**: You can configure the auto-caching behavior in the LiteLLM UI or in the `litellm config.yaml` file. + +## Configuration + +You need to specify `cache_control_injection_points` in your model configuration. This tells LiteLLM: +1. Where to add the caching directive (`location`) +2. Which message to target (`role`) + +LiteLLM will then automatically add a `cache_control` directive to the specified messages in your requests: + +```json +"cache_control": { + "type": "ephemeral" +} +``` + +## Usage Example + +In this example, we'll configure caching for system messages by adding the directive to all messages with `role: system`. + + + + +```yaml showLineNumbers title="litellm config.yaml" +model_list: + - model_name: anthropic-auto-inject-cache-system-message + litellm_params: + model: anthropic/claude-3-5-sonnet-20240620 + api_key: os.environ/ANTHROPIC_API_KEY + cache_control_injection_points: + - location: message + role: system +``` + + + + +On the LiteLLM UI, you can specify the `cache_control_injection_points` in the `Advanced Settings` tab when adding a model. + + + + + + +## Detailed Example + +### 1. Original Request to LiteLLM + +In this example, we have a very long, static system message and a varying user message. It's efficient to cache the system message since it rarely changes. + +```json +{ + "messages": [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": "You are a helpful assistant. This is a set of very long instructions that you will follow. Here is a legal document that you will use to answer the user's question." + } + ] + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is the main topic of this legal document?" + } + ] + } + ] +} +``` + +### 2. LiteLLM's Modified Request + +LiteLLM auto-injects the caching directive into the system message based on our configuration: + +```json +{ + "messages": [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": "You are a helpful assistant. This is a set of very long instructions that you will follow. Here is a legal document that you will use to answer the user's question.", + "cache_control": {"type": "ephemeral"} + } + ] + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What is the main topic of this legal document?" + } + ] + } + ] +} +``` + +When the model provider processes this request, it will recognize the caching directive and only process the system message once, caching it for subsequent requests. + + + + + + diff --git a/docs/my-website/docs/tutorials/scim_litellm.md b/docs/my-website/docs/tutorials/scim_litellm.md new file mode 100644 index 0000000000..c744abe4b4 --- /dev/null +++ b/docs/my-website/docs/tutorials/scim_litellm.md @@ -0,0 +1,74 @@ + +import Image from '@theme/IdealImage'; + +# SCIM with LiteLLM + +Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning on LiteLLM. + + +This tutorial will walk you through the steps to connect your IDP to LiteLLM SCIM Endpoints. + +### Supported SSO Providers for SCIM +Below is a list of supported SSO providers for connecting to LiteLLM SCIM Endpoints. +- Microsoft Entra ID (Azure AD) +- Okta +- Google Workspace +- OneLogin +- Keycloak +- Auth0 + + +## 1. Get your SCIM Tenant URL and Bearer Token + +On LiteLLM, navigate to the Settings > Admin Settings > SCIM. On this page you will create a SCIM Token, this allows your IDP to authenticate to litellm `/scim` endpoints. + + + +## 2. Connect your IDP to LiteLLM SCIM Endpoints + +On your IDP provider, navigate to your SSO application and select `Provisioning` > `New provisioning configuration`. + +On this page, paste in your litellm scim tenant url and bearer token. + +Once this is pasted in, click on `Test Connection` to ensure your IDP can authenticate to the LiteLLM SCIM endpoints. + + + + +## 3. Test SCIM Connection + +### 3.1 Assign the group to your LiteLLM Enterprise App + +On your IDP Portal, navigate to `Enterprise Applications` > Select your litellm app + + + +
+
+ +Once you've selected your litellm app, click on `Users and Groups` > `Add user/group` + + + +
+ +Now select the group you created in step 1.1. And add it to the LiteLLM Enterprise App. At this point we have added `Production LLM Evals Group` to the LiteLLM Enterprise App. The next step is having LiteLLM automatically create the `Production LLM Evals Group` on the LiteLLM DB when a new user signs in. + + + + +### 3.2 Sign in to LiteLLM UI via SSO + +Sign into the LiteLLM UI via SSO. You should be redirected to the Entra ID SSO page. This SSO sign in flow will trigger LiteLLM to fetch the latest Groups and Members from Azure Entra ID. + + + +### 3.3 Check the new team on LiteLLM UI + +On the LiteLLM UI, Navigate to `Teams`, You should see the new team `Production LLM Evals Group` auto-created on LiteLLM. + + + + + + diff --git a/docs/my-website/docs/tutorials/tag_management.md b/docs/my-website/docs/tutorials/tag_management.md new file mode 100644 index 0000000000..9b00db47d1 --- /dev/null +++ b/docs/my-website/docs/tutorials/tag_management.md @@ -0,0 +1,145 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# [Beta] Routing based on request metadata + +Create routing rules based on request metadata. + +## Setup + +Add the following to your litellm proxy config yaml file. + +```yaml showLineNumbers title="litellm proxy config.yaml" +router_settings: + enable_tag_filtering: True # 👈 Key Change +``` + +## 1. Create a tag + +On the LiteLLM UI, navigate to Experimental > Tag Management > Create Tag. + +Create a tag called `private-data` and only select the allowed models for requests with this tag. Once created, you will see the tag in the Tag Management page. + + + + +## 2. Test Tag Routing + +Now we will test the tag based routing rules. + +### 2.1 Invalid model + +This request will fail since we send `tags=private-data` but the model `gpt-4o` is not in the allowed models for the `private-data` tag. + + + +
+ +Here is an example sending the same request using the OpenAI Python SDK. + + + +```python showLineNumbers +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://0.0.0.0:4000/v1/" +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "user", "content": "Hello, how are you?"} + ], + extra_body={ + "tags": "private-data" + } +) +``` + + + + +```bash +curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": "Hello, how are you?" + } + ], + "tags": "private-data" +}' +``` + + + + +
+ +### 2.2 Valid model + +This request will succeed since we send `tags=private-data` and the model `us.anthropic.claude-3-7-sonnet-20250219-v1:0` is in the allowed models for the `private-data` tag. + + + +Here is an example sending the same request using the OpenAI Python SDK. + + + + +```python showLineNumbers +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://0.0.0.0:4000/v1/" +) + +response = client.chat.completions.create( + model="us.anthropic.claude-3-7-sonnet-20250219-v1:0", + messages=[ + {"role": "user", "content": "Hello, how are you?"} + ], + extra_body={ + "tags": "private-data" + } +) +``` + + + + +```bash +curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "us.anthropic.claude-3-7-sonnet-20250219-v1:0", + "messages": [ + { + "role": "user", + "content": "Hello, how are you?" + } + ], + "tags": "private-data" +}' +``` + + + + + + +## Additional Tag Features +- [Sending tags in request headers](https://docs.litellm.ai/docs/proxy/tag_routing#calling-via-request-header) +- [Tag based routing](https://docs.litellm.ai/docs/proxy/tag_routing) +- [Track spend per tag](cost_tracking#-custom-tags) +- [Setup Budgets per Virtual Key, Team](users) + diff --git a/docs/my-website/img/arize.png b/docs/my-website/img/arize.png new file mode 100644 index 0000000000..45d6dacda9 Binary files /dev/null and b/docs/my-website/img/arize.png differ diff --git a/docs/my-website/img/auto_prompt_caching.png b/docs/my-website/img/auto_prompt_caching.png new file mode 100644 index 0000000000..6cd3785512 Binary files /dev/null and b/docs/my-website/img/auto_prompt_caching.png differ diff --git a/docs/my-website/img/custom_prompt_management.png b/docs/my-website/img/custom_prompt_management.png new file mode 100644 index 0000000000..2c96e0d116 Binary files /dev/null and b/docs/my-website/img/custom_prompt_management.png differ diff --git a/docs/my-website/img/deadlock_fix_1.png b/docs/my-website/img/deadlock_fix_1.png new file mode 100644 index 0000000000..df651f440c Binary files /dev/null and b/docs/my-website/img/deadlock_fix_1.png differ diff --git a/docs/my-website/img/deadlock_fix_2.png b/docs/my-website/img/deadlock_fix_2.png new file mode 100644 index 0000000000..0f139d84e5 Binary files /dev/null and b/docs/my-website/img/deadlock_fix_2.png differ diff --git a/docs/my-website/img/debug_sso.png b/docs/my-website/img/debug_sso.png new file mode 100644 index 0000000000..d7dde36892 Binary files /dev/null and b/docs/my-website/img/debug_sso.png differ diff --git a/docs/my-website/img/enterprise_vs_oss.png b/docs/my-website/img/enterprise_vs_oss.png new file mode 100644 index 0000000000..f2b58fbc14 Binary files /dev/null and b/docs/my-website/img/enterprise_vs_oss.png differ diff --git a/docs/my-website/img/entra_create_team.png b/docs/my-website/img/entra_create_team.png new file mode 100644 index 0000000000..223a897d87 Binary files /dev/null and b/docs/my-website/img/entra_create_team.png differ diff --git a/docs/my-website/img/hf_filter_inference_providers.png b/docs/my-website/img/hf_filter_inference_providers.png new file mode 100644 index 0000000000..d4c7188919 Binary files /dev/null and b/docs/my-website/img/hf_filter_inference_providers.png differ diff --git a/docs/my-website/img/image_handling.png b/docs/my-website/img/image_handling.png new file mode 100644 index 0000000000..bd56206911 Binary files /dev/null and b/docs/my-website/img/image_handling.png differ diff --git a/docs/my-website/img/litellm_codex.gif b/docs/my-website/img/litellm_codex.gif new file mode 100644 index 0000000000..04332b5053 Binary files /dev/null and b/docs/my-website/img/litellm_codex.gif differ diff --git a/docs/my-website/img/litellm_entra_id.png b/docs/my-website/img/litellm_entra_id.png new file mode 100644 index 0000000000..4cfbd0747f Binary files /dev/null and b/docs/my-website/img/litellm_entra_id.png differ diff --git a/docs/my-website/img/litellm_mcp.png b/docs/my-website/img/litellm_mcp.png new file mode 100644 index 0000000000..cef822eeb2 Binary files /dev/null and b/docs/my-website/img/litellm_mcp.png differ diff --git a/docs/my-website/img/managed_files_arch.png b/docs/my-website/img/managed_files_arch.png new file mode 100644 index 0000000000..e49c47334d Binary files /dev/null and b/docs/my-website/img/managed_files_arch.png differ diff --git a/docs/my-website/img/mcp_2.png b/docs/my-website/img/mcp_2.png new file mode 100644 index 0000000000..98e063efc5 Binary files /dev/null and b/docs/my-website/img/mcp_2.png differ diff --git a/docs/my-website/img/mcp_ui.png b/docs/my-website/img/mcp_ui.png new file mode 100644 index 0000000000..6731fba71b Binary files /dev/null and b/docs/my-website/img/mcp_ui.png differ diff --git a/docs/my-website/img/msft_auto_team.png b/docs/my-website/img/msft_auto_team.png new file mode 100644 index 0000000000..a50c5bbfbd Binary files /dev/null and b/docs/my-website/img/msft_auto_team.png differ diff --git a/docs/my-website/img/msft_default_settings.png b/docs/my-website/img/msft_default_settings.png new file mode 100644 index 0000000000..0caa60b1f5 Binary files /dev/null and b/docs/my-website/img/msft_default_settings.png differ diff --git a/docs/my-website/img/msft_enterprise_app.png b/docs/my-website/img/msft_enterprise_app.png new file mode 100644 index 0000000000..0a8c849a5c Binary files /dev/null and b/docs/my-website/img/msft_enterprise_app.png differ diff --git a/docs/my-website/img/msft_enterprise_assign_group.png b/docs/my-website/img/msft_enterprise_assign_group.png new file mode 100644 index 0000000000..d43e1c6684 Binary files /dev/null and b/docs/my-website/img/msft_enterprise_assign_group.png differ diff --git a/docs/my-website/img/msft_enterprise_select_group.png b/docs/my-website/img/msft_enterprise_select_group.png new file mode 100644 index 0000000000..e49032db9f Binary files /dev/null and b/docs/my-website/img/msft_enterprise_select_group.png differ diff --git a/docs/my-website/img/msft_member_1.png b/docs/my-website/img/msft_member_1.png new file mode 100644 index 0000000000..2fe627f773 Binary files /dev/null and b/docs/my-website/img/msft_member_1.png differ diff --git a/docs/my-website/img/msft_member_2.png b/docs/my-website/img/msft_member_2.png new file mode 100644 index 0000000000..9757aa9cea Binary files /dev/null and b/docs/my-website/img/msft_member_2.png differ diff --git a/docs/my-website/img/msft_member_3.png b/docs/my-website/img/msft_member_3.png new file mode 100644 index 0000000000..783a4a1dd8 Binary files /dev/null and b/docs/my-website/img/msft_member_3.png differ diff --git a/docs/my-website/img/msft_sso_sign_in.png b/docs/my-website/img/msft_sso_sign_in.png new file mode 100644 index 0000000000..43c5173295 Binary files /dev/null and b/docs/my-website/img/msft_sso_sign_in.png differ diff --git a/docs/my-website/img/prevent_deadlocks.jpg b/docs/my-website/img/prevent_deadlocks.jpg new file mode 100644 index 0000000000..2807f327d1 Binary files /dev/null and b/docs/my-website/img/prevent_deadlocks.jpg differ diff --git a/docs/my-website/img/realtime_api.png b/docs/my-website/img/realtime_api.png new file mode 100644 index 0000000000..798525278c Binary files /dev/null and b/docs/my-website/img/realtime_api.png differ diff --git a/docs/my-website/img/release_notes/chat_metrics.png b/docs/my-website/img/release_notes/chat_metrics.png new file mode 100644 index 0000000000..2e45392cd6 Binary files /dev/null and b/docs/my-website/img/release_notes/chat_metrics.png differ diff --git a/docs/my-website/img/release_notes/mcp_ui.png b/docs/my-website/img/release_notes/mcp_ui.png new file mode 100644 index 0000000000..8f4cd4ea19 Binary files /dev/null and b/docs/my-website/img/release_notes/mcp_ui.png differ diff --git a/docs/my-website/img/release_notes/new_activity_tab.png b/docs/my-website/img/release_notes/new_activity_tab.png new file mode 100644 index 0000000000..e8cea22a90 Binary files /dev/null and b/docs/my-website/img/release_notes/new_activity_tab.png differ diff --git a/docs/my-website/img/release_notes/new_tag_usage.png b/docs/my-website/img/release_notes/new_tag_usage.png new file mode 100644 index 0000000000..4188cbc245 Binary files /dev/null and b/docs/my-website/img/release_notes/new_tag_usage.png differ diff --git a/docs/my-website/img/release_notes/new_team_usage.png b/docs/my-website/img/release_notes/new_team_usage.png new file mode 100644 index 0000000000..5fea2506d9 Binary files /dev/null and b/docs/my-website/img/release_notes/new_team_usage.png differ diff --git a/docs/my-website/img/release_notes/new_team_usage_highlight.jpg b/docs/my-website/img/release_notes/new_team_usage_highlight.jpg new file mode 100644 index 0000000000..05dbf4b918 Binary files /dev/null and b/docs/my-website/img/release_notes/new_team_usage_highlight.jpg differ diff --git a/docs/my-website/img/release_notes/spend_by_model.jpg b/docs/my-website/img/release_notes/spend_by_model.jpg new file mode 100644 index 0000000000..2584949eff Binary files /dev/null and b/docs/my-website/img/release_notes/spend_by_model.jpg differ diff --git a/docs/my-website/img/release_notes/sso_sync.png b/docs/my-website/img/release_notes/sso_sync.png new file mode 100644 index 0000000000..a7bf6b838b Binary files /dev/null and b/docs/my-website/img/release_notes/sso_sync.png differ diff --git a/docs/my-website/img/release_notes/tag_management.png b/docs/my-website/img/release_notes/tag_management.png new file mode 100644 index 0000000000..eca7b8cbb1 Binary files /dev/null and b/docs/my-website/img/release_notes/tag_management.png differ diff --git a/docs/my-website/img/release_notes/team_model_add.png b/docs/my-website/img/release_notes/team_model_add.png new file mode 100644 index 0000000000..f548469846 Binary files /dev/null and b/docs/my-website/img/release_notes/team_model_add.png differ diff --git a/docs/my-website/img/release_notes/ui_usage.png b/docs/my-website/img/release_notes/ui_usage.png new file mode 100644 index 0000000000..ac39ffb918 Binary files /dev/null and b/docs/my-website/img/release_notes/ui_usage.png differ diff --git a/docs/my-website/img/release_notes/unified_responses_api_rn.png b/docs/my-website/img/release_notes/unified_responses_api_rn.png new file mode 100644 index 0000000000..60ede0e211 Binary files /dev/null and b/docs/my-website/img/release_notes/unified_responses_api_rn.png differ diff --git a/docs/my-website/img/scim_0.png b/docs/my-website/img/scim_0.png new file mode 100644 index 0000000000..265271b78c Binary files /dev/null and b/docs/my-website/img/scim_0.png differ diff --git a/docs/my-website/img/scim_1.png b/docs/my-website/img/scim_1.png new file mode 100644 index 0000000000..c6d64b5d11 Binary files /dev/null and b/docs/my-website/img/scim_1.png differ diff --git a/docs/my-website/img/scim_2.png b/docs/my-website/img/scim_2.png new file mode 100644 index 0000000000..c96cf9f0b5 Binary files /dev/null and b/docs/my-website/img/scim_2.png differ diff --git a/docs/my-website/img/scim_3.png b/docs/my-website/img/scim_3.png new file mode 100644 index 0000000000..5ecd3906bd Binary files /dev/null and b/docs/my-website/img/scim_3.png differ diff --git a/docs/my-website/img/scim_4.png b/docs/my-website/img/scim_4.png new file mode 100644 index 0000000000..b4b484418c Binary files /dev/null and b/docs/my-website/img/scim_4.png differ diff --git a/docs/my-website/img/scim_integration.png b/docs/my-website/img/scim_integration.png new file mode 100644 index 0000000000..2cfeb872bf Binary files /dev/null and b/docs/my-website/img/scim_integration.png differ diff --git a/docs/my-website/img/tag_create.png b/docs/my-website/img/tag_create.png new file mode 100644 index 0000000000..d515b3a9f4 Binary files /dev/null and b/docs/my-website/img/tag_create.png differ diff --git a/docs/my-website/img/tag_invalid.png b/docs/my-website/img/tag_invalid.png new file mode 100644 index 0000000000..e12f7197b1 Binary files /dev/null and b/docs/my-website/img/tag_invalid.png differ diff --git a/docs/my-website/img/tag_valid.png b/docs/my-website/img/tag_valid.png new file mode 100644 index 0000000000..3b6e121d12 Binary files /dev/null and b/docs/my-website/img/tag_valid.png differ diff --git a/docs/my-website/img/ui_auto_prompt_caching.png b/docs/my-website/img/ui_auto_prompt_caching.png new file mode 100644 index 0000000000..e6f48e48d0 Binary files /dev/null and b/docs/my-website/img/ui_auto_prompt_caching.png differ diff --git a/docs/my-website/package-lock.json b/docs/my-website/package-lock.json index 6c07e67d91..e6f20d567b 100644 --- a/docs/my-website/package-lock.json +++ b/docs/my-website/package-lock.json @@ -2148,9 +2148,10 @@ } }, "node_modules/@babel/runtime": { - "version": "7.26.0", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.26.0.tgz", - "integrity": "sha512-FDSOghenHTiToteC/QRlv2q3DhPZ/oOXTBoirfWNx1Cx3TMVcGWQtMMmQcSvb/JjpNeGzx8Pq/b4fKEJuWm1sw==", + "version": "7.27.0", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.0.tgz", + "integrity": "sha512-VtPOkrdPHZsKc/clNqyi9WUA8TINkZ4cGk63UUE3u4pmB2k+ZMQRDuIOagv8UVd6j7k0T3+RRIb7beKTebNbcw==", + "license": "MIT", "dependencies": { "regenerator-runtime": "^0.14.0" }, @@ -12454,9 +12455,10 @@ } }, "node_modules/http-proxy-middleware": { - "version": "2.0.7", - "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.7.tgz", - "integrity": "sha512-fgVY8AV7qU7z/MmXJ/rxwbrtQH4jBQ9m7kp3llF0liB7glmFeVZFBepQb32T3y8n8k2+AEYuMPCpinYW+/CuRA==", + "version": "2.0.9", + "resolved": "https://registry.npmjs.org/http-proxy-middleware/-/http-proxy-middleware-2.0.9.tgz", + "integrity": "sha512-c1IyJYLYppU574+YI7R4QyX2ystMtVXZwIdzazUIPIJsHuWNd+mho2j+bKoHftndicGj9yh+xjd+l0yj7VeT1Q==", + "license": "MIT", "dependencies": { "@types/http-proxy": "^1.17.8", "http-proxy": "^1.18.1", @@ -12559,9 +12561,10 @@ } }, "node_modules/image-size": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/image-size/-/image-size-1.1.1.tgz", - "integrity": "sha512-541xKlUw6jr/6gGuk92F+mYM5zaFAc5ahphvkqvNe2bQ6gVBkd6bfrmVJ2t4KDAfikAYZyIqTnktX3i6/aQDrQ==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/image-size/-/image-size-1.2.1.tgz", + "integrity": "sha512-rH+46sQJ2dlwfjfhCyNx5thzrv+dtmBIhPHk0zgRUukHzZ/kRueTJXoYYsclBaKcSMBWuGbOFXtioLpzTb5euw==", + "license": "MIT", "dependencies": { "queue": "6.0.2" }, diff --git a/docs/my-website/release_notes/v1.55.10/index.md b/docs/my-website/release_notes/v1.55.10/index.md index 7f9839c2b5..2b5ce75cf0 100644 --- a/docs/my-website/release_notes/v1.55.10/index.md +++ b/docs/my-website/release_notes/v1.55.10/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.55.8-stable/index.md b/docs/my-website/release_notes/v1.55.8-stable/index.md index 7e82e94747..38c78eb537 100644 --- a/docs/my-website/release_notes/v1.55.8-stable/index.md +++ b/docs/my-website/release_notes/v1.55.8-stable/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.56.1/index.md b/docs/my-website/release_notes/v1.56.1/index.md index 7c4ccc74ea..74f3606b90 100644 --- a/docs/my-website/release_notes/v1.56.1/index.md +++ b/docs/my-website/release_notes/v1.56.1/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.56.3/index.md b/docs/my-website/release_notes/v1.56.3/index.md index 95205633ea..3d996ba5b8 100644 --- a/docs/my-website/release_notes/v1.56.3/index.md +++ b/docs/my-website/release_notes/v1.56.3/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.56.4/index.md b/docs/my-website/release_notes/v1.56.4/index.md index 93f8725632..bf9cc2d94e 100644 --- a/docs/my-website/release_notes/v1.56.4/index.md +++ b/docs/my-website/release_notes/v1.56.4/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.57.3/index.md b/docs/my-website/release_notes/v1.57.3/index.md index 3bee71a8e1..ab1154a0a8 100644 --- a/docs/my-website/release_notes/v1.57.3/index.md +++ b/docs/my-website/release_notes/v1.57.3/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.57.7/index.md b/docs/my-website/release_notes/v1.57.7/index.md index ce987baf77..4da2402efa 100644 --- a/docs/my-website/release_notes/v1.57.7/index.md +++ b/docs/my-website/release_notes/v1.57.7/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.57.8-stable/index.md b/docs/my-website/release_notes/v1.57.8-stable/index.md index d37a7b9ff8..78fe13f2ed 100644 --- a/docs/my-website/release_notes/v1.57.8-stable/index.md +++ b/docs/my-website/release_notes/v1.57.8-stable/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ @@ -38,7 +38,7 @@ hide_table_of_contents: false 2. OpenAI Moderations - `omni-moderation-latest` support. [Start Here](https://docs.litellm.ai/docs/moderation) 3. Azure O1 - fake streaming support. This ensures if a `stream=true` is passed, the response is streamed. [Start Here](https://docs.litellm.ai/docs/providers/azure) 4. Anthropic - non-whitespace char stop sequence handling - [PR](https://github.com/BerriAI/litellm/pull/7484) -5. Azure OpenAI - support entrata id username + password based auth. [Start Here](https://docs.litellm.ai/docs/providers/azure#entrata-id---use-tenant_id-client_id-client_secret) +5. Azure OpenAI - support Entra ID username + password based auth. [Start Here](https://docs.litellm.ai/docs/providers/azure#entra-id---use-tenant_id-client_id-client_secret) 6. LM Studio - embedding route support. [Start Here](https://docs.litellm.ai/docs/providers/lm-studio) 7. WatsonX - ZenAPIKeyAuth support. [Start Here](https://docs.litellm.ai/docs/providers/watsonx) diff --git a/docs/my-website/release_notes/v1.59.0/index.md b/docs/my-website/release_notes/v1.59.0/index.md index 5343ba49ad..2699e42020 100644 --- a/docs/my-website/release_notes/v1.59.0/index.md +++ b/docs/my-website/release_notes/v1.59.0/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.59.8-stable/index.md b/docs/my-website/release_notes/v1.59.8-stable/index.md index fa9825fb66..023f284ad5 100644 --- a/docs/my-website/release_notes/v1.59.8-stable/index.md +++ b/docs/my-website/release_notes/v1.59.8-stable/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.61.20-stable/index.md b/docs/my-website/release_notes/v1.61.20-stable/index.md index 132c1aa318..5012e2aa90 100644 --- a/docs/my-website/release_notes/v1.61.20-stable/index.md +++ b/docs/my-website/release_notes/v1.61.20-stable/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.63.0/index.md b/docs/my-website/release_notes/v1.63.0/index.md index e74a2f9b86..ab74b11b4d 100644 --- a/docs/my-website/release_notes/v1.63.0/index.md +++ b/docs/my-website/release_notes/v1.63.0/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.63.11-stable/index.md b/docs/my-website/release_notes/v1.63.11-stable/index.md index 55fefb737a..882747a07b 100644 --- a/docs/my-website/release_notes/v1.63.11-stable/index.md +++ b/docs/my-website/release_notes/v1.63.11-stable/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ @@ -26,13 +26,18 @@ This release is primarily focused on: - UI - Credential Management, re-use credentials when adding new models - UI - Test Connection to LLM Provider before adding a model -:::info +## Known Issues +- 🚨 Known issue on Azure OpenAI - We don't recommend upgrading if you use Azure OpenAI. This version failed our Azure OpenAI load test -This release will be live on 03/16/2025 -::: +## Docker Run LiteLLM Proxy - +``` +docker run +-e STORE_MODEL_IN_DB=True +-p 4000:4000 +ghcr.io/berriai/litellm:main-v1.63.11-stable +``` ## Demo Instance diff --git a/docs/my-website/release_notes/v1.63.14/index.md b/docs/my-website/release_notes/v1.63.14/index.md new file mode 100644 index 0000000000..ff2630468c --- /dev/null +++ b/docs/my-website/release_notes/v1.63.14/index.md @@ -0,0 +1,131 @@ +--- +title: v1.63.14-stable +slug: v1.63.14-stable +date: 2025-03-22T10:00:00 +authors: + - name: Krrish Dholakia + title: CEO, LiteLLM + url: https://www.linkedin.com/in/krish-d/ + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 + - name: Ishaan Jaffer + title: CTO, LiteLLM + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg + +tags: [credential management, thinking content, responses api, snowflake] +hide_table_of_contents: false +--- + +import Image from '@theme/IdealImage'; + +These are the changes since `v1.63.11-stable`. + +This release brings: +- LLM Translation Improvements (MCP Support and Bedrock Application Profiles) +- Perf improvements for Usage-based Routing +- Streaming guardrail support via websockets +- Azure OpenAI client perf fix (from previous release) + +## Docker Run LiteLLM Proxy + +``` +docker run +-e STORE_MODEL_IN_DB=True +-p 4000:4000 +ghcr.io/berriai/litellm:main-v1.63.14-stable.patch1 +``` + +## Demo Instance + +Here's a Demo Instance to test changes: +- Instance: https://demo.litellm.ai/ +- Login Credentials: + - Username: admin + - Password: sk-1234 + + + +## New Models / Updated Models + +- Azure gpt-4o - fixed pricing to latest global pricing - [PR](https://github.com/BerriAI/litellm/pull/9361) +- O1-Pro - add pricing + model information - [PR](https://github.com/BerriAI/litellm/pull/9397) +- Azure AI - mistral 3.1 small pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453) +- Azure - gpt-4.5-preview pricing added - [PR](https://github.com/BerriAI/litellm/pull/9453) + + + +## LLM Translation + +1. **New LLM Features** + +- Bedrock: Support bedrock application inference profiles [Docs](https://docs.litellm.ai/docs/providers/bedrock#bedrock-application-inference-profile) + - Infer aws region from bedrock application profile id - (`arn:aws:bedrock:us-east-1:...`) +- Ollama - support calling via `/v1/completions` [Get Started](../../docs/providers/ollama#using-ollama-fim-on-v1completions) +- Bedrock - support `us.deepseek.r1-v1:0` model name [Docs](../../docs/providers/bedrock#supported-aws-bedrock-models) +- OpenRouter - `OPENROUTER_API_BASE` env var support [Docs](../../docs/providers/openrouter.md) +- Azure - add audio model parameter support - [Docs](../../docs/providers/azure#azure-audio-model) +- OpenAI - PDF File support [Docs](../../docs/completion/document_understanding#openai-file-message-type) +- OpenAI - o1-pro Responses API streaming support [Docs](../../docs/response_api.md#streaming) +- [BETA] MCP - Use MCP Tools with LiteLLM SDK [Docs](../../docs/mcp) + +2. **Bug Fixes** + +- Voyage: prompt token on embedding tracking fix - [PR](https://github.com/BerriAI/litellm/commit/56d3e75b330c3c3862dc6e1c51c1210e48f1068e) +- Sagemaker - Fix ‘Too little data for declared Content-Length’ error - [PR](https://github.com/BerriAI/litellm/pull/9326) +- OpenAI-compatible models - fix issue when calling openai-compatible models w/ custom_llm_provider set - [PR](https://github.com/BerriAI/litellm/pull/9355) +- VertexAI - Embedding ‘outputDimensionality’ support - [PR](https://github.com/BerriAI/litellm/commit/437dbe724620675295f298164a076cbd8019d304) +- Anthropic - return consistent json response format on streaming/non-streaming - [PR](https://github.com/BerriAI/litellm/pull/9437) + +## Spend Tracking Improvements + +- `litellm_proxy/` - support reading litellm response cost header from proxy, when using client sdk +- Reset Budget Job - fix budget reset error on keys/teams/users [PR](https://github.com/BerriAI/litellm/pull/9329) +- Streaming - Prevents final chunk w/ usage from being ignored (impacted bedrock streaming + cost tracking) [PR](https://github.com/BerriAI/litellm/pull/9314) + + +## UI + +1. Users Page + - Feature: Control default internal user settings [PR](https://github.com/BerriAI/litellm/pull/9328) +2. Icons: + - Feature: Replace external "artificialanalysis.ai" icons by local svg [PR](https://github.com/BerriAI/litellm/pull/9374) +3. Sign In/Sign Out + - Fix: Default login when `default_user_id` user does not exist in DB [PR](https://github.com/BerriAI/litellm/pull/9395) + + +## Logging Integrations + +- Support post-call guardrails for streaming responses [Get Started](../../docs/proxy/guardrails/custom_guardrail#1-write-a-customguardrail-class) +- Arize [Get Started](../../docs/observability/arize_integration) + - fix invalid package import [PR](https://github.com/BerriAI/litellm/pull/9338) + - migrate to using standardloggingpayload for metadata, ensures spans land successfully [PR](https://github.com/BerriAI/litellm/pull/9338) + - fix logging to just log the LLM I/O [PR](https://github.com/BerriAI/litellm/pull/9353) + - Dynamic API Key/Space param support [Get Started](../../docs/observability/arize_integration#pass-arize-spacekey-per-request) +- StandardLoggingPayload - Log litellm_model_name in payload. Allows knowing what the model sent to API provider was [Get Started](../../docs/proxy/logging_spec#standardlogginghiddenparams) +- Prompt Management - Allow building custom prompt management integration [Get Started](../../docs/proxy/custom_prompt_management.md) + +## Performance / Reliability improvements + +- Redis Caching - add 5s default timeout, prevents hanging redis connection from impacting llm calls [PR](https://github.com/BerriAI/litellm/commit/db92956ae33ed4c4e3233d7e1b0c7229817159bf) +- Allow disabling all spend updates / writes to DB - patch to allow disabling all spend updates to DB with a flag [PR](https://github.com/BerriAI/litellm/pull/9331) +- Azure OpenAI - correctly re-use azure openai client, fixes perf issue from previous Stable release [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413) +- Azure OpenAI - uses litellm.ssl_verify on Azure/OpenAI clients [PR](https://github.com/BerriAI/litellm/commit/f2026ef907c06d94440930917add71314b901413) +- Usage-based routing - Wildcard model support [Get Started](../../docs/proxy/usage_based_routing#wildcard-model-support) +- Usage-based routing - Support batch writing increments to redis - reduces latency to same as ‘simple-shuffle’ [PR](https://github.com/BerriAI/litellm/pull/9357) +- Router - show reason for model cooldown on ‘no healthy deployments available error’ [PR](https://github.com/BerriAI/litellm/pull/9438) +- Caching - add max value limit to an item in in-memory cache (1MB) - prevents OOM errors on large image url’s being sent through proxy [PR](https://github.com/BerriAI/litellm/pull/9448) + + +## General Improvements + +- Passthrough Endpoints - support returning api-base on pass-through endpoints Response Headers [Docs](../../docs/proxy/response_headers#litellm-specific-headers) +- SSL - support reading ssl security level from env var - Allows user to specify lower security settings [Get Started](../../docs/guides/security_settings) +- Credentials - only poll Credentials table when `STORE_MODEL_IN_DB` is True [PR](https://github.com/BerriAI/litellm/pull/9376) +- Image URL Handling - new architecture doc on image url handling [Docs](../../docs/proxy/image_handling) +- OpenAI - bump to pip install "openai==1.68.2" [PR](https://github.com/BerriAI/litellm/commit/e85e3bc52a9de86ad85c3dbb12d87664ee567a5a) +- Gunicorn - security fix - bump gunicorn==23.0.0 [PR](https://github.com/BerriAI/litellm/commit/7e9fc92f5c7fea1e7294171cd3859d55384166eb) + + +## Complete Git Diff + +[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.11-stable...v1.63.14.rc) \ No newline at end of file diff --git a/docs/my-website/release_notes/v1.63.2-stable/index.md b/docs/my-website/release_notes/v1.63.2-stable/index.md index 0c359452dc..3d47e02ac1 100644 --- a/docs/my-website/release_notes/v1.63.2-stable/index.md +++ b/docs/my-website/release_notes/v1.63.2-stable/index.md @@ -6,7 +6,7 @@ authors: - name: Krrish Dholakia title: CEO, LiteLLM url: https://www.linkedin.com/in/krish-d/ - image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1743638400&v=beta&t=39KOXMUFedvukiWWVPHf3qI45fuQD7lNglICwN31DrI + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 - name: Ishaan Jaffer title: CTO, LiteLLM url: https://www.linkedin.com/in/reffajnaahsi/ diff --git a/docs/my-website/release_notes/v1.65.0-stable/index.md b/docs/my-website/release_notes/v1.65.0-stable/index.md new file mode 100644 index 0000000000..3696f5023c --- /dev/null +++ b/docs/my-website/release_notes/v1.65.0-stable/index.md @@ -0,0 +1,160 @@ +--- +title: v1.65.0-stable - Model Context Protocol +slug: v1.65.0-stable +date: 2025-03-30T10:00:00 +authors: + - name: Krrish Dholakia + title: CEO, LiteLLM + url: https://www.linkedin.com/in/krish-d/ + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 + - name: Ishaan Jaffer + title: CTO, LiteLLM + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg +tags: [mcp, custom_prompt_management] +hide_table_of_contents: false +--- +import Image from '@theme/IdealImage'; + +v1.65.0-stable is live now. Here are the key highlights of this release: +- **MCP Support**: Support for adding and using MCP servers on the LiteLLM proxy. +- **UI view total usage after 1M+ logs**: You can now view usage analytics after crossing 1M+ logs in DB. + +## Model Context Protocol (MCP) + +This release introduces support for centrally adding MCP servers on LiteLLM. This allows you to add MCP server endpoints and your developers can `list` and `call` MCP tools through LiteLLM. + +Read more about MCP [here](https://docs.litellm.ai/docs/mcp). + + +

+ Expose and use MCP servers through LiteLLM +

+ +## UI view total usage after 1M+ logs + +This release brings the ability to view total usage analytics even after exceeding 1M+ logs in your database. We've implemented a scalable architecture that stores only aggregate usage data, resulting in significantly more efficient queries and reduced database CPU utilization. + + + +

+ View total usage after 1M+ logs +

+ + +- How this works: + - We now aggregate usage data into a dedicated DailyUserSpend table, significantly reducing query load and CPU usage even beyond 1M+ logs. + +- Daily Spend Breakdown API: + + - Retrieve granular daily usage data (by model, provider, and API key) with a single endpoint. + Example Request: + + ```shell title="Daily Spend Breakdown API" showLineNumbers + curl -L -X GET 'http://localhost:4000/user/daily/activity?start_date=2025-03-20&end_date=2025-03-27' \ + -H 'Authorization: Bearer sk-...' + ``` + + ```json title="Daily Spend Breakdown API Response" showLineNumbers + { + "results": [ + { + "date": "2025-03-27", + "metrics": { + "spend": 0.0177072, + "prompt_tokens": 111, + "completion_tokens": 1711, + "total_tokens": 1822, + "api_requests": 11 + }, + "breakdown": { + "models": { + "gpt-4o-mini": { + "spend": 1.095e-05, + "prompt_tokens": 37, + "completion_tokens": 9, + "total_tokens": 46, + "api_requests": 1 + }, + "providers": { "openai": { ... }, "azure_ai": { ... } }, + "api_keys": { "3126b6eaf1...": { ... } } + } + } + ], + "metadata": { + "total_spend": 0.7274667, + "total_prompt_tokens": 280990, + "total_completion_tokens": 376674, + "total_api_requests": 14 + } + } + ``` + + + + +## New Models / Updated Models +- Support for Vertex AI gemini-2.0-flash-lite & Google AI Studio gemini-2.0-flash-lite [PR](https://github.com/BerriAI/litellm/pull/9523) +- Support for Vertex AI Fine-Tuned LLMs [PR](https://github.com/BerriAI/litellm/pull/9542) +- Nova Canvas image generation support [PR](https://github.com/BerriAI/litellm/pull/9525) +- OpenAI gpt-4o-transcribe support [PR](https://github.com/BerriAI/litellm/pull/9517) +- Added new Vertex AI text embedding model [PR](https://github.com/BerriAI/litellm/pull/9476) + +## LLM Translation +- OpenAI Web Search Tool Call Support [PR](https://github.com/BerriAI/litellm/pull/9465) +- Vertex AI topLogprobs support [PR](https://github.com/BerriAI/litellm/pull/9518) +- Support for sending images and video to Vertex AI multimodal embedding [Doc](https://docs.litellm.ai/docs/providers/vertex#multi-modal-embeddings) +- Support litellm.api_base for Vertex AI + Gemini across completion, embedding, image_generation [PR](https://github.com/BerriAI/litellm/pull/9516) +- Bug fix for returning `response_cost` when using litellm python SDK with LiteLLM Proxy [PR](https://github.com/BerriAI/litellm/commit/6fd18651d129d606182ff4b980e95768fc43ca3d) +- Support for `max_completion_tokens` on Mistral API [PR](https://github.com/BerriAI/litellm/pull/9606) +- Refactored Vertex AI passthrough routes - fixes unpredictable behaviour with auto-setting default_vertex_region on router model add [PR](https://github.com/BerriAI/litellm/pull/9467) + +## Spend Tracking Improvements +- Log 'api_base' on spend logs [PR](https://github.com/BerriAI/litellm/pull/9509) +- Support for Gemini audio token cost tracking [PR](https://github.com/BerriAI/litellm/pull/9535) +- Fixed OpenAI audio input token cost tracking [PR](https://github.com/BerriAI/litellm/pull/9535) + +## UI + +### Model Management +- Allowed team admins to add/update/delete models on UI [PR](https://github.com/BerriAI/litellm/pull/9572) +- Added render supports_web_search on model hub [PR](https://github.com/BerriAI/litellm/pull/9469) + +### Request Logs +- Show API base and model ID on request logs [PR](https://github.com/BerriAI/litellm/pull/9572) +- Allow viewing keyinfo on request logs [PR](https://github.com/BerriAI/litellm/pull/9568) + +### Usage Tab +- Added Daily User Spend Aggregate view - allows UI Usage tab to work > 1m rows [PR](https://github.com/BerriAI/litellm/pull/9538) +- Connected UI to "LiteLLM_DailyUserSpend" spend table [PR](https://github.com/BerriAI/litellm/pull/9603) + +## Logging Integrations +- Fixed StandardLoggingPayload for GCS Pub Sub Logging Integration [PR](https://github.com/BerriAI/litellm/pull/9508) +- Track `litellm_model_name` on `StandardLoggingPayload` [Docs](https://docs.litellm.ai/docs/proxy/logging_spec#standardlogginghiddenparams) + +## Performance / Reliability Improvements +- LiteLLM Redis semantic caching implementation [PR](https://github.com/BerriAI/litellm/pull/9356) +- Gracefully handle exceptions when DB is having an outage [PR](https://github.com/BerriAI/litellm/pull/9533) +- Allow Pods to startup + passing /health/readiness when allow_requests_on_db_unavailable: True and DB is down [PR](https://github.com/BerriAI/litellm/pull/9569) + + +## General Improvements +- Support for exposing MCP tools on litellm proxy [PR](https://github.com/BerriAI/litellm/pull/9426) +- Support discovering Gemini, Anthropic, xAI models by calling their /v1/model endpoint [PR](https://github.com/BerriAI/litellm/pull/9530) +- Fixed route check for non-proxy admins on JWT auth [PR](https://github.com/BerriAI/litellm/pull/9454) +- Added baseline Prisma database migrations [PR](https://github.com/BerriAI/litellm/pull/9565) +- View all wildcard models on /model/info [PR](https://github.com/BerriAI/litellm/pull/9572) + + +## Security +- Bumped next from 14.2.21 to 14.2.25 in UI dashboard [PR](https://github.com/BerriAI/litellm/pull/9458) + +## Complete Git Diff + +[Here's the complete git diff](https://github.com/BerriAI/litellm/compare/v1.63.14-stable.patch1...v1.65.0-stable) diff --git a/docs/my-website/release_notes/v1.65.0/index.md b/docs/my-website/release_notes/v1.65.0/index.md new file mode 100644 index 0000000000..84276c997d --- /dev/null +++ b/docs/my-website/release_notes/v1.65.0/index.md @@ -0,0 +1,34 @@ +--- +title: v1.65.0 - Team Model Add - update +slug: v1.65.0 +date: 2025-03-28T10:00:00 +authors: + - name: Krrish Dholakia + title: CEO, LiteLLM + url: https://www.linkedin.com/in/krish-d/ + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 + - name: Ishaan Jaffer + title: CTO, LiteLLM + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg +tags: [management endpoints, team models, ui] +hide_table_of_contents: false +--- + +import Image from '@theme/IdealImage'; + +v1.65.0 updates the `/model/new` endpoint to prevent non-team admins from creating team models. + +This means that only proxy admins or team admins can create team models. + +## Additional Changes + +- Allows team admins to call `/model/update` to update team models. +- Allows team admins to call `/model/delete` to delete team models. +- Introduces new `user_models_only` param to `/v2/model/info` - only return models added by this user. + + +These changes enable team admins to add and manage models for their team on the LiteLLM UI + API. + + + \ No newline at end of file diff --git a/docs/my-website/release_notes/v1.65.4-stable/index.md b/docs/my-website/release_notes/v1.65.4-stable/index.md new file mode 100644 index 0000000000..872024a47a --- /dev/null +++ b/docs/my-website/release_notes/v1.65.4-stable/index.md @@ -0,0 +1,176 @@ +--- +title: v1.65.4-stable +slug: v1.65.4-stable +date: 2025-04-05T10:00:00 +authors: + - name: Krrish Dholakia + title: CEO, LiteLLM + url: https://www.linkedin.com/in/krish-d/ + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 + - name: Ishaan Jaffer + title: CTO, LiteLLM + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg + +tags: [] +hide_table_of_contents: false +--- + +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Deploy this version + + + + +``` showLineNumbers title="docker run litellm" +docker run +-e STORE_MODEL_IN_DB=True +-p 4000:4000 +ghcr.io/berriai/litellm:main-v1.65.4-stable +``` + + + + +``` showLineNumbers title="pip install litellm" +pip install litellm==1.65.4.post1 +``` + + + +v1.65.4-stable is live. Here are the improvements since v1.65.0-stable. + +## Key Highlights +- **Preventing DB Deadlocks**: Fixes a high-traffic issue when multiple instances were writing to the DB at the same time. +- **New Usage Tab**: Enables viewing spend by model and customizing date range + +Let's dive in. + +### Preventing DB Deadlocks + + + +This release fixes the DB deadlocking issue that users faced in high traffic (10K+ RPS). This is great because it enables user/key/team spend tracking works at that scale. + +Read more about the new architecture [here](https://docs.litellm.ai/docs/proxy/db_deadlocks) + + +### New Usage Tab + + + +The new Usage tab now brings the ability to track daily spend by model. This makes it easier to catch any spend tracking or token counting errors, when combined with the ability to view successful requests, and token usage. + +To test this out, just go to Experimental > New Usage > Activity. + + +## New Models / Updated Models + +1. Databricks - claude-3-7-sonnet cost tracking [PR](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/model_prices_and_context_window.json#L10350) +2. VertexAI - `gemini-2.5-pro-exp-03-25` cost tracking [PR](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/model_prices_and_context_window.json#L4492) +3. VertexAI - `gemini-2.0-flash` cost tracking [PR](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/model_prices_and_context_window.json#L4689) +4. Groq - add whisper ASR models to model cost map [PR](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/model_prices_and_context_window.json#L3324) +5. IBM - Add watsonx/ibm/granite-3-8b-instruct to model cost map [PR](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/model_prices_and_context_window.json#L91) +6. Google AI Studio - add gemini/gemini-2.5-pro-preview-03-25 to model cost map [PR](https://github.com/BerriAI/litellm/blob/52b35cd8093b9ad833987b24f494586a1e923209/model_prices_and_context_window.json#L4850) + +## LLM Translation +1. Vertex AI - Support anyOf param for OpenAI json schema translation [Get Started](https://docs.litellm.ai/docs/providers/vertex#json-schema) +2. Anthropic- response_format + thinking param support (works across Anthropic API, Bedrock, Vertex) [Get Started](https://docs.litellm.ai/docs/reasoning_content) +3. Anthropic - if thinking token is specified and max tokens is not - ensure max token to anthropic is higher than thinking tokens (works across Anthropic API, Bedrock, Vertex) [PR](https://github.com/BerriAI/litellm/pull/9594) +4. Bedrock - latency optimized inference support [Get Started](https://docs.litellm.ai/docs/providers/bedrock#usage---latency-optimized-inference) +5. Sagemaker - handle special tokens + multibyte character code in response [Get Started](https://docs.litellm.ai/docs/providers/aws_sagemaker) +6. MCP - add support for using SSE MCP servers [Get Started](https://docs.litellm.ai/docs/mcp#usage) +8. Anthropic - new `litellm.messages.create` interface for calling Anthropic `/v1/messages` via passthrough [Get Started](https://docs.litellm.ai/docs/anthropic_unified#usage) +11. Anthropic - support ‘file’ content type in message param (works across Anthropic API, Bedrock, Vertex) [Get Started](https://docs.litellm.ai/docs/providers/anthropic#usage---pdf) +12. Anthropic - map openai 'reasoning_effort' to anthropic 'thinking' param (works across Anthropic API, Bedrock, Vertex) [Get Started](https://docs.litellm.ai/docs/providers/anthropic#usage---thinking--reasoning_content) +13. Google AI Studio (Gemini) - [BETA] `/v1/files` upload support [Get Started](../../docs/providers/google_ai_studio/files) +14. Azure - fix o-series tool calling [Get Started](../../docs/providers/azure#tool-calling--function-calling) +15. Unified file id - [ALPHA] allow calling multiple providers with same file id [PR](https://github.com/BerriAI/litellm/pull/9718) + - This is experimental, and not recommended for production use. + - We plan to have a production-ready implementation by next week. +16. Google AI Studio (Gemini) - return logprobs [PR](https://github.com/BerriAI/litellm/pull/9713) +17. Anthropic - Support prompt caching for Anthropic tool calls [Get Started](https://docs.litellm.ai/docs/completion/prompt_caching) +18. OpenRouter - unwrap extra body on open router calls [PR](https://github.com/BerriAI/litellm/pull/9747) +19. VertexAI - fix credential caching issue [PR](https://github.com/BerriAI/litellm/pull/9756) +20. XAI - filter out 'name' param for XAI [PR](https://github.com/BerriAI/litellm/pull/9761) +21. Gemini - image generation output support [Get Started](../../docs/providers/gemini#image-generation) +22. Databricks - support claude-3-7-sonnet w/ thinking + response_format [Get Started](../../docs/providers/databricks#usage---thinking--reasoning_content) + +## Spend Tracking Improvements +1. Reliability fix - Check sent and received model for cost calculation [PR](https://github.com/BerriAI/litellm/pull/9669) +2. Vertex AI - Multimodal embedding cost tracking [Get Started](https://docs.litellm.ai/docs/providers/vertex#multi-modal-embeddings), [PR](https://github.com/BerriAI/litellm/pull/9623) + +## Management Endpoints / UI + + + +1. New Usage Tab + - Report 'total_tokens' + report success/failure calls + - Remove double bars on scroll + - Ensure ‘daily spend’ chart ordered from earliest to latest date + - showing spend per model per day + - show key alias on usage tab + - Allow non-admins to view their activity + - Add date picker to new usage tab +2. Virtual Keys Tab + - remove 'default key' on user signup + - fix showing user models available for personal key creation +3. Test Key Tab + - Allow testing image generation models +4. Models Tab + - Fix bulk adding models + - support reusable credentials for passthrough endpoints + - Allow team members to see team models +5. Teams Tab + - Fix json serialization error on update team metadata +6. Request Logs Tab + - Add reasoning_content token tracking across all providers on streaming +7. API + - return key alias on /user/daily/activity [Get Started](../../docs/proxy/cost_tracking#daily-spend-breakdown-api) +8. SSO + - Allow assigning SSO users to teams on MSFT SSO [PR](https://github.com/BerriAI/litellm/pull/9745) + +## Logging / Guardrail Integrations + +1. Console Logs - Add json formatting for uncaught exceptions [PR](https://github.com/BerriAI/litellm/pull/9619) +2. Guardrails - AIM Guardrails support for virtual key based policies [Get Started](../../docs/proxy/guardrails/aim_security) +3. Logging - fix completion start time tracking [PR](https://github.com/BerriAI/litellm/pull/9688) +4. Prometheus + - Allow adding authentication on Prometheus /metrics endpoints [PR](https://github.com/BerriAI/litellm/pull/9766) + - Distinguish LLM Provider Exception vs. LiteLLM Exception in metric naming [PR](https://github.com/BerriAI/litellm/pull/9760) + - Emit operational metrics for new DB Transaction architecture [PR](https://github.com/BerriAI/litellm/pull/9719) + +## Performance / Loadbalancing / Reliability improvements +1. Preventing Deadlocks + - Reduce DB Deadlocks by storing spend updates in Redis and then committing to DB [PR](https://github.com/BerriAI/litellm/pull/9608) + - Ensure no deadlocks occur when updating DailyUserSpendTransaction [PR](https://github.com/BerriAI/litellm/pull/9690) + - High Traffic fix - ensure new DB + Redis architecture accurately tracks spend [PR](https://github.com/BerriAI/litellm/pull/9673) + - Use Redis for PodLock Manager instead of PG (ensures no deadlocks occur) [PR](https://github.com/BerriAI/litellm/pull/9715) + - v2 DB Deadlock Reduction Architecture – Add Max Size for In-Memory Queue + Backpressure Mechanism [PR](https://github.com/BerriAI/litellm/pull/9759) + +2. Prisma Migrations [Get Started](../../docs/proxy/prod#9-use-prisma-migrate-deploy) + - connects litellm proxy to litellm's prisma migration files + - Handle db schema updates from new `litellm-proxy-extras` sdk +3. Redis - support password for sync sentinel clients [PR](https://github.com/BerriAI/litellm/pull/9622) +4. Fix "Circular reference detected" error when max_parallel_requests = 0 [PR](https://github.com/BerriAI/litellm/pull/9671) +5. Code QA - Ban hardcoded numbers [PR](https://github.com/BerriAI/litellm/pull/9709) + +## Helm +1. fix: wrong indentation of ttlSecondsAfterFinished in chart [PR](https://github.com/BerriAI/litellm/pull/9611) + +## General Proxy Improvements +1. Fix - only apply service_account_settings.enforced_params on service accounts [PR](https://github.com/BerriAI/litellm/pull/9683) +2. Fix - handle metadata null on `/chat/completion` [PR](https://github.com/BerriAI/litellm/issues/9717) +3. Fix - Move daily user transaction logging outside of 'disable_spend_logs' flag, as they’re unrelated [PR](https://github.com/BerriAI/litellm/pull/9772) + +## Demo + +Try this on the demo instance [today](https://docs.litellm.ai/docs/proxy/demo) + +## Complete Git Diff + +See the complete git diff since v1.65.0-stable, [here](https://github.com/BerriAI/litellm/releases/tag/v1.65.4-stable) + diff --git a/docs/my-website/release_notes/v1.66.0-stable/index.md b/docs/my-website/release_notes/v1.66.0-stable/index.md new file mode 100644 index 0000000000..939322e031 --- /dev/null +++ b/docs/my-website/release_notes/v1.66.0-stable/index.md @@ -0,0 +1,197 @@ +--- +title: v1.66.0-stable - Realtime API Cost Tracking +slug: v1.66.0-stable +date: 2025-04-12T10:00:00 +authors: + - name: Krrish Dholakia + title: CEO, LiteLLM + url: https://www.linkedin.com/in/krish-d/ + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 + - name: Ishaan Jaffer + title: CTO, LiteLLM + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg + +tags: ["sso", "unified_file_id", "cost_tracking", "security"] +hide_table_of_contents: false +--- + +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Deploy this version + + + + +``` showLineNumbers title="docker run litellm" +docker run +-e STORE_MODEL_IN_DB=True +-p 4000:4000 +ghcr.io/berriai/litellm:main-v1.66.0-stable +``` + + + + +``` showLineNumbers title="pip install litellm" +pip install litellm==1.66.0.post1 +``` + + + +v1.66.0-stable is live now, here are the key highlights of this release + +## Key Highlights +- **Realtime API Cost Tracking**: Track cost of realtime API calls +- **Microsoft SSO Auto-sync**: Auto-sync groups and group members from Azure Entra ID to LiteLLM +- **xAI grok-3**: Added support for `xai/grok-3` models +- **Security Fixes**: Fixed [CVE-2025-0330](https://www.cve.org/CVERecord?id=CVE-2025-0330) and [CVE-2024-6825](https://www.cve.org/CVERecord?id=CVE-2024-6825) vulnerabilities + +Let's dive in. + +## Realtime API Cost Tracking + + + + +This release adds Realtime API logging + cost tracking. +- **Logging**: LiteLLM now logs the complete response from realtime calls to all logging integrations (DB, S3, Langfuse, etc.) +- **Cost Tracking**: You can now set 'base_model' and custom pricing for realtime models. [Custom Pricing](../../docs/proxy/custom_pricing) +- **Budgets**: Your key/user/team budgets now work for realtime models as well. + +Start [here](https://docs.litellm.ai/docs/realtime) + + + +## Microsoft SSO Auto-sync + + +

+ Auto-sync groups and members from Azure Entra ID to LiteLLM +

+ +This release adds support for auto-syncing groups and members on Microsoft Entra ID with LiteLLM. This means that LiteLLM proxy administrators can spend less time managing teams and members and LiteLLM handles the following: + +- Auto-create teams that exist on Microsoft Entra ID +- Sync team members on Microsoft Entra ID with LiteLLM teams + +Get started with this [here](https://docs.litellm.ai/docs/tutorials/msft_sso) + + +## New Models / Updated Models + +- **xAI** + 1. Added reasoning_effort support for `xai/grok-3-mini-beta` [Get Started](https://docs.litellm.ai/docs/providers/xai#reasoning-usage) + 2. Added cost tracking for `xai/grok-3` models [PR](https://github.com/BerriAI/litellm/pull/9920) + +- **Hugging Face** + 1. Added inference providers support [Get Started](https://docs.litellm.ai/docs/providers/huggingface#serverless-inference-providers) + +- **Azure** + 1. Added azure/gpt-4o-realtime-audio cost tracking [PR](https://github.com/BerriAI/litellm/pull/9893) + +- **VertexAI** + 1. Added enterpriseWebSearch tool support [Get Started](https://docs.litellm.ai/docs/providers/vertex#grounding---web-search) + 2. Moved to only passing keys accepted by the Vertex AI response schema [PR](https://github.com/BerriAI/litellm/pull/8992) + +- **Google AI Studio** + 1. Added cost tracking for `gemini-2.5-pro` [PR](https://github.com/BerriAI/litellm/pull/9837) + 2. Fixed pricing for 'gemini/gemini-2.5-pro-preview-03-25' [PR](https://github.com/BerriAI/litellm/pull/9896) + 3. Fixed handling file_data being passed in [PR](https://github.com/BerriAI/litellm/pull/9786) + +- **Azure** + 1. Updated Azure Phi-4 pricing [PR](https://github.com/BerriAI/litellm/pull/9862) + 2. Added azure/gpt-4o-realtime-audio cost tracking [PR](https://github.com/BerriAI/litellm/pull/9893) + +- **Databricks** + 1. Removed reasoning_effort from parameters [PR](https://github.com/BerriAI/litellm/pull/9811) + 2. Fixed custom endpoint check for Databricks [PR](https://github.com/BerriAI/litellm/pull/9925) + +- **General** + 1. Added litellm.supports_reasoning() util to track if an llm supports reasoning [Get Started](https://docs.litellm.ai/docs/providers/anthropic#reasoning) + 2. Function Calling - Handle pydantic base model in message tool calls, handle tools = [], and support fake streaming on tool calls for meta.llama3-3-70b-instruct-v1:0 [PR](https://github.com/BerriAI/litellm/pull/9774) + 3. LiteLLM Proxy - Allow passing `thinking` param to litellm proxy via client sdk [PR](https://github.com/BerriAI/litellm/pull/9386) + 4. Fixed correctly translating 'thinking' param for litellm [PR](https://github.com/BerriAI/litellm/pull/9904) + + +## Spend Tracking Improvements +- **OpenAI, Azure** + 1. Realtime API Cost tracking with token usage metrics in spend logs [Get Started](https://docs.litellm.ai/docs/realtime) +- **Anthropic** + 1. Fixed Claude Haiku cache read pricing per token [PR](https://github.com/BerriAI/litellm/pull/9834) + 2. Added cost tracking for Claude responses with base_model [PR](https://github.com/BerriAI/litellm/pull/9897) + 3. Fixed Anthropic prompt caching cost calculation and trimmed logged message in db [PR](https://github.com/BerriAI/litellm/pull/9838) +- **General** + 1. Added token tracking and log usage object in spend logs [PR](https://github.com/BerriAI/litellm/pull/9843) + 2. Handle custom pricing at deployment level [PR](https://github.com/BerriAI/litellm/pull/9855) + + +## Management Endpoints / UI + +- **Test Key Tab** + 1. Added rendering of Reasoning content, ttft, usage metrics on test key page [PR](https://github.com/BerriAI/litellm/pull/9931) + + +

+ View input, output, reasoning tokens, ttft metrics. +

+- **Tag / Policy Management** + 1. Added Tag/Policy Management. Create routing rules based on request metadata. This allows you to enforce that requests with `tags="private"` only go to specific models. [Get Started](https://docs.litellm.ai/docs/tutorials/tag_management) + +
+ + +

+ Create and manage tags. +

+- **Redesigned Login Screen** + 1. Polished login screen [PR](https://github.com/BerriAI/litellm/pull/9778) +- **Microsoft SSO Auto-Sync** + 1. Added debug route to allow admins to debug SSO JWT fields [PR](https://github.com/BerriAI/litellm/pull/9835) + 2. Added ability to use MSFT Graph API to assign users to teams [PR](https://github.com/BerriAI/litellm/pull/9865) + 3. Connected litellm to Azure Entra ID Enterprise Application [PR](https://github.com/BerriAI/litellm/pull/9872) + 4. Added ability for admins to set `default_team_params` for when litellm SSO creates default teams [PR](https://github.com/BerriAI/litellm/pull/9895) + 5. Fixed MSFT SSO to use correct field for user email [PR](https://github.com/BerriAI/litellm/pull/9886) + 6. Added UI support for setting Default Team setting when litellm SSO auto creates teams [PR](https://github.com/BerriAI/litellm/pull/9918) +- **UI Bug Fixes** + 1. Prevented team, key, org, model numerical values changing on scrolling [PR](https://github.com/BerriAI/litellm/pull/9776) + 2. Instantly reflect key and team updates in UI [PR](https://github.com/BerriAI/litellm/pull/9825) + +## Logging / Guardrail Improvements + +- **Prometheus** + 1. Emit Key and Team Budget metrics on a cron job schedule [Get Started](https://docs.litellm.ai/docs/proxy/prometheus#initialize-budget-metrics-on-startup) + +## Security Fixes + +- Fixed [CVE-2025-0330](https://www.cve.org/CVERecord?id=CVE-2025-0330) - Leakage of Langfuse API keys in team exception handling [PR](https://github.com/BerriAI/litellm/pull/9830) +- Fixed [CVE-2024-6825](https://www.cve.org/CVERecord?id=CVE-2024-6825) - Remote code execution in post call rules [PR](https://github.com/BerriAI/litellm/pull/9826) + +## Helm + +- Added service annotations to litellm-helm chart [PR](https://github.com/BerriAI/litellm/pull/9840) +- Added extraEnvVars to the helm deployment [PR](https://github.com/BerriAI/litellm/pull/9292) + +## Demo + +Try this on the demo instance [today](https://docs.litellm.ai/docs/proxy/demo) + +## Complete Git Diff + +See the complete git diff since v1.65.4-stable, [here](https://github.com/BerriAI/litellm/releases/tag/v1.66.0-stable) + + diff --git a/docs/my-website/release_notes/v1.67.0-stable/index.md b/docs/my-website/release_notes/v1.67.0-stable/index.md new file mode 100644 index 0000000000..cb7938fce5 --- /dev/null +++ b/docs/my-website/release_notes/v1.67.0-stable/index.md @@ -0,0 +1,153 @@ +--- +title: v1.67.0-stable - SCIM Integration +slug: v1.67.0-stable +date: 2025-04-19T10:00:00 +authors: + - name: Krrish Dholakia + title: CEO, LiteLLM + url: https://www.linkedin.com/in/krish-d/ + image_url: https://media.licdn.com/dms/image/v2/D4D03AQGrlsJ3aqpHmQ/profile-displayphoto-shrink_400_400/B4DZSAzgP7HYAg-/0/1737327772964?e=1749686400&v=beta&t=Hkl3U8Ps0VtvNxX0BNNq24b4dtX5wQaPFp6oiKCIHD8 + - name: Ishaan Jaffer + title: CTO, LiteLLM + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg + +tags: ["sso", "unified_file_id", "cost_tracking", "security"] +hide_table_of_contents: false +--- +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Key Highlights + +- **SCIM Integration**: Enables identity providers (Okta, Azure AD, OneLogin, etc.) to automate user and team (group) provisioning, updates, and deprovisioning +- **Team and Tag based usage tracking**: You can now see usage and spend by team and tag at 1M+ spend logs. +- **Unified Responses API**: Support for calling Anthropic, Gemini, Groq, etc. via OpenAI's new Responses API. + +Let's dive in. + +## SCIM Integration + + + +This release adds SCIM support to LiteLLM. This allows your SSO provider (Okta, Azure AD, etc) to automatically create/delete users, teams, and memberships on LiteLLM. This means that when you remove a team on your SSO provider, your SSO provider will automatically delete the corresponding team on LiteLLM. + +[Read more](../../docs/tutorials/scim_litellm) +## Team and Tag based usage tracking + + + + +This release improves team and tag based usage tracking at 1m+ spend logs, making it easy to monitor your LLM API Spend in production. This covers: + +- View **daily spend** by teams + tags +- View **usage / spend by key**, within teams +- View **spend by multiple tags** +- Allow **internal users** to view spend of teams they're a member of + +[Read more](#management-endpoints--ui) + +## Unified Responses API + +This release allows you to call Azure OpenAI, Anthropic, AWS Bedrock, and Google Vertex AI models via the POST /v1/responses endpoint on LiteLLM. This means you can now use popular tools like [OpenAI Codex](https://docs.litellm.ai/docs/tutorials/openai_codex) with your own models. + + + + +[Read more](https://docs.litellm.ai/docs/response_api) + + +## New Models / Updated Models + +- **OpenAI** + 1. gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing - [Get Started](../../docs/providers/openai#usage), [PR](https://github.com/BerriAI/litellm/pull/9990) + 2. o4 - correctly map o4 to openai o_series model +- **Azure AI** + 1. Phi-4 output cost per token fix - [PR](https://github.com/BerriAI/litellm/pull/9880) + 2. Responses API support [Get Started](../../docs/providers/azure#azure-responses-api),[PR](https://github.com/BerriAI/litellm/pull/10116) +- **Anthropic** + 1. redacted message thinking support - [Get Started](../../docs/providers/anthropic#usage---thinking--reasoning_content),[PR](https://github.com/BerriAI/litellm/pull/10129) +- **Cohere** + 1. `/v2/chat` Passthrough endpoint support w/ cost tracking - [Get Started](../../docs/pass_through/cohere), [PR](https://github.com/BerriAI/litellm/pull/9997) +- **Azure** + 1. Support azure tenant_id/client_id env vars - [Get Started](../../docs/providers/azure#entra-id---use-tenant_id-client_id-client_secret), [PR](https://github.com/BerriAI/litellm/pull/9993) + 2. Fix response_format check for 2025+ api versions - [PR](https://github.com/BerriAI/litellm/pull/9993) + 3. Add gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, o3, o3-mini, o4-mini pricing +- **VLLM** + 1. Files - Support 'file' message type for VLLM video url's - [Get Started](../../docs/providers/vllm#send-video-url-to-vllm), [PR](https://github.com/BerriAI/litellm/pull/10129) + 2. Passthrough - new `/vllm/` passthrough endpoint support [Get Started](../../docs/pass_through/vllm), [PR](https://github.com/BerriAI/litellm/pull/10002) +- **Mistral** + 1. new `/mistral` passthrough endpoint support [Get Started](../../docs/pass_through/mistral), [PR](https://github.com/BerriAI/litellm/pull/10002) +- **AWS** + 1. New mapped bedrock regions - [PR](https://github.com/BerriAI/litellm/pull/9430) +- **VertexAI / Google AI Studio** + 1. Gemini - Response format - Retain schema field ordering for google gemini and vertex by specifying propertyOrdering - [Get Started](../../docs/providers/vertex#json-schema), [PR](https://github.com/BerriAI/litellm/pull/9828) + 2. Gemini-2.5-flash - return reasoning content [Google AI Studio](../../docs/providers/gemini#usage---thinking--reasoning_content), [Vertex AI](../../docs/providers/vertex#thinking--reasoning_content) + 3. Gemini-2.5-flash - pricing + model information [PR](https://github.com/BerriAI/litellm/pull/10125) + 4. Passthrough - new `/vertex_ai/discovery` route - enables calling AgentBuilder API routes [Get Started](../../docs/pass_through/vertex_ai#supported-api-endpoints), [PR](https://github.com/BerriAI/litellm/pull/10084) +- **Fireworks AI** + 1. return tool calling responses in `tool_calls` field (fireworks incorrectly returns this as a json str in content) [PR](https://github.com/BerriAI/litellm/pull/10130) +- **Triton** + 1. Remove fixed remove bad_words / stop words from `/generate` call - [Get Started](../../docs/providers/triton-inference-server#triton-generate---chat-completion), [PR](https://github.com/BerriAI/litellm/pull/10163) +- **Other** + 1. Support for all litellm providers on Responses API (works with Codex) - [Get Started](../../docs/tutorials/openai_codex), [PR](https://github.com/BerriAI/litellm/pull/10132) + 2. Fix combining multiple tool calls in streaming response - [Get Started](../../docs/completion/stream#helper-function), [PR](https://github.com/BerriAI/litellm/pull/10040) + + +## Spend Tracking Improvements + +- **Cost Control** - inject cache control points in prompt for cost reduction [Get Started](../../docs/tutorials/prompt_caching), [PR](https://github.com/BerriAI/litellm/pull/10000) +- **Spend Tags** - spend tags in headers - support x-litellm-tags even if tag based routing not enabled [Get Started](../../docs/proxy/request_headers#litellm-headers), [PR](https://github.com/BerriAI/litellm/pull/10000) +- **Gemini-2.5-flash** - support cost calculation for reasoning tokens [PR](https://github.com/BerriAI/litellm/pull/10141) + +## Management Endpoints / UI +- **Users** + 1. Show created_at and updated_at on users page - [PR](https://github.com/BerriAI/litellm/pull/10033) +- **Virtual Keys** + 1. Filter by key alias - https://github.com/BerriAI/litellm/pull/10085 +- **Usage Tab** + + 1. Team based usage + + - New `LiteLLM_DailyTeamSpend` Table for aggregate team based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10039) + + - New Team based usage dashboard + new `/team/daily/activity` API - [PR](https://github.com/BerriAI/litellm/pull/10081) + - Return team alias on /team/daily/activity API - [PR](https://github.com/BerriAI/litellm/pull/10157) + - allow internal user view spend for teams they belong to - [PR](https://github.com/BerriAI/litellm/pull/10157) + - allow viewing top keys by team - [PR](https://github.com/BerriAI/litellm/pull/10157) + + + + 2. Tag Based Usage + - New `LiteLLM_DailyTagSpend` Table for aggregate tag based usage logging - [PR](https://github.com/BerriAI/litellm/pull/10071) + - Restrict to only Proxy Admins - [PR](https://github.com/BerriAI/litellm/pull/10157) + - allow viewing top keys by tag + - Return tags passed in request (i.e. dynamic tags) on `/tag/list` API - [PR](https://github.com/BerriAI/litellm/pull/10157) + + 3. Track prompt caching metrics in daily user, team, tag tables - [PR](https://github.com/BerriAI/litellm/pull/10029) + 4. Show usage by key (on all up, team, and tag usage dashboards) - [PR](https://github.com/BerriAI/litellm/pull/10157) + 5. swap old usage with new usage tab +- **Models** + 1. Make columns resizable/hideable - [PR](https://github.com/BerriAI/litellm/pull/10119) +- **API Playground** + 1. Allow internal user to call api playground - [PR](https://github.com/BerriAI/litellm/pull/10157) +- **SCIM** + 1. Add LiteLLM SCIM Integration for Team and User management - [Get Started](../../docs/tutorials/scim_litellm), [PR](https://github.com/BerriAI/litellm/pull/10072) + + +## Logging / Guardrail Integrations +- **GCS** + 1. Fix gcs pub sub logging with env var GCS_PROJECT_ID - [Get Started](../../docs/observability/gcs_bucket_integration#usage), [PR](https://github.com/BerriAI/litellm/pull/10042) +- **AIM** + 1. Add litellm call id passing to Aim guardrails on pre and post-hooks calls - [Get Started](../../docs/proxy/guardrails/aim_security), [PR](https://github.com/BerriAI/litellm/pull/10021) +- **Azure blob storage** + 1. Ensure logging works in high throughput scenarios - [Get Started](../../docs/proxy/logging#azure-blob-storage), [PR](https://github.com/BerriAI/litellm/pull/9962) + +## General Proxy Improvements + +- **Support setting `litellm.modify_params` via env var** [PR](https://github.com/BerriAI/litellm/pull/9964) +- **Model Discovery** - Check provider’s `/models` endpoints when calling proxy’s `/v1/models` endpoint - [Get Started](../../docs/proxy/model_discovery), [PR](https://github.com/BerriAI/litellm/pull/9958) +- **`/utils/token_counter`** - fix retrieving custom tokenizer for db models - [Get Started](../../docs/proxy/configs#set-custom-tokenizer), [PR](https://github.com/BerriAI/litellm/pull/10047) +- **Prisma migrate** - handle existing columns in db table - [PR](https://github.com/BerriAI/litellm/pull/10138) + diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index e8029560b0..c8096a21bb 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -53,7 +53,7 @@ const sidebars = { { type: "category", label: "Architecture", - items: ["proxy/architecture", "proxy/db_info", "router_architecture", "proxy/user_management_heirarchy", "proxy/jwt_auth_arch"], + items: ["proxy/architecture", "proxy/db_info", "proxy/db_deadlocks", "router_architecture", "proxy/user_management_heirarchy", "proxy/jwt_auth_arch", "proxy/image_handling"], }, { type: "link", @@ -69,6 +69,7 @@ const sidebars = { "proxy/clientside_auth", "proxy/request_headers", "proxy/response_headers", + "proxy/model_discovery", ], }, { @@ -101,6 +102,7 @@ const sidebars = { "proxy/admin_ui_sso", "proxy/self_serve", "proxy/public_teams", + "tutorials/scim_litellm", "proxy/custom_sso", "proxy/ui_credentials", "proxy/ui_logs" @@ -137,15 +139,17 @@ const sidebars = { label: "[Beta] Guardrails", items: [ "proxy/guardrails/quick_start", - "proxy/guardrails/aim_security", - "proxy/guardrails/aporia_api", - "proxy/guardrails/bedrock", - "proxy/guardrails/guardrails_ai", - "proxy/guardrails/lakera_ai", - "proxy/guardrails/pii_masking_v2", - "proxy/guardrails/secret_detection", - "proxy/guardrails/custom_guardrail", - "prompt_injection" + ...[ + "proxy/guardrails/aim_security", + "proxy/guardrails/aporia_api", + "proxy/guardrails/bedrock", + "proxy/guardrails/guardrails_ai", + "proxy/guardrails/lakera_ai", + "proxy/guardrails/pii_masking_v2", + "proxy/guardrails/secret_detection", + "proxy/guardrails/custom_guardrail", + "proxy/guardrails/prompt_injection", + ].sort(), ], }, { @@ -186,7 +190,15 @@ const sidebars = { "providers/azure_ai", "providers/aiml", "providers/vertex", - "providers/gemini", + + { + type: "category", + label: "Google AI Studio", + items: [ + "providers/gemini", + "providers/google_ai_studio/files", + ] + }, "providers/anthropic", "providers/aws_sagemaker", "providers/bedrock", @@ -244,7 +256,9 @@ const sidebars = { "exception_mapping", "completion/provider_specific_params", "guides/finetuned_models", + "guides/security_settings", "completion/audio", + "completion/web_search", "completion/document_understanding", "completion/vision", "completion/json_mode", @@ -294,6 +308,7 @@ const sidebars = { "text_completion", "embedding/supported_embedding", "anthropic_unified", + "mcp", { type: "category", label: "/images", @@ -318,6 +333,8 @@ const sidebars = { "pass_through/vertex_ai", "pass_through/google_ai_studio", "pass_through/cohere", + "pass_through/vllm", + "pass_through/mistral", "pass_through/openai_passthrough", "pass_through/anthropic_completion", "pass_through/bedrock", @@ -328,7 +345,15 @@ const sidebars = { }, "rerank", "assistants", - "files_endpoints", + + { + type: "category", + label: "/files", + items: [ + "files_endpoints", + "proxy/litellm_managed_files", + ], + }, "batches", "realtime", "fine_tuning", @@ -366,8 +391,12 @@ const sidebars = { ], }, { - type: "doc", - id: "proxy/prompt_management" + type: "category", + label: "[Beta] Prompt Management", + items: [ + "proxy/prompt_management", + "proxy/custom_prompt_management" + ], }, { type: "category", @@ -383,9 +412,10 @@ const sidebars = { type: "category", label: "Logging & Observability", items: [ + "observability/agentops_integration", + "observability/langfuse_integration", "observability/lunary_integration", "observability/mlflow", - "observability/langfuse_integration", "observability/gcs_bucket_integration", "observability/langsmith_integration", "observability/literalai_integration", @@ -393,6 +423,7 @@ const sidebars = { "observability/logfire_integration", "observability/argilla", "observability/arize_integration", + "observability/phoenix_integration", "debugging/local_debugging", "observability/raw_request_response", "observability/custom_callback", @@ -418,6 +449,10 @@ const sidebars = { label: "Tutorials", items: [ "tutorials/openweb_ui", + "tutorials/openai_codex", + "tutorials/msft_sso", + "tutorials/prompt_caching", + "tutorials/tag_management", 'tutorials/litellm_proxy_aporia', { type: "category", diff --git a/docs/my-website/src/components/TransformRequestPlayground.tsx b/docs/my-website/src/components/TransformRequestPlayground.tsx new file mode 100644 index 0000000000..8f22e5e198 --- /dev/null +++ b/docs/my-website/src/components/TransformRequestPlayground.tsx @@ -0,0 +1,161 @@ +import React, { useState } from 'react'; +import styles from './transform_request.module.css'; + +const DEFAULT_REQUEST = { + "model": "bedrock/gpt-4", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Explain quantum computing in simple terms" + } + ], + "temperature": 0.7, + "max_tokens": 500, + "stream": true +}; + +type ViewMode = 'split' | 'request' | 'transformed'; + +const TransformRequestPlayground: React.FC = () => { + const [request, setRequest] = useState(JSON.stringify(DEFAULT_REQUEST, null, 2)); + const [transformedRequest, setTransformedRequest] = useState(''); + const [viewMode, setViewMode] = useState('split'); + + const handleTransform = async () => { + try { + // Here you would make the actual API call to transform the request + // For now, we'll just set a sample response + const sampleResponse = `curl -X POST \\ + https://api.openai.com/v1/chat/completions \\ + -H 'Authorization: Bearer sk-xxx' \\ + -H 'Content-Type: application/json' \\ + -d '{ + "model": "gpt-4", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + } + ], + "temperature": 0.7 + }'`; + setTransformedRequest(sampleResponse); + } catch (error) { + console.error('Error transforming request:', error); + } + }; + + const handleCopy = () => { + navigator.clipboard.writeText(transformedRequest); + }; + + const renderContent = () => { + switch (viewMode) { + case 'request': + return ( +
+
+

Original Request

+

The request you would send to LiteLLM /chat/completions endpoint.

+
+