diff --git a/.circleci/config.yml b/.circleci/config.yml index 516f2b20d3..4fad4111d3 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 4.3.4 jobs: local_testing: docker: - - image: circleci/python:3.9 + - image: cimg/python:3.11 working_directory: ~/project steps: @@ -43,9 +43,13 @@ jobs: pip install "langfuse==2.27.1" pip install "logfire==0.29.0" pip install numpydoc - pip install traceloop-sdk==0.18.2 + pip install traceloop-sdk==0.21.1 + pip install opentelemetry-api==1.25.0 + pip install opentelemetry-sdk==1.25.0 + pip install opentelemetry-exporter-otlp==1.25.0 pip install openai - pip install prisma + pip install prisma + pip install "detect_secrets==1.5.0" pip install "httpx==0.24.1" pip install fastapi pip install "gunicorn==21.2.0" @@ -61,6 +65,8 @@ jobs: pip install prometheus-client==0.20.0 pip install "pydantic==2.7.1" pip install "diskcache==5.6.1" + pip install "Pillow==10.3.0" + pip install "jsonschema==4.22.0" - save_cache: paths: - ./venv @@ -96,7 +102,7 @@ jobs: command: | pwd ls - python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv litellm/tests/ -x --junitxml=test-results/junit.xml --durations=5 -k "not test_python_38.py" no_output_timeout: 120m # Store test results @@ -122,6 +128,7 @@ jobs: pip install jinja2 pip install tokenizers pip install openai + pip install jsonschema - run: name: Run tests command: | @@ -176,6 +183,7 @@ jobs: pip install numpydoc pip install prisma pip install fastapi + pip install jsonschema pip install "httpx==0.24.1" pip install "gunicorn==21.2.0" pip install "anyio==3.7.1" @@ -198,11 +206,13 @@ jobs: -e REDIS_PORT=$REDIS_PORT \ -e AZURE_FRANCE_API_KEY=$AZURE_FRANCE_API_KEY \ -e AZURE_EUROPE_API_KEY=$AZURE_EUROPE_API_KEY \ + -e MISTRAL_API_KEY=$MISTRAL_API_KEY \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ -e AWS_REGION_NAME=$AWS_REGION_NAME \ -e AUTO_INFER_REGION=True \ -e OPENAI_API_KEY=$OPENAI_API_KEY \ + -e LITELLM_LICENSE=$LITELLM_LICENSE \ -e LANGFUSE_PROJECT1_PUBLIC=$LANGFUSE_PROJECT1_PUBLIC \ -e LANGFUSE_PROJECT2_PUBLIC=$LANGFUSE_PROJECT2_PUBLIC \ -e LANGFUSE_PROJECT1_SECRET=$LANGFUSE_PROJECT1_SECRET \ @@ -233,7 +243,102 @@ jobs: command: | pwd ls - python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests + no_output_timeout: 120m + + # Store test results + - store_test_results: + path: test-results + proxy_log_to_otel_tests: + machine: + image: ubuntu-2204:2023.10.1 + resource_class: xlarge + working_directory: ~/project + steps: + - checkout + - run: + name: Install Docker CLI (In case it's not already installed) + command: | + sudo apt-get update + sudo apt-get install -y docker-ce docker-ce-cli containerd.io + - run: + name: Install Python 3.9 + command: | + curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh + bash miniconda.sh -b -p $HOME/miniconda + export PATH="$HOME/miniconda/bin:$PATH" + conda init bash + source ~/.bashrc + conda create -n myenv python=3.9 -y + conda activate myenv + python --version + - run: + name: Install Dependencies + command: | + pip install "pytest==7.3.1" + pip install "pytest-asyncio==0.21.1" + pip install aiohttp + pip install openai + python -m pip install --upgrade pip + python -m pip install -r .circleci/requirements.txt + pip install "pytest==7.3.1" + pip install "pytest-mock==3.12.0" + pip install "pytest-asyncio==0.21.1" + pip install mypy + pip install pyarrow + pip install numpydoc + pip install prisma + pip install fastapi + pip install jsonschema + pip install "httpx==0.24.1" + pip install "anyio==3.7.1" + pip install "asyncio==3.4.3" + pip install "PyGithub==1.59.1" + - run: + name: Build Docker image + command: docker build -t my-app:latest -f Dockerfile.database . + - run: + name: Run Docker container + # intentionally give bad redis credentials here + # the OTEL test - should get this as a trace + command: | + docker run -d \ + -p 4000:4000 \ + -e DATABASE_URL=$PROXY_DATABASE_URL \ + -e REDIS_HOST=$REDIS_HOST \ + -e REDIS_PASSWORD=$REDIS_PASSWORD \ + -e REDIS_PORT=$REDIS_PORT \ + -e LITELLM_MASTER_KEY="sk-1234" \ + -e OPENAI_API_KEY=$OPENAI_API_KEY \ + -e LITELLM_LICENSE=$LITELLM_LICENSE \ + -e OTEL_EXPORTER="in_memory" \ + --name my-app \ + -v $(pwd)/litellm/proxy/example_config_yaml/otel_test_config.yaml:/app/config.yaml \ + my-app:latest \ + --config /app/config.yaml \ + --port 4000 \ + --detailed_debug \ + - run: + name: Install curl and dockerize + command: | + sudo apt-get update + sudo apt-get install -y curl + sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz + sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz + sudo rm dockerize-linux-amd64-v0.6.1.tar.gz + - run: + name: Start outputting logs + command: docker logs -f my-app + background: true + - run: + name: Wait for app to be ready + command: dockerize -wait http://localhost:4000 -timeout 5m + - run: + name: Run tests + command: | + pwd + ls + python -m pytest -vv tests/otel_tests/test_otel.py -x --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m # Store test results @@ -327,6 +432,12 @@ workflows: only: - main - /litellm_.*/ + - proxy_log_to_otel_tests: + filters: + branches: + only: + - main + - /litellm_.*/ - installing_litellm_on_python: filters: branches: @@ -337,7 +448,8 @@ workflows: requires: - local_testing - build_and_test + - proxy_log_to_otel_tests filters: branches: only: - - main + - main \ No newline at end of file diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt index b505536e27..c4225a9aa2 100644 --- a/.circleci/requirements.txt +++ b/.circleci/requirements.txt @@ -7,6 +7,5 @@ cohere redis anthropic orjson -pydantic==1.10.14 +pydantic==2.7.1 google-cloud-aiplatform==1.43.0 -redisvl==0.0.7 # semantic caching \ No newline at end of file diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml new file mode 100644 index 0000000000..58e7cfe10d --- /dev/null +++ b/.github/dependabot.yaml @@ -0,0 +1,10 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" + groups: + github-actions: + patterns: + - "*" diff --git a/.github/workflows/ghcr_deploy.yml b/.github/workflows/ghcr_deploy.yml index 58cda02c31..51e24f856b 100644 --- a/.github/workflows/ghcr_deploy.yml +++ b/.github/workflows/ghcr_deploy.yml @@ -25,6 +25,11 @@ jobs: if: github.repository == 'BerriAI/litellm' runs-on: ubuntu-latest steps: + - + name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} - name: Set up QEMU uses: docker/setup-qemu-action@v3 @@ -41,12 +46,14 @@ jobs: name: Build and push uses: docker/build-push-action@v5 with: + context: . push: true tags: litellm/litellm:${{ github.event.inputs.tag || 'latest' }} - name: Build and push litellm-database image uses: docker/build-push-action@v5 with: + context: . push: true file: Dockerfile.database tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }} @@ -54,6 +61,7 @@ jobs: name: Build and push litellm-spend-logs image uses: docker/build-push-action@v5 with: + context: . push: true file: ./litellm-js/spend-logs/Dockerfile tags: litellm/litellm-spend_logs:${{ github.event.inputs.tag || 'latest' }} @@ -68,6 +76,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. - name: Log in to the Container registry uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 @@ -92,7 +102,7 @@ jobs: - name: Build and push Docker image uses: docker/build-push-action@4976231911ebf5f32aad765192d35f942aa48cb8 with: - context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}} + context: . push: true tags: ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} # if a tag is provided, use that, otherwise use the release tag, and if neither is available, use 'latest' labels: ${{ steps.meta.outputs.labels }} @@ -106,6 +116,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} - name: Log in to the Container registry uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 @@ -128,7 +140,7 @@ jobs: - name: Build and push Database Docker image uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 with: - context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}} + context: . file: Dockerfile.database push: true tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }} @@ -143,6 +155,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} - name: Log in to the Container registry uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 @@ -165,7 +179,7 @@ jobs: - name: Build and push Database Docker image uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 with: - context: https://github.com/BerriAI/litellm.git#${{ github.event.inputs.commit_hash}} + context: . file: ./litellm-js/spend-logs/Dockerfile push: true tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }} @@ -176,6 +190,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} - name: Log in to the Container registry uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 @@ -273,7 +289,8 @@ jobs: repo: context.repo.repo, release_id: process.env.RELEASE_ID, }); - return response.data.body; + const formattedBody = JSON.stringify(response.data.body).slice(1, -1); + return formattedBody; } catch (error) { core.setFailed(error.message); } @@ -286,14 +303,15 @@ jobs: RELEASE_NOTES: ${{ steps.release-notes.outputs.result }} run: | curl -H "Content-Type: application/json" -X POST -d '{ - "content": "New LiteLLM release ${{ env.RELEASE_TAG }}", + "content": "New LiteLLM release '"${RELEASE_TAG}"'", "username": "Release Changelog", "avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png", "embeds": [ { - "title": "Changelog for LiteLLM ${{ env.RELEASE_TAG }}", - "description": "${{ env.RELEASE_NOTES }}", + "title": "Changelog for LiteLLM '"${RELEASE_TAG}"'", + "description": "'"${RELEASE_NOTES}"'", "color": 2105893 } ] }' $WEBHOOK_URL + diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000000..23e4a06da9 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,34 @@ +name: Publish Dev Release to PyPI + +on: + workflow_dispatch: + +jobs: + publish-dev-release: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 # Adjust the Python version as needed + + - name: Install dependencies + run: pip install toml twine + + - name: Read version from pyproject.toml + id: read-version + run: | + version=$(python -c 'import toml; print(toml.load("pyproject.toml")["tool"]["commitizen"]["version"])') + printf "LITELLM_VERSION=%s" "$version" >> $GITHUB_ENV + + - name: Check if version exists on PyPI + id: check-version + run: | + set -e + if twine check --repository-url https://pypi.org/simple/ "litellm==$LITELLM_VERSION" >/dev/null 2>&1; then + echo "Version $LITELLM_VERSION already exists on PyPI. Skipping publish." + diff --git a/.gitignore b/.gitignore index b75a92309a..67aa57bb3e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ .venv .env +.newenv +newenv/* litellm/proxy/myenv/* litellm_uuid.txt __pycache__/ @@ -55,4 +57,10 @@ litellm/proxy/_super_secret_config.yaml litellm/proxy/_super_secret_config.yaml litellm/proxy/myenv/bin/activate litellm/proxy/myenv/bin/Activate.ps1 -myenv/* \ No newline at end of file +myenv/* +litellm/proxy/_experimental/out/404/index.html +litellm/proxy/_experimental/out/model_hub/index.html +litellm/proxy/_experimental/out/onboarding/index.html +litellm/tests/log.txt +litellm/tests/langfuse.log +litellm/tests/langfuse.log diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc41d85f14..a33473b724 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,19 @@ repos: +- repo: local + hooks: + - id: mypy + name: mypy + entry: python3 -m mypy --ignore-missing-imports + language: system + types: [python] + files: ^litellm/ + - id: isort + name: isort + entry: isort + language: system + types: [python] + files: litellm/.*\.py + exclude: ^litellm/__init__.py$ - repo: https://github.com/psf/black rev: 24.2.0 hooks: @@ -7,20 +22,23 @@ repos: rev: 7.0.0 # The version of flake8 to use hooks: - id: flake8 - exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/proxy/tests/ + exclude: ^litellm/tests/|^litellm/proxy/tests/ additional_dependencies: [flake8-print] files: litellm/.*\.py +- repo: https://github.com/python-poetry/poetry + rev: 1.8.0 + hooks: + - id: poetry-check - repo: local hooks: - id: check-files-match name: Check if files match entry: python3 ci_cd/check_files_match.py language: system -- repo: local - hooks: - - id: mypy - name: mypy - entry: python3 -m mypy --ignore-missing-imports - language: system - types: [python] - files: ^litellm/ \ No newline at end of file + # - id: check-file-length + # name: Check file length + # entry: python check_file_length.py + # args: ["10000"] # set your desired maximum number of lines + # language: python + # files: litellm/.*\.py + # exclude: ^litellm/tests/ \ No newline at end of file diff --git a/README.md b/README.md index 415ea8480e..92328b4d5c 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,8 @@ Support for more providers. Missing a provider or LLM Platform, raise a [feature # Usage ([**Docs**](https://docs.litellm.ai/docs/)) > [!IMPORTANT] -> LiteLLM v1.0.0 now requires `openai>=1.0.0`. Migration guide [here](https://docs.litellm.ai/docs/migration) +> LiteLLM v1.0.0 now requires `openai>=1.0.0`. Migration guide [here](https://docs.litellm.ai/docs/migration) +> LiteLLM v1.40.14+ now requires `pydantic>=2.0.0`. No changes required. Open In Colab @@ -119,6 +120,7 @@ from litellm import completion ## set env variables for logging tools os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" +os.environ["HELICONE_API_KEY"] = "your-helicone-auth-key" os.environ["LANGFUSE_PUBLIC_KEY"] = "" os.environ["LANGFUSE_SECRET_KEY"] = "" os.environ["ATHINA_API_KEY"] = "your-athina-api-key" @@ -126,7 +128,7 @@ os.environ["ATHINA_API_KEY"] = "your-athina-api-key" os.environ["OPENAI_API_KEY"] # set callbacks -litellm.success_callback = ["lunary", "langfuse", "athina"] # log input/output to lunary, langfuse, supabase, athina etc +litellm.success_callback = ["lunary", "langfuse", "athina", "helicone"] # log input/output to lunary, langfuse, supabase, athina, helicone etc #openai call response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) @@ -147,6 +149,7 @@ The proxy provides: ## 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/) + ## Quick Start Proxy - CLI ```shell @@ -179,6 +182,31 @@ print(response) ## Proxy Key Management ([Docs](https://docs.litellm.ai/docs/proxy/virtual_keys)) +Connect the proxy with a Postgres DB to create proxy keys + +```bash +# Get the code +git clone https://github.com/BerriAI/litellm + +# Go to folder +cd litellm + +# Add the master key - you can change this after setup +echo 'LITELLM_MASTER_KEY="sk-1234"' > .env + +# Add the litellm salt key - you cannot change this after adding a model +# It is used to encrypt / decrypt your LLM API Key credentials +# We recommned - https://1password.com/password-generator/ +# password generator to get a random hash for litellm salt key +echo 'LITELLM_SALT_KEY="sk-1234"' > .env + +source .env + +# Start +docker-compose up +``` + + UI on `/ui` on your proxy server ![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033) @@ -206,37 +234,39 @@ curl 'http://0.0.0.0:4000/key/generate' \ ## Supported Providers ([Docs](https://docs.litellm.ai/docs/providers)) | Provider | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses) | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion) | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming) | [Async Embedding](https://docs.litellm.ai/docs/embedding/supported_embedding) | [Async Image Generation](https://docs.litellm.ai/docs/image_generation) | -| ----------------------------------------------------------------------------------- | ------------------------------------------------------- | ------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | --------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- | ----------------------------------------------------------------------- | -| [openai](https://docs.litellm.ai/docs/providers/openai) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [azure](https://docs.litellm.ai/docs/providers/azure) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [aws - sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker) | ✅ | ✅ | ✅ | ✅ | ✅ | -| [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock) | ✅ | ✅ | ✅ | ✅ | ✅ | -| [google - vertex_ai [Gemini]](https://docs.litellm.ai/docs/providers/vertex) | ✅ | ✅ | ✅ | ✅ | -| [google - palm](https://docs.litellm.ai/docs/providers/palm) | ✅ | ✅ | ✅ | ✅ | -| [google AI Studio - gemini](https://docs.litellm.ai/docs/providers/gemini) | ✅ | ✅ | ✅ | ✅ | | -| [mistral ai api](https://docs.litellm.ai/docs/providers/mistral) | ✅ | ✅ | ✅ | ✅ | ✅ | -| [cloudflare AI Workers](https://docs.litellm.ai/docs/providers/cloudflare_workers) | ✅ | ✅ | ✅ | ✅ | -| [cohere](https://docs.litellm.ai/docs/providers/cohere) | ✅ | ✅ | ✅ | ✅ | ✅ | -| [anthropic](https://docs.litellm.ai/docs/providers/anthropic) | ✅ | ✅ | ✅ | ✅ | -| [huggingface](https://docs.litellm.ai/docs/providers/huggingface) | ✅ | ✅ | ✅ | ✅ | ✅ | -| [replicate](https://docs.litellm.ai/docs/providers/replicate) | ✅ | ✅ | ✅ | ✅ | -| [together_ai](https://docs.litellm.ai/docs/providers/togetherai) | ✅ | ✅ | ✅ | ✅ | -| [openrouter](https://docs.litellm.ai/docs/providers/openrouter) | ✅ | ✅ | ✅ | ✅ | -| [ai21](https://docs.litellm.ai/docs/providers/ai21) | ✅ | ✅ | ✅ | ✅ | -| [baseten](https://docs.litellm.ai/docs/providers/baseten) | ✅ | ✅ | ✅ | ✅ | -| [vllm](https://docs.litellm.ai/docs/providers/vllm) | ✅ | ✅ | ✅ | ✅ | -| [nlp_cloud](https://docs.litellm.ai/docs/providers/nlp_cloud) | ✅ | ✅ | ✅ | ✅ | -| [aleph alpha](https://docs.litellm.ai/docs/providers/aleph_alpha) | ✅ | ✅ | ✅ | ✅ | -| [petals](https://docs.litellm.ai/docs/providers/petals) | ✅ | ✅ | ✅ | ✅ | -| [ollama](https://docs.litellm.ai/docs/providers/ollama) | ✅ | ✅ | ✅ | ✅ | ✅ | -| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra) | ✅ | ✅ | ✅ | ✅ | -| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ | -| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ | -| [Deepseek](https://docs.litellm.ai/docs/providers/deepseek) | ✅ | ✅ | ✅ | ✅ | -| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ | -| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅ -| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ | -| [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) | | | | | ✅ | +|-------------------------------------------------------------------------------------|---------------------------------------------------------|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------|-------------------------------------------------------------------------| +| [openai](https://docs.litellm.ai/docs/providers/openai) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [azure](https://docs.litellm.ai/docs/providers/azure) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [aws - sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker) | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock) | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [google - vertex_ai](https://docs.litellm.ai/docs/providers/vertex) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [google - palm](https://docs.litellm.ai/docs/providers/palm) | ✅ | ✅ | ✅ | ✅ | | | +| [google AI Studio - gemini](https://docs.litellm.ai/docs/providers/gemini) | ✅ | ✅ | ✅ | ✅ | | | +| [mistral ai api](https://docs.litellm.ai/docs/providers/mistral) | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [cloudflare AI Workers](https://docs.litellm.ai/docs/providers/cloudflare_workers) | ✅ | ✅ | ✅ | ✅ | | | +| [cohere](https://docs.litellm.ai/docs/providers/cohere) | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [anthropic](https://docs.litellm.ai/docs/providers/anthropic) | ✅ | ✅ | ✅ | ✅ | | | +| [empower](https://docs.litellm.ai/docs/providers/empower) | ✅ | ✅ | ✅ | ✅ | +| [huggingface](https://docs.litellm.ai/docs/providers/huggingface) | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [replicate](https://docs.litellm.ai/docs/providers/replicate) | ✅ | ✅ | ✅ | ✅ | | | +| [together_ai](https://docs.litellm.ai/docs/providers/togetherai) | ✅ | ✅ | ✅ | ✅ | | | +| [openrouter](https://docs.litellm.ai/docs/providers/openrouter) | ✅ | ✅ | ✅ | ✅ | | | +| [ai21](https://docs.litellm.ai/docs/providers/ai21) | ✅ | ✅ | ✅ | ✅ | | | +| [baseten](https://docs.litellm.ai/docs/providers/baseten) | ✅ | ✅ | ✅ | ✅ | | | +| [vllm](https://docs.litellm.ai/docs/providers/vllm) | ✅ | ✅ | ✅ | ✅ | | | +| [nlp_cloud](https://docs.litellm.ai/docs/providers/nlp_cloud) | ✅ | ✅ | ✅ | ✅ | | | +| [aleph alpha](https://docs.litellm.ai/docs/providers/aleph_alpha) | ✅ | ✅ | ✅ | ✅ | | | +| [petals](https://docs.litellm.ai/docs/providers/petals) | ✅ | ✅ | ✅ | ✅ | | | +| [ollama](https://docs.litellm.ai/docs/providers/ollama) | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra) | ✅ | ✅ | ✅ | ✅ | | | +| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ | | | +| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ | | | +| [Deepseek](https://docs.litellm.ai/docs/providers/deepseek) | ✅ | ✅ | ✅ | ✅ | | | +| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ | | | +| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ | | +| [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) | | | | | ✅ | | +| [FriendliAI](https://docs.litellm.ai/docs/providers/friendliai) | ✅ | ✅ | ✅ | ✅ | | | [**Read the Docs**](https://docs.litellm.ai/docs/) diff --git a/check_file_length.py b/check_file_length.py new file mode 100644 index 0000000000..f23b79add2 --- /dev/null +++ b/check_file_length.py @@ -0,0 +1,28 @@ +import sys + + +def check_file_length(max_lines, filenames): + bad_files = [] + for filename in filenames: + with open(filename, "r") as file: + lines = file.readlines() + if len(lines) > max_lines: + bad_files.append((filename, len(lines))) + return bad_files + + +if __name__ == "__main__": + max_lines = int(sys.argv[1]) + filenames = sys.argv[2:] + + bad_files = check_file_length(max_lines, filenames) + if bad_files: + bad_files.sort( + key=lambda x: x[1], reverse=True + ) # Sort files by length in descending order + for filename, length in bad_files: + print(f"{filename}: {length} lines") + + sys.exit(1) + else: + sys.exit(0) diff --git a/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/grafana_dashboard.json b/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/grafana_dashboard.json new file mode 100644 index 0000000000..17fef1ffda --- /dev/null +++ b/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/grafana_dashboard.json @@ -0,0 +1,594 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2039, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(litellm_self_latency_bucket{self=\"self\"}[1m])) by (le))", + "legendFormat": "Time to first token", + "range": true, + "refId": "A" + } + ], + "title": "Time to first token (latency)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "7e4b0627fd32efdd2313c846325575808aadcf2839f0fde90723aab9ab73c78f" + }, + "properties": [ + { + "id": "displayName", + "value": "Translata" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "editorMode": "code", + "expr": "sum(increase(litellm_spend_metric_total[30d])) by (hashed_api_key)", + "legendFormat": "{{team}}", + "range": true, + "refId": "A" + } + ], + "title": "Spend by team", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "editorMode": "code", + "expr": "sum by (model) (increase(litellm_requests_metric_total[5m]))", + "legendFormat": "{{model}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests by model", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 3, + "x": 0, + "y": 25 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.17", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "editorMode": "code", + "expr": "sum(increase(litellm_llm_api_failed_requests_metric_total[1h]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Faild Requests", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 3, + "x": 3, + "y": 25 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "editorMode": "code", + "expr": "sum(increase(litellm_spend_metric_total[30d])) by (model)", + "legendFormat": "{{model}}", + "range": true, + "refId": "A" + } + ], + "title": "Spend", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 25 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "rMzWaBvIk" + }, + "editorMode": "code", + "expr": "sum(increase(litellm_total_tokens_total[5m])) by (model)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Tokens", + "type": "timeseries" + } + ], + "refresh": "1m", + "revision": 1, + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "LLM Proxy", + "uid": "rgRrHxESz", + "version": 15, + "weekStart": "" + } \ No newline at end of file diff --git a/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/readme.md b/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/readme.md new file mode 100644 index 0000000000..1f193aba70 --- /dev/null +++ b/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/readme.md @@ -0,0 +1,6 @@ +## This folder contains the `json` for creating the following Grafana Dashboard + +### Pre-Requisites +- Setup LiteLLM Proxy Prometheus Metrics https://docs.litellm.ai/docs/proxy/prometheus + +![1716623265684](https://github.com/BerriAI/litellm/assets/29436595/0e12c57e-4a2d-4850-bd4f-e4294f87a814) diff --git a/cookbook/litellm_proxy_server/grafana_dashboard/readme.md b/cookbook/litellm_proxy_server/grafana_dashboard/readme.md new file mode 100644 index 0000000000..fae1d792d2 --- /dev/null +++ b/cookbook/litellm_proxy_server/grafana_dashboard/readme.md @@ -0,0 +1,6 @@ +## Contains example Grafana Dashboard made for LiteLLM Proxy Server + +This folder contains the `json` for creating Grafana Dashboards + +### Pre-Requisites +- Setup LiteLLM Proxy Prometheus Metrics https://docs.litellm.ai/docs/proxy/prometheus \ No newline at end of file diff --git a/cookbook/proxy-server/readme.md b/cookbook/litellm_proxy_server/readme.md similarity index 100% rename from cookbook/proxy-server/readme.md rename to cookbook/litellm_proxy_server/readme.md diff --git a/cookbook/misc/add_new_models.py b/cookbook/misc/add_new_models.py new file mode 100644 index 0000000000..c9b5a91e30 --- /dev/null +++ b/cookbook/misc/add_new_models.py @@ -0,0 +1,72 @@ +import requests +import json + + +def get_initial_config(): + proxy_base_url = input("Enter your proxy base URL (e.g., http://localhost:4000): ") + master_key = input("Enter your LITELLM_MASTER_KEY ") + return proxy_base_url, master_key + + +def get_user_input(): + model_name = input( + "Enter model_name (this is the 'model' passed in /chat/completions requests):" + ) + model = input("litellm_params: Enter model eg. 'azure/': ") + tpm = int(input("litellm_params: Enter tpm (tokens per minute): ")) + rpm = int(input("litellm_params: Enter rpm (requests per minute): ")) + api_key = input("litellm_params: Enter api_key: ") + api_base = input("litellm_params: Enter api_base: ") + api_version = input("litellm_params: Enter api_version: ") + timeout = int(input("litellm_params: Enter timeout (0 for default): ")) + stream_timeout = int( + input("litellm_params: Enter stream_timeout (0 for default): ") + ) + max_retries = int(input("litellm_params: Enter max_retries (0 for default): ")) + + return { + "model_name": model_name, + "litellm_params": { + "model": model, + "tpm": tpm, + "rpm": rpm, + "api_key": api_key, + "api_base": api_base, + "api_version": api_version, + "timeout": timeout, + "stream_timeout": stream_timeout, + "max_retries": max_retries, + }, + } + + +def make_request(proxy_base_url, master_key, data): + url = f"{proxy_base_url}/model/new" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {master_key}", + } + + response = requests.post(url, headers=headers, json=data) + + print(f"Status Code: {response.status_code}") + print(f"Response from adding model: {response.text}") + + +def main(): + proxy_base_url, master_key = get_initial_config() + + while True: + print("Adding new Model to your proxy server...") + data = get_user_input() + make_request(proxy_base_url, master_key, data) + + add_another = input("Do you want to add another model? (yes/no): ").lower() + if add_another != "yes": + break + + print("Script finished.") + + +if __name__ == "__main__": + main() diff --git a/deploy/charts/litellm-helm/Chart.yaml b/deploy/charts/litellm-helm/Chart.yaml index 7f68acf885..fcd2e83cc2 100644 --- a/deploy/charts/litellm-helm/Chart.yaml +++ b/deploy/charts/litellm-helm/Chart.yaml @@ -18,13 +18,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.2.0 +version: 0.2.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: v1.35.38 +appVersion: v1.41.8 dependencies: - name: "postgresql" diff --git a/docker-compose.yml b/docker-compose.yml index 05439b1dfb..ca98ec784d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,16 +1,35 @@ -version: "3.9" +version: "3.11" services: litellm: build: context: . args: target: runtime - image: ghcr.io/berriai/litellm:main-latest + image: ghcr.io/berriai/litellm:main-stable + ######################################### + ## Uncomment these lines to start proxy with a config.yaml file ## + # volumes: + # - ./proxy_server_config.yaml:/app/config.yaml + # command: [ "--config", "./config.yaml", "--port", "4000"] + ############################################### ports: - "4000:4000" # Map the container port to the host, change the host port if necessary - volumes: - - ./litellm-config.yaml:/app/config.yaml # Mount the local configuration file - # You can change the port or number of workers as per your requirements or pass any new supported CLI augument. Make sure the port passed here matches with the container port defined above in `ports` value - command: [ "--config", "/app/config.yaml", "--port", "4000", "--num_workers", "8" ] + environment: + DATABASE_URL: "postgresql://postgres:example@db:5432/postgres" + STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI + env_file: + - .env # Load local .env file + + + db: + image: postgres + restart: always + environment: + POSTGRES_PASSWORD: example + healthcheck: + test: ["CMD-SHELL", "pg_isready"] + interval: 1s + timeout: 5s + retries: 10 # ...rest of your docker-compose config if any \ No newline at end of file diff --git a/docs/my-website/docs/anthropic_completion.md b/docs/my-website/docs/anthropic_completion.md new file mode 100644 index 0000000000..ca65f3f6f5 --- /dev/null +++ b/docs/my-website/docs/anthropic_completion.md @@ -0,0 +1,54 @@ +# [BETA] Anthropic `/v1/messages` + +Call 100+ LLMs in the Anthropic format. + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: my-test-model + litellm_params: + model: gpt-3.5-turbo +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl -X POST 'http://0.0.0.0:4000/v1/messages' \ +-H 'x-api-key: sk-1234' \ +-H 'content-type: application/json' \ +-D '{ + "model": "my-test-model", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ] +}' +``` + +## Test with Anthropic SDK + +```python +import os +from anthropic import Anthropic + +client = Anthropic(api_key="sk-1234", base_url="http://0.0.0.0:4000") # 👈 CONNECT TO PROXY + +message = client.messages.create( + messages=[ + { + "role": "user", + "content": "Hello, Claude", + } + ], + model="my-test-model", # 👈 set 'model_name' +) +print(message.content) +``` \ No newline at end of file diff --git a/docs/my-website/docs/assistants.md b/docs/my-website/docs/assistants.md new file mode 100644 index 0000000000..fb30a132f7 --- /dev/null +++ b/docs/my-website/docs/assistants.md @@ -0,0 +1,312 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Assistants API + +Covers Threads, Messages, Assistants. + +LiteLLM currently covers: +- Create Assistants +- Get Assistants +- Create Thread +- Get Thread +- Add Messages +- Get Messages +- Run Thread + +## Quick Start + +Call an existing Assistant. + +- Get the Assistant + +- Create a Thread when a user starts a conversation. + +- Add Messages to the Thread as the user asks questions. + +- Run the Assistant on the Thread to generate a response by calling the model and the tools. + +### SDK + PROXY + + + +**Create an Assistant** + + +```python +import litellm +import os + +# setup env +os.environ["OPENAI_API_KEY"] = "sk-.." + +assistant = litellm.create_assistants( + custom_llm_provider="openai", + model="gpt-4-turbo", + instructions="You are a personal math tutor. When asked a question, write and run Python code to answer the question.", + name="Math Tutor", + tools=[{"type": "code_interpreter"}], +) + +### ASYNC USAGE ### +# assistant = await litellm.acreate_assistants( +# custom_llm_provider="openai", +# model="gpt-4-turbo", +# instructions="You are a personal math tutor. When asked a question, write and run Python code to answer the question.", +# name="Math Tutor", +# tools=[{"type": "code_interpreter"}], +# ) +``` + +**Get the Assistant** + +```python +from litellm import get_assistants, aget_assistants +import os + +# setup env +os.environ["OPENAI_API_KEY"] = "sk-.." + +assistants = get_assistants(custom_llm_provider="openai") + +### ASYNC USAGE ### +# assistants = await aget_assistants(custom_llm_provider="openai") +``` + +**Create a Thread** + +```python +from litellm import create_thread, acreate_thread +import os + +os.environ["OPENAI_API_KEY"] = "sk-.." + +new_thread = create_thread( + custom_llm_provider="openai", + messages=[{"role": "user", "content": "Hey, how's it going?"}], # type: ignore + ) + +### ASYNC USAGE ### +# new_thread = await acreate_thread(custom_llm_provider="openai",messages=[{"role": "user", "content": "Hey, how's it going?"}]) +``` + +**Add Messages to the Thread** + +```python +from litellm import create_thread, get_thread, aget_thread, add_message, a_add_message +import os + +os.environ["OPENAI_API_KEY"] = "sk-.." + +## CREATE A THREAD +_new_thread = create_thread( + custom_llm_provider="openai", + messages=[{"role": "user", "content": "Hey, how's it going?"}], # type: ignore + ) + +## OR retrieve existing thread +received_thread = get_thread( + custom_llm_provider="openai", + thread_id=_new_thread.id, + ) + +### ASYNC USAGE ### +# received_thread = await aget_thread(custom_llm_provider="openai", thread_id=_new_thread.id,) + +## ADD MESSAGE TO THREAD +message = {"role": "user", "content": "Hey, how's it going?"} +added_message = add_message( + thread_id=_new_thread.id, custom_llm_provider="openai", **message + ) + +### ASYNC USAGE ### +# added_message = await a_add_message(thread_id=_new_thread.id, custom_llm_provider="openai", **message) +``` + +**Run the Assistant on the Thread** + +```python +from litellm import get_assistants, create_thread, add_message, run_thread, arun_thread +import os + +os.environ["OPENAI_API_KEY"] = "sk-.." +assistants = get_assistants(custom_llm_provider="openai") + +## get the first assistant ### +assistant_id = assistants.data[0].id + +## GET A THREAD +_new_thread = create_thread( + custom_llm_provider="openai", + messages=[{"role": "user", "content": "Hey, how's it going?"}], # type: ignore + ) + +## ADD MESSAGE +message = {"role": "user", "content": "Hey, how's it going?"} +added_message = add_message( + thread_id=_new_thread.id, custom_llm_provider="openai", **message + ) + +## 🚨 RUN THREAD +response = run_thread( + custom_llm_provider="openai", thread_id=thread_id, assistant_id=assistant_id + ) + +### ASYNC USAGE ### +# response = await arun_thread(custom_llm_provider="openai", thread_id=thread_id, assistant_id=assistant_id) + +print(f"run_thread: {run_thread}") +``` + + + +```yaml +assistant_settings: + custom_llm_provider: azure + litellm_params: + api_key: os.environ/AZURE_API_KEY + api_base: os.environ/AZURE_API_BASE + api_version: os.environ/AZURE_API_VERSION +``` + +```bash +$ litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + + +**Create the Assistant** + +```bash +curl "http://localhost:4000/v1/assistants" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "instructions": "You are a personal math tutor. When asked a question, write and run Python code to answer the question.", + "name": "Math Tutor", + "tools": [{"type": "code_interpreter"}], + "model": "gpt-4-turbo" + }' +``` + + +**Get the Assistant** + +```bash +curl "http://0.0.0.0:4000/v1/assistants?order=desc&limit=20" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" +``` + +**Create a Thread** + +```bash +curl http://0.0.0.0:4000/v1/threads \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '' +``` + +**Get a Thread** + +```bash +curl http://0.0.0.0:4000/v1/threads/{thread_id} \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" +``` + +**Add Messages to the Thread** + +```bash +curl http://0.0.0.0:4000/v1/threads/{thread_id}/messages \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "role": "user", + "content": "How does AI work? Explain it in simple terms." + }' +``` + +**Run the Assistant on the Thread** + +```bash +curl http://0.0.0.0:4000/v1/threads/thread_abc123/runs \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "assistant_id": "asst_abc123" + }' +``` + + + + +## Streaming + + + + +```python +from litellm import run_thread_stream +import os + +os.environ["OPENAI_API_KEY"] = "sk-.." + +message = {"role": "user", "content": "Hey, how's it going?"} + +data = {"custom_llm_provider": "openai", "thread_id": _new_thread.id, "assistant_id": assistant_id, **message} + +run = run_thread_stream(**data) +with run as run: + assert isinstance(run, AssistantEventHandler) + for chunk in run: + print(f"chunk: {chunk}") + run.until_done() +``` + + + + +```bash +curl -X POST 'http://0.0.0.0:4000/threads/{thread_id}/runs' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-D '{ + "assistant_id": "asst_6xVZQFFy1Kw87NbnYeNebxTf", + "stream": true +}' +``` + + + + +## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/assistants) + +## OpenAI-Compatible APIs + +To call openai-compatible Assistants API's (eg. Astra Assistants API), just add `openai/` to the model name: + + +**config** +```yaml +assistant_settings: + custom_llm_provider: openai + litellm_params: + api_key: os.environ/ASTRA_API_KEY + api_base: os.environ/ASTRA_API_BASE +``` + +**curl** + +```bash +curl -X POST "http://localhost:4000/v1/assistants" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "instructions": "You are a personal math tutor. When asked a question, write and run Python code to answer the question.", + "name": "Math Tutor", + "tools": [{"type": "code_interpreter"}], + "model": "openai/" + }' +``` \ No newline at end of file diff --git a/docs/my-website/docs/audio_transcription.md b/docs/my-website/docs/audio_transcription.md index 25eca6caa4..b4a1df01c4 100644 --- a/docs/my-website/docs/audio_transcription.md +++ b/docs/my-website/docs/audio_transcription.md @@ -1,7 +1,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Audio Transcription +# Speech to Text Use this to loadbalance across Azure + OpenAI. diff --git a/docs/my-website/docs/batches.md b/docs/my-website/docs/batches.md new file mode 100644 index 0000000000..51f3bb5cad --- /dev/null +++ b/docs/my-website/docs/batches.md @@ -0,0 +1,124 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Batches API + +Covers Batches, Files + + +## Quick Start + +Call an existing Assistant. + +- Create File for Batch Completion + +- Create Batch Request + +- Retrieve the Specific Batch and File Content + + + + + +**Create File for Batch Completion** + +```python +from litellm +import os + +os.environ["OPENAI_API_KEY"] = "sk-.." + +file_name = "openai_batch_completions.jsonl" +_current_dir = os.path.dirname(os.path.abspath(__file__)) +file_path = os.path.join(_current_dir, file_name) +file_obj = await litellm.acreate_file( + file=open(file_path, "rb"), + purpose="batch", + custom_llm_provider="openai", +) +print("Response from creating file=", file_obj) +``` + +**Create Batch Request** + +```python +from litellm +import os + +create_batch_response = await litellm.acreate_batch( + completion_window="24h", + endpoint="/v1/chat/completions", + input_file_id=batch_input_file_id, + custom_llm_provider="openai", + metadata={"key1": "value1", "key2": "value2"}, +) + +print("response from litellm.create_batch=", create_batch_response) +``` + +**Retrieve the Specific Batch and File Content** + +```python + +retrieved_batch = await litellm.aretrieve_batch( + batch_id=create_batch_response.id, custom_llm_provider="openai" +) +print("retrieved batch=", retrieved_batch) +# just assert that we retrieved a non None batch + +assert retrieved_batch.id == create_batch_response.id + +# try to get file content for our original file + +file_content = await litellm.afile_content( + file_id=batch_input_file_id, custom_llm_provider="openai" +) + +print("file content = ", file_content) +``` + + + + +```bash +$ export OPENAI_API_KEY="sk-..." + +$ litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +**Create File for Batch Completion** + +```shell +curl https://api.openai.com/v1/files \ + -H "Authorization: Bearer sk-1234" \ + -F purpose="batch" \ + -F file="@mydata.jsonl" +``` + +**Create Batch Request** + +```bash +curl http://localhost:4000/v1/batches \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "input_file_id": "file-abc123", + "endpoint": "/v1/chat/completions", + "completion_window": "24h" + }' +``` + +**Retrieve the Specific Batch** + +```bash +curl http://localhost:4000/v1/batches/batch_abc123 \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ +``` + + + + +## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/batch) diff --git a/docs/my-website/docs/caching/all_caches.md b/docs/my-website/docs/caching/all_caches.md index eb309f9b8b..1b8bbd8e09 100644 --- a/docs/my-website/docs/caching/all_caches.md +++ b/docs/my-website/docs/caching/all_caches.md @@ -212,6 +212,94 @@ If you run the code two times, response1 will use the cache from the first run t + + +## Switch Cache On / Off Per LiteLLM Call + +LiteLLM supports 4 cache-controls: + +- `no-cache`: *Optional(bool)* When `True`, Will not return a cached response, but instead call the actual endpoint. +- `no-store`: *Optional(bool)* When `True`, Will not cache the response. +- `ttl`: *Optional(int)* - Will cache the response for the user-defined amount of time (in seconds). +- `s-maxage`: *Optional(int)* Will only accept cached responses that are within user-defined range (in seconds). + +[Let us know if you need more](https://github.com/BerriAI/litellm/issues/1218) + + + +Example usage `no-cache` - When `True`, Will not return a cached response + +```python +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + } + ], + cache={"no-cache": True}, + ) +``` + + + + + +Example usage `no-store` - When `True`, Will not cache the response. + +```python +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + } + ], + cache={"no-store": True}, + ) +``` + + + + +Example usage `ttl` - cache the response for 10 seconds + +```python +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + } + ], + cache={"ttl": 10}, + ) +``` + + + + +Example usage `s-maxage` - Will only accept cached responses for 60 seconds + +```python +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "user", + "content": "hello who are you" + } + ], + cache={"s-maxage": 60}, + ) +``` + + + + ## Cache Context Manager - Enable, Disable, Update Cache diff --git a/docs/my-website/docs/completion/batching.md b/docs/my-website/docs/completion/batching.md index 09f59f743d..5854f4db80 100644 --- a/docs/my-website/docs/completion/batching.md +++ b/docs/my-website/docs/completion/batching.md @@ -1,3 +1,6 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Batching Completion() LiteLLM allows you to: * Send many completion calls to 1 model @@ -51,6 +54,9 @@ This makes parallel calls to the specified `models` and returns the first respon Use this to reduce latency + + + ### Example Code ```python import litellm @@ -68,8 +74,93 @@ response = batch_completion_models( print(result) ``` + + + + + +[how to setup proxy config](#example-setup) + +Just pass a comma-separated string of model names and the flag `fastest_response=True`. + + + + +```bash + +curl -X POST 'http://localhost:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-D '{ + "model": "gpt-4o, groq-llama", # 👈 Comma-separated models + "messages": [ + { + "role": "user", + "content": "What's the weather like in Boston today?" + } + ], + "stream": true, + "fastest_response": true # 👈 FLAG +} + +' +``` + + + + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +# request sent to model set on litellm proxy, `litellm --model` +response = client.chat.completions.create( + model="gpt-4o, groq-llama", # 👈 Comma-separated models + messages = [ + { + "role": "user", + "content": "this is a test request, write a short poem" + } + ], + extra_body={"fastest_response": true} # 👈 FLAG +) + +print(response) +``` + + + + +--- + +### Example Setup: + +```yaml +model_list: +- model_name: groq-llama + litellm_params: + model: groq/llama3-8b-8192 + api_key: os.environ/GROQ_API_KEY +- model_name: gpt-4o + litellm_params: + model: gpt-4o + api_key: os.environ/OPENAI_API_KEY +``` + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + + + + ### Output -Returns the first response +Returns the first response in OpenAI format. Cancels other LLM API calls. ```json { "object": "chat.completion", @@ -95,6 +186,7 @@ Returns the first response } ``` + ## Send 1 completion call to many models: Return All Responses This makes parallel calls to the specified models and returns all responses diff --git a/docs/my-website/docs/completion/drop_params.md b/docs/my-website/docs/completion/drop_params.md new file mode 100644 index 0000000000..e79a88e14b --- /dev/null +++ b/docs/my-website/docs/completion/drop_params.md @@ -0,0 +1,110 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Drop Unsupported Params + +Drop unsupported OpenAI params by your LLM Provider. + +## Quick Start + +```python +import litellm +import os + +# set keys +os.environ["COHERE_API_KEY"] = "co-.." + +litellm.drop_params = True # 👈 KEY CHANGE + +response = litellm.completion( + model="command-r", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + response_format={"key": "value"}, + ) +``` + + +LiteLLM maps all supported openai params by provider + model (e.g. function calling is supported by anthropic on bedrock but not titan). + +See `litellm.get_supported_openai_params("command-r")` [**Code**](https://github.com/BerriAI/litellm/blob/main/litellm/utils.py#L3584) + +If a provider/model doesn't support a particular param, you can drop it. + +## OpenAI Proxy Usage + +```yaml +litellm_settings: + drop_params: true +``` + +## Pass drop_params in `completion(..)` + +Just drop_params when calling specific models + + + + +```python +import litellm +import os + +# set keys +os.environ["COHERE_API_KEY"] = "co-.." + +response = litellm.completion( + model="command-r", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + response_format={"key": "value"}, + drop_params=True + ) +``` + + + +```yaml +- litellm_params: + api_base: my-base + model: openai/my-model + drop_params: true # 👈 KEY CHANGE + model_name: my-model +``` + + + +## Specify params to drop + +To drop specific params when calling a provider (E.g. 'logit_bias' for vllm) + +Use `additional_drop_params` + + + + +```python +import litellm +import os + +# set keys +os.environ["COHERE_API_KEY"] = "co-.." + +response = litellm.completion( + model="command-r", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + response_format={"key": "value"}, + additional_drop_params=["response_format"] + ) +``` + + + +```yaml +- litellm_params: + api_base: my-base + model: openai/my-model + additional_drop_params: ["response_format"] # 👈 KEY CHANGE + model_name: my-model +``` + + + +**additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model. \ No newline at end of file diff --git a/docs/my-website/docs/completion/function_call.md b/docs/my-website/docs/completion/function_call.md index 5daccf7232..514e8cda1a 100644 --- a/docs/my-website/docs/completion/function_call.md +++ b/docs/my-website/docs/completion/function_call.md @@ -502,10 +502,10 @@ response = completion(model="gpt-3.5-turbo-0613", messages=messages, functions=f print(response) ``` -## Function calling for Non-OpenAI LLMs +## Function calling for Models w/out function-calling support ### Adding Function to prompt -For Non OpenAI LLMs LiteLLM allows you to add the function to the prompt set: `litellm.add_function_to_prompt = True` +For Models/providers without function calling support, LiteLLM allows you to add the function to the prompt set: `litellm.add_function_to_prompt = True` #### Usage ```python diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md index e844c541c1..c5988940d7 100644 --- a/docs/my-website/docs/completion/input.md +++ b/docs/my-website/docs/completion/input.md @@ -39,38 +39,38 @@ This is a list of openai params we translate across providers. Use `litellm.get_supported_openai_params()` for an updated list of params for each model + provider -| Provider | temperature | max_tokens | top_p | stream | stop | n | presence_penalty | frequency_penalty | functions | function_call | logit_bias | user | response_format | seed | tools | tool_choice | logprobs | top_logprobs | extra_headers | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|--| -|Anthropic| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | |✅ | ✅ | ✅ | ✅ | ✅ | | | ✅ -|OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ | -|Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | | | ✅ | +| Provider | temperature | max_tokens | top_p | stream | stream_options | stop | n | presence_penalty | frequency_penalty | functions | function_call | logit_bias | user | response_format | seed | tools | tool_choice | logprobs | top_logprobs | extra_headers | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +|Anthropic| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | |✅ | ✅ | ✅ | ✅ | ✅ | | | ✅ | +|OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ | ✅ | +|Azure OpenAI| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | ✅ | ✅ | ✅ |✅ | ✅ | | | ✅ | |Replicate | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | -|Anyscale | ✅ | ✅ | ✅ | ✅ | +|Anyscale | ✅ | ✅ | ✅ | ✅ | ✅ | |Cohere| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | -|Huggingface| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | +|Huggingface| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | |Openrouter| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | ✅ | | | | | |AI21| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | -|VertexAI| ✅ | ✅ | | ✅ | | | | | | | | | | | ✅ | | | +|VertexAI| ✅ | ✅ | | ✅ | ✅ | | | | | | | | | ✅ | ✅ | | | |Bedrock| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | ✅ (for anthropic) | | -|Sagemaker| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | +|Sagemaker| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | |TogetherAI| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | ✅ | -|AlephAlpha| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | -|Palm| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | -|NLP Cloud| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | -|Petals| ✅ | ✅ | | ✅ | | | | | | | +|AlephAlpha| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | +|Palm| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | +|NLP Cloud| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | +|Petals| ✅ | ✅ | | ✅ | ✅ | | | | | | |Ollama| ✅ | ✅ | ✅ | ✅ | ✅ | | | ✅ | | | | | ✅ | | | |Databricks| ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | -|ClarifAI| ✅ | ✅ | | | | | | | | | | | | | | - +|ClarifAI| ✅ | ✅ | |✅ | ✅ | | | | | | | | | | | :::note By default, LiteLLM raises an exception if the openai param being passed in isn't supported. -To drop the param instead, set `litellm.drop_params = True`. +To drop the param instead, set `litellm.drop_params = True` or `completion(..drop_params=True)`. -**For function calling:** +This **ONLY DROPS UNSUPPORTED OPENAI PARAMS**. + +LiteLLM assumes any non-openai param is provider specific and passes it in as a kwarg in the request body -Add to prompt for non-openai models, set: `litellm.add_function_to_prompt = True`. ::: ## Input Params @@ -97,6 +97,7 @@ def completion( seed: Optional[int] = None, tools: Optional[List] = None, tool_choice: Optional[str] = None, + parallel_tool_calls: Optional[bool] = None, logprobs: Optional[bool] = None, top_logprobs: Optional[int] = None, deployment_id=None, @@ -166,10 +167,12 @@ def completion( - `function`: *object* - Required. -- `tool_choice`: *string or object (optional)* - Controls which (if any) function is called by the model. none means the model will not call a function and instead generates a message. auto means the model can pick between generating a message or calling a function. Specifying a particular function via {"type: "function", "function": {"name": "my_function"}} forces the model to call that function. +- `tool_choice`: *string or object (optional)* - Controls which (if any) function is called by the model. none means the model will not call a function and instead generates a message. auto means the model can pick between generating a message or calling a function. Specifying a particular function via `{"type: "function", "function": {"name": "my_function"}}` forces the model to call that function. - `none` is the default when no functions are present. `auto` is the default if functions are present. +- `parallel_tool_calls`: *boolean (optional)* - Whether to enable parallel function calling during tool use.. OpenAI default is true. + - `frequency_penalty`: *number or null (optional)* - It is used to penalize new tokens based on their frequency in the text so far. - `logit_bias`: *map (optional)* - Used to modify the probability of specific tokens appearing in the completion. @@ -226,399 +229,3 @@ def completion( - `hf_model_name`: *string (optional)* - [Sagemaker Only] The corresponding huggingface name of the model, used to pull the right chat template for the model. - -## Provider-specific Params -Providers might offer params not supported by OpenAI (e.g. top_k). You can pass those in 2 ways: -- via completion(): We'll pass the non-openai param, straight to the provider as part of the request body. - - e.g. `completion(model="claude-instant-1", top_k=3)` -- via provider-specific config variable (e.g. `litellm.OpenAIConfig()`). - - - - -```python -import litellm, os - -# set env variables -os.environ["OPENAI_API_KEY"] = "your-openai-key" - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="gpt-3.5-turbo", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.OpenAIConfig(max_tokens=10) - -response_2 = litellm.completion( - model="gpt-3.5-turbo", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - -```python -import litellm, os - -# set env variables -os.environ["OPENAI_API_KEY"] = "your-openai-key" - - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="text-davinci-003", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.OpenAITextCompletionConfig(max_tokens=10) -response_2 = litellm.completion( - model="text-davinci-003", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - -```python -import litellm, os - -# set env variables -os.environ["AZURE_API_BASE"] = "your-azure-api-base" -os.environ["AZURE_API_TYPE"] = "azure" # [OPTIONAL] -os.environ["AZURE_API_VERSION"] = "2023-07-01-preview" # [OPTIONAL] - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="azure/chatgpt-v-2", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.AzureOpenAIConfig(max_tokens=10) -response_2 = litellm.completion( - model="azure/chatgpt-v-2", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - -```python -import litellm, os - -# set env variables -os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-key" - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="claude-instant-1", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.AnthropicConfig(max_tokens_to_sample=200) -response_2 = litellm.completion( - model="claude-instant-1", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - - -```python -import litellm, os - -# set env variables -os.environ["HUGGINGFACE_API_KEY"] = "your-huggingface-key" #[OPTIONAL] - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="huggingface/mistralai/Mistral-7B-Instruct-v0.1", - messages=[{ "content": "Hello, how are you?","role": "user"}], - api_base="https://your-huggingface-api-endpoint", - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.HuggingfaceConfig(max_new_tokens=200) -response_2 = litellm.completion( - model="huggingface/mistralai/Mistral-7B-Instruct-v0.1", - messages=[{ "content": "Hello, how are you?","role": "user"}], - api_base="https://your-huggingface-api-endpoint" - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - - - -```python -import litellm, os - -# set env variables -os.environ["TOGETHERAI_API_KEY"] = "your-togetherai-key" - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="together_ai/togethercomputer/llama-2-70b-chat", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.TogetherAIConfig(max_tokens_to_sample=200) -response_2 = litellm.completion( - model="together_ai/togethercomputer/llama-2-70b-chat", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - - -```python -import litellm, os - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="ollama/llama2", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.OllamConfig(num_predict=200) -response_2 = litellm.completion( - model="ollama/llama2", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - - -```python -import litellm, os - -# set env variables -os.environ["REPLICATE_API_KEY"] = "your-replicate-key" - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.ReplicateConfig(max_new_tokens=200) -response_2 = litellm.completion( - model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - - - -```python -import litellm - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="petals/petals-team/StableBeluga2", - messages=[{ "content": "Hello, how are you?","role": "user"}], - api_base="https://chat.petals.dev/api/v1/generate", - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.PetalsConfig(max_new_tokens=10) -response_2 = litellm.completion( - model="petals/petals-team/StableBeluga2", - messages=[{ "content": "Hello, how are you?","role": "user"}], - api_base="https://chat.petals.dev/api/v1/generate", - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - - -```python -import litellm, os - -# set env variables -os.environ["PALM_API_KEY"] = "your-palm-key" - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="palm/chat-bison", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.PalmConfig(maxOutputTokens=10) -response_2 = litellm.completion( - model="palm/chat-bison", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - -```python -import litellm, os - -# set env variables -os.environ["AI21_API_KEY"] = "your-ai21-key" - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="j2-mid", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.AI21Config(maxOutputTokens=10) -response_2 = litellm.completion( - model="j2-mid", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - - -```python -import litellm, os - -# set env variables -os.environ["COHERE_API_KEY"] = "your-cohere-key" - -## SET MAX TOKENS - via completion() -response_1 = litellm.completion( - model="command-nightly", - messages=[{ "content": "Hello, how are you?","role": "user"}], - max_tokens=10 - ) - -response_1_text = response_1.choices[0].message.content - -## SET MAX TOKENS - via config -litellm.CohereConfig(max_tokens=200) -response_2 = litellm.completion( - model="command-nightly", - messages=[{ "content": "Hello, how are you?","role": "user"}], - ) - -response_2_text = response_2.choices[0].message.content - -## TEST OUTPUT -assert len(response_2_text) > len(response_1_text) -``` - - - - - - -[**Check out the tutorial!**](../tutorials/provider_specific_params.md) diff --git a/docs/my-website/docs/completion/json_mode.md b/docs/my-website/docs/completion/json_mode.md new file mode 100644 index 0000000000..0e7e64a8ec --- /dev/null +++ b/docs/my-website/docs/completion/json_mode.md @@ -0,0 +1,137 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# JSON Mode + +## Quick Start + + + + +```python +from litellm import completion +import os + +os.environ["OPENAI_API_KEY"] = "" + +response = completion( + model="gpt-4o-mini", + response_format={ "type": "json_object" }, + messages=[ + {"role": "system", "content": "You are a helpful assistant designed to output JSON."}, + {"role": "user", "content": "Who won the world series in 2020?"} + ] +) +print(response.choices[0].message.content) +``` + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "gpt-4o-mini", + "response_format": { "type": "json_object" }, + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant designed to output JSON." + }, + { + "role": "user", + "content": "Who won the world series in 2020?" + } + ] + }' +``` + + + +## Check Model Support + +Call `litellm.get_supported_openai_params` to check if a model/provider supports `response_format`. + +```python +from litellm import get_supported_openai_params + +params = get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock") + +assert "response_format" in params +``` + +## Validate JSON Schema + +For VertexAI models, LiteLLM supports passing the `response_schema` and validating the JSON output. + +This works across Gemini (`vertex_ai_beta/`) + Anthropic (`vertex_ai/`) models. + + + + + +```python +# !gcloud auth application-default login - run this to add vertex credentials to your env + +from litellm import completion + +messages = [{"role": "user", "content": "List 5 cookie recipes"}] + +response_schema = { + "type": "array", + "items": { + "type": "object", + "properties": { + "recipe_name": { + "type": "string", + }, + }, + "required": ["recipe_name"], + }, +} + +resp = completion( + model="vertex_ai_beta/gemini-1.5-pro", + messages=messages, + response_format={ + "type": "json_object", + "response_schema": response_schema, + "enforce_validation": True, # client-side json schema validation + }, + vertex_location="us-east5", +) + +print("Received={}".format(resp)) +``` + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "vertex_ai_beta/gemini-1.5-pro", + "messages": [{"role": "user", "content": "List 5 cookie recipes"}] + "response_format": { + "type": "json_object", + "enforce_validation: true, + "response_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "recipe_name": { + "type": "string", + }, + }, + "required": ["recipe_name"], + }, + } + }, + }' +``` + + + \ No newline at end of file diff --git a/docs/my-website/docs/completion/provider_specific_params.md b/docs/my-website/docs/completion/provider_specific_params.md new file mode 100644 index 0000000000..a8307fc8a2 --- /dev/null +++ b/docs/my-website/docs/completion/provider_specific_params.md @@ -0,0 +1,436 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Provider-specific Params + +Providers might offer params not supported by OpenAI (e.g. top_k). LiteLLM treats any non-openai param, as a provider-specific param, and passes it to the provider in the request body, as a kwarg. [**See Reserved Params**](https://github.com/BerriAI/litellm/blob/aa2fd29e48245f360e771a8810a69376464b195e/litellm/main.py#L700) + +You can pass those in 2 ways: +- via completion(): We'll pass the non-openai param, straight to the provider as part of the request body. + - e.g. `completion(model="claude-instant-1", top_k=3)` +- via provider-specific config variable (e.g. `litellm.OpenAIConfig()`). + +## SDK Usage + + + +```python +import litellm, os + +# set env variables +os.environ["OPENAI_API_KEY"] = "your-openai-key" + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="gpt-3.5-turbo", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.OpenAIConfig(max_tokens=10) + +response_2 = litellm.completion( + model="gpt-3.5-turbo", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + +```python +import litellm, os + +# set env variables +os.environ["OPENAI_API_KEY"] = "your-openai-key" + + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="text-davinci-003", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.OpenAITextCompletionConfig(max_tokens=10) +response_2 = litellm.completion( + model="text-davinci-003", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + +```python +import litellm, os + +# set env variables +os.environ["AZURE_API_BASE"] = "your-azure-api-base" +os.environ["AZURE_API_TYPE"] = "azure" # [OPTIONAL] +os.environ["AZURE_API_VERSION"] = "2023-07-01-preview" # [OPTIONAL] + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="azure/chatgpt-v-2", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.AzureOpenAIConfig(max_tokens=10) +response_2 = litellm.completion( + model="azure/chatgpt-v-2", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + +```python +import litellm, os + +# set env variables +os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-key" + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="claude-instant-1", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.AnthropicConfig(max_tokens_to_sample=200) +response_2 = litellm.completion( + model="claude-instant-1", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + + +```python +import litellm, os + +# set env variables +os.environ["HUGGINGFACE_API_KEY"] = "your-huggingface-key" #[OPTIONAL] + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="huggingface/mistralai/Mistral-7B-Instruct-v0.1", + messages=[{ "content": "Hello, how are you?","role": "user"}], + api_base="https://your-huggingface-api-endpoint", + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.HuggingfaceConfig(max_new_tokens=200) +response_2 = litellm.completion( + model="huggingface/mistralai/Mistral-7B-Instruct-v0.1", + messages=[{ "content": "Hello, how are you?","role": "user"}], + api_base="https://your-huggingface-api-endpoint" + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + + + +```python +import litellm, os + +# set env variables +os.environ["TOGETHERAI_API_KEY"] = "your-togetherai-key" + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="together_ai/togethercomputer/llama-2-70b-chat", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.TogetherAIConfig(max_tokens_to_sample=200) +response_2 = litellm.completion( + model="together_ai/togethercomputer/llama-2-70b-chat", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + + +```python +import litellm, os + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="ollama/llama2", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.OllamConfig(num_predict=200) +response_2 = litellm.completion( + model="ollama/llama2", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + + +```python +import litellm, os + +# set env variables +os.environ["REPLICATE_API_KEY"] = "your-replicate-key" + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.ReplicateConfig(max_new_tokens=200) +response_2 = litellm.completion( + model="replicate/meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + + + +```python +import litellm + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="petals/petals-team/StableBeluga2", + messages=[{ "content": "Hello, how are you?","role": "user"}], + api_base="https://chat.petals.dev/api/v1/generate", + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.PetalsConfig(max_new_tokens=10) +response_2 = litellm.completion( + model="petals/petals-team/StableBeluga2", + messages=[{ "content": "Hello, how are you?","role": "user"}], + api_base="https://chat.petals.dev/api/v1/generate", + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + + +```python +import litellm, os + +# set env variables +os.environ["PALM_API_KEY"] = "your-palm-key" + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="palm/chat-bison", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.PalmConfig(maxOutputTokens=10) +response_2 = litellm.completion( + model="palm/chat-bison", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + +```python +import litellm, os + +# set env variables +os.environ["AI21_API_KEY"] = "your-ai21-key" + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="j2-mid", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.AI21Config(maxOutputTokens=10) +response_2 = litellm.completion( + model="j2-mid", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + + +```python +import litellm, os + +# set env variables +os.environ["COHERE_API_KEY"] = "your-cohere-key" + +## SET MAX TOKENS - via completion() +response_1 = litellm.completion( + model="command-nightly", + messages=[{ "content": "Hello, how are you?","role": "user"}], + max_tokens=10 + ) + +response_1_text = response_1.choices[0].message.content + +## SET MAX TOKENS - via config +litellm.CohereConfig(max_tokens=200) +response_2 = litellm.completion( + model="command-nightly", + messages=[{ "content": "Hello, how are you?","role": "user"}], + ) + +response_2_text = response_2.choices[0].message.content + +## TEST OUTPUT +assert len(response_2_text) > len(response_1_text) +``` + + + + + + +[**Check out the tutorial!**](../tutorials/provider_specific_params.md) + + +## Proxy Usage + +**via Config** + +```yaml +model_list: + - model_name: llama-3-8b-instruct + litellm_params: + model: predibase/llama-3-8b-instruct + api_key: os.environ/PREDIBASE_API_KEY + tenant_id: os.environ/PREDIBASE_TENANT_ID + max_tokens: 256 + adapter_base: # 👈 PROVIDER-SPECIFIC PARAM +``` + +**via Request** + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-D '{ + "model": "llama-3-8b-instruct", + "messages": [ + { + "role": "user", + "content": "What'\''s the weather like in Boston today?" + } + ], + "adapater_id": "my-special-adapter-id" # 👈 PROVIDER-SPECIFIC PARAM + }' +``` \ No newline at end of file diff --git a/docs/my-website/docs/completion/reliable_completions.md b/docs/my-website/docs/completion/reliable_completions.md index 2656f9a4fb..94102e1944 100644 --- a/docs/my-website/docs/completion/reliable_completions.md +++ b/docs/my-website/docs/completion/reliable_completions.md @@ -31,9 +31,15 @@ response = completion( ) ``` -## Fallbacks +## Fallbacks (SDK) -### Context Window Fallbacks +:::info + +[See how to do on PROXY](../proxy/reliability.md) + +::: + +### Context Window Fallbacks (SDK) ```python from litellm import completion @@ -43,7 +49,7 @@ messages = [{"content": "how does a court case get to the Supreme Court?" * 500, completion(model="gpt-3.5-turbo", messages=messages, context_window_fallback_dict=ctx_window_fallback_dict) ``` -### Fallbacks - Switch Models/API Keys/API Bases +### Fallbacks - Switch Models/API Keys/API Bases (SDK) LLM APIs can be unstable, completion() with fallbacks ensures you'll always get a response from your calls @@ -69,7 +75,7 @@ response = completion(model="azure/gpt-4", messages=messages, api_key=api_key, [Check out this section for implementation details](#fallbacks-1) -## Implementation Details +## Implementation Details (SDK) ### Fallbacks #### Output from calls diff --git a/docs/my-website/docs/completion/token_usage.md b/docs/my-website/docs/completion/token_usage.md index 807ccfd91e..0bec6b3f90 100644 --- a/docs/my-website/docs/completion/token_usage.md +++ b/docs/my-website/docs/completion/token_usage.md @@ -1,7 +1,21 @@ # Completion Token Usage & Cost By default LiteLLM returns token usage in all completion requests ([See here](https://litellm.readthedocs.io/en/latest/output/)) -However, we also expose some helper functions + **[NEW]** an API to calculate token usage across providers: +LiteLLM returns `response_cost` in all calls. + +```python +from litellm import completion + +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hey, how's it going?"}], + mock_response="Hello world", + ) + +print(response._hidden_params["response_cost"]) +``` + +LiteLLM also exposes some helper functions: - `encode`: This encodes the text passed in, using the model-specific tokenizer. [**Jump to code**](#1-encode) @@ -23,7 +37,7 @@ However, we also expose some helper functions + **[NEW]** an API to calculate to - `api.litellm.ai`: Live token + price count across [all supported models](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). [**Jump to code**](#10-apilitellmai) -📣 This is a community maintained list. Contributions are welcome! ❤️ +📣 [This is a community maintained list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json). Contributions are welcome! ❤️ ## Example Usage diff --git a/docs/my-website/docs/completion/vision.md b/docs/my-website/docs/completion/vision.md index ea04b1e1e1..69af03c987 100644 --- a/docs/my-website/docs/completion/vision.md +++ b/docs/my-website/docs/completion/vision.md @@ -39,7 +39,7 @@ Use `litellm.supports_vision(model="")` -> returns `True` if model supports `vis ```python assert litellm.supports_vision(model="gpt-4-vision-preview") == True -assert litellm.supports_vision(model="gemini-1.0-pro-visionn") == True +assert litellm.supports_vision(model="gemini-1.0-pro-vision") == True assert litellm.supports_vision(model="gpt-3.5-turbo") == False ``` diff --git a/docs/my-website/docs/data_security.md b/docs/my-website/docs/data_security.md new file mode 100644 index 0000000000..9572a9597b --- /dev/null +++ b/docs/my-website/docs/data_security.md @@ -0,0 +1,42 @@ +# Data Privacy and Security + +## Security Measures + +### LiteLLM Cloud + +- We encrypt all data stored using your `LITELLM_MASTER_KEY` and in transit using TLS. +- Our database and application run on GCP, AWS infrastructure, partly managed by NeonDB. + - US data region: Northern California (AWS/GCP `us-west-1`) & Virginia (AWS `us-east-1`) + - EU data region Germany/Frankfurt (AWS/GCP `eu-central-1`) +- All users have access to SSO (Single Sign-On) through OAuth 2.0 with Google, Okta, Microsoft, KeyCloak. +- Audit Logs with retention policy +- Control Allowed IP Addresses that can access your Cloud LiteLLM Instance + +For security inquiries, please contact us at support@berri.ai + +## Self-hosted Instances LiteLLM + +- ** No data or telemetry is stored on LiteLLM Servers when you self host ** +- For installation and configuration, see: [Self-hosting guided](../docs/proxy/deploy.md) +- **Telemetry** We run no telemetry when you self host LiteLLM + +For security inquiries, please contact us at support@berri.ai + +### Supported data regions for LiteLLM Cloud + +LiteLLM supports the following data regions: + +- US, Northern California (AWS/GCP `us-west-1`) +- Europe, Frankfurt, Germany (AWS/GCP `eu-central-1`) + +All data, user accounts, and infrastructure are completely separated between these two regions + +### Security Vulnerability Reporting Guidelines + +We value the security community's role in protecting our systems and users. To report a security vulnerability: + +- Email support@berri.ai with details +- Include steps to reproduce the issue +- Provide any relevant additional information + +We'll review all reports promptly. Note that we don't currently offer a bug bounty program. diff --git a/docs/my-website/docs/debugging/hosted_debugging.md b/docs/my-website/docs/debugging/hosted_debugging.md index 5c98ac6f56..e69de29bb2 100644 --- a/docs/my-website/docs/debugging/hosted_debugging.md +++ b/docs/my-website/docs/debugging/hosted_debugging.md @@ -1,90 +0,0 @@ -import Image from '@theme/IdealImage'; -import QueryParamReader from '../../src/components/queryParamReader.js' - -# [Beta] Monitor Logs in Production - -:::note - -This is in beta. Expect frequent updates, as we improve based on your feedback. - -::: - -LiteLLM provides an integration to let you monitor logs in production. - -👉 Jump to our sample LiteLLM Dashboard: https://admin.litellm.ai/ - - -Dashboard - -## Debug your first logs - - Open In Colab - - - -### 1. Get your LiteLLM Token - -Go to [admin.litellm.ai](https://admin.litellm.ai/) and copy the code snippet with your unique token - -Usage - -### 2. Set up your environment - -**Add it to your .env** - -```python -import os - -os.env["LITELLM_TOKEN"] = "e24c4c06-d027-4c30-9e78-18bc3a50aebb" # replace with your unique token - -``` - -**Turn on LiteLLM Client** -```python -import litellm -litellm.client = True -``` - -### 3. Make a normal `completion()` call -```python -import litellm -from litellm import completion -import os - -# set env variables -os.environ["LITELLM_TOKEN"] = "e24c4c06-d027-4c30-9e78-18bc3a50aebb" # replace with your unique token -os.environ["OPENAI_API_KEY"] = "openai key" - -litellm.use_client = True # enable logging dashboard -messages = [{ "content": "Hello, how are you?","role": "user"}] - -# openai call -response = completion(model="gpt-3.5-turbo", messages=messages) -``` - -Your `completion()` call print with a link to your session dashboard (https://admin.litellm.ai/) - -In the above case it would be: [`admin.litellm.ai/e24c4c06-d027-4c30-9e78-18bc3a50aebb`](https://admin.litellm.ai/e24c4c06-d027-4c30-9e78-18bc3a50aebb) - -Click on your personal dashboard link. Here's how you can find it 👇 - -Dashboard - -[👋 Tell us if you need better privacy controls](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version?month=2023-08) - -### 3. Review request log - -Oh! Looks like our request was made successfully. Let's click on it and see exactly what got sent to the LLM provider. - - - - -Ah! So we can see that this request was made to a **Baseten** (see litellm_params > custom_llm_provider) for a model with ID - **7qQNLDB** (see model). The message sent was - `"Hey, how's it going?"` and the response received was - `"As an AI language model, I don't have feelings or emotions, but I can assist you with your queries. How can I assist you today?"` - -Dashboard Log Row - -:::info - -🎉 Congratulations! You've successfully debugger your first log! - -::: \ No newline at end of file diff --git a/docs/my-website/docs/embedding/supported_embedding.md b/docs/my-website/docs/embedding/supported_embedding.md index ebf7a29ebc..73ac477554 100644 --- a/docs/my-website/docs/embedding/supported_embedding.md +++ b/docs/my-website/docs/embedding/supported_embedding.md @@ -85,6 +85,17 @@ print(query_result[:5]) ## Input Params for `litellm.embedding()` + + +:::info + +Any non-openai params, will be treated as provider-specific params, and sent in the request body as kwargs to the provider. + +[**See Reserved Params**](https://github.com/BerriAI/litellm/blob/2f5f85cb52f36448d1f8bbfbd3b8af8167d0c4c8/litellm/main.py#L3130) + +[**See Example**](#example) +::: + ### Required Fields - `model`: *string* - ID of the model to use. `model='text-embedding-ada-002'` @@ -363,3 +374,66 @@ All models listed here https://docs.voyageai.com/embeddings/#models-and-specific | voyage-01 | `embedding(model="voyage/voyage-01", input)` | | voyage-lite-01 | `embedding(model="voyage/voyage-lite-01", input)` | | voyage-lite-01-instruct | `embedding(model="voyage/voyage-lite-01-instruct", input)` | + +## Provider-specific Params + + +:::info + +Any non-openai params, will be treated as provider-specific params, and sent in the request body as kwargs to the provider. + +[**See Reserved Params**](https://github.com/BerriAI/litellm/blob/2f5f85cb52f36448d1f8bbfbd3b8af8167d0c4c8/litellm/main.py#L3130) +::: + +### **Example** + +Cohere v3 Models have a required parameter: `input_type`, it can be one of the following four values: + +- `input_type="search_document"`: (default) Use this for texts (documents) you want to store in your vector database +- `input_type="search_query"`: Use this for search queries to find the most relevant documents in your vector database +- `input_type="classification"`: Use this if you use the embeddings as an input for a classification system +- `input_type="clustering"`: Use this if you use the embeddings for text clustering + +https://txt.cohere.com/introducing-embed-v3/ + + + + +```python +from litellm import embedding +os.environ["COHERE_API_KEY"] = "cohere key" + +# cohere call +response = embedding( + model="embed-english-v3.0", + input=["good morning from litellm", "this is another item"], + input_type="search_document" # 👈 PROVIDER-SPECIFIC PARAM +) +``` + + + +**via config** + +```yaml +model_list: + - model_name: "cohere-embed" + litellm_params: + model: embed-english-v3.0 + input_type: search_document # 👈 PROVIDER-SPECIFIC PARAM +``` + +**via request** + +```bash +curl -X POST 'http://0.0.0.0:4000/v1/embeddings' \ +-H 'Authorization: Bearer sk-54d77cd67b9febbb' \ +-H 'Content-Type: application/json' \ +-d '{ + "model": "cohere-embed", + "input": ["Are you authorized to work in United States of America?"], + "input_type": "search_document" # 👈 PROVIDER-SPECIFIC PARAM +}' +``` + + \ No newline at end of file diff --git a/docs/my-website/docs/enterprise.md b/docs/my-website/docs/enterprise.md index 3dc4cb0e2d..9f075ef35a 100644 --- a/docs/my-website/docs/enterprise.md +++ b/docs/my-website/docs/enterprise.md @@ -2,38 +2,64 @@ For companies that need SSO, user management and professional support for LiteLLM Proxy :::info - +Interested in Enterprise? Schedule a meeting with us here 👉 [Talk to founders](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) ::: -This covers: -- ✅ **Features under the [LiteLLM Commercial License (Content Mod, Custom Tags, etc.)](https://docs.litellm.ai/docs/proxy/enterprise)** -- ✅ [**Secure UI access with Single Sign-On**](../docs/proxy/ui.md#setup-ssoauth-for-ui) -- ✅ [**JWT-Auth**](../docs/proxy/token_auth.md) -- ✅ [**Prompt Injection Detection**](#prompt-injection-detection-lakeraai) -- ✅ [**Invite Team Members to access `/spend` Routes**](../docs/proxy/cost_tracking#allowing-non-proxy-admins-to-access-spend-endpoints) -- ✅ **Feature Prioritization** -- ✅ **Custom Integrations** -- ✅ **Professional Support - Dedicated discord + slack** -- ✅ **Custom SLAs** - - -## [COMING SOON] AWS Marketplace Support - Deploy managed LiteLLM Proxy within your VPC. Includes all enterprise features. +[**View AWS Marketplace Listing**](https://aws.amazon.com/marketplace/pp/prodview-gdm3gswgjhgjo?sr=0-1&ref_=beagle&applicationId=AWSMPContessa) + [**Get early access**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) + +This covers: +- **Enterprise Features** + - **Security** + - ✅ [SSO for Admin UI](./proxy/ui#✨-enterprise-features) + - ✅ [Audit Logs with retention policy](./proxy/enterprise#audit-logs) + - ✅ [JWT-Auth](../docs/proxy/token_auth.md) + - ✅ [Control available public, private routes](./proxy/enterprise#control-available-public-private-routes) + - ✅ [[BETA] AWS Key Manager v2 - Key Decryption](./proxy/enterprise#beta-aws-key-manager---key-decryption) + - ✅ IP address‑based access control lists + - ✅ Track Request IP Address + - ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](./proxy/pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints) + - ✅ Set Max Request / File Size on Requests + - ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](./proxy/enterprise#enforce-required-params-for-llm-requests) + - **Spend Tracking** + - ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags) + - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend) + - **Advanced Metrics** + - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens) + - **Guardrails, PII Masking, Content Moderation** + - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation) + - ✅ [Prompt Injection Detection (with LakeraAI API)](./proxy/enterprise#prompt-injection-detection---lakeraai) + - ✅ Reject calls from Blocked User list + - ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors) + - **Custom Branding** + - ✅ [Custom Branding + Routes on Swagger Docs](./proxy/enterprise#swagger-docs---custom-routes--branding) + - ✅ [Public Model Hub](../docs/proxy/enterprise.md#public-model-hub) + - ✅ [Custom Email Branding](../docs/proxy/email.md#customizing-email-branding) +- ✅ **Feature Prioritization** +- ✅ **Custom Integrations** +- ✅ **Professional Support - Dedicated discord + slack** + + + ## Frequently Asked Questions ### What topics does Professional support cover and what SLAs do you offer? Professional Support can assist with LLM/Provider integrations, deployment, upgrade management, and LLM Provider troubleshooting. We can’t solve your own infrastructure-related issues but we will guide you to fix them. -We offer custom SLAs based on your needs and the severity of the issue. The standard SLA is 6 hours for Sev0-Sev1 severity and 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday). +- 1 hour for Sev0 issues +- 6 hours for Sev1 +- 24h for Sev2-Sev3 between 7am – 7pm PT (Monday through Saturday) + +**We can offer custom SLAs** based on your needs and the severity of the issue ### What’s the cost of the Self-Managed Enterprise edition? diff --git a/docs/my-website/docs/getting_started.md b/docs/my-website/docs/getting_started.md index edbdf3c00f..e9b2a0db61 100644 --- a/docs/my-website/docs/getting_started.md +++ b/docs/my-website/docs/getting_started.md @@ -87,13 +87,14 @@ from litellm import completion ## set env variables for logging tools os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" +os.environ["HELICONE_API_KEY"] = "your-helicone-key" os.environ["LANGFUSE_PUBLIC_KEY"] = "" os.environ["LANGFUSE_SECRET_KEY"] = "" os.environ["OPENAI_API_KEY"] # set callbacks -litellm.success_callback = ["lunary", "langfuse"] # log input/output to langfuse, lunary, supabase +litellm.success_callback = ["lunary", "langfuse", "helicone"] # log input/output to langfuse, lunary, supabase, helicone #openai call response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) diff --git a/docs/my-website/docs/hosted.md b/docs/my-website/docs/hosted.md index 92940e8585..99bfe99031 100644 --- a/docs/my-website/docs/hosted.md +++ b/docs/my-website/docs/hosted.md @@ -21,6 +21,14 @@ See our status page for [**live reliability**](https://status.litellm.ai/) - **Reliable**: Our hosted proxy is tested on 1k requests per second, making it reliable for high load. - **Secure**: LiteLLM is currently undergoing SOC-2 compliance, to make sure your data is as secure as possible. +## Data Privacy & Security + +You can find our [data privacy & security policy for cloud litellm here](../docs/data_security#litellm-cloud) + +## Supported data regions for LiteLLM Cloud + +You can find [supported data regions litellm here](../docs/data_security#supported-data-regions-for-litellm-cloud) + ### Pricing Pricing is based on usage. We can figure out a price that works for your team, on the call. diff --git a/docs/my-website/docs/image_generation.md b/docs/my-website/docs/image_generation.md index 7bb4d2c991..5a7ef6f4f7 100644 --- a/docs/my-website/docs/image_generation.md +++ b/docs/my-website/docs/image_generation.md @@ -14,7 +14,76 @@ response = image_generation(prompt="A cute baby sea otter", model="dall-e-3") print(f"response: {response}") ``` -### Input Params for `litellm.image_generation()` +## Proxy Usage + +### Setup config.yaml + +```yaml +model_list: + - model_name: dall-e-2 ### RECEIVED MODEL NAME ### + litellm_params: # all params accepted by litellm.image_generation() + model: azure/dall-e-2 ### MODEL NAME sent to `litellm.image_generation()` ### + api_base: https://my-endpoint-europe-berri-992.openai.azure.com/ + api_key: "os.environ/AZURE_API_KEY_EU" # does os.getenv("AZURE_API_KEY_EU") + rpm: 6 # [OPTIONAL] Rate limit for this deployment: in requests per minute (rpm) + +``` + +### Start proxy + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +### Test + + + + +```bash +curl -X POST 'http://0.0.0.0:4000/v1/images/generations' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-D '{ + "model": "dall-e-2", + "prompt": "A cute baby sea otter", + "n": 1, + "size": "1024x1024" +}' +``` + + + + +```python +from openai import OpenAI +client = openai.OpenAI( + api_key="sk-1234", + base_url="http://0.0.0.0:4000" +) + + +image = client.images.generate( + prompt="A cute baby sea otter", + model="dall-e-3", +) + +print(image) +``` + + + +## Input Params for `litellm.image_generation()` + +:::info + +Any non-openai params, will be treated as provider-specific params, and sent in the request body as kwargs to the provider. + +[**See Reserved Params**](https://github.com/BerriAI/litellm/blob/2f5f85cb52f36448d1f8bbfbd3b8af8167d0c4c8/litellm/main.py#L4082) +::: + ### Required Fields - `prompt`: *string* - A text description of the desired image(s). @@ -51,7 +120,7 @@ print(f"response: {response}") - `api_base`: *string (optional)* - The api endpoint you want to call the model with -- `api_version`: *string (optional)* - (Azure-specific) the api version for the call +- `api_version`: *string (optional)* - (Azure-specific) the api version for the call; required for dall-e-3 on Azure - `api_key`: *string (optional)* - The API key to authenticate and authorize requests. If not provided, the default API key is used. @@ -166,4 +235,4 @@ response = litellm.image_generation( vertex_ai_location="us-central1", ) print(f"response: {response}") -``` \ No newline at end of file +``` diff --git a/docs/my-website/docs/index.md b/docs/my-website/docs/index.md index 762156f466..6b472ee6c6 100644 --- a/docs/my-website/docs/index.md +++ b/docs/my-website/docs/index.md @@ -310,6 +310,7 @@ LiteLLM exposes pre defined callbacks to send data to Lunary, Langfuse, Helicone from litellm import completion ## set env variables for logging tools +os.environ["HELICONE_API_KEY"] = "your-helicone-key" os.environ["LANGFUSE_PUBLIC_KEY"] = "" os.environ["LANGFUSE_SECRET_KEY"] = "" os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" @@ -317,7 +318,7 @@ os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" os.environ["OPENAI_API_KEY"] # set callbacks -litellm.success_callback = ["lunary", "langfuse"] # log input/output to lunary, langfuse, supabase +litellm.success_callback = ["lunary", "langfuse", "helicone"] # log input/output to lunary, langfuse, supabase, helicone #openai call response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) diff --git a/docs/my-website/docs/observability/athina_integration.md b/docs/my-website/docs/observability/athina_integration.md index 62c8897518..cd1442f35a 100644 --- a/docs/my-website/docs/observability/athina_integration.md +++ b/docs/my-website/docs/observability/athina_integration.md @@ -2,6 +2,15 @@ import Image from '@theme/IdealImage'; # Athina + +:::tip + +This is community maintained, Please make an issue if you run into a bug +https://github.com/BerriAI/litellm + +::: + + [Athina](https://athina.ai/) is an evaluation framework and production monitoring platform for your LLM-powered app. Athina is designed to enhance the performance and reliability of AI applications through real-time monitoring, granular analytics, and plug-and-play evaluations. diff --git a/docs/my-website/docs/observability/callbacks.md b/docs/my-website/docs/observability/callbacks.md index af745e8455..0d54a89176 100644 --- a/docs/my-website/docs/observability/callbacks.md +++ b/docs/my-website/docs/observability/callbacks.md @@ -7,15 +7,17 @@ liteLLM provides `input_callbacks`, `success_callbacks` and `failure_callbacks`, liteLLM supports: - [Custom Callback Functions](https://docs.litellm.ai/docs/observability/custom_callback) -- [Lunary](https://lunary.ai/docs) - [Langfuse](https://langfuse.com/docs) - [Helicone](https://docs.helicone.ai/introduction) - [Traceloop](https://traceloop.com/docs) +- [Lunary](https://lunary.ai/docs) - [Athina](https://docs.athina.ai/) - [Sentry](https://docs.sentry.io/platforms/python/) - [PostHog](https://posthog.com/docs/libraries/python) - [Slack](https://slack.dev/bolt-python/concepts) +This is **not** an extensive list. Please check the dropdown for all logging integrations. + ### Quick Start ```python diff --git a/docs/my-website/docs/observability/custom_callback.md b/docs/my-website/docs/observability/custom_callback.md index 3168222273..373b4a96c0 100644 --- a/docs/my-website/docs/observability/custom_callback.md +++ b/docs/my-website/docs/observability/custom_callback.md @@ -38,7 +38,7 @@ class MyCustomHandler(CustomLogger): print(f"On Async Success") async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): - print(f"On Async Success") + print(f"On Async Failure") customHandler = MyCustomHandler() diff --git a/docs/my-website/docs/observability/greenscale_integration.md b/docs/my-website/docs/observability/greenscale_integration.md index 0dd673226e..49eadc6453 100644 --- a/docs/my-website/docs/observability/greenscale_integration.md +++ b/docs/my-website/docs/observability/greenscale_integration.md @@ -1,5 +1,14 @@ # Greenscale - Track LLM Spend and Responsible Usage + +:::tip + +This is community maintained, Please make an issue if you run into a bug +https://github.com/BerriAI/litellm + +::: + + [Greenscale](https://greenscale.ai/) is a production monitoring platform for your LLM-powered app that provides you granular key insights into your GenAI spending and responsible usage. Greenscale only captures metadata to minimize the exposure risk of personally identifiable information (PII). ## Getting Started diff --git a/docs/my-website/docs/observability/helicone_integration.md b/docs/my-website/docs/observability/helicone_integration.md index de89ba8da8..7e7f9fcb6f 100644 --- a/docs/my-website/docs/observability/helicone_integration.md +++ b/docs/my-website/docs/observability/helicone_integration.md @@ -1,55 +1,170 @@ -# Helicone Tutorial -[Helicone](https://helicone.ai/) is an open source observability platform that proxies your OpenAI traffic and provides you key insights into your spend, latency and usage. +# 🧠 Helicone - OSS LLM Observability Platform -## Use Helicone to log requests across all LLM Providers (OpenAI, Azure, Anthropic, Cohere, Replicate, PaLM) -liteLLM provides `success_callbacks` and `failure_callbacks`, making it easy for you to send data to a particular provider depending on the status of your responses. +:::tip -In this case, we want to log requests to Helicone when a request succeeds. +This is community maintained. Please make an issue if you run into a bug: +https://github.com/BerriAI/litellm + +::: + +[Helicone](https://helicone.ai/) is an open source observability platform that proxies your LLM requests and provides key insights into your usage, spend, latency and more. + +## Using Helicone with LiteLLM + +LiteLLM provides `success_callbacks` and `failure_callbacks`, allowing you to easily log data to Helicone based on the status of your responses. + +### Supported LLM Providers + +Helicone can log requests across [various LLM providers](https://docs.helicone.ai/getting-started/quick-start), including: + +- OpenAI +- Azure +- Anthropic +- Gemini +- Groq +- Cohere +- Replicate +- And more + +### Integration Methods + +There are two main approaches to integrate Helicone with LiteLLM: + +1. Using callbacks +2. Using Helicone as a proxy + +Let's explore each method in detail. + +### Approach 1: Use Callbacks + +Use just 1 line of code to instantly log your responses **across all providers** with Helicone: -### Approach 1: Use Callbacks -Use just 1 line of code, to instantly log your responses **across all providers** with helicone: ```python -litellm.success_callback=["helicone"] +litellm.success_callback = ["helicone"] ``` -Complete code -```python -from litellm import completion - -## set env variables -os.environ["HELICONE_API_KEY"] = "your-helicone-key" -os.environ["OPENAI_API_KEY"], os.environ["COHERE_API_KEY"] = "", "" - -# set callbacks -litellm.success_callback=["helicone"] - -#openai call -response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) - -#cohere call -response = completion(model="command-nightly", messages=[{"role": "user", "content": "Hi 👋 - i'm cohere"}]) -``` - -### Approach 2: [OpenAI + Azure only] Use Helicone as a proxy -Helicone provides advanced functionality like caching, etc. Helicone currently supports this for Azure and OpenAI. - -If you want to use Helicone to proxy your OpenAI/Azure requests, then you can - - -- Set helicone as your base url via: `litellm.api_url` -- Pass in helicone request headers via: `litellm.headers` - Complete Code + ```python -import litellm +import os from litellm import completion -litellm.api_base = "https://oai.hconeai.com/v1" -litellm.headers = {"Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}"} +## Set env variables +os.environ["HELICONE_API_KEY"] = "your-helicone-key" +os.environ["OPENAI_API_KEY"] = "your-openai-key" -response = litellm.completion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "how does a court case get to the Supreme Court?"}] +# Set callbacks +litellm.success_callback = ["helicone"] + +# OpenAI call +response = completion( + model="gpt-4o", + messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}], ) print(response) ``` + +### Approach 2: Use Helicone as a proxy + +Helicone's proxy provides [advanced functionality](https://docs.helicone.ai/getting-started/proxy-vs-async) like caching, rate limiting, LLM security through [PromptArmor](https://promptarmor.com/) and more. + +To use Helicone as a proxy for your LLM requests: + +1. Set Helicone as your base URL via: litellm.api_base +2. Pass in Helicone request headers via: litellm.metadata + +Complete Code: + +```python +import os +import litellm +from litellm import completion + +litellm.api_base = "https://oai.hconeai.com/v1" +litellm.headers = { + "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API +} + +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "How does a court case get to the Supreme Court?"}] +) + +print(response) +``` + +### Advanced Usage + +You can add custom metadata and properties to your requests using Helicone headers. Here are some examples: + +```python +litellm.metadata = { + "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API + "Helicone-User-Id": "user-abc", # Specify the user making the request + "Helicone-Property-App": "web", # Custom property to add additional information + "Helicone-Property-Custom": "any-value", # Add any custom property + "Helicone-Prompt-Id": "prompt-supreme-court", # Assign an ID to associate this prompt with future versions + "Helicone-Cache-Enabled": "true", # Enable caching of responses + "Cache-Control": "max-age=3600", # Set cache limit to 1 hour + "Helicone-RateLimit-Policy": "10;w=60;s=user", # Set rate limit policy + "Helicone-Retry-Enabled": "true", # Enable retry mechanism + "helicone-retry-num": "3", # Set number of retries + "helicone-retry-factor": "2", # Set exponential backoff factor + "Helicone-Model-Override": "gpt-3.5-turbo-0613", # Override the model used for cost calculation + "Helicone-Session-Id": "session-abc-123", # Set session ID for tracking + "Helicone-Session-Path": "parent-trace/child-trace", # Set session path for hierarchical tracking + "Helicone-Omit-Response": "false", # Include response in logging (default behavior) + "Helicone-Omit-Request": "false", # Include request in logging (default behavior) + "Helicone-LLM-Security-Enabled": "true", # Enable LLM security features + "Helicone-Moderations-Enabled": "true", # Enable content moderation + "Helicone-Fallbacks": '["gpt-3.5-turbo", "gpt-4"]', # Set fallback models +} +``` + +### Caching and Rate Limiting + +Enable caching and set up rate limiting policies: + +```python +litellm.metadata = { + "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API + "Helicone-Cache-Enabled": "true", # Enable caching of responses + "Cache-Control": "max-age=3600", # Set cache limit to 1 hour + "Helicone-RateLimit-Policy": "100;w=3600;s=user", # Set rate limit policy +} +``` + +### Session Tracking and Tracing + +Track multi-step and agentic LLM interactions using session IDs and paths: + +```python +litellm.metadata = { + "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API + "Helicone-Session-Id": "session-abc-123", # The session ID you want to track + "Helicone-Session-Path": "parent-trace/child-trace", # The path of the session +} +``` + +- `Helicone-Session-Id`: Use this to specify the unique identifier for the session you want to track. This allows you to group related requests together. +- `Helicone-Session-Path`: This header defines the path of the session, allowing you to represent parent and child traces. For example, "parent/child" represents a child trace of a parent trace. + +By using these two headers, you can effectively group and visualize multi-step LLM interactions, gaining insights into complex AI workflows. + +### Retry and Fallback Mechanisms + +Set up retry mechanisms and fallback options: + +```python +litellm.metadata = { + "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API + "Helicone-Retry-Enabled": "true", # Enable retry mechanism + "helicone-retry-num": "3", # Set number of retries + "helicone-retry-factor": "2", # Set exponential backoff factor + "Helicone-Fallbacks": '["gpt-3.5-turbo", "gpt-4"]', # Set fallback models +} +``` + +> **Supported Headers** - For a full list of supported Helicone headers and their descriptions, please refer to the [Helicone documentation](https://docs.helicone.ai/getting-started/quick-start). +> By utilizing these headers and metadata options, you can gain deeper insights into your LLM usage, optimize performance, and better manage your AI workflows with Helicone and LiteLLM. diff --git a/docs/my-website/docs/observability/langfuse_integration.md b/docs/my-website/docs/observability/langfuse_integration.md index 6dd5377ea7..9703d38a03 100644 --- a/docs/my-website/docs/observability/langfuse_integration.md +++ b/docs/my-website/docs/observability/langfuse_integration.md @@ -1,6 +1,6 @@ import Image from '@theme/IdealImage'; -# Langfuse - Logging LLM Input/Output +# 🔥 Langfuse - Logging LLM Input/Output LangFuse is open Source Observability & Analytics for LLM Apps Detailed production traces and a granular view on quality, cost and latency @@ -122,10 +122,12 @@ response = completion( metadata={ "generation_name": "ishaan-test-generation", # set langfuse Generation Name "generation_id": "gen-id22", # set langfuse Generation ID + "parent_observation_id": "obs-id9" # set langfuse Parent Observation ID "version": "test-generation-version" # set langfuse Generation Version "trace_user_id": "user-id2", # set langfuse Trace User ID "session_id": "session-1", # set langfuse Session ID "tags": ["tag1", "tag2"], # set langfuse Tags + "trace_name": "new-trace-name" # set langfuse Trace Name "trace_id": "trace-id22", # set langfuse Trace ID "trace_metadata": {"key": "value"}, # set langfuse Trace Metadata "trace_version": "test-trace-version", # set langfuse Trace Version (if not set, defaults to Generation Version) @@ -144,6 +146,27 @@ print(response) ``` +You can also pass `metadata` as part of the request header with a `langfuse_*` prefix: + +```shell +curl --location --request POST 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'langfuse_trace_id: trace-id2' \ + --header 'langfuse_trace_user_id: user-id2' \ + --header 'langfuse_trace_metadata: {"key":"value"}' \ + --data '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] +}' +``` + + ### Trace & Generation Parameters #### Trace Specific Parameters @@ -170,9 +193,10 @@ The following parameters can be updated on a continuation of a trace by passing #### Generation Specific Parameters -* `generation_id` - Identifier for the generation, auto-generated by default -* `generation_name` - Identifier for the generation, auto-generated by default -* `prompt` - Langfuse prompt object used for the generation, defaults to None +* `generation_id` - Identifier for the generation, auto-generated by default +* `generation_name` - Identifier for the generation, auto-generated by default +* `parent_observation_id` - Identifier for the parent observation, defaults to `None` +* `prompt` - Langfuse prompt object used for the generation, defaults to `None` Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation. diff --git a/docs/my-website/docs/observability/langsmith_integration.md b/docs/my-website/docs/observability/langsmith_integration.md index b115866d54..79d047e33a 100644 --- a/docs/my-website/docs/observability/langsmith_integration.md +++ b/docs/my-website/docs/observability/langsmith_integration.md @@ -1,10 +1,20 @@ import Image from '@theme/IdealImage'; # Langsmith - Logging LLM Input/Output + + +:::tip + +This is community maintained, Please make an issue if you run into a bug +https://github.com/BerriAI/litellm + +::: + + An all-in-one developer platform for every step of the application lifecycle https://smith.langchain.com/ - + :::info We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or diff --git a/docs/my-website/docs/observability/logfire_integration.md b/docs/my-website/docs/observability/logfire_integration.md index c1f425f425..a2d406f9c6 100644 --- a/docs/my-website/docs/observability/logfire_integration.md +++ b/docs/my-website/docs/observability/logfire_integration.md @@ -1,6 +1,6 @@ import Image from '@theme/IdealImage'; -# Logfire - Logging LLM Input/Output +# 🔥 Logfire - Logging LLM Input/Output Logfire is open Source Observability & Analytics for LLM Apps Detailed production traces and a granular view on quality, cost and latency @@ -14,10 +14,14 @@ join our [discord](https://discord.gg/wuPM9dRgDw) ## Pre-Requisites -Ensure you have run `pip install logfire` for this integration +Ensure you have installed the following packages to use this integration ```shell -pip install logfire litellm +pip install litellm + +pip install opentelemetry-api==1.25.0 +pip install opentelemetry-sdk==1.25.0 +pip install opentelemetry-exporter-otlp==1.25.0 ``` ## Quick Start @@ -25,8 +29,7 @@ pip install logfire litellm Get your Logfire token from [Logfire](https://logfire.pydantic.dev/) ```python -litellm.success_callback = ["logfire"] -litellm.failure_callback = ["logfire"] # logs errors to logfire +litellm.callbacks = ["logfire"] ``` ```python diff --git a/docs/my-website/docs/observability/lunary_integration.md b/docs/my-website/docs/observability/lunary_integration.md index 9b8e90df7b..56e74132f7 100644 --- a/docs/my-website/docs/observability/lunary_integration.md +++ b/docs/my-website/docs/observability/lunary_integration.md @@ -1,5 +1,13 @@ # Lunary - Logging and tracing LLM input/output +:::tip + +This is community maintained, Please make an issue if you run into a bug +https://github.com/BerriAI/litellm + +::: + + [Lunary](https://lunary.ai/) is an open-source AI developer platform providing observability, prompt management, and evaluation tools for AI developers.