diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..efff383d4 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +/docs +/cookbook +/.circleci +/.github +/tests \ No newline at end of file diff --git a/.github/workflows/ghcr_deploy.yml b/.github/workflows/ghcr_deploy.yml index d7cf4271c..f6e88bbd8 100644 --- a/.github/workflows/ghcr_deploy.yml +++ b/.github/workflows/ghcr_deploy.yml @@ -10,6 +10,7 @@ on: env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} + CHART_NAME: litellm # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. jobs: @@ -103,6 +104,11 @@ jobs: uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database + # Configure multi platform Docker builds + - name: Set up QEMU + uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 - name: Build and push Database Docker image uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 @@ -112,6 +118,60 @@ jobs: push: true tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest labels: ${{ steps.meta-database.outputs.labels }} + platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 + build-and-push-helm-chart: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: lowercase github.repository_owner + run: | + echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV} + - name: Get LiteLLM Latest Tag + id: current_app_tag + uses: WyriHaximus/github-action-get-previous-tag@v1.3.0 + + - name: Get last published chart version + id: current_version + shell: bash + run: | + CHART_LIST=$(helm show chart oci://${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/${{ env.CHART_NAME }} 2>/dev/null || true) + if [ -z "${CHART_LIST}" ]; then + echo "current-version=0.1.0" | tee -a $GITHUB_OUTPUT + else + printf '%s' "${CHART_LIST}" | grep '^version:' | awk 'BEGIN{FS=":"}{print "current-version="$2}' | tr -d " " | tee -a $GITHUB_OUTPUT + fi + env: + HELM_EXPERIMENTAL_OCI: '1' + + # Automatically update the helm chart version one "patch" level + - name: Bump release version + id: bump_version + uses: christian-draeger/increment-semantic-version@1.1.0 + with: + current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }} + version-fragment: 'bug' + + - uses: ./.github/actions/helm-oci-chart-releaser + with: + name: ${{ env.CHART_NAME }} + repository: ${{ env.REPO_OWNER }} + tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }} + app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }} + path: deploy/charts/${{ env.CHART_NAME }} + registry: ${{ env.REGISTRY }} + registry_username: ${{ github.actor }} + registry_password: ${{ secrets.GITHUB_TOKEN }} + update_dependencies: true + release: name: "New LiteLLM Release" needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database] @@ -171,13 +231,13 @@ jobs: RELEASE_NOTES: ${{ steps.release-notes.outputs.result }} run: | curl -H "Content-Type: application/json" -X POST -d '{ - "content": "||@everyone||", + "content": "New LiteLLM release ${{ env.RELEASE_TAG }}", "username": "Release Changelog", "avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png", "embeds": [ { - "title": "Changelog for ${RELEASE_TAG}", - "description": "${RELEASE_NOTES}", + "title": "Changelog for LiteLLM ${{ env.RELEASE_TAG }}", + "description": "${{ env.RELEASE_NOTES }}", "color": 2105893 } ] diff --git a/.github/workflows/interpret_load_test.py b/.github/workflows/interpret_load_test.py new file mode 100644 index 000000000..b52d4d2b3 --- /dev/null +++ b/.github/workflows/interpret_load_test.py @@ -0,0 +1,91 @@ +import csv +import os +from github import Github + + +def interpret_results(csv_file): + with open(csv_file, newline="") as csvfile: + csvreader = csv.DictReader(csvfile) + rows = list(csvreader) + """ + in this csv reader + - Create 1 new column "Status" + - if a row has a median response time < 300 and an average response time < 300, Status = "Passed ✅" + - if a row has a median response time >= 300 or an average response time >= 300, Status = "Failed ❌" + - Order the table in this order Name, Status, Median Response Time, Average Response Time, Requests/s,Failures/s, Min Response Time, Max Response Time, all other columns + """ + + # Add a new column "Status" + for row in rows: + median_response_time = float( + row["Median Response Time"].strip().rstrip("ms") + ) + average_response_time = float( + row["Average Response Time"].strip().rstrip("s") + ) + + request_count = int(row["Request Count"]) + failure_count = int(row["Failure Count"]) + + failure_percent = round((failure_count / request_count) * 100, 2) + + # Determine status based on conditions + if ( + median_response_time < 300 + and average_response_time < 300 + and failure_percent < 5 + ): + row["Status"] = "Passed ✅" + else: + row["Status"] = "Failed ❌" + + # Construct Markdown table header + markdown_table = "| Name | Status | Median Response Time (ms) | Average Response Time (ms) | Requests/s | Failures/s | Request Count | Failure Count | Min Response Time (ms) | Max Response Time (ms) |" + markdown_table += ( + "\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |" + ) + + # Construct Markdown table rows + for row in rows: + markdown_table += f"\n| {row['Name']} | {row['Status']} | {row['Median Response Time']} | {row['Average Response Time']} | {row['Requests/s']} | {row['Failures/s']} | {row['Request Count']} | {row['Failure Count']} | {row['Min Response Time']} | {row['Max Response Time']} |" + print("markdown table: ", markdown_table) + return markdown_table + + +if __name__ == "__main__": + csv_file = "load_test_stats.csv" # Change this to the path of your CSV file + markdown_table = interpret_results(csv_file) + + # Update release body with interpreted results + github_token = os.getenv("GITHUB_TOKEN") + g = Github(github_token) + repo = g.get_repo( + "BerriAI/litellm" + ) # Replace with your repository's username and name + latest_release = repo.get_latest_release() + print("got latest release: ", latest_release) + print("latest release body: ", latest_release.body) + print("markdown table: ", markdown_table) + + # check if "Load Test LiteLLM Proxy Results" exists + existing_release_body = latest_release.body + if "Load Test LiteLLM Proxy Results" in latest_release.body: + # find the "Load Test LiteLLM Proxy Results" section and delete it + start_index = latest_release.body.find("Load Test LiteLLM Proxy Results") + existing_release_body = latest_release.body[:start_index] + + new_release_body = ( + existing_release_body + + "\n\n" + + "## Load Test LiteLLM Proxy Results" + + "\n\n" + + markdown_table + ) + print("new release body: ", new_release_body) + try: + latest_release.update_release( + name=latest_release.tag_name, + message=new_release_body, + ) + except Exception as e: + print(e) diff --git a/.github/workflows/load_test.yml b/.github/workflows/load_test.yml index ed0c34fbd..ddf613fa6 100644 --- a/.github/workflows/load_test.yml +++ b/.github/workflows/load_test.yml @@ -1,6 +1,11 @@ name: Test Locust Load Test -on: [push] +on: + workflow_run: + workflows: ["Build, Publish LiteLLM Docker Image. New Release"] + types: + - completed + workflow_dispatch: jobs: build: @@ -8,15 +13,32 @@ jobs: steps: - name: Checkout uses: actions/checkout@v1 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install PyGithub - name: Run Load Test id: locust_run uses: BerriAI/locust-github-action@master with: LOCUSTFILE: ".github/workflows/locustfile.py" - URL: "https://litellm-api.up.railway.app/" + URL: "https://litellm-database-docker-build-production.up.railway.app/" USERS: "100" RATE: "10" - RUNTIME: "60s" + RUNTIME: "300s" + - name: Process Load Test Stats + run: | + echo "Current working directory: $PWD" + ls + python ".github/workflows/interpret_load_test.py" + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + working-directory: ${{ github.workspace }} - name: Upload CSV as Asset to Latest Release uses: xresloader/upload-to-github-release@v1 env: @@ -25,4 +47,4 @@ jobs: file: "load_test_stats.csv;load_test.html" update_latest_release: true tag_name: "load-test" - overwrite: true + overwrite: true \ No newline at end of file diff --git a/.github/workflows/locustfile.py b/.github/workflows/locustfile.py index 5efdca84d..9e5b62ff0 100644 --- a/.github/workflows/locustfile.py +++ b/.github/workflows/locustfile.py @@ -1,4 +1,6 @@ -from locust import HttpUser, task, between +from locust import HttpUser, task, between, events +import json +import time class MyUser(HttpUser): @@ -8,7 +10,7 @@ class MyUser(HttpUser): def chat_completion(self): headers = { "Content-Type": "application/json", - "Authorization": f"Bearer sk-1234", + "Authorization": f"Bearer sk-gUvTeN9g0sgHBMf9HeCaqA", # Include any additional headers you may need for authentication, etc. } @@ -26,3 +28,15 @@ class MyUser(HttpUser): response = self.client.post("chat/completions", json=payload, headers=headers) # Print or log the response if needed + + @task(10) + def health_readiness(self): + start_time = time.time() + response = self.client.get("health/readiness") + response_time = time.time() - start_time + + @task(10) + def health_liveliness(self): + start_time = time.time() + response = self.client.get("health/liveliness") + response_time = time.time() - start_time diff --git a/Dockerfile b/Dockerfile index dd3012109..4408afb3d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -66,4 +66,4 @@ ENTRYPOINT ["litellm"] # Append "--detailed_debug" to the end of CMD to view detailed debug logs # CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"] -CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "1"] \ No newline at end of file +CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "4"] diff --git a/deploy/charts/litellm/README.md b/deploy/charts/litellm/README.md index 817781ed0..e005280b8 100644 --- a/deploy/charts/litellm/README.md +++ b/deploy/charts/litellm/README.md @@ -2,7 +2,7 @@ ## Prerequisites -- Kubernetes 1.23+ +- Kubernetes 1.21+ - Helm 3.8.0+ If `db.deployStandalone` is used: @@ -33,6 +33,7 @@ If `db.useStackgresOperator` is used (not yet implemented): | `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | N/A | #### Example `environmentSecrets` Secret + ``` apiVersion: v1 kind: Secret diff --git a/deploy/charts/litellm/values.yaml b/deploy/charts/litellm/values.yaml index cc53fc59c..891c44f2a 100644 --- a/deploy/charts/litellm/values.yaml +++ b/deploy/charts/litellm/values.yaml @@ -6,7 +6,6 @@ replicaCount: 1 image: # Use "ghcr.io/berriai/litellm-database" for optimized image with database - # Alternatively, use "ghcr.io/berriai/litellm" for the default image repository: ghcr.io/berriai/litellm-database pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. @@ -85,10 +84,13 @@ proxy_config: litellm_params: model: gpt-3.5-turbo api_key: eXaMpLeOnLy + - model_name: fake-openai-endpoint + litellm_params: + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ general_settings: master_key: os.environ/PROXY_MASTER_KEY -# litellm_settings: -# cache: true resources: {} # We usually recommend not to specify default resources and to leave this as a conscious diff --git a/docs/my-website/docs/audio_transcription.md b/docs/my-website/docs/audio_transcription.md index 09fa1a1b9..25eca6caa 100644 --- a/docs/my-website/docs/audio_transcription.md +++ b/docs/my-website/docs/audio_transcription.md @@ -77,9 +77,32 @@ litellm --config /path/to/config.yaml ### Test + + + ```bash -curl --location 'http://0.0.0.0:4000/v1/audio/transcriptions' \ +curl --location 'http://0.0.0.0:8000/v1/audio/transcriptions' \ --header 'Authorization: Bearer sk-1234' \ --form 'file=@"/Users/krrishdholakia/Downloads/gettysburg.wav"' \ --form 'model="whisper"' ``` + + + + +```python +from openai import OpenAI +client = openai.OpenAI( + api_key="sk-1234", + base_url="http://0.0.0.0:8000" +) + + +audio_file = open("speech.mp3", "rb") +transcript = client.audio.transcriptions.create( + model="whisper", + file=audio_file +) +``` + + \ No newline at end of file diff --git a/docs/my-website/docs/langchain/langchain.md b/docs/my-website/docs/langchain/langchain.md index fa5a0a96b..cc12767b8 100644 --- a/docs/my-website/docs/langchain/langchain.md +++ b/docs/my-website/docs/langchain/langchain.md @@ -133,3 +133,6 @@ chat(messages) ``` + +## Use LangChain ChatLiteLLM + Langfuse +Checkout this section [here](../observability/langfuse_integration#use-langchain-chatlitellm--langfuse) for more details on how to integrate Langfuse with ChatLiteLLM. diff --git a/docs/my-website/docs/load_test.md b/docs/my-website/docs/load_test.md index f85ff9122..5eb6a0610 100644 --- a/docs/my-website/docs/load_test.md +++ b/docs/my-website/docs/load_test.md @@ -2,6 +2,54 @@ import Image from '@theme/IdealImage'; # 🔥 Load Test LiteLLM +## How to run a locust load test on LiteLLM Proxy + +1. Add `fake-openai-endpoint` to your proxy config.yaml and start your litellm proxy +litellm provides a free hosted `fake-openai-endpoint` you can load test against + +```yaml +model_list: + - model_name: fake-openai-endpoint + litellm_params: + model: openai/fake + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app/ +``` + +2. `pip install locust` + +3. Create a file called `locustfile.py` on your local machine. Copy the contents from the litellm load test located [here](https://github.com/BerriAI/litellm/blob/main/.github/workflows/locustfile.py) + +4. Start locust + Run `locust` in the same directory as your `locustfile.py` from step 2 + + ```shell + locust + ``` + + Output on terminal + ``` + [2024-03-15 07:19:58,893] Starting web interface at http://0.0.0.0:8089 + [2024-03-15 07:19:58,898] Starting Locust 2.24.0 + ``` + +5. Run Load test on locust + + Head to the locust UI on http://0.0.0.0:8089 + + Set Users=100, Ramp Up Users=10, Host=Base URL of your LiteLLM Proxy + + + +6. Expected Results + + Expect to see the following response times for `/health/readiness` + Median → /health/readiness is `150ms` + + Avg → /health/readiness is `219ms` + + + ## Load Test LiteLLM Proxy - 1500+ req/s ## 1500+ concurrent requests/s diff --git a/docs/my-website/docs/observability/langfuse_integration.md b/docs/my-website/docs/observability/langfuse_integration.md index 294f3fb38..50b016d09 100644 --- a/docs/my-website/docs/observability/langfuse_integration.md +++ b/docs/my-website/docs/observability/langfuse_integration.md @@ -132,6 +132,41 @@ print(response) ``` +### Use LangChain ChatLiteLLM + Langfuse +Pass `trace_user_id`, `session_id` in model_kwargs +```python +import os +from langchain.chat_models import ChatLiteLLM +from langchain.schema import HumanMessage +import litellm + +# from https://cloud.langfuse.com/ +os.environ["LANGFUSE_PUBLIC_KEY"] = "" +os.environ["LANGFUSE_SECRET_KEY"] = "" + +os.environ['OPENAI_API_KEY']="" + +# set langfuse as a callback, litellm will send the data to langfuse +litellm.success_callback = ["langfuse"] + +chat = ChatLiteLLM( + model="gpt-3.5-turbo" + model_kwargs={ + "metadata": { + "trace_user_id": "user-id2", # set langfuse Trace User ID + "session_id": "session-1" , # set langfuse Session ID + "tags": ["tag1", "tag2"] + } + } + ) +messages = [ + HumanMessage( + content="what model are you" + ) +] +chat(messages) +``` + ## Troubleshooting & Errors ### Data not getting logged to Langfuse ? @@ -142,4 +177,4 @@ print(response) - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) - [Community Discord 💭](https://discord.gg/wuPM9dRgDw) - Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬ -- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai \ No newline at end of file +- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai diff --git a/docs/my-website/docs/providers/cohere.md b/docs/my-website/docs/providers/cohere.md index c6efb3b40..71763e30d 100644 --- a/docs/my-website/docs/providers/cohere.md +++ b/docs/my-website/docs/providers/cohere.md @@ -49,7 +49,7 @@ for chunk in response: | command-light | `completion('command-light', messages)` | | command-medium | `completion('command-medium', messages)` | | command-medium-beta | `completion('command-medium-beta', messages)` | -| command-xlarge-beta | `completion('command-xlarge-beta', messages)` | +| command-xlarge-nightly | `completion('command-xlarge-nightly', messages)` | | command-nightly | `completion('command-nightly', messages)` | diff --git a/docs/my-website/docs/providers/fireworks_ai.md b/docs/my-website/docs/providers/fireworks_ai.md new file mode 100644 index 000000000..ba50bd1f2 --- /dev/null +++ b/docs/my-website/docs/providers/fireworks_ai.md @@ -0,0 +1,53 @@ +# Fireworks AI +https://fireworks.ai/ + +**We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests** + +## API Key +```python +# env variable +os.environ['FIREWORKS_AI_API_KEY'] +``` + +## Sample Usage +```python +from litellm import completion +import os + +os.environ['FIREWORKS_AI_API_KEY'] = "" +response = completion( + model="fireworks_ai/mixtral-8x7b-instruct", + messages=[ + {"role": "user", "content": "hello from litellm"} + ], +) +print(response) +``` + +## Sample Usage - Streaming +```python +from litellm import completion +import os + +os.environ['FIREWORKS_AI_API_KEY'] = "" +response = completion( + model="fireworks_ai/mixtral-8x7b-instruct", + messages=[ + {"role": "user", "content": "hello from litellm"} + ], + stream=True +) + +for chunk in response: + print(chunk) +``` + + +## Supported Models - ALL Fireworks AI Models Supported! +We support ALL Fireworks AI models, just set `fireworks_ai/` as a prefix when sending completion requests + +| Model Name | Function Call | +|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| mixtral-8x7b-instruct | `completion(model="fireworks_ai/mixtral-8x7b-instruct", messages)` | +| firefunction-v1 | `completion(model="fireworks_ai/firefunction-v1", messages)` | +| llama-v2-70b-chat | `completion(model="fireworks_ai/llama-v2-70b-chat", messages)` | \ No newline at end of file diff --git a/docs/my-website/docs/providers/groq.md b/docs/my-website/docs/providers/groq.md index e09cf9f8a..d8a4fded4 100644 --- a/docs/my-website/docs/providers/groq.md +++ b/docs/my-website/docs/providers/groq.md @@ -49,4 +49,5 @@ We support ALL Groq models, just set `groq/` as a prefix when sending completion | Model Name | Function Call | |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| | llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` | -| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` | +| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` | +| gemma-7b-it | `completion(model="groq/gemma-7b-it", messages)` | \ No newline at end of file diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 2be1d8de1..1521f63b0 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -225,6 +225,32 @@ litellm_settings: supported_call_types: ["acompletion", "completion", "embedding", "aembedding"] # defaults to all litellm call types ``` + +### Turn on `batch_redis_requests` + +**What it does?** +When a request is made: + +- Check if a key starting with `litellm:::` exists in-memory, if no - get the last 100 cached requests for this key and store it + +- New requests are stored with this `litellm:..` as the namespace + +**Why?** +Reduce number of redis GET requests. This improved latency by 46% in prod load tests. + +**Usage** + +```yaml +litellm_settings: + cache: true + cache_params: + type: redis + ... # remaining redis args (host, port, etc.) + callbacks: ["batch_redis_requests"] # 👈 KEY CHANGE! +``` + +[**SEE CODE**](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/batch_redis_get.py) + ### Turn on / off caching per request. The proxy support 3 cache-controls: diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md index 175806d27..3f105e387 100644 --- a/docs/my-website/docs/proxy/deploy.md +++ b/docs/my-website/docs/proxy/deploy.md @@ -150,17 +150,20 @@ To avoid issues with predictability, difficulties in rollback, and inconsistent ## Deploy with Database +### Docker, Kubernetes, Helm Chart + + + + + We maintain a [seperate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database - - - -``` +```shell docker pull docker pull ghcr.io/berriai/litellm-database:main-latest ``` -``` +```shell docker run --name litellm-proxy \ -e DATABASE_URL=postgresql://:@:/ \ -p 4000:4000 \ @@ -233,6 +236,8 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`. +Use this to deploy litellm using a helm chart. Link to [the LiteLLM Helm Chart](https://github.com/BerriAI/litellm/tree/main/deploy/charts/litellm) + #### Step 1. Clone the repository ```bash @@ -241,9 +246,11 @@ git clone https://github.com/BerriAI/litellm.git #### Step 2. Deploy with Helm +Run the following command in the root of your `litellm` repo. This will set the litellm proxy master key as `sk-1234` + ```bash helm install \ - --set masterkey=SuPeRsEcReT \ + --set masterkey=sk-1234 \ mydeploy \ deploy/charts/litellm ``` @@ -259,6 +266,9 @@ kubectl \ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`. + +If you need to set your litellm proxy config.yaml, you can find this in [values.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/charts/litellm/values.yaml) + diff --git a/docs/my-website/img/litellm_load_test.png b/docs/my-website/img/litellm_load_test.png new file mode 100644 index 000000000..2dd8299d2 Binary files /dev/null and b/docs/my-website/img/litellm_load_test.png differ diff --git a/docs/my-website/img/locust_load_test.png b/docs/my-website/img/locust_load_test.png new file mode 100644 index 000000000..37de623a1 Binary files /dev/null and b/docs/my-website/img/locust_load_test.png differ diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index ae56f9d7c..21f66a778 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -138,6 +138,7 @@ const sidebars = { "providers/ollama", "providers/perplexity", "providers/groq", + "providers/fireworks_ai", "providers/vllm", "providers/xinference", "providers/cloudflare_workers", diff --git a/enterprise/__init__.py b/enterprise/__init__.py new file mode 100644 index 000000000..b6e690fd5 --- /dev/null +++ b/enterprise/__init__.py @@ -0,0 +1 @@ +from . import * diff --git a/litellm/__init__.py b/litellm/__init__.py index a821bde30..7eae39097 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -36,6 +36,7 @@ token: Optional[str] = ( telemetry = True max_tokens = 256 # OpenAI Defaults drop_params = False +modify_params = False retry = True api_key: Optional[str] = None openai_key: Optional[str] = None @@ -327,6 +328,7 @@ openai_compatible_providers: List = [ "perplexity", "xinference", "together_ai", + "fireworks_ai", ] @@ -478,6 +480,7 @@ provider_list: List = [ "voyage", "cloudflare", "xinference", + "fireworks_ai", "custom", # custom apis ] diff --git a/litellm/caching.py b/litellm/caching.py index 9df95f199..ed856f86f 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -129,6 +129,16 @@ class RedisCache(BaseCache): f"LiteLLM Caching: set() - Got exception from REDIS : {str(e)}" ) + async def async_scan_iter(self, pattern: str, count: int = 100) -> list: + keys = [] + _redis_client = self.init_async_client() + async with _redis_client as redis_client: + async for key in redis_client.scan_iter(match=pattern + "*", count=count): + keys.append(key) + if len(keys) >= count: + break + return keys + async def async_set_cache(self, key, value, **kwargs): _redis_client = self.init_async_client() async with _redis_client as redis_client: @@ -140,6 +150,9 @@ class RedisCache(BaseCache): await redis_client.set( name=key, value=json.dumps(value), ex=ttl, get=True ) + print_verbose( + f"Successfully Set ASYNC Redis Cache: key: {key}\nValue {value}\nttl={ttl}" + ) except Exception as e: # NON blocking - notify users Redis is throwing an exception print_verbose( @@ -172,8 +185,6 @@ class RedisCache(BaseCache): return results except Exception as e: print_verbose(f"Error occurred in pipeline write - {str(e)}") - # NON blocking - notify users Redis is throwing an exception - logging.debug("LiteLLM Caching: set() - Got exception from REDIS : ", e) def _get_cache_logic(self, cached_response: Any): """ @@ -208,7 +219,7 @@ class RedisCache(BaseCache): _redis_client = self.init_async_client() async with _redis_client as redis_client: try: - print_verbose(f"Get Redis Cache: key: {key}") + print_verbose(f"Get Async Redis Cache: key: {key}") cached_response = await redis_client.get(key) print_verbose( f"Got Async Redis Cache: key: {key}, cached_response {cached_response}" @@ -217,8 +228,39 @@ class RedisCache(BaseCache): return response except Exception as e: # NON blocking - notify users Redis is throwing an exception - traceback.print_exc() - logging.debug("LiteLLM Caching: get() - Got exception from REDIS: ", e) + print_verbose( + f"LiteLLM Caching: async get() - Got exception from REDIS: {str(e)}" + ) + + async def async_get_cache_pipeline(self, key_list) -> dict: + """ + Use Redis for bulk read operations + """ + _redis_client = await self.init_async_client() + key_value_dict = {} + try: + async with _redis_client as redis_client: + async with redis_client.pipeline(transaction=True) as pipe: + # Queue the get operations in the pipeline for all keys. + for cache_key in key_list: + pipe.get(cache_key) # Queue GET command in pipeline + + # Execute the pipeline and await the results. + results = await pipe.execute() + + # Associate the results back with their keys. + # 'results' is a list of values corresponding to the order of keys in 'key_list'. + key_value_dict = dict(zip(key_list, results)) + + decoded_results = { + k.decode("utf-8"): self._get_cache_logic(v) + for k, v in key_value_dict.items() + } + + return decoded_results + except Exception as e: + print_verbose(f"Error occurred in pipeline read - {str(e)}") + return key_value_dict def flush_cache(self): self.redis_client.flushall() @@ -1001,6 +1043,10 @@ class Cache: if self.namespace is not None: hash_hex = f"{self.namespace}:{hash_hex}" print_verbose(f"Hashed Key with Namespace: {hash_hex}") + elif kwargs.get("metadata", {}).get("redis_namespace", None) is not None: + _namespace = kwargs.get("metadata", {}).get("redis_namespace", None) + hash_hex = f"{_namespace}:{hash_hex}" + print_verbose(f"Hashed Key with Namespace: {hash_hex}") return hash_hex def generate_streaming_content(self, content): diff --git a/litellm/llms/bedrock.py b/litellm/llms/bedrock.py index 4aa27b3c9..31e4905cb 100644 --- a/litellm/llms/bedrock.py +++ b/litellm/llms/bedrock.py @@ -82,12 +82,22 @@ class AmazonAnthropicClaude3Config: Supported Params for the Amazon / Anthropic Claude 3 models: - - `max_tokens` (integer) max tokens, - - `anthropic_version` (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31" + - `max_tokens` Required (integer) max tokens, + - `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31" + - `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py + - `temperature` Optional (float) The amount of randomness injected into the response + - `top_p` Optional (float) Use nucleus sampling. + - `top_k` Optional (int) Only sample from the top K options for each subsequent token + - `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating """ max_tokens: Optional[int] = litellm.max_tokens anthropic_version: Optional[str] = "bedrock-2023-05-31" + system: Optional[str] = None + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None + stop_sequences: Optional[List[str]] = None def __init__( self, @@ -128,6 +138,12 @@ class AmazonAnthropicClaude3Config: optional_params["tools"] = value if param == "stream": optional_params["stream"] = value + if param == "stop": + optional_params["stop_sequences"] = value + if param == "temperature": + optional_params["temperature"] = value + if param == "top_p": + optional_params["top_p"] = value return optional_params diff --git a/litellm/llms/cohere.py b/litellm/llms/cohere.py index 960dc66d3..a09e249af 100644 --- a/litellm/llms/cohere.py +++ b/litellm/llms/cohere.py @@ -300,8 +300,7 @@ def embedding( for text in input: input_tokens += len(encoding.encode(text)) - model_response["usage"] = { - "prompt_tokens": input_tokens, - "total_tokens": input_tokens, - } + model_response["usage"] = Usage( + prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens + ) return model_response diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index 4ebb48dba..0347e5fd6 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -705,6 +705,7 @@ def anthropic_messages_pt(messages: list): "text" ].rstrip() # no trailing whitespace for final assistant message + return new_messages diff --git a/litellm/main.py b/litellm/main.py index 8ccde52e6..3a9fed77e 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -12,6 +12,7 @@ from typing import Any, Literal, Union, BinaryIO from functools import partial import dotenv, traceback, random, asyncio, time, contextvars from copy import deepcopy + import httpx import litellm from ._logging import verbose_logger @@ -891,6 +892,7 @@ def completion( or custom_llm_provider == "mistral" or custom_llm_provider == "openai" or custom_llm_provider == "together_ai" + or custom_llm_provider in litellm.openai_compatible_providers or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo ): # allow user to make an openai call with a custom base # note: if a user sets a custom base - we should ensure this works @@ -2393,6 +2395,7 @@ async def aembedding(*args, **kwargs): or custom_llm_provider == "deepinfra" or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" + or custom_llm_provider == "fireworks_ai" or custom_llm_provider == "ollama" or custom_llm_provider == "vertex_ai" ): # currently implemented aiohttp calls for just azure and openai, soon all. @@ -2892,6 +2895,7 @@ async def atext_completion(*args, **kwargs): or custom_llm_provider == "deepinfra" or custom_llm_provider == "perplexity" or custom_llm_provider == "groq" + or custom_llm_provider == "fireworks_ai" or custom_llm_provider == "text-completion-openai" or custom_llm_provider == "huggingface" or custom_llm_provider == "ollama" diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index ddacbf05c..0a90c91ca 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -631,6 +631,13 @@ "litellm_provider": "groq", "mode": "chat" }, + "groq/gemma-7b-it": { + "max_tokens": 8192, + "input_cost_per_token": 0.00000010, + "output_cost_per_token": 0.00000010, + "litellm_provider": "groq", + "mode": "chat" + }, "claude-instant-1.2": { "max_tokens": 100000, "max_output_tokens": 8191, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index aab9b3d5c..1c41d79fc 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -9,6 +9,12 @@ model_list: model: gpt-3.5-turbo-1106 api_key: os.environ/OPENAI_API_KEY +litellm_settings: + cache: true + cache_params: + type: redis + callbacks: ["batch_redis_requests"] + general_settings: master_key: sk-1234 - database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require" \ No newline at end of file + # database_url: "postgresql://krrishdholakia:9yQkKWiB8vVs@ep-icy-union-a5j4dwls.us-east-2.aws.neon.tech/neondb?sslmode=require" \ No newline at end of file diff --git a/litellm/proxy/hooks/batch_redis_get.py b/litellm/proxy/hooks/batch_redis_get.py new file mode 100644 index 000000000..71588c9d4 --- /dev/null +++ b/litellm/proxy/hooks/batch_redis_get.py @@ -0,0 +1,124 @@ +# What this does? +## Gets a key's redis cache, and store it in memory for 1 minute. +## This reduces the number of REDIS GET requests made during high-traffic by the proxy. +### [BETA] this is in Beta. And might change. + +from typing import Optional, Literal +import litellm +from litellm.caching import DualCache, RedisCache, InMemoryCache +from litellm.proxy._types import UserAPIKeyAuth +from litellm.integrations.custom_logger import CustomLogger +from litellm._logging import verbose_proxy_logger +from fastapi import HTTPException +import json, traceback + + +class _PROXY_BatchRedisRequests(CustomLogger): + # Class variables or attributes + in_memory_cache: Optional[InMemoryCache] = None + + def __init__(self): + litellm.cache.async_get_cache = ( + self.async_get_cache + ) # map the litellm 'get_cache' function to our custom function + + def print_verbose( + self, print_statement, debug_level: Literal["INFO", "DEBUG"] = "DEBUG" + ): + if debug_level == "DEBUG": + verbose_proxy_logger.debug(print_statement) + elif debug_level == "INFO": + verbose_proxy_logger.debug(print_statement) + if litellm.set_verbose is True: + print(print_statement) # noqa + + async def async_pre_call_hook( + self, + user_api_key_dict: UserAPIKeyAuth, + cache: DualCache, + data: dict, + call_type: str, + ): + try: + """ + Get the user key + + Check if a key starting with `litellm:: 0: + key_value_dict = ( + await litellm.cache.cache.async_get_cache_pipeline( + key_list=keys + ) + ) + + ## Add to cache + if len(key_value_dict.items()) > 0: + await cache.in_memory_cache.async_set_cache_pipeline( + cache_list=list(key_value_dict.items()), ttl=60 + ) + ## Set cache namespace if it's a miss + data["metadata"]["redis_namespace"] = cache_key_name + except HTTPException as e: + raise e + except Exception as e: + traceback.print_exc() + + async def async_get_cache(self, *args, **kwargs): + """ + - Check if the cache key is in-memory + + - Else return None + """ + try: # never block execution + if "cache_key" in kwargs: + cache_key = kwargs["cache_key"] + else: + cache_key = litellm.cache.get_cache_key( + *args, **kwargs + ) # returns ":" - we pass redis_namespace in async_pre_call_hook. Done to avoid rewriting the async_set_cache logic + if cache_key is not None and self.in_memory_cache is not None: + cache_control_args = kwargs.get("cache", {}) + max_age = cache_control_args.get( + "s-max-age", cache_control_args.get("s-maxage", float("inf")) + ) + cached_result = self.in_memory_cache.get_cache( + cache_key, *args, **kwargs + ) + return litellm.cache._get_cache_logic( + cached_result=cached_result, max_age=max_age + ) + except Exception as e: + return None diff --git a/litellm/proxy/proxy_load_test/locustfile.py b/litellm/proxy/proxy_load_test/locustfile.py index 220bd3553..f439f7274 100644 --- a/litellm/proxy/proxy_load_test/locustfile.py +++ b/litellm/proxy/proxy_load_test/locustfile.py @@ -6,7 +6,7 @@ import time class MyUser(HttpUser): wait_time = between(1, 5) - @task + @task(3) def chat_completion(self): headers = { "Content-Type": "application/json", @@ -31,62 +31,8 @@ class MyUser(HttpUser): @task(10) def health_readiness(self): - start_time = time.time() response = self.client.get("health/readiness") - response_time = time.time() - start_time - if response_time > 1: - events.request_failure.fire( - request_type="GET", - name="health/readiness", - response_time=response_time, - exception=None, - response=response, - ) @task(10) def health_liveliness(self): - start_time = time.time() response = self.client.get("health/liveliness") - response_time = time.time() - start_time - if response_time > 1: - events.request_failure.fire( - request_type="GET", - name="health/liveliness", - response_time=response_time, - exception=None, - response=response, - ) - - # @task - # def key_generate(self): - # headers = { - # "Authorization": "Bearer sk-1234", - # "Content-Type": "application/json", - # } - - # payload = { - # "models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], - # "duration": "20m", - # "metadata": {"user": "ishaan@berri.ai"}, - # "team_id": "core-infra", - # "max_budget": 10, - # "soft_budget": 5, - # } - - # response = self.client.post("key/generate", json=payload, headers=headers) - - # if response.status_code == 200: - # key_response = response.json() - # models = key_response.get("models", []) - # if models: - # # Use the first model from the key generation response to make a chat completions request - # model_to_use = models[0] - # chat_payload = { - # "model": model_to_use, - # "messages": [ - # {"role": "system", "content": "You are a chat bot."}, - # {"role": "user", "content": "Hello, how are you?"}, - # ], - # } - # chat_response = self.client.post("chat/completions", json=chat_payload, headers=headers) - # # Print or log the chat response if needed diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index c50b0e895..4c9cd876e 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -8,7 +8,6 @@ import hashlib, uuid import warnings import importlib import warnings -import backoff def showwarning(message, category, filename, lineno, file=None, line=None): @@ -35,7 +34,6 @@ try: import orjson import logging from apscheduler.schedulers.asyncio import AsyncIOScheduler - from argon2 import PasswordHasher except ImportError as e: raise ImportError(f"Missing dependency {e}. Run `pip install 'litellm[proxy]'`") @@ -145,9 +143,12 @@ from typing import Union try: # when using litellm cli import litellm.proxy.enterprise as enterprise -except: +except Exception as e: # when using litellm docker image - import enterprise # type: ignore + try: + import enterprise # type: ignore + except Exception as e: + pass ui_link = f"/ui/" ui_message = ( @@ -252,7 +253,6 @@ user_headers = None user_config_file_path = f"config_{int(time.time())}.yaml" local_logging = True # writes logs to a local api_log.json file for debugging experimental = False -ph = PasswordHasher() #### GLOBAL VARIABLES #### llm_router: Optional[litellm.Router] = None llm_model_list: Optional[list] = None @@ -382,7 +382,7 @@ async def user_api_key_auth( return valid_token try: - is_master_key_valid = ph.verify(litellm_master_key_hash, api_key) + is_master_key_valid = secrets.compare_digest(api_key, master_key) except Exception as e: is_master_key_valid = False @@ -887,6 +887,9 @@ async def user_api_key_auth( raise Exception( f"This key is made for LiteLLM UI, Tried to access route: {route}. Not allowed" ) + if valid_token is None: + # No token was found when looking up in the DB + raise Exception("Invalid token passed") if valid_token_dict is not None: return UserAPIKeyAuth(api_key=api_key, **valid_token_dict) else: @@ -1420,6 +1423,8 @@ async def update_cache( try: for _id in user_ids: # Fetch the existing cost for the given user + if _id is None: + continue existing_spend_obj = await user_api_key_cache.async_get_cache(key=_id) if existing_spend_obj is None: # if user does not exist in LiteLLM_UserTable, create a new user @@ -1791,6 +1796,16 @@ class ProxyConfig: _ENTERPRISE_PromptInjectionDetection() ) imported_list.append(prompt_injection_detection_obj) + elif ( + isinstance(callback, str) + and callback == "batch_redis_requests" + ): + from litellm.proxy.hooks.batch_redis_get import ( + _PROXY_BatchRedisRequests, + ) + + batch_redis_obj = _PROXY_BatchRedisRequests() + imported_list.append(batch_redis_obj) else: imported_list.append( get_instance_fn( @@ -1913,7 +1928,7 @@ class ProxyConfig: master_key = litellm.get_secret(master_key) if master_key is not None and isinstance(master_key, str): - litellm_master_key_hash = ph.hash(master_key) + litellm_master_key_hash = master_key ### CUSTOM API KEY AUTH ### ## pass filepath custom_auth = general_settings.get("custom_auth", None) diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index 07d39b086..aa0681c61 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -474,11 +474,10 @@ def test_redis_cache_completion_stream(): # test_redis_cache_completion_stream() -def test_redis_cache_acompletion_stream(): - import asyncio - +@pytest.mark.asyncio +async def test_redis_cache_acompletion_stream(): try: - litellm.set_verbose = False + litellm.set_verbose = True random_word = generate_random_word() messages = [ { @@ -496,37 +495,31 @@ def test_redis_cache_acompletion_stream(): response_1_content = "" response_2_content = "" - async def call1(): - nonlocal response_1_content - response1 = await litellm.acompletion( - model="gpt-3.5-turbo", - messages=messages, - max_tokens=40, - temperature=1, - stream=True, - ) - async for chunk in response1: - response_1_content += chunk.choices[0].delta.content or "" - print(response_1_content) + response1 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + max_tokens=40, + temperature=1, + stream=True, + ) + async for chunk in response1: + response_1_content += chunk.choices[0].delta.content or "" + print(response_1_content) - asyncio.run(call1()) time.sleep(0.5) print("\n\n Response 1 content: ", response_1_content, "\n\n") - async def call2(): - nonlocal response_2_content - response2 = await litellm.acompletion( - model="gpt-3.5-turbo", - messages=messages, - max_tokens=40, - temperature=1, - stream=True, - ) - async for chunk in response2: - response_2_content += chunk.choices[0].delta.content or "" - print(response_2_content) + response2 = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=messages, + max_tokens=40, + temperature=1, + stream=True, + ) + async for chunk in response2: + response_2_content += chunk.choices[0].delta.content or "" + print(response_2_content) - asyncio.run(call2()) print("\nresponse 1", response_1_content) print("\nresponse 2", response_2_content) assert ( @@ -536,14 +529,15 @@ def test_redis_cache_acompletion_stream(): litellm.success_callback = [] litellm._async_success_callback = [] except Exception as e: - print(e) + print(f"{str(e)}\n\n{traceback.format_exc()}") raise e # test_redis_cache_acompletion_stream() -def test_redis_cache_acompletion_stream_bedrock(): +@pytest.mark.asyncio +async def test_redis_cache_acompletion_stream_bedrock(): import asyncio try: @@ -565,39 +559,33 @@ def test_redis_cache_acompletion_stream_bedrock(): response_1_content = "" response_2_content = "" - async def call1(): - nonlocal response_1_content - response1 = await litellm.acompletion( - model="bedrock/anthropic.claude-v2", - messages=messages, - max_tokens=40, - temperature=1, - stream=True, - ) - async for chunk in response1: - print(chunk) - response_1_content += chunk.choices[0].delta.content or "" - print(response_1_content) + response1 = await litellm.acompletion( + model="bedrock/anthropic.claude-v2", + messages=messages, + max_tokens=40, + temperature=1, + stream=True, + ) + async for chunk in response1: + print(chunk) + response_1_content += chunk.choices[0].delta.content or "" + print(response_1_content) - asyncio.run(call1()) time.sleep(0.5) print("\n\n Response 1 content: ", response_1_content, "\n\n") - async def call2(): - nonlocal response_2_content - response2 = await litellm.acompletion( - model="bedrock/anthropic.claude-v2", - messages=messages, - max_tokens=40, - temperature=1, - stream=True, - ) - async for chunk in response2: - print(chunk) - response_2_content += chunk.choices[0].delta.content or "" - print(response_2_content) + response2 = await litellm.acompletion( + model="bedrock/anthropic.claude-v2", + messages=messages, + max_tokens=40, + temperature=1, + stream=True, + ) + async for chunk in response2: + print(chunk) + response_2_content += chunk.choices[0].delta.content or "" + print(response_2_content) - asyncio.run(call2()) print("\nresponse 1", response_1_content) print("\nresponse 2", response_2_content) assert ( @@ -612,8 +600,8 @@ def test_redis_cache_acompletion_stream_bedrock(): raise e -@pytest.mark.skip(reason="AWS Suspended Account") -def test_s3_cache_acompletion_stream_azure(): +@pytest.mark.asyncio +async def test_s3_cache_acompletion_stream_azure(): import asyncio try: @@ -637,41 +625,35 @@ def test_s3_cache_acompletion_stream_azure(): response_1_created = "" response_2_created = "" - async def call1(): - nonlocal response_1_content, response_1_created - response1 = await litellm.acompletion( - model="azure/chatgpt-v-2", - messages=messages, - max_tokens=40, - temperature=1, - stream=True, - ) - async for chunk in response1: - print(chunk) - response_1_created = chunk.created - response_1_content += chunk.choices[0].delta.content or "" - print(response_1_content) + response1 = await litellm.acompletion( + model="azure/chatgpt-v-2", + messages=messages, + max_tokens=40, + temperature=1, + stream=True, + ) + async for chunk in response1: + print(chunk) + response_1_created = chunk.created + response_1_content += chunk.choices[0].delta.content or "" + print(response_1_content) - asyncio.run(call1()) time.sleep(0.5) print("\n\n Response 1 content: ", response_1_content, "\n\n") - async def call2(): - nonlocal response_2_content, response_2_created - response2 = await litellm.acompletion( - model="azure/chatgpt-v-2", - messages=messages, - max_tokens=40, - temperature=1, - stream=True, - ) - async for chunk in response2: - print(chunk) - response_2_content += chunk.choices[0].delta.content or "" - response_2_created = chunk.created - print(response_2_content) + response2 = await litellm.acompletion( + model="azure/chatgpt-v-2", + messages=messages, + max_tokens=40, + temperature=1, + stream=True, + ) + async for chunk in response2: + print(chunk) + response_2_content += chunk.choices[0].delta.content or "" + response_2_created = chunk.created + print(response_2_content) - asyncio.run(call2()) print("\nresponse 1", response_1_content) print("\nresponse 2", response_2_content) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 85f1139fa..44e2f7af6 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -358,7 +358,7 @@ def test_completion_mistral_azure(): } ], ) - # Add any assertions here to check the response + # Add any assertions here to check, the response print(response) except Exception as e: @@ -575,6 +575,25 @@ def test_completion_azure_gpt4_vision(): # test_completion_azure_gpt4_vision() +def test_completion_fireworks_ai(): + try: + litellm.set_verbose = True + messages = [ + {"role": "system", "content": "You're a good bot"}, + { + "role": "user", + "content": "Hey", + }, + ] + response = completion( + model="fireworks_ai/accounts/fireworks/models/mixtral-8x7b-instruct", + messages=messages, + ) + print(response) + except Exception as e: + pytest.fail(f"Error occurred: {e}") + + @pytest.mark.skip(reason="this test is flaky") def test_completion_perplexity_api(): try: diff --git a/litellm/tests/test_custom_logger.py b/litellm/tests/test_custom_logger.py index 0a8f7b941..b2e2b7d22 100644 --- a/litellm/tests/test_custom_logger.py +++ b/litellm/tests/test_custom_logger.py @@ -97,27 +97,23 @@ class TmpFunction: ) -def test_async_chat_openai_stream(): +@pytest.mark.asyncio +async def test_async_chat_openai_stream(): try: tmp_function = TmpFunction() litellm.set_verbose = True litellm.success_callback = [tmp_function.async_test_logging_fn] complete_streaming_response = "" - async def call_gpt(): - nonlocal complete_streaming_response - response = await litellm.acompletion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}], - stream=True, - ) - async for chunk in response: - complete_streaming_response += ( - chunk["choices"][0]["delta"]["content"] or "" - ) - print(complete_streaming_response) + response = await litellm.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}], + stream=True, + ) + async for chunk in response: + complete_streaming_response += chunk["choices"][0]["delta"]["content"] or "" + print(complete_streaming_response) - asyncio.run(call_gpt()) complete_streaming_response = complete_streaming_response.strip("'") response1 = tmp_function.complete_streaming_response_in_callback["choices"][0][ "message" @@ -130,7 +126,7 @@ def test_async_chat_openai_stream(): assert tmp_function.async_success == True except Exception as e: print(e) - pytest.fail(f"An error occurred - {str(e)}") + pytest.fail(f"An error occurred - {str(e)}\n\n{traceback.format_exc()}") # test_async_chat_openai_stream() diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py index c32a55353..7eecca60b 100644 --- a/litellm/tests/test_embedding.py +++ b/litellm/tests/test_embedding.py @@ -117,6 +117,8 @@ def test_openai_azure_embedding_simple(): print("Calculated request cost=", request_cost) + assert isinstance(response.usage, litellm.Usage) + except Exception as e: pytest.fail(f"Error occurred: {e}") @@ -204,6 +206,8 @@ def test_cohere_embedding(): input=["good morning from litellm", "this is another item"], ) print(f"response:", response) + + assert isinstance(response.usage, litellm.Usage) except Exception as e: pytest.fail(f"Error occurred: {e}") @@ -269,6 +273,8 @@ def test_bedrock_embedding_titan(): assert end_time - start_time < 0.1 litellm.disable_cache() + + assert isinstance(response.usage, litellm.Usage) except Exception as e: pytest.fail(f"Error occurred: {e}") @@ -295,6 +301,8 @@ def test_bedrock_embedding_cohere(): isinstance(x, float) for x in response["data"][0]["embedding"] ), "Expected response to be a list of floats" # print(f"response:", response) + + assert isinstance(response.usage, litellm.Usage) except Exception as e: pytest.fail(f"Error occurred: {e}") @@ -331,6 +339,8 @@ def test_hf_embedding(): input=["good morning from litellm", "this is another item"], ) print(f"response:", response) + + assert isinstance(response.usage, litellm.Usage) except Exception as e: # Note: Huggingface inference API is unstable and fails with "model loading errors all the time" pass @@ -386,6 +396,8 @@ def test_aembedding_azure(): response._hidden_params["custom_llm_provider"], ) assert response._hidden_params["custom_llm_provider"] == "azure" + + assert isinstance(response.usage, litellm.Usage) except Exception as e: pytest.fail(f"Error occurred: {e}") @@ -440,6 +452,7 @@ def test_mistral_embeddings(): input=["good morning from litellm"], ) print(f"response: {response}") + assert isinstance(response.usage, litellm.Usage) except Exception as e: pytest.fail(f"Error occurred: {e}") diff --git a/litellm/tests/test_key_generate_prisma.py b/litellm/tests/test_key_generate_prisma.py index 151781beb..103b344f5 100644 --- a/litellm/tests/test_key_generate_prisma.py +++ b/litellm/tests/test_key_generate_prisma.py @@ -158,7 +158,7 @@ def test_call_with_invalid_key(prisma_client): async def test(): await litellm.proxy.proxy_server.prisma_client.connect() - generated_key = "bad-key" + generated_key = "sk-126666" bearer_token = "Bearer " + generated_key request = Request(scope={"type": "http"}, receive=None) @@ -173,7 +173,7 @@ def test_call_with_invalid_key(prisma_client): except Exception as e: print("Got Exception", e) print(e.message) - assert "Authentication Error" in e.message + assert "Authentication Error, Invalid token passed" in e.message pass diff --git a/litellm/utils.py b/litellm/utils.py index 3fb961c05..95b18421f 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -72,7 +72,7 @@ from .integrations.litedebugger import LiteDebugger from .proxy._types import KeyManagementSystem from openai import OpenAIError as OriginalError from openai._models import BaseModel as OpenAIObject -from .caching import S3Cache, RedisSemanticCache +from .caching import S3Cache, RedisSemanticCache, RedisCache from .exceptions import ( AuthenticationError, BadRequestError, @@ -1795,7 +1795,12 @@ class Logging: ) result = kwargs["async_complete_streaming_response"] # only add to cache once we have a complete streaming response - litellm.cache.add_cache(result, **kwargs) + if litellm.cache is not None and not isinstance( + litellm.cache.cache, S3Cache + ): + await litellm.cache.async_add_cache(result, **kwargs) + else: + litellm.cache.add_cache(result, **kwargs) if isinstance(callback, CustomLogger): # custom logger class print_verbose( f"Running Async success callback: {callback}; self.stream: {self.stream}; async_complete_streaming_response: {self.model_call_details.get('async_complete_streaming_response', None)} result={result}" @@ -2589,7 +2594,7 @@ def client(original_function): if ( kwargs.get("max_tokens", None) is not None and model is not None - and litellm.drop_params + and litellm.modify_params == True # user is okay with params being modified and ( call_type == CallTypes.acompletion.value @@ -2806,7 +2811,9 @@ def client(original_function): ): if len(cached_result) == 1 and cached_result[0] is None: cached_result = None - elif isinstance(litellm.cache.cache, RedisSemanticCache): + elif isinstance( + litellm.cache.cache, RedisSemanticCache + ) or isinstance(litellm.cache.cache, RedisCache): preset_cache_key = litellm.cache.get_cache_key(*args, **kwargs) kwargs["preset_cache_key"] = ( preset_cache_key # for streaming calls, we need to pass the preset_cache_key @@ -5375,6 +5382,17 @@ def get_llm_provider( # groq is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1 api_base = "https://api.groq.com/openai/v1" dynamic_api_key = get_secret("GROQ_API_KEY") + elif custom_llm_provider == "fireworks_ai": + # fireworks is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.groq.com/openai/v1 + if not model.startswith("accounts/fireworks/models"): + model = f"accounts/fireworks/models/{model}" + api_base = "https://api.fireworks.ai/inference/v1" + dynamic_api_key = ( + get_secret("FIREWORKS_API_KEY") + or get_secret("FIREWORKS_AI_API_KEY") + or get_secret("FIREWORKSAI_API_KEY") + or get_secret("FIREWORKS_AI_TOKEN") + ) elif custom_llm_provider == "mistral": # mistral is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.mistral.ai api_base = ( diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index ddacbf05c..0a90c91ca 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -631,6 +631,13 @@ "litellm_provider": "groq", "mode": "chat" }, + "groq/gemma-7b-it": { + "max_tokens": 8192, + "input_cost_per_token": 0.00000010, + "output_cost_per_token": 0.00000010, + "litellm_provider": "groq", + "mode": "chat" + }, "claude-instant-1.2": { "max_tokens": 100000, "max_output_tokens": 8191, diff --git a/proxy_server_config.yaml b/proxy_server_config.yaml index 32f12bd79..69da70153 100644 --- a/proxy_server_config.yaml +++ b/proxy_server_config.yaml @@ -45,13 +45,15 @@ litellm_settings: budget_duration: 30d num_retries: 5 request_timeout: 600 + cache: true + callbacks: ["batch_redis_requests"] general_settings: master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234) proxy_budget_rescheduler_min_time: 60 proxy_budget_rescheduler_max_time: 64 # database_url: "postgresql://:@:/" # [OPTIONAL] use for token-based auth to proxy -environment_variables: +# environment_variables: # settings for using redis caching # REDIS_HOST: redis-16337.c322.us-east-1-2.ec2.cloud.redislabs.com # REDIS_PORT: "16337" diff --git a/pyproject.toml b/pyproject.toml index acbbeb1dd..d07e87500 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.31.12" +version = "1.31.16" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -76,7 +76,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.31.12" +version = "1.31.16" version_files = [ "pyproject.toml:^version" ] diff --git a/requirements.txt b/requirements.txt index adfec7bc6..eaff0fb71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,5 +34,4 @@ jinja2==3.1.3 # for prompt templates certifi>=2023.7.22 # [TODO] clean up aiohttp==3.9.0 # for network calls aioboto3==12.3.0 # for async sagemaker calls -argon2-cffi==23.1.0 # for checking secrets #### \ No newline at end of file