Merge branch 'main' into litellm_call_id_in_response

This commit is contained in:
Krish Dholakia 2024-07-11 21:54:49 -07:00 committed by GitHub
commit 72f1c9181d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
119 changed files with 4737 additions and 1868 deletions

View file

@ -243,7 +243,102 @@ jobs:
command: |
pwd
ls
python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5
python -m pytest -vv tests/ -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests
no_output_timeout: 120m
# Store test results
- store_test_results:
path: test-results
proxy_log_to_otel_tests:
machine:
image: ubuntu-2204:2023.10.1
resource_class: xlarge
working_directory: ~/project
steps:
- checkout
- run:
name: Install Docker CLI (In case it's not already installed)
command: |
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
- run:
name: Install Python 3.9
command: |
curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
conda init bash
source ~/.bashrc
conda create -n myenv python=3.9 -y
conda activate myenv
python --version
- run:
name: Install Dependencies
command: |
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install aiohttp
pip install openai
python -m pip install --upgrade pip
python -m pip install -r .circleci/requirements.txt
pip install "pytest==7.3.1"
pip install "pytest-mock==3.12.0"
pip install "pytest-asyncio==0.21.1"
pip install mypy
pip install pyarrow
pip install numpydoc
pip install prisma
pip install fastapi
pip install jsonschema
pip install "httpx==0.24.1"
pip install "anyio==3.7.1"
pip install "asyncio==3.4.3"
pip install "PyGithub==1.59.1"
- run:
name: Build Docker image
command: docker build -t my-app:latest -f Dockerfile.database .
- run:
name: Run Docker container
# intentionally give bad redis credentials here
# the OTEL test - should get this as a trace
command: |
docker run -d \
-p 4000:4000 \
-e DATABASE_URL=$PROXY_DATABASE_URL \
-e REDIS_HOST=$REDIS_HOST \
-e REDIS_PASSWORD=$REDIS_PASSWORD \
-e REDIS_PORT=$REDIS_PORT \
-e LITELLM_MASTER_KEY="sk-1234" \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
-e LITELLM_LICENSE=$LITELLM_LICENSE \
-e OTEL_EXPORTER="in_memory" \
--name my-app \
-v $(pwd)/litellm/proxy/example_config_yaml/otel_test_config.yaml:/app/config.yaml \
my-app:latest \
--config /app/config.yaml \
--port 4000 \
--detailed_debug \
- run:
name: Install curl and dockerize
command: |
sudo apt-get update
sudo apt-get install -y curl
sudo wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz
sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz
sudo rm dockerize-linux-amd64-v0.6.1.tar.gz
- run:
name: Start outputting logs
command: docker logs -f my-app
background: true
- run:
name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 5m
- run:
name: Run tests
command: |
pwd
ls
python -m pytest -vv tests/otel_tests/test_otel.py -x --junitxml=test-results/junit.xml --durations=5
no_output_timeout: 120m
# Store test results
@ -337,6 +432,12 @@ workflows:
only:
- main
- /litellm_.*/
- proxy_log_to_otel_tests:
filters:
branches:
only:
- main
- /litellm_.*/
- installing_litellm_on_python:
filters:
branches:
@ -347,6 +448,7 @@ workflows:
requires:
- local_testing
- build_and_test
- proxy_log_to_otel_tests
filters:
branches:
only:

View file

@ -1,88 +0,0 @@
apiVersion: v1
entries:
postgresql:
- annotations:
category: Database
images: |
- name: os-shell
image: docker.io/bitnami/os-shell:12-debian-12-r16
- name: postgres-exporter
image: docker.io/bitnami/postgres-exporter:0.15.0-debian-12-r14
- name: postgresql
image: docker.io/bitnami/postgresql:16.2.0-debian-12-r6
licenses: Apache-2.0
apiVersion: v2
appVersion: 16.2.0
created: "2024-07-08T11:05:19.312515+08:00"
dependencies:
- name: common
repository: oci://registry-1.docker.io/bitnamicharts
tags:
- bitnami-common
version: 2.x.x
description: PostgreSQL (Postgres) is an open source object-relational database
known for reliability and data integrity. ACID-compliant, it supports foreign
keys, joins, views, triggers and stored procedures.
digest: 3c8125526b06833df32e2f626db34aeaedb29d38f03d15349db6604027d4a167
home: https://bitnami.com
icon: https://bitnami.com/assets/stacks/postgresql/img/postgresql-stack-220x234.png
keywords:
- postgresql
- postgres
- database
- sql
- replication
- cluster
maintainers:
- name: VMware, Inc.
url: https://github.com/bitnami/charts
name: postgresql
sources:
- https://github.com/bitnami/charts/tree/main/bitnami/postgresql
urls:
- https://berriai.github.io/litellm/charts/postgresql-14.3.1.tgz
version: 14.3.1
redis:
- annotations:
category: Database
images: |
- name: kubectl
image: docker.io/bitnami/kubectl:1.29.2-debian-12-r3
- name: os-shell
image: docker.io/bitnami/os-shell:12-debian-12-r16
- name: redis
image: docker.io/bitnami/redis:7.2.4-debian-12-r9
- name: redis-exporter
image: docker.io/bitnami/redis-exporter:1.58.0-debian-12-r4
- name: redis-sentinel
image: docker.io/bitnami/redis-sentinel:7.2.4-debian-12-r7
licenses: Apache-2.0
apiVersion: v2
appVersion: 7.2.4
created: "2024-07-08T11:05:19.317065+08:00"
dependencies:
- name: common
repository: oci://registry-1.docker.io/bitnamicharts
tags:
- bitnami-common
version: 2.x.x
description: Redis(R) is an open source, advanced key-value store. It is often
referred to as a data structure server since keys can contain strings, hashes,
lists, sets and sorted sets.
digest: b2fa1835f673a18002ca864c54fadac3c33789b26f6c5e58e2851b0b14a8f984
home: https://bitnami.com
icon: https://bitnami.com/assets/stacks/redis/img/redis-stack-220x234.png
keywords:
- redis
- keyvalue
- database
maintainers:
- name: VMware, Inc.
url: https://github.com/bitnami/charts
name: redis
sources:
- https://github.com/bitnami/charts/tree/main/bitnami/redis
urls:
- https://berriai.github.io/litellm/charts/redis-18.19.1.tgz
version: 18.19.1
generated: "2024-07-08T11:05:19.308028+08:00"

View file

@ -0,0 +1,54 @@
# [BETA] Anthropic `/v1/messages`
Call 100+ LLMs in the Anthropic format.
1. Setup config.yaml
```yaml
model_list:
- model_name: my-test-model
litellm_params:
model: gpt-3.5-turbo
```
2. Start proxy
```bash
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/v1/messages' \
-H 'x-api-key: sk-1234' \
-H 'content-type: application/json' \
-D '{
"model": "my-test-model",
"max_tokens": 1024,
"messages": [
{"role": "user", "content": "Hello, world"}
]
}'
```
## Test with Anthropic SDK
```python
import os
from anthropic import Anthropic
client = Anthropic(api_key="sk-1234", base_url="http://0.0.0.0:4000") # 👈 CONNECT TO PROXY
message = client.messages.create(
messages=[
{
"role": "user",
"content": "Hello, Claude",
}
],
model="my-test-model", # 👈 set 'model_name'
)
print(message.content)
```

View file

@ -26,6 +26,7 @@ Call an existing Assistant.
- Run the Assistant on the Thread to generate a response by calling the model and the tools.
### SDK + PROXY
<Tabs>
<TabItem value="sdk" label="SDK">
@ -281,3 +282,31 @@ curl -X POST 'http://0.0.0.0:4000/threads/{thread_id}/runs' \
</Tabs>
## [👉 Proxy API Reference](https://litellm-api.up.railway.app/#/assistants)
## OpenAI-Compatible APIs
To call openai-compatible Assistants API's (eg. Astra Assistants API), just add `openai/` to the model name:
**config**
```yaml
assistant_settings:
custom_llm_provider: openai
litellm_params:
api_key: os.environ/ASTRA_API_KEY
api_base: os.environ/ASTRA_API_BASE
```
**curl**
```bash
curl -X POST "http://localhost:4000/v1/assistants" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"instructions": "You are a personal math tutor. When asked a question, write and run Python code to answer the question.",
"name": "Math Tutor",
"tools": [{"type": "code_interpreter"}],
"model": "openai/<my-astra-model-name>"
}'
```

View file

@ -0,0 +1,34 @@
# Data Privacy and Security
## Security Measures
### LiteLLM Cloud
- We encrypt all data stored using your `LITELLM_MASTER_KEY` and in transit using TLS.
- Our database and application run on GCP, AWS infrastructure, partly managed by NeonDB.
- US data region: Northern California (AWS/GCP `us-west-1`) & Virginia (AWS `us-east-1`)
- EU data region Germany/Frankfurt (AWS/GCP `eu-central-1`)
- All users have access to SSO (Single Sign-On) through OAuth 2.0 with Google, Okta, Microsoft, KeyCloak.
- Audit Logs with retention policy
- Control Allowed IP Addresses that can access your Cloud LiteLLM Instance
For security inquiries, please contact us at support@berri.ai
### Supported data regions for LiteLLM Cloud
LiteLLM supports the following data regions:
- US, Northern California (AWS/GCP `us-west-1`)
- Europe, Frankfurt, Germany (AWS/GCP `eu-central-1`)
All data, user accounts, and infrastructure are completely separated between these two regions
### Security Vulnerability Reporting Guidelines
We value the security community's role in protecting our systems and users. To report a security vulnerability:
- Email support@berri.ai with details
- Include steps to reproduce the issue
- Provide any relevant additional information
We'll review all reports promptly. Note that we don't currently offer a bug bounty program.

View file

@ -24,6 +24,7 @@ This covers:
- ✅ [JWT-Auth](../docs/proxy/token_auth.md)
- ✅ [Control available public, private routes](./proxy/enterprise#control-available-public-private-routes)
- ✅ [[BETA] AWS Key Manager v2 - Key Decryption](./proxy/enterprise#beta-aws-key-manager---key-decryption)
- ✅ IP addressbased access control lists
- ✅ Track Request IP Address
- ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](./proxy/pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
- ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](./proxy/enterprise#enforce-required-params-for-llm-requests)

View file

@ -21,6 +21,14 @@ See our status page for [**live reliability**](https://status.litellm.ai/)
- **Reliable**: Our hosted proxy is tested on 1k requests per second, making it reliable for high load.
- **Secure**: LiteLLM is currently undergoing SOC-2 compliance, to make sure your data is as secure as possible.
## Data Privacy & Security
You can find our [data privacy & security policy for cloud litellm here](../docs/data_security#litellm-cloud)
## Supported data regions for LiteLLM Cloud
You can find [supported data regions litellm here](../docs/data_security#supported-data-regions-for-litellm-cloud)
### Pricing
Pricing is based on usage. We can figure out a price that works for your team, on the call.

View file

@ -18,6 +18,7 @@ Features:
- ✅ [JWT-Auth](../docs/proxy/token_auth.md)
- ✅ [Control available public, private routes](#control-available-public-private-routes)
- ✅ [[BETA] AWS Key Manager v2 - Key Decryption](#beta-aws-key-manager---key-decryption)
- ✅ IP addressbased access control lists
- ✅ Track Request IP Address
- ✅ [Use LiteLLM keys/authentication on Pass Through Endpoints](pass_through#✨-enterprise---use-litellm-keysauthentication-on-pass-through-endpoints)
- ✅ [Enforce Required Params for LLM Requests (ex. Reject requests missing ["metadata"]["generation_name"])](#enforce-required-params-for-llm-requests)

View file

@ -112,37 +112,52 @@ model_list:
mode: completion # 👈 ADD THIS
```
### Speech to Text Models
```yaml
model_list:
- model_name: whisper
litellm_params:
model: whisper-1
api_key: os.environ/OPENAI_API_KEY
model_info:
mode: audio_transcription
```
## `/health/readiness`
Unprotected endpoint for checking if proxy is ready to accept requests
Example Request:
```bash
curl --location 'http://0.0.0.0:4000/health/readiness'
```bash
curl http://0.0.0.0:4000/health/readiness
```
Example Response:
*If proxy connected to a database*
```json
{
"status": "healthy",
"db": "connected",
"litellm_version":"1.19.2",
"status": "connected",
"db": "connected",
"cache": null,
"litellm_version": "1.40.21",
"success_callbacks": [
"langfuse",
"_PROXY_track_cost_callback",
"response_taking_too_long_callback",
"_PROXY_MaxParallelRequestsHandler",
"_PROXY_MaxBudgetLimiter",
"_PROXY_CacheControlCheck",
"ServiceLogging"
],
"last_updated": "2024-07-10T18:59:10.616968"
}
```
*If proxy not connected to a database*
```json
{
"status": "healthy",
"db": "Not connected",
"litellm_version":"1.19.2",
}
```
If the proxy is not connected to a database, then the `"db"` field will be `"Not
connected"` instead of `"connected"` and the `"last_updated"` field will not be present.
## `/health/liveliness`

View file

@ -1,27 +1,19 @@
# 🪢 Logging
Log Proxy input, output, and exceptions using:
- Langfuse
- OpenTelemetry
- Custom Callbacks
- DataDog
- DynamoDB
- s3 Bucket
- etc.
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 🪢 Logging - Langfuse, OpenTelemetry, Custom Callbacks, DataDog, s3 Bucket, Sentry, Athina, Azure Content-Safety
Log Proxy Input, Output, Exceptions using Langfuse, OpenTelemetry, Custom Callbacks, DataDog, DynamoDB, s3 Bucket
## Table of Contents
- [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
- [Logging with OpenTelemetry (OpenTelemetry)](#logging-proxy-inputoutput-in-opentelemetry-format)
- [Async Custom Callbacks](#custom-callback-class-async)
- [Async Custom Callback APIs](#custom-callback-apis-async)
- [Logging to Galileo](#logging-llm-io-to-galileo)
- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
- [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
- [Logging to DataDog](#logging-proxy-inputoutput---datadog)
- [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
- [Logging to Sentry](#logging-proxy-inputoutput---sentry)
- [Logging to Athina](#logging-proxy-inputoutput-athina)
- [(BETA) Moderation with Azure Content-Safety](#moderation-with-azure-content-safety)
## Getting the LiteLLM Call ID
LiteLLM generates a unique `call_id` for each request. This `call_id` can be
@ -56,6 +48,7 @@ A number of these headers could be useful for troubleshooting, but the
components in your system, including in logging tools.
## Logging Proxy Input/Output - Langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
**Step 1** Install langfuse
@ -65,6 +58,7 @@ pip install langfuse>=2.0.0
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -75,6 +69,7 @@ litellm_settings:
```
**Step 3**: Set required env variables for logging to langfuse
```shell
export LANGFUSE_PUBLIC_KEY="pk_kk"
export LANGFUSE_SECRET_KEY="sk_ss"
@ -85,11 +80,13 @@ export LANGFUSE_HOST="https://xxx.langfuse.com"
**Step 4**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```
litellm --test
```
@ -100,7 +97,6 @@ Expected output on Langfuse
### Logging Metadata to Langfuse
<Tabs>
<TabItem value="Curl" label="Curl Request">
@ -126,6 +122,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}
}'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
@ -159,6 +156,7 @@ response = client.chat.completions.create(
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
@ -201,7 +199,6 @@ print(response)
</TabItem>
</Tabs>
### Team based Logging to Langfuse
**Example:**
@ -290,6 +287,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}
}'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
@ -320,6 +318,7 @@ response = client.chat.completions.create(
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
@ -365,7 +364,6 @@ You will see `raw_request` in your Langfuse Metadata. This is the RAW CURL comma
<Image img={require('../../img/debug_langfuse.png')} />
## Logging Proxy Input/Output in OpenTelemetry format
:::info
@ -381,10 +379,8 @@ OTEL_SERVICE_NAME=<your-service-name>` # default="litellm"
<Tabs>
<TabItem value="Console Exporter" label="Log to console">
**Step 1:** Set callbacks and env vars
Add the following to your env
@ -400,7 +396,6 @@ litellm_settings:
callbacks: ["otel"]
```
**Step 2**: Start the proxy, make a test request
Start proxy
@ -460,7 +455,6 @@ This is the Span from OTEL Logging
</TabItem>
<TabItem value="Honeycomb" label="Log to Honeycomb">
#### Quick Start - Log to Honeycomb
@ -482,7 +476,6 @@ litellm_settings:
callbacks: ["otel"]
```
**Step 2**: Start the proxy, make a test request
Start proxy
@ -507,10 +500,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}'
```
</TabItem>
<TabItem value="otel-col" label="Log to OTEL HTTP Collector">
#### Quick Start - Log to OTEL Collector
@ -532,7 +523,6 @@ litellm_settings:
callbacks: ["otel"]
```
**Step 2**: Start the proxy, make a test request
Start proxy
@ -559,7 +549,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
</TabItem>
<TabItem value="otel-col-grpc" label="Log to OTEL GRPC Collector">
#### Quick Start - Log to OTEL GRPC Collector
@ -581,7 +570,6 @@ litellm_settings:
callbacks: ["otel"]
```
**Step 2**: Start the proxy, make a test request
Start proxy
@ -606,7 +594,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}'
```
</TabItem>
<TabItem value="traceloop" label="Log to Traceloop Cloud">
@ -629,7 +616,6 @@ environment_variables:
TRACELOOP_API_KEY: "XXXXX"
```
**Step 3**: Start the proxy, make a test request
Start proxy
@ -665,11 +651,15 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
❓ Use this when you want to **pass information about the incoming request in a distributed tracing system**
✅ Key change: Pass the **`traceparent` header** in your requests. [Read more about traceparent headers here](https://uptrace.dev/opentelemetry/opentelemetry-traceparent.html#what-is-traceparent-header)
```curl
traceparent: 00-80e1afed08e019fc1110464cfa66635c-7a085853722dc6d2-01
```
Example Usage
1. Make Request to LiteLLM Proxy with `traceparent` header
```python
import openai
import uuid
@ -693,7 +683,6 @@ response = client.chat.completions.create(
)
print(response)
```
```shell
@ -707,12 +696,12 @@ Search for Trace=`80e1afed08e019fc1110464cfa66635c` on your OTEL Collector
<Image img={require('../../img/otel_parent.png')} />
## Custom Callback Class [Async]
Use this when you want to run custom callbacks in `python`
#### Step 1 - Create your custom `litellm` callback class
We use `litellm.integrations.custom_logger` for this, **more details about litellm custom callbacks [here](https://docs.litellm.ai/docs/observability/custom_callback)**
Define your custom callback class in a python file.
@ -815,16 +804,17 @@ proxy_handler_instance = MyCustomHandler()
```
#### Step 2 - Pass your custom callback class in `config.yaml`
We pass the custom callback class defined in **Step1** to the config.yaml.
Set `callbacks` to `python_filename.logger_instance_name`
In the config below, we pass
- python_filename: `custom_callbacks.py`
- logger_instance_name: `proxy_handler_instance`. This is defined in Step 1
`callbacks: custom_callbacks.proxy_handler_instance`
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -837,6 +827,7 @@ litellm_settings:
```
#### Step 3 - Start proxy + test request
```shell
litellm --config proxy_config.yaml
```
@ -858,6 +849,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
```
#### Resulting Log on Proxy
```shell
On Success
Model: gpt-3.5-turbo,
@ -910,7 +902,6 @@ class MyCustomHandler(CustomLogger):
"max_tokens": 10
}
}
```
#### Logging `model_info` set in config.yaml
@ -928,11 +919,13 @@ class MyCustomHandler(CustomLogger):
```
**Expected Output**
```json
{'mode': 'embedding', 'input_cost_per_token': 0.002}
```
### Logging responses from proxy
Both `/chat/completions` and `/embeddings` responses are available as `response_obj`
**Note: for `/chat/completions`, both `stream=True` and `non stream` responses are available as `response_obj`**
@ -946,6 +939,7 @@ class MyCustomHandler(CustomLogger):
```
**Expected Output /chat/completion [for both `stream` and `non-stream` responses]**
```json
ModelResponse(
id='chatcmpl-8Tfu8GoMElwOZuj2JlHBhNHG01PPo',
@ -972,6 +966,7 @@ ModelResponse(
```
**Expected Output /embeddings**
```json
{
'model': 'ada',
@ -991,7 +986,6 @@ ModelResponse(
}
```
## Custom Callback APIs [Async]
:::info
@ -1001,10 +995,12 @@ This is an Enterprise only feature [Get Started with Enterprise here](https://gi
:::
Use this if you:
- Want to use custom callbacks written in a non Python programming language
- Want your callbacks to run on a different microservice
#### Step 1. Create your generic logging API endpoint
Set up a generic API endpoint that can receive data in JSON format. The data will be included within a "data" field.
Your server should support the following Request format:
@ -1067,11 +1063,8 @@ async def log_event(request: Request):
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=4000)
```
#### Step 2. Set your `GENERIC_LOGGER_ENDPOINT` to the endpoint + route we should send callback logs to
```shell
@ -1081,6 +1074,7 @@ os.environ["GENERIC_LOGGER_ENDPOINT"] = "http://localhost:4000/log-event"
#### Step 3. Create a `config.yaml` file and set `litellm_settings`: `success_callback` = ["generic"]
Example litellm proxy config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -1092,8 +1086,8 @@ litellm_settings:
Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API
## Logging LLM IO to Galileo
[BETA]
Log LLM I/O on [www.rungalileo.io](https://www.rungalileo.io/)
@ -1116,6 +1110,7 @@ export GALILEO_PASSWORD=""
### Quick Start
1. Add to Config.yaml
```yaml
model_list:
- litellm_params:
@ -1151,7 +1146,6 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
'
```
🎉 That's it - Expect to see your Logs on your Galileo Dashboard
## Logging Proxy Cost + Usage - OpenMeter
@ -1169,6 +1163,7 @@ export OPENMETER_API_KEY=""
### Quick Start
1. Add to Config.yaml
```yaml
model_list:
- litellm_params:
@ -1204,13 +1199,14 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
'
```
<Image img={require('../../img/openmeter_img_2.png')} />
## Logging Proxy Input/Output - DataDog
We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog
**Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -1230,6 +1226,7 @@ DD_SITE="us5.datadoghq.com" # your datadog base url
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
@ -1257,10 +1254,10 @@ Expected output on Datadog
<Image img={require('../../img/dd_small1.png')} />
## Logging Proxy Input/Output - s3 Buckets
We will use the `--config` to set
- `litellm.success_callback = ["s3"]`
This will log all successfull LLM calls to s3 Bucket
@ -1274,6 +1271,7 @@ AWS_REGION_NAME = ""
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -1293,11 +1291,13 @@ litellm_settings:
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
@ -1317,6 +1317,7 @@ Your logs should be available on the specified s3 Bucket
## Logging Proxy Input/Output - DynamoDB
We will use the `--config` to set
- `litellm.success_callback = ["dynamodb"]`
- `litellm.dynamodb_table_name = "your-table-name"`
@ -1331,6 +1332,7 @@ AWS_REGION_NAME = ""
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -1344,11 +1346,13 @@ litellm_settings:
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
@ -1436,19 +1440,18 @@ Your logs should be available on DynamoDB
}
```
## Logging Proxy Input/Output - Sentry
If api calls fail (llm/database) you can log those to Sentry:
**Step 1** Install Sentry
```shell
pip install --upgrade sentry-sdk
```
**Step 2**: Save your Sentry_DSN and add `litellm_settings`: `failure_callback`
```shell
export SENTRY_DSN="your-sentry-dsn"
```
@ -1468,11 +1471,13 @@ general_settings:
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```
litellm --test
```
@ -1490,6 +1495,7 @@ ATHINA_API_KEY = "your-athina-api-key"
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -1502,11 +1508,13 @@ litellm_settings:
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
@ -1538,6 +1546,7 @@ AZURE_CONTENT_SAFETY_KEY = "<your-azure-content-safety-key>"
```
**Step 2**: Create a `config.yaml` file and set `litellm_settings`: `success_callback`
```yaml
model_list:
- model_name: gpt-3.5-turbo
@ -1553,11 +1562,13 @@ litellm_settings:
**Step 3**: Start the proxy, make a test request
Start proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
@ -1573,7 +1584,8 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
```
An HTTP 400 error will be returned if the content is detected with a value greater than the threshold set in the `config.yaml`.
The details of the response will describe :
The details of the response will describe:
- The `source` : input text or llm generated text
- The `category` : the category of the content that triggered the moderation
- The `severity` : the severity from 0 to 10

View file

@ -15,9 +15,9 @@ model_list:
metadata: "here's additional metadata on the model" # returned via GET /model/info
```
## Get Model Information
## Get Model Information - `/model/info`
Retrieve detailed information about each model listed in the `/models` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes.
Retrieve detailed information about each model listed in the `/model/info` endpoint, including descriptions from the `config.yaml` file, and additional model info (e.g. max tokens, cost per input token, etc.) pulled the model_info you set and the litellm model cost map. Sensitive details like API keys are excluded for security purposes.
<Tabs
defaultValue="curl"

View file

@ -1,7 +1,7 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [OLD PROXY 👉 [**NEW** proxy here](./simple_proxy)] Local OpenAI Proxy Server
# [OLD PROXY 👉 [NEW proxy here](./simple_proxy)] Local OpenAI Proxy Server
A fast, and lightweight OpenAI-compatible server to call 100+ LLM APIs.

View file

@ -117,6 +117,7 @@ const sidebars = {
"text_to_speech",
"assistants",
"batches",
"anthropic_completion"
],
},
{
@ -237,6 +238,7 @@ const sidebars = {
label: "Extras",
items: [
"extras/contributing",
"data_security",
"contributing",
"rules",
"proxy_server",

View file

@ -1,6 +1,25 @@
apiVersion: v1
entries:
litellm-helm:
- apiVersion: v2
appVersion: v1.41.8
created: "2024-07-10T00:59:11.1889+08:00"
dependencies:
- condition: db.deployStandalone
name: postgresql
repository: oci://registry-1.docker.io/bitnamicharts
version: '>=13.3.0'
- condition: redis.enabled
name: redis
repository: oci://registry-1.docker.io/bitnamicharts
version: '>=18.0.0'
description: Call all LLM APIs using the OpenAI format
digest: eeff5e4e6cebb4c977cb7359c1ec6c773c66982f6aa39dbed94a674890144a43
name: litellm-helm
type: application
urls:
- https://berriai.github.io/litellm/litellm-helm-0.2.1.tgz
version: 0.2.1
- apiVersion: v2
appVersion: v1.35.38
created: "2024-05-06T10:22:24.384392-07:00"
@ -33,7 +52,7 @@ entries:
licenses: Apache-2.0
apiVersion: v2
appVersion: 16.2.0
created: "2024-05-06T10:22:24.387717-07:00"
created: "2024-07-10T00:59:11.191731+08:00"
dependencies:
- name: common
repository: oci://registry-1.docker.io/bitnamicharts
@ -60,7 +79,7 @@ entries:
sources:
- https://github.com/bitnami/charts/tree/main/bitnami/postgresql
urls:
- charts/postgresql-14.3.1.tgz
- https://berriai.github.io/litellm/charts/postgresql-14.3.1.tgz
version: 14.3.1
redis:
- annotations:
@ -79,7 +98,7 @@ entries:
licenses: Apache-2.0
apiVersion: v2
appVersion: 7.2.4
created: "2024-05-06T10:22:24.391903-07:00"
created: "2024-07-10T00:59:11.195667+08:00"
dependencies:
- name: common
repository: oci://registry-1.docker.io/bitnamicharts
@ -103,6 +122,6 @@ entries:
sources:
- https://github.com/bitnami/charts/tree/main/bitnami/redis
urls:
- charts/redis-18.19.1.tgz
- https://berriai.github.io/litellm/charts/redis-18.19.1.tgz
version: 18.19.1
generated: "2024-05-06T10:22:24.375026-07:00"
generated: "2024-07-10T00:59:11.179952+08:00"

BIN
litellm-helm-0.2.1.tgz Normal file

Binary file not shown.

View file

@ -364,7 +364,7 @@ for key, value in model_cost.items():
elif value.get("litellm_provider") == "mistral":
mistral_chat_models.append(key)
elif value.get("litellm_provider") == "anthropic":
anthropic_models.append(key)
anthropic_models.append(key)
elif value.get("litellm_provider") == "empower":
empower_models.append(key)
elif value.get("litellm_provider") == "openrouter":
@ -789,6 +789,7 @@ from .utils import (
get_api_base,
get_first_chars_messages,
ModelResponse,
EmbeddingResponse,
ImageResponse,
get_provider_fields,
)
@ -879,5 +880,11 @@ from .proxy.proxy_cli import run_server
from .router import Router
from .assistants.main import *
from .batches.main import *
from .files.main import *
from .scheduler import *
from .cost_calculator import response_cost_calculator, cost_per_token
### ADAPTERS ###
from .types.adapter import AdapterItem
adapters: List[AdapterItem] = []

View file

@ -0,0 +1,50 @@
# What is this?
## Translates OpenAI call to Anthropic `/v1/messages` format
import json
import os
import traceback
import uuid
from typing import Literal, Optional
import dotenv
import httpx
from pydantic import BaseModel
import litellm
from litellm import ChatCompletionRequest, verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.types.llms.anthropic import AnthropicMessagesRequest, AnthropicResponse
class AnthropicAdapter(CustomLogger):
def __init__(self) -> None:
super().__init__()
def translate_completion_input_params(
self, kwargs
) -> Optional[ChatCompletionRequest]:
"""
- translate params, where needed
- pass rest, as is
"""
request_body = AnthropicMessagesRequest(**kwargs) # type: ignore
translated_body = litellm.AnthropicConfig().translate_anthropic_to_openai(
anthropic_message_request=request_body
)
return translated_body
def translate_completion_output_params(
self, response: litellm.ModelResponse
) -> Optional[AnthropicResponse]:
return litellm.AnthropicConfig().translate_openai_response_to_anthropic(
response=response
)
def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
return super().translate_completion_output_params_streaming()
anthropic_adapter = AnthropicAdapter()

View file

@ -10,296 +10,37 @@ https://platform.openai.com/docs/api-reference/batch
"""
import os
import asyncio
from functools import partial
import contextvars
from typing import Literal, Optional, Dict, Coroutine, Any, Union
import os
from functools import partial
from typing import Any, Coroutine, Dict, Literal, Optional, Union
import httpx
import litellm
from litellm import client
from litellm.utils import supports_httpx_timeout
from ..types.router import *
from ..llms.openai import OpenAIBatchesAPI, OpenAIFilesAPI
from ..types.llms.openai import (
CreateBatchRequest,
RetrieveBatchRequest,
CancelBatchRequest,
CreateFileRequest,
FileTypes,
FileObject,
Batch,
CancelBatchRequest,
CreateBatchRequest,
CreateFileRequest,
FileContentRequest,
FileObject,
FileTypes,
HttpxBinaryResponseContent,
RetrieveBatchRequest,
)
from ..types.router import *
####### ENVIRONMENT VARIABLES ###################
openai_batches_instance = OpenAIBatchesAPI()
openai_files_instance = OpenAIFilesAPI()
#################################################
async def acreate_file(
file: FileTypes,
purpose: Literal["assistants", "batch", "fine-tune"],
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Coroutine[Any, Any, FileObject]:
"""
Async: Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
"""
try:
loop = asyncio.get_event_loop()
kwargs["acreate_file"] = True
# Use a partial function to pass your keyword arguments
func = partial(
create_file,
file,
purpose,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def create_file(
file: FileTypes,
purpose: Literal["assistants", "batch", "fine-tune"],
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
"""
Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_create_file_request = CreateFileRequest(
file=file,
purpose=purpose,
extra_headers=extra_headers,
extra_body=extra_body,
)
_is_async = kwargs.pop("acreate_file", False) is True
response = openai_files_instance.create_file(
_is_async=_is_async,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
create_file_data=_create_file_request,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e
async def afile_content(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Coroutine[Any, Any, HttpxBinaryResponseContent]:
"""
Async: Get file contents
LiteLLM Equivalent of GET https://api.openai.com/v1/files
"""
try:
loop = asyncio.get_event_loop()
kwargs["afile_content"] = True
# Use a partial function to pass your keyword arguments
func = partial(
file_content,
file_id,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def file_content(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Union[HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent]]:
"""
Returns the contents of the specified file.
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_file_content_request = FileContentRequest(
file_id=file_id,
extra_headers=extra_headers,
extra_body=extra_body,
)
_is_async = kwargs.pop("afile_content", False) is True
response = openai_files_instance.file_content(
_is_async=_is_async,
file_content_request=_file_content_request,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e
async def acreate_batch(
completion_window: Literal["24h"],
endpoint: Literal["/v1/chat/completions", "/v1/embeddings", "/v1/completions"],

View file

@ -15,10 +15,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
from litellm.litellm_core_utils.llm_cost_calc.google import (
cost_per_token as google_cost_per_token,
)
from litellm.litellm_core_utils.llm_cost_calc.google import (
cost_router as google_cost_router,
)
from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
from litellm.utils import (
CallTypes,
CostPerToken,
@ -160,22 +162,32 @@ def cost_per_token(
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
print_verbose(f"Looking up model={model} in model_cost_map")
if custom_llm_provider == "vertex_ai" and "claude" in model:
return google_cost_per_token(
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
if custom_llm_provider == "vertex_ai":
return google_cost_per_character(
cost_router = google_cost_router(
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
prompt_characters=prompt_characters,
completion_characters=completion_characters,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
call_type=call_type,
)
if cost_router == "cost_per_character":
return google_cost_per_character(
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
prompt_characters=prompt_characters,
completion_characters=completion_characters,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
elif cost_router == "cost_per_token":
return google_cost_per_token(
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
elif custom_llm_provider == "gemini":
return google_cost_per_token(
model=model_without_prefix,

659
litellm/files/main.py Normal file
View file

@ -0,0 +1,659 @@
"""
Main File for Files API implementation
https://platform.openai.com/docs/api-reference/files
"""
import asyncio
import contextvars
import os
from functools import partial
from typing import Any, Coroutine, Dict, Literal, Optional, Union
import httpx
import litellm
from litellm import client
from litellm.llms.openai import FileDeleted, FileObject, OpenAIFilesAPI
from litellm.types.llms.openai import (
Batch,
CreateFileRequest,
FileContentRequest,
FileTypes,
HttpxBinaryResponseContent,
)
from litellm.types.router import *
from litellm.utils import supports_httpx_timeout
####### ENVIRONMENT VARIABLES ###################
openai_files_instance = OpenAIFilesAPI()
#################################################
async def afile_retrieve(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Coroutine[Any, Any, FileObject]:
"""
Async: Get file contents
LiteLLM Equivalent of GET https://api.openai.com/v1/files
"""
try:
loop = asyncio.get_event_loop()
kwargs["is_async"] = True
# Use a partial function to pass your keyword arguments
func = partial(
file_retrieve,
file_id,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def file_retrieve(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> FileObject:
"""
Returns the contents of the specified file.
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_is_async = kwargs.pop("is_async", False) is True
response = openai_files_instance.retrieve_file(
file_id=file_id,
_is_async=_is_async,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e
# Delete file
async def afile_delete(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Coroutine[Any, Any, FileObject]:
"""
Async: Delete file
LiteLLM Equivalent of DELETE https://api.openai.com/v1/files
"""
try:
loop = asyncio.get_event_loop()
kwargs["is_async"] = True
# Use a partial function to pass your keyword arguments
func = partial(
file_delete,
file_id,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def file_delete(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> FileDeleted:
"""
Delete file
LiteLLM Equivalent of DELETE https://api.openai.com/v1/files
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_is_async = kwargs.pop("is_async", False) is True
response = openai_files_instance.delete_file(
file_id=file_id,
_is_async=_is_async,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e
# List files
async def afile_list(
custom_llm_provider: Literal["openai"] = "openai",
purpose: Optional[str] = None,
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
):
"""
Async: List files
LiteLLM Equivalent of GET https://api.openai.com/v1/files
"""
try:
loop = asyncio.get_event_loop()
kwargs["is_async"] = True
# Use a partial function to pass your keyword arguments
func = partial(
file_list,
custom_llm_provider,
purpose,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def file_list(
custom_llm_provider: Literal["openai"] = "openai",
purpose: Optional[str] = None,
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
):
"""
List files
LiteLLM Equivalent of GET https://api.openai.com/v1/files
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_is_async = kwargs.pop("is_async", False) is True
response = openai_files_instance.list_files(
purpose=purpose,
_is_async=_is_async,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'file_list'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="file_list", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e
async def acreate_file(
file: FileTypes,
purpose: Literal["assistants", "batch", "fine-tune"],
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Coroutine[Any, Any, FileObject]:
"""
Async: Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
"""
try:
loop = asyncio.get_event_loop()
kwargs["acreate_file"] = True
# Use a partial function to pass your keyword arguments
func = partial(
create_file,
file,
purpose,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def create_file(
file: FileTypes,
purpose: Literal["assistants", "batch", "fine-tune"],
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Union[FileObject, Coroutine[Any, Any, FileObject]]:
"""
Files are used to upload documents that can be used with features like Assistants, Fine-tuning, and Batch API.
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_create_file_request = CreateFileRequest(
file=file,
purpose=purpose,
extra_headers=extra_headers,
extra_body=extra_body,
)
_is_async = kwargs.pop("acreate_file", False) is True
response = openai_files_instance.create_file(
_is_async=_is_async,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
create_file_data=_create_file_request,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e
async def afile_content(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Coroutine[Any, Any, HttpxBinaryResponseContent]:
"""
Async: Get file contents
LiteLLM Equivalent of GET https://api.openai.com/v1/files
"""
try:
loop = asyncio.get_event_loop()
kwargs["afile_content"] = True
# Use a partial function to pass your keyword arguments
func = partial(
file_content,
file_id,
custom_llm_provider,
extra_headers,
extra_body,
**kwargs,
)
# Add the context to the function
ctx = contextvars.copy_context()
func_with_context = partial(ctx.run, func)
init_response = await loop.run_in_executor(None, func_with_context)
if asyncio.iscoroutine(init_response):
response = await init_response
else:
response = init_response # type: ignore
return response
except Exception as e:
raise e
def file_content(
file_id: str,
custom_llm_provider: Literal["openai"] = "openai",
extra_headers: Optional[Dict[str, str]] = None,
extra_body: Optional[Dict[str, str]] = None,
**kwargs,
) -> Union[HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent]]:
"""
Returns the contents of the specified file.
LiteLLM Equivalent of POST: POST https://api.openai.com/v1/files
"""
try:
optional_params = GenericLiteLLMParams(**kwargs)
if custom_llm_provider == "openai":
# for deepinfra/perplexity/anyscale/groq we check in get_llm_provider and pass in the api base from there
api_base = (
optional_params.api_base
or litellm.api_base
or os.getenv("OPENAI_API_BASE")
or "https://api.openai.com/v1"
)
organization = (
optional_params.organization
or litellm.organization
or os.getenv("OPENAI_ORGANIZATION", None)
or None # default - https://github.com/openai/openai-python/blob/284c1799070c723c6a553337134148a7ab088dd8/openai/util.py#L105
)
# set API KEY
api_key = (
optional_params.api_key
or litellm.api_key # for deepinfra/perplexity/anyscale we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or os.getenv("OPENAI_API_KEY")
)
### TIMEOUT LOGIC ###
timeout = (
optional_params.timeout or kwargs.get("request_timeout", 600) or 600
)
# set timeout for 10 minutes by default
if (
timeout is not None
and isinstance(timeout, httpx.Timeout)
and supports_httpx_timeout(custom_llm_provider) == False
):
read_timeout = timeout.read or 600
timeout = read_timeout # default 10 min timeout
elif timeout is not None and not isinstance(timeout, httpx.Timeout):
timeout = float(timeout) # type: ignore
elif timeout is None:
timeout = 600.0
_file_content_request = FileContentRequest(
file_id=file_id,
extra_headers=extra_headers,
extra_body=extra_body,
)
_is_async = kwargs.pop("afile_content", False) is True
response = openai_files_instance.file_content(
_is_async=_is_async,
file_content_request=_file_content_request,
api_base=api_base,
api_key=api_key,
timeout=timeout,
max_retries=optional_params.max_retries,
organization=organization,
)
else:
raise litellm.exceptions.BadRequestError(
message="LiteLLM doesn't support {} for 'create_batch'. Only 'openai' is supported.".format(
custom_llm_provider
),
model="n/a",
llm_provider=custom_llm_provider,
response=httpx.Response(
status_code=400,
content="Unsupported provider",
request=httpx.Request(method="create_thread", url="https://github.com/BerriAI/litellm"), # type: ignore
),
)
return response
except Exception as e:
raise e

View file

@ -5,9 +5,12 @@ import traceback
from typing import Literal, Optional, Union
import dotenv
from pydantic import BaseModel
from litellm.caching import DualCache
from litellm.proxy._types import UserAPIKeyAuth
from litellm.types.llms.openai import ChatCompletionRequest
from litellm.types.utils import ModelResponse
class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callback#callback-class
@ -55,6 +58,30 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
def pre_call_check(self, deployment: dict) -> Optional[dict]:
pass
#### ADAPTERS #### Allow calling 100+ LLMs in custom format - https://github.com/BerriAI/litellm/pulls
def translate_completion_input_params(
self, kwargs
) -> Optional[ChatCompletionRequest]:
"""
Translates the input params, from the provider's native format to the litellm.completion() format.
"""
pass
def translate_completion_output_params(
self, response: ModelResponse
) -> Optional[BaseModel]:
"""
Translates the output params, from the OpenAI format to the custom format.
"""
pass
def translate_completion_output_params_streaming(self) -> Optional[BaseModel]:
"""
Translates the streaming chunk, from the OpenAI format to the custom format.
"""
pass
#### CALL HOOKS - proxy only ####
"""
Control the modify incoming / outgoung data before calling the model

View file

@ -326,7 +326,12 @@ class LangFuseLogger:
or isinstance(value, int)
or isinstance(value, float)
):
new_metadata[key] = copy.deepcopy(value)
try:
new_metadata[key] = copy.deepcopy(value)
except Exception as e:
verbose_logger.error(
f"Langfuse [Non-blocking error] - error copying metadata: {str(e)}"
)
metadata = new_metadata
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")

View file

@ -52,6 +52,12 @@ class OpenTelemetryConfig:
OTEL_HEADERS gets sent as headers = {"x-honeycomb-team": "B85YgLm96******"}
"""
from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
InMemorySpanExporter,
)
if os.getenv("OTEL_EXPORTER") == "in_memory":
return cls(exporter=InMemorySpanExporter())
return cls(
exporter=os.getenv("OTEL_EXPORTER", "console"),
endpoint=os.getenv("OTEL_ENDPOINT"),

View file

@ -675,7 +675,7 @@ class SlackAlerting(CustomLogger):
async def failed_tracking_alert(self, error_message: str):
"""Raise alert when tracking failed for specific model"""
_cache: DualCache = self.internal_usage_cache
message = "Failed Tracking Cost for" + error_message
message = "Failed Tracking Cost for " + error_message
_cache_key = "budget_alerts:failed_tracking:{}".format(message)
result = await _cache.async_get_cache(key=_cache_key)
if result is None:
@ -1530,15 +1530,19 @@ Model Info:
"""Log deployment latency"""
try:
if "daily_reports" in self.alert_types:
model_id = (
kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
)
litellm_params = kwargs.get("litellm_params", {}) or {}
model_info = litellm_params.get("model_info", {}) or {}
model_id = model_info.get("id", "") or ""
response_s: timedelta = end_time - start_time
final_value = response_s
total_tokens = 0
if isinstance(response_obj, litellm.ModelResponse):
if isinstance(response_obj, litellm.ModelResponse) and (
hasattr(response_obj, "usage")
and response_obj.usage is not None
and hasattr(response_obj.usage, "completion_tokens")
):
completion_tokens = response_obj.usage.completion_tokens
if completion_tokens is not None and completion_tokens > 0:
final_value = float(
@ -1557,8 +1561,7 @@ Model Info:
)
except Exception as e:
verbose_proxy_logger.error(
"[Non-Blocking Error] Slack Alerting: Got error in logging LLM deployment latency: ",
e,
f"[Non-Blocking Error] Slack Alerting: Got error in logging LLM deployment latency: {str(e)}"
)
pass

View file

@ -1275,7 +1275,7 @@ class Logging:
f"Model={self.model}; cost={self.model_call_details['response_cost']}"
)
except litellm.NotFoundError as e:
verbose_logger.error(
verbose_logger.warning(
f"Model={self.model} not found in completion cost map. Setting 'response_cost' to None"
)
self.model_call_details["response_cost"] = None

View file

@ -1,7 +1,7 @@
# What is this?
## Cost calculation for Google AI Studio / Vertex AI models
import traceback
from typing import List, Literal, Optional, Tuple
from typing import List, Literal, Optional, Tuple, Union
import litellm
from litellm import verbose_logger
@ -29,6 +29,32 @@ def _is_above_128k(tokens: float) -> bool:
return False
def cost_router(
model: str,
custom_llm_provider: str,
prompt_tokens: float,
completion_tokens: float,
prompt_characters: float,
completion_characters: float,
call_type: Union[Literal["embedding", "aembedding"], str],
) -> Literal["cost_per_character", "cost_per_token"]:
"""
Route the cost calc to the right place, based on model/call_type/etc.
Returns
- str, the specific google cost calc function it should route to.
"""
if custom_llm_provider == "vertex_ai" and "claude" in model:
return "cost_per_token"
elif custom_llm_provider == "gemini":
return "cost_per_token"
elif custom_llm_provider == "vertex_ai" and (
call_type == "embedding" or call_type == "aembedding"
):
return "cost_per_token"
return "cost_per_character"
def cost_per_character(
model: str,
custom_llm_provider: str,

View file

@ -1,11 +1,16 @@
import os, types, traceback
import json
import os
import time # type: ignore
import traceback
import types
from enum import Enum
import requests # type: ignore
import time, httpx # type: ignore
from typing import Callable, Optional
from litellm.utils import ModelResponse, Choices, Message
import httpx
import requests # type: ignore
import litellm
from litellm.utils import Choices, Message, ModelResponse
class AI21Error(Exception):
@ -185,7 +190,7 @@ def completion(
message=message_obj,
)
choices_list.append(choice_obj)
model_response["choices"] = choices_list
model_response.choices = choices_list # type: ignore
except Exception as e:
raise AI21Error(
message=traceback.format_exc(), status_code=response.status_code
@ -197,13 +202,17 @@ def completion(
encoding.encode(model_response["choices"][0]["message"].get("content"))
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response["usage"] = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
}
model_response.created = int(time.time())
model_response.model = model
setattr(
model_response,
"usage",
litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
),
)
return model_response

View file

@ -1,12 +1,15 @@
import os, types
import json
from enum import Enum
import requests # type: ignore
import os
import time
import types
from enum import Enum
from typing import Callable, Optional
import litellm
from litellm.utils import ModelResponse, Choices, Message, Usage
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.utils import Choices, Message, ModelResponse, Usage
class AlephAlphaError(Exception):
@ -275,7 +278,7 @@ def completion(
message=message_obj,
)
choices_list.append(choice_obj)
model_response["choices"] = choices_list
model_response.choices = choices_list # type: ignore
except:
raise AlephAlphaError(
message=json.dumps(completion_response),
@ -291,8 +294,8 @@ def completion(
)
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -20,19 +20,43 @@ from litellm.llms.custom_httpx.http_handler import (
_get_httpx_client,
)
from litellm.types.llms.anthropic import (
AnthopicMessagesAssistantMessageParam,
AnthropicFinishReason,
AnthropicMessagesRequest,
AnthropicMessagesTool,
AnthropicMessagesToolChoice,
AnthropicMessagesUserMessageParam,
AnthropicResponse,
AnthropicResponseContentBlockText,
AnthropicResponseContentBlockToolUse,
AnthropicResponseUsageBlock,
ContentBlockDelta,
ContentBlockStart,
MessageBlockDelta,
MessageStartBlock,
)
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionAssistantMessage,
ChatCompletionAssistantToolCall,
ChatCompletionImageObject,
ChatCompletionImageUrlObject,
ChatCompletionRequest,
ChatCompletionResponseMessage,
ChatCompletionSystemMessage,
ChatCompletionTextObject,
ChatCompletionToolCallChunk,
ChatCompletionToolCallFunctionChunk,
ChatCompletionToolChoiceFunctionParam,
ChatCompletionToolChoiceObjectParam,
ChatCompletionToolChoiceValues,
ChatCompletionToolMessage,
ChatCompletionToolParam,
ChatCompletionToolParamFunctionChunk,
ChatCompletionUsageBlock,
ChatCompletionUserMessage,
)
from litellm.types.utils import GenericStreamingChunk
from litellm.types.utils import Choices, GenericStreamingChunk
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
from .base import BaseLLM
@ -168,6 +192,287 @@ class AnthropicConfig:
optional_params["top_p"] = value
return optional_params
### FOR [BETA] `/v1/messages` endpoint support
def translatable_anthropic_params(self) -> List:
"""
Which anthropic params, we need to translate to the openai format.
"""
return ["messages", "metadata", "system", "tool_choice", "tools"]
def translate_anthropic_messages_to_openai(
self,
messages: List[
Union[
AnthropicMessagesUserMessageParam,
AnthopicMessagesAssistantMessageParam,
]
],
) -> List:
new_messages: List[AllMessageValues] = []
for m in messages:
user_message: Optional[ChatCompletionUserMessage] = None
tool_message_list: List[ChatCompletionToolMessage] = []
## USER MESSAGE ##
if m["role"] == "user":
## translate user message
if isinstance(m["content"], str):
user_message = ChatCompletionUserMessage(
role="user", content=m["content"]
)
elif isinstance(m["content"], list):
new_user_content_list: List[
Union[ChatCompletionTextObject, ChatCompletionImageObject]
] = []
for content in m["content"]:
if content["type"] == "text":
text_obj = ChatCompletionTextObject(
type="text", text=content["text"]
)
new_user_content_list.append(text_obj)
elif content["type"] == "image":
image_url = ChatCompletionImageUrlObject(
url=f"data:{content['type']};base64,{content['source']}"
)
image_obj = ChatCompletionImageObject(
type="image_url", image_url=image_url
)
new_user_content_list.append(image_obj)
elif content["type"] == "tool_result":
if "content" not in content:
tool_result = ChatCompletionToolMessage(
role="tool",
tool_call_id=content["tool_use_id"],
content="",
)
tool_message_list.append(tool_result)
elif isinstance(content["content"], str):
tool_result = ChatCompletionToolMessage(
role="tool",
tool_call_id=content["tool_use_id"],
content=content["content"],
)
tool_message_list.append(tool_result)
elif isinstance(content["content"], list):
for c in content["content"]:
if c["type"] == "text":
tool_result = ChatCompletionToolMessage(
role="tool",
tool_call_id=content["tool_use_id"],
content=c["text"],
)
tool_message_list.append(tool_result)
elif c["type"] == "image":
image_str = (
f"data:{c['type']};base64,{c['source']}"
)
tool_result = ChatCompletionToolMessage(
role="tool",
tool_call_id=content["tool_use_id"],
content=image_str,
)
tool_message_list.append(tool_result)
if user_message is not None:
new_messages.append(user_message)
if len(tool_message_list) > 0:
new_messages.extend(tool_message_list)
## ASSISTANT MESSAGE ##
assistant_message_str: Optional[str] = None
tool_calls: List[ChatCompletionAssistantToolCall] = []
if m["role"] == "assistant":
if isinstance(m["content"], str):
assistant_message_str = m["content"]
elif isinstance(m["content"], list):
for content in m["content"]:
if content["type"] == "text":
if assistant_message_str is None:
assistant_message_str = content["text"]
else:
assistant_message_str += content["text"]
elif content["type"] == "tool_use":
function_chunk = ChatCompletionToolCallFunctionChunk(
name=content["name"],
arguments=json.dumps(content["input"]),
)
tool_calls.append(
ChatCompletionAssistantToolCall(
id=content["id"],
type="function",
function=function_chunk,
)
)
if assistant_message_str is not None or len(tool_calls) > 0:
assistant_message = ChatCompletionAssistantMessage(
role="assistant",
content=assistant_message_str,
)
if len(tool_calls) > 0:
assistant_message["tool_calls"] = tool_calls
new_messages.append(assistant_message)
return new_messages
def translate_anthropic_tool_choice_to_openai(
self, tool_choice: AnthropicMessagesToolChoice
) -> ChatCompletionToolChoiceValues:
if tool_choice["type"] == "any":
return "required"
elif tool_choice["type"] == "auto":
return "auto"
elif tool_choice["type"] == "tool":
tc_function_param = ChatCompletionToolChoiceFunctionParam(
name=tool_choice.get("name", "")
)
return ChatCompletionToolChoiceObjectParam(
type="function", function=tc_function_param
)
else:
raise ValueError(
"Incompatible tool choice param submitted - {}".format(tool_choice)
)
def translate_anthropic_tools_to_openai(
self, tools: List[AnthropicMessagesTool]
) -> List[ChatCompletionToolParam]:
new_tools: List[ChatCompletionToolParam] = []
for tool in tools:
function_chunk = ChatCompletionToolParamFunctionChunk(
name=tool["name"],
parameters=tool["input_schema"],
)
if "description" in tool:
function_chunk["description"] = tool["description"]
new_tools.append(
ChatCompletionToolParam(type="function", function=function_chunk)
)
return new_tools
def translate_anthropic_to_openai(
self, anthropic_message_request: AnthropicMessagesRequest
) -> ChatCompletionRequest:
"""
This is used by the beta Anthropic Adapter, for translating anthropic `/v1/messages` requests to the openai format.
"""
new_messages: List[AllMessageValues] = []
## CONVERT ANTHROPIC MESSAGES TO OPENAI
new_messages = self.translate_anthropic_messages_to_openai(
messages=anthropic_message_request["messages"]
)
## ADD SYSTEM MESSAGE TO MESSAGES
if "system" in anthropic_message_request:
new_messages.insert(
0,
ChatCompletionSystemMessage(
role="system", content=anthropic_message_request["system"]
),
)
new_kwargs: ChatCompletionRequest = {
"model": anthropic_message_request["model"],
"messages": new_messages,
}
## CONVERT METADATA (user_id)
if "metadata" in anthropic_message_request:
if "user_id" in anthropic_message_request["metadata"]:
new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]
## CONVERT TOOL CHOICE
if "tool_choice" in anthropic_message_request:
new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
tool_choice=anthropic_message_request["tool_choice"]
)
## CONVERT TOOLS
if "tools" in anthropic_message_request:
new_kwargs["tools"] = self.translate_anthropic_tools_to_openai(
tools=anthropic_message_request["tools"]
)
translatable_params = self.translatable_anthropic_params()
for k, v in anthropic_message_request.items():
if k not in translatable_params: # pass remaining params as is
new_kwargs[k] = v # type: ignore
return new_kwargs
def _translate_openai_content_to_anthropic(
self, choices: List[Choices]
) -> List[
Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
]:
new_content: List[
Union[
AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse
]
] = []
for choice in choices:
if (
choice.message.tool_calls is not None
and len(choice.message.tool_calls) > 0
):
for tool_call in choice.message.tool_calls:
new_content.append(
AnthropicResponseContentBlockToolUse(
type="tool_use",
id=tool_call.id,
name=tool_call.function.name or "",
input=json.loads(tool_call.function.arguments),
)
)
elif choice.message.content is not None:
new_content.append(
AnthropicResponseContentBlockText(
type="text", text=choice.message.content
)
)
return new_content
def _translate_openai_finish_reason_to_anthropic(
self, openai_finish_reason: str
) -> AnthropicFinishReason:
if openai_finish_reason == "stop":
return "end_turn"
elif openai_finish_reason == "length":
return "max_tokens"
elif openai_finish_reason == "tool_calls":
return "tool_use"
return "end_turn"
def translate_openai_response_to_anthropic(
self, response: litellm.ModelResponse
) -> AnthropicResponse:
## translate content block
anthropic_content = self._translate_openai_content_to_anthropic(choices=response.choices) # type: ignore
## extract finish reason
anthropic_finish_reason = self._translate_openai_finish_reason_to_anthropic(
openai_finish_reason=response.choices[0].finish_reason # type: ignore
)
# extract usage
usage: litellm.Usage = getattr(response, "usage")
anthropic_usage = AnthropicResponseUsageBlock(
input_tokens=usage.prompt_tokens, output_tokens=usage.completion_tokens
)
translated_obj = AnthropicResponse(
id=response.id,
type="message",
role="assistant",
model=response.model or "unknown-model",
stop_sequence=None,
usage=anthropic_usage,
content=anthropic_content,
stop_reason=anthropic_finish_reason,
)
return translated_obj
# makes headers for API call
def validate_environment(api_key, user_headers):
@ -231,121 +536,6 @@ class AnthropicChatCompletion(BaseLLM):
def __init__(self) -> None:
super().__init__()
# def process_streaming_response(
# self,
# model: str,
# response: Union[requests.Response, httpx.Response],
# model_response: ModelResponse,
# stream: bool,
# logging_obj: litellm.litellm_core_utils.litellm_logging.Logging,
# optional_params: dict,
# api_key: str,
# data: Union[dict, str],
# messages: List,
# print_verbose,
# encoding,
# ) -> CustomStreamWrapper:
# """
# Return stream object for tool-calling + streaming
# """
# ## LOGGING
# logging_obj.post_call(
# input=messages,
# api_key=api_key,
# original_response=response.text,
# additional_args={"complete_input_dict": data},
# )
# print_verbose(f"raw model_response: {response.text}")
# ## RESPONSE OBJECT
# try:
# completion_response = response.json()
# except:
# raise AnthropicError(
# message=response.text, status_code=response.status_code
# )
# text_content = ""
# tool_calls = []
# for content in completion_response["content"]:
# if content["type"] == "text":
# text_content += content["text"]
# ## TOOL CALLING
# elif content["type"] == "tool_use":
# tool_calls.append(
# {
# "id": content["id"],
# "type": "function",
# "function": {
# "name": content["name"],
# "arguments": json.dumps(content["input"]),
# },
# }
# )
# if "error" in completion_response:
# raise AnthropicError(
# message=str(completion_response["error"]),
# status_code=response.status_code,
# )
# _message = litellm.Message(
# tool_calls=tool_calls,
# content=text_content or None,
# )
# model_response.choices[0].message = _message # type: ignore
# model_response._hidden_params["original_response"] = completion_response[
# "content"
# ] # allow user to access raw anthropic tool calling response
# model_response.choices[0].finish_reason = map_finish_reason(
# completion_response["stop_reason"]
# )
# print_verbose("INSIDE ANTHROPIC STREAMING TOOL CALLING CONDITION BLOCK")
# # return an iterator
# streaming_model_response = ModelResponse(stream=True)
# streaming_model_response.choices[0].finish_reason = model_response.choices[ # type: ignore
# 0
# ].finish_reason
# # streaming_model_response.choices = [litellm.utils.StreamingChoices()]
# streaming_choice = litellm.utils.StreamingChoices()
# streaming_choice.index = model_response.choices[0].index
# _tool_calls = []
# print_verbose(
# f"type of model_response.choices[0]: {type(model_response.choices[0])}"
# )
# print_verbose(f"type of streaming_choice: {type(streaming_choice)}")
# if isinstance(model_response.choices[0], litellm.Choices):
# if getattr(
# model_response.choices[0].message, "tool_calls", None
# ) is not None and isinstance(
# model_response.choices[0].message.tool_calls, list
# ):
# for tool_call in model_response.choices[0].message.tool_calls:
# _tool_call = {**tool_call.dict(), "index": 0}
# _tool_calls.append(_tool_call)
# delta_obj = litellm.utils.Delta(
# content=getattr(model_response.choices[0].message, "content", None),
# role=model_response.choices[0].message.role,
# tool_calls=_tool_calls,
# )
# streaming_choice.delta = delta_obj
# streaming_model_response.choices = [streaming_choice]
# completion_stream = ModelResponseIterator(
# model_response=streaming_model_response
# )
# print_verbose(
# "Returns anthropic CustomStreamWrapper with 'cached_response' streaming object"
# )
# return CustomStreamWrapper(
# completion_stream=completion_stream,
# model=model,
# custom_llm_provider="cached_response",
# logging_obj=logging_obj,
# )
# else:
# raise AnthropicError(
# status_code=422,
# message="Unprocessable response object - {}".format(response.text),
# )
def process_response(
self,
model: str,
@ -417,8 +607,8 @@ class AnthropicChatCompletion(BaseLLM):
completion_tokens = completion_response["usage"]["output_tokens"]
total_tokens = prompt_tokens + completion_tokens
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,15 +1,19 @@
import os, types
import json
from enum import Enum
import requests
import os
import time
import types
from enum import Enum
from typing import Callable, Optional
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper
import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt
import httpx
from .base import BaseLLM
import requests
import litellm
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
from .base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory
class AnthropicConstants(Enum):
@ -117,9 +121,9 @@ class AnthropicTextCompletion(BaseLLM):
)
else:
if len(completion_response["completion"]) > 0:
model_response["choices"][0]["message"]["content"] = (
completion_response["completion"]
)
model_response.choices[0].message.content = completion_response[ # type: ignore
"completion"
]
model_response.choices[0].finish_reason = completion_response["stop_reason"]
## CALCULATING USAGE
@ -130,8 +134,8 @@ class AnthropicTextCompletion(BaseLLM):
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
) ##[TODO] use the anthropic tokenizer here
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,9 +1,11 @@
import os
import json
from enum import Enum
import requests # type: ignore
import os
import time
from enum import Enum
from typing import Callable
import requests # type: ignore
from litellm.utils import ModelResponse, Usage
@ -106,28 +108,32 @@ def completion(
and "data" in completion_response["model_output"]
and isinstance(completion_response["model_output"]["data"], list)
):
model_response["choices"][0]["message"]["content"] = (
completion_response["model_output"]["data"][0]
)
model_response.choices[0].message.content = completion_response[ # type: ignore
"model_output"
][
"data"
][
0
]
elif isinstance(completion_response["model_output"], str):
model_response["choices"][0]["message"]["content"] = (
completion_response["model_output"]
)
model_response.choices[0].message.content = completion_response[ # type: ignore
"model_output"
]
elif "completion" in completion_response and isinstance(
completion_response["completion"], str
):
model_response["choices"][0]["message"]["content"] = (
completion_response["completion"]
)
model_response.choices[0].message.content = completion_response[ # type: ignore
"completion"
]
elif isinstance(completion_response, list) and len(completion_response) > 0:
if "generated_text" not in completion_response:
raise BasetenError(
message=f"Unable to parse response. Original response: {response.text}",
status_code=response.status_code,
)
model_response["choices"][0]["message"]["content"] = (
completion_response[0]["generated_text"]
)
model_response.choices[0].message.content = completion_response[0][ # type: ignore
"generated_text"
]
## GETTING LOGPROBS
if (
"details" in completion_response[0]
@ -139,7 +145,7 @@ def completion(
sum_logprob = 0
for token in completion_response[0]["details"]["tokens"]:
sum_logprob += token["logprob"]
model_response["choices"][0]["message"]._logprobs = sum_logprob
model_response.choices[0].logprobs = sum_logprob
else:
raise BasetenError(
message=f"Unable to parse response. Original response: {response.text}",
@ -152,8 +158,8 @@ def completion(
encoding.encode(model_response["choices"][0]["message"]["content"])
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1122,7 +1122,7 @@ def completion(
logging_obj=logging_obj,
)
model_response["finish_reason"] = map_finish_reason(
model_response.choices[0].finish_reason = map_finish_reason(
response_body["stop_reason"]
)
_usage = litellm.Usage(
@ -1134,14 +1134,16 @@ def completion(
setattr(model_response, "usage", _usage)
else:
outputText = response_body["completion"]
model_response["finish_reason"] = response_body["stop_reason"]
model_response.choices[0].finish_reason = response_body["stop_reason"]
elif provider == "cohere":
outputText = response_body["generations"][0]["text"]
elif provider == "meta":
outputText = response_body["generation"]
elif provider == "mistral":
outputText = response_body["outputs"][0]["text"]
model_response["finish_reason"] = response_body["outputs"][0]["stop_reason"]
model_response.choices[0].finish_reason = response_body["outputs"][0][
"stop_reason"
]
else: # amazon titan
outputText = response_body.get("results")[0].get("outputText")
@ -1160,7 +1162,7 @@ def completion(
and getattr(model_response.choices[0].message, "tool_calls", None)
is None
):
model_response["choices"][0]["message"]["content"] = outputText
model_response.choices[0].message.content = outputText
elif (
hasattr(model_response.choices[0], "message")
and getattr(model_response.choices[0].message, "tool_calls", None)
@ -1199,8 +1201,8 @@ def completion(
)
setattr(model_response, "usage", usage)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
model_response._hidden_params["region_name"] = client.meta.region_name
print_verbose(f"model_response._hidden_params: {model_response._hidden_params}")
@ -1323,9 +1325,9 @@ def _embedding_func_single(
def embedding(
model: str,
input: Union[list, str],
model_response: litellm.EmbeddingResponse,
api_key: Optional[str] = None,
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
):
@ -1391,9 +1393,9 @@ def embedding(
"embedding": embedding,
}
)
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response["model"] = model
model_response.object = "list"
model_response.data = embedding_response
model_response.model = model
input_tokens = 0
input_str = "".join(input)

View file

@ -521,7 +521,7 @@ class BedrockLLM(BaseLLM):
outputText = completion_response["text"] # type: ignore
elif "generations" in completion_response:
outputText = completion_response["generations"][0]["text"]
model_response["finish_reason"] = map_finish_reason(
model_response.choices[0].finish_reason = map_finish_reason(
completion_response["generations"][0]["finish_reason"]
)
elif provider == "anthropic":
@ -625,7 +625,7 @@ class BedrockLLM(BaseLLM):
logging_obj=logging_obj,
)
model_response["finish_reason"] = map_finish_reason(
model_response.choices[0].finish_reason = map_finish_reason(
completion_response.get("stop_reason", "")
)
_usage = litellm.Usage(
@ -638,7 +638,9 @@ class BedrockLLM(BaseLLM):
else:
outputText = completion_response["completion"]
model_response["finish_reason"] = completion_response["stop_reason"]
model_response.choices[0].finish_reason = completion_response[
"stop_reason"
]
elif provider == "ai21":
outputText = (
completion_response.get("completions")[0].get("data").get("text")
@ -647,9 +649,9 @@ class BedrockLLM(BaseLLM):
outputText = completion_response["generation"]
elif provider == "mistral":
outputText = completion_response["outputs"][0]["text"]
model_response["finish_reason"] = completion_response["outputs"][0][
"stop_reason"
]
model_response.choices[0].finish_reason = completion_response[
"outputs"
][0]["stop_reason"]
else: # amazon titan
outputText = completion_response.get("results")[0].get("outputText")
except Exception as e:
@ -667,7 +669,7 @@ class BedrockLLM(BaseLLM):
and getattr(model_response.choices[0].message, "tool_calls", None)
is None
):
model_response["choices"][0]["message"]["content"] = outputText
model_response.choices[0].message.content = outputText
elif (
hasattr(model_response.choices[0], "message")
and getattr(model_response.choices[0].message, "tool_calls", None)
@ -723,8 +725,8 @@ class BedrockLLM(BaseLLM):
)
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
@ -1066,7 +1068,7 @@ class BedrockLLM(BaseLLM):
if response.status_code != 200:
raise BedrockError(
status_code=response.status_code, message=response.text
status_code=response.status_code, message=response.read()
)
decoder = AWSEventStreamDecoder(model=model)
@ -1446,8 +1448,8 @@ class BedrockConverseLLM(BaseLLM):
message=litellm.Message(**chat_completion_message),
)
]
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=input_tokens,
completion_tokens=output_tokens,

View file

@ -1,13 +1,18 @@
import os, types, traceback
import json
import requests
import os
import time
import traceback
import types
from typing import Callable, Optional
from litellm.utils import ModelResponse, Usage, Choices, Message, CustomStreamWrapper
import litellm
import httpx
import requests
import litellm
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage
from .prompt_templates.factory import custom_prompt, prompt_factory
class ClarifaiError(Exception):
@ -87,7 +92,14 @@ def completions_to_model(payload):
def process_response(
model, prompt, response, model_response, api_key, data, encoding, logging_obj
model,
prompt,
response,
model_response: litellm.ModelResponse,
api_key,
data,
encoding,
logging_obj,
):
logging_obj.post_call(
input=prompt,
@ -116,7 +128,7 @@ def process_response(
message=message_obj,
)
choices_list.append(choice_obj)
model_response["choices"] = choices_list
model_response.choices = choices_list # type: ignore
except Exception as e:
raise ClarifaiError(
@ -128,11 +140,15 @@ def process_response(
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"].get("content"))
)
model_response["model"] = model
model_response["usage"] = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
model_response.model = model
setattr(
model_response,
"usage",
Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
),
)
return model_response
@ -202,7 +218,7 @@ async def async_completion(
message=message_obj,
)
choices_list.append(choice_obj)
model_response["choices"] = choices_list
model_response.choices = choices_list # type: ignore
except Exception as e:
raise ClarifaiError(
@ -214,11 +230,15 @@ async def async_completion(
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"].get("content"))
)
model_response["model"] = model
model_response["usage"] = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
model_response.model = model
setattr(
model_response,
"usage",
Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
),
)
return model_response

View file

@ -1,13 +1,17 @@
import os, types
import json
from enum import Enum
import requests # type: ignore
import os
import time
import types
from enum import Enum
from typing import Callable, Optional
import litellm
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.utils import ModelResponse, Usage
from .prompt_templates.factory import prompt_factory, custom_prompt
from .prompt_templates.factory import custom_prompt, prompt_factory
class CloudflareError(Exception):
@ -147,9 +151,9 @@ def completion(
)
completion_response = response.json()
model_response["choices"][0]["message"]["content"] = completion_response[
"result"
]["response"]
model_response.choices[0].message.content = completion_response["result"][ # type: ignore
"response"
]
## CALCULATING USAGE
print_verbose(
@ -160,8 +164,8 @@ def completion(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
)
model_response["created"] = int(time.time())
model_response["model"] = "cloudflare/" + model
model_response.created = int(time.time())
model_response.model = "cloudflare/" + model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,12 +1,16 @@
import os, types
import json
import os
import time
import traceback
import types
from enum import Enum
import requests # type: ignore
import time, traceback
from typing import Callable, Optional
from litellm.utils import ModelResponse, Choices, Message, Usage
import litellm
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.utils import Choices, Message, ModelResponse, Usage
class CohereError(Exception):
@ -117,7 +121,7 @@ class CohereConfig:
def validate_environment(api_key):
headers = {
"Request-Source":"unspecified:litellm",
"Request-Source": "unspecified:litellm",
"accept": "application/json",
"content-type": "application/json",
}
@ -219,7 +223,7 @@ def completion(
message=message_obj,
)
choices_list.append(choice_obj)
model_response["choices"] = choices_list
model_response.choices = choices_list # type: ignore
except Exception as e:
raise CohereError(
message=response.text, status_code=response.status_code
@ -231,8 +235,8 @@ def completion(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
@ -245,9 +249,9 @@ def completion(
def embedding(
model: str,
input: list,
model_response: litellm.EmbeddingResponse,
api_key: Optional[str] = None,
logging_obj=None,
model_response=None,
encoding=None,
optional_params=None,
):
@ -294,14 +298,18 @@ def embedding(
output_data.append(
{"object": "embedding", "index": idx, "embedding": embedding}
)
model_response["object"] = "list"
model_response["data"] = output_data
model_response["model"] = model
model_response.object = "list"
model_response.data = output_data
model_response.model = model
input_tokens = 0
for text in input:
input_tokens += len(encoding.encode(text))
model_response["usage"] = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
setattr(
model_response,
"usage",
Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
),
)
return model_response

View file

@ -305,8 +305,8 @@ def completion(
prompt_tokens = billed_units.get("input_tokens", 0)
completion_tokens = billed_units.get("output_tokens", 0)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,26 +1,26 @@
# What is this?
## Handler file for databricks API https://docs.databricks.com/en/machine-learning/foundation-models/api-reference.html#chat-request
from functools import partial
import os, types
import copy
import json
from enum import Enum
import requests, copy # type: ignore
import os
import time
from typing import Callable, Optional, List, Union, Tuple, Literal
from litellm.utils import (
ModelResponse,
Usage,
CustomStreamWrapper,
EmbeddingResponse,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason
import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from .base import BaseLLM
import types
from enum import Enum
from functools import partial
from typing import Callable, List, Literal, Optional, Tuple, Union
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.types.llms.databricks import GenericStreamingChunk
from litellm.types.utils import ProviderField
from litellm.utils import CustomStreamWrapper, EmbeddingResponse, ModelResponse, Usage
from .base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory
class DatabricksError(Exception):
@ -354,8 +354,8 @@ class DatabricksChatCompletion(BaseLLM):
completion_tokens = completion_response["usage"]["output_tokens"]
total_tokens = prompt_tokens + completion_tokens
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,7 +1,7 @@
####################################
######### DEPRECATED FILE ##########
####################################
# logic moved to `vertex_httpx.py` #
# ####################################
# ######### DEPRECATED FILE ##########
# ####################################
# # logic moved to `vertex_httpx.py` #
import copy
import time
@ -92,332 +92,332 @@ class GeminiConfig:
}
class TextStreamer:
"""
A class designed to return an async stream from AsyncGenerateContentResponse object.
"""
# class TextStreamer:
# """
# A class designed to return an async stream from AsyncGenerateContentResponse object.
# """
def __init__(self, response):
self.response = response
self._aiter = self.response.__aiter__()
# def __init__(self, response):
# self.response = response
# self._aiter = self.response.__aiter__()
async def __aiter__(self):
while True:
try:
# This will manually advance the async iterator.
# In the case the next object doesn't exists, __anext__() will simply raise a StopAsyncIteration exception
next_object = await self._aiter.__anext__()
yield next_object
except StopAsyncIteration:
# After getting all items from the async iterator, stop iterating
break
# async def __aiter__(self):
# while True:
# try:
# # This will manually advance the async iterator.
# # In the case the next object doesn't exists, __anext__() will simply raise a StopAsyncIteration exception
# next_object = await self._aiter.__anext__()
# yield next_object
# except StopAsyncIteration:
# # After getting all items from the async iterator, stop iterating
# break
def supports_system_instruction():
import google.generativeai as genai
# def supports_system_instruction():
# import google.generativeai as genai
gemini_pkg_version = Version(genai.__version__)
return gemini_pkg_version >= Version("0.5.0")
# gemini_pkg_version = Version(genai.__version__)
# return gemini_pkg_version >= Version("0.5.0")
def completion(
model: str,
messages: list,
model_response: ModelResponse,
print_verbose: Callable,
api_key,
encoding,
logging_obj,
custom_prompt_dict: dict,
acompletion: bool = False,
optional_params=None,
litellm_params=None,
logger_fn=None,
):
try:
import google.generativeai as genai # type: ignore
except:
raise Exception(
"Importing google.generativeai failed, please run 'pip install -q google-generativeai"
)
genai.configure(api_key=api_key)
system_prompt = ""
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model]
prompt = custom_prompt(
role_dict=model_prompt_details["roles"],
initial_prompt_value=model_prompt_details["initial_prompt_value"],
final_prompt_value=model_prompt_details["final_prompt_value"],
messages=messages,
)
else:
system_prompt, messages = get_system_prompt(messages=messages)
prompt = prompt_factory(
model=model, messages=messages, custom_llm_provider="gemini"
)
# def completion(
# model: str,
# messages: list,
# model_response: ModelResponse,
# print_verbose: Callable,
# api_key,
# encoding,
# logging_obj,
# custom_prompt_dict: dict,
# acompletion: bool = False,
# optional_params=None,
# litellm_params=None,
# logger_fn=None,
# ):
# try:
# import google.generativeai as genai # type: ignore
# except:
# raise Exception(
# "Importing google.generativeai failed, please run 'pip install -q google-generativeai"
# )
# genai.configure(api_key=api_key)
# system_prompt = ""
# if model in custom_prompt_dict:
# # check if the model has a registered custom prompt
# model_prompt_details = custom_prompt_dict[model]
# prompt = custom_prompt(
# role_dict=model_prompt_details["roles"],
# initial_prompt_value=model_prompt_details["initial_prompt_value"],
# final_prompt_value=model_prompt_details["final_prompt_value"],
# messages=messages,
# )
# else:
# system_prompt, messages = get_system_prompt(messages=messages)
# prompt = prompt_factory(
# model=model, messages=messages, custom_llm_provider="gemini"
# )
## Load Config
inference_params = copy.deepcopy(optional_params)
stream = inference_params.pop("stream", None)
# ## Load Config
# inference_params = copy.deepcopy(optional_params)
# stream = inference_params.pop("stream", None)
# Handle safety settings
safety_settings_param = inference_params.pop("safety_settings", None)
safety_settings = None
if safety_settings_param:
safety_settings = [
genai.types.SafetySettingDict(x) for x in safety_settings_param
]
# # Handle safety settings
# safety_settings_param = inference_params.pop("safety_settings", None)
# safety_settings = None
# if safety_settings_param:
# safety_settings = [
# genai.types.SafetySettingDict(x) for x in safety_settings_param
# ]
config = litellm.GeminiConfig.get_config()
for k, v in config.items():
if (
k not in inference_params
): # completion(top_k=3) > gemini_config(top_k=3) <- allows for dynamic variables to be passed in
inference_params[k] = v
# config = litellm.GeminiConfig.get_config()
# for k, v in config.items():
# if (
# k not in inference_params
# ): # completion(top_k=3) > gemini_config(top_k=3) <- allows for dynamic variables to be passed in
# inference_params[k] = v
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key="",
additional_args={
"complete_input_dict": {
"inference_params": inference_params,
"system_prompt": system_prompt,
}
},
)
## COMPLETION CALL
try:
_params = {"model_name": "models/{}".format(model)}
_system_instruction = supports_system_instruction()
if _system_instruction and len(system_prompt) > 0:
_params["system_instruction"] = system_prompt
_model = genai.GenerativeModel(**_params)
if stream is True:
if acompletion is True:
# ## LOGGING
# logging_obj.pre_call(
# input=prompt,
# api_key="",
# additional_args={
# "complete_input_dict": {
# "inference_params": inference_params,
# "system_prompt": system_prompt,
# }
# },
# )
# ## COMPLETION CALL
# try:
# _params = {"model_name": "models/{}".format(model)}
# _system_instruction = supports_system_instruction()
# if _system_instruction and len(system_prompt) > 0:
# _params["system_instruction"] = system_prompt
# _model = genai.GenerativeModel(**_params)
# if stream is True:
# if acompletion is True:
async def async_streaming():
try:
response = await _model.generate_content_async(
contents=prompt,
generation_config=genai.types.GenerationConfig(
**inference_params
),
safety_settings=safety_settings,
stream=True,
)
# async def async_streaming():
# try:
# response = await _model.generate_content_async(
# contents=prompt,
# generation_config=genai.types.GenerationConfig(
# **inference_params
# ),
# safety_settings=safety_settings,
# stream=True,
# )
response = litellm.CustomStreamWrapper(
TextStreamer(response),
model,
custom_llm_provider="gemini",
logging_obj=logging_obj,
)
return response
except Exception as e:
raise GeminiError(status_code=500, message=str(e))
# response = litellm.CustomStreamWrapper(
# TextStreamer(response),
# model,
# custom_llm_provider="gemini",
# logging_obj=logging_obj,
# )
# return response
# except Exception as e:
# raise GeminiError(status_code=500, message=str(e))
return async_streaming()
response = _model.generate_content(
contents=prompt,
generation_config=genai.types.GenerationConfig(**inference_params),
safety_settings=safety_settings,
stream=True,
)
return response
elif acompletion == True:
return async_completion(
_model=_model,
model=model,
prompt=prompt,
inference_params=inference_params,
safety_settings=safety_settings,
logging_obj=logging_obj,
print_verbose=print_verbose,
model_response=model_response,
messages=messages,
encoding=encoding,
)
else:
params = {
"contents": prompt,
"generation_config": genai.types.GenerationConfig(**inference_params),
"safety_settings": safety_settings,
}
response = _model.generate_content(**params)
except Exception as e:
raise GeminiError(
message=str(e),
status_code=500,
)
# return async_streaming()
# response = _model.generate_content(
# contents=prompt,
# generation_config=genai.types.GenerationConfig(**inference_params),
# safety_settings=safety_settings,
# stream=True,
# )
# return response
# elif acompletion == True:
# return async_completion(
# _model=_model,
# model=model,
# prompt=prompt,
# inference_params=inference_params,
# safety_settings=safety_settings,
# logging_obj=logging_obj,
# print_verbose=print_verbose,
# model_response=model_response,
# messages=messages,
# encoding=encoding,
# )
# else:
# params = {
# "contents": prompt,
# "generation_config": genai.types.GenerationConfig(**inference_params),
# "safety_settings": safety_settings,
# }
# response = _model.generate_content(**params)
# except Exception as e:
# raise GeminiError(
# message=str(e),
# status_code=500,
# )
## LOGGING
logging_obj.post_call(
input=prompt,
api_key="",
original_response=response,
additional_args={"complete_input_dict": {}},
)
print_verbose(f"raw model_response: {response}")
## RESPONSE OBJECT
completion_response = response
try:
choices_list = []
for idx, item in enumerate(completion_response.candidates):
if len(item.content.parts) > 0:
message_obj = Message(content=item.content.parts[0].text)
else:
message_obj = Message(content=None)
choice_obj = Choices(index=idx, message=message_obj)
choices_list.append(choice_obj)
model_response["choices"] = choices_list
except Exception as e:
verbose_logger.error("LiteLLM.gemini.py: Exception occured - {}".format(str(e)))
verbose_logger.debug(traceback.format_exc())
raise GeminiError(
message=traceback.format_exc(), status_code=response.status_code
)
# ## LOGGING
# logging_obj.post_call(
# input=prompt,
# api_key="",
# original_response=response,
# additional_args={"complete_input_dict": {}},
# )
# print_verbose(f"raw model_response: {response}")
# ## RESPONSE OBJECT
# completion_response = response
# try:
# choices_list = []
# for idx, item in enumerate(completion_response.candidates):
# if len(item.content.parts) > 0:
# message_obj = Message(content=item.content.parts[0].text)
# else:
# message_obj = Message(content=None)
# choice_obj = Choices(index=idx, message=message_obj)
# choices_list.append(choice_obj)
# model_response.choices = choices_list
# except Exception as e:
# verbose_logger.error("LiteLLM.gemini.py: Exception occured - {}".format(str(e)))
# verbose_logger.debug(traceback.format_exc())
# raise GeminiError(
# message=traceback.format_exc(), status_code=response.status_code
# )
try:
completion_response = model_response["choices"][0]["message"].get("content")
if completion_response is None:
raise Exception
except:
original_response = f"response: {response}"
if hasattr(response, "candidates"):
original_response = f"response: {response.candidates}"
if "SAFETY" in original_response:
original_response += (
"\nThe candidate content was flagged for safety reasons."
)
elif "RECITATION" in original_response:
original_response += (
"\nThe candidate content was flagged for recitation reasons."
)
raise GeminiError(
status_code=400,
message=f"No response received. Original response - {original_response}",
)
# try:
# completion_response = model_response["choices"][0]["message"].get("content")
# if completion_response is None:
# raise Exception
# except:
# original_response = f"response: {response}"
# if hasattr(response, "candidates"):
# original_response = f"response: {response.candidates}"
# if "SAFETY" in original_response:
# original_response += (
# "\nThe candidate content was flagged for safety reasons."
# )
# elif "RECITATION" in original_response:
# original_response += (
# "\nThe candidate content was flagged for recitation reasons."
# )
# raise GeminiError(
# status_code=400,
# message=f"No response received. Original response - {original_response}",
# )
## CALCULATING USAGE
prompt_str = ""
for m in messages:
if isinstance(m["content"], str):
prompt_str += m["content"]
elif isinstance(m["content"], list):
for content in m["content"]:
if content["type"] == "text":
prompt_str += content["text"]
prompt_tokens = len(encoding.encode(prompt_str))
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
)
# ## CALCULATING USAGE
# prompt_str = ""
# for m in messages:
# if isinstance(m["content"], str):
# prompt_str += m["content"]
# elif isinstance(m["content"], list):
# for content in m["content"]:
# if content["type"] == "text":
# prompt_str += content["text"]
# prompt_tokens = len(encoding.encode(prompt_str))
# completion_tokens = len(
# encoding.encode(model_response["choices"][0]["message"].get("content", ""))
# )
model_response["created"] = int(time.time())
model_response["model"] = "gemini/" + model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
setattr(model_response, "usage", usage)
return model_response
# model_response.created = int(time.time())
# model_response.model = "gemini/" + model
# usage = Usage(
# prompt_tokens=prompt_tokens,
# completion_tokens=completion_tokens,
# total_tokens=prompt_tokens + completion_tokens,
# )
# setattr(model_response, "usage", usage)
# return model_response
async def async_completion(
_model,
model,
prompt,
inference_params,
safety_settings,
logging_obj,
print_verbose,
model_response,
messages,
encoding,
):
import google.generativeai as genai # type: ignore
# async def async_completion(
# _model,
# model,
# prompt,
# inference_params,
# safety_settings,
# logging_obj,
# print_verbose,
# model_response,
# messages,
# encoding,
# ):
# import google.generativeai as genai # type: ignore
response = await _model.generate_content_async(
contents=prompt,
generation_config=genai.types.GenerationConfig(**inference_params),
safety_settings=safety_settings,
)
# response = await _model.generate_content_async(
# contents=prompt,
# generation_config=genai.types.GenerationConfig(**inference_params),
# safety_settings=safety_settings,
# )
## LOGGING
logging_obj.post_call(
input=prompt,
api_key="",
original_response=response,
additional_args={"complete_input_dict": {}},
)
print_verbose(f"raw model_response: {response}")
## RESPONSE OBJECT
completion_response = response
try:
choices_list = []
for idx, item in enumerate(completion_response.candidates):
if len(item.content.parts) > 0:
message_obj = Message(content=item.content.parts[0].text)
else:
message_obj = Message(content=None)
choice_obj = Choices(index=idx, message=message_obj)
choices_list.append(choice_obj)
model_response["choices"] = choices_list
except Exception as e:
verbose_logger.error("LiteLLM.gemini.py: Exception occured - {}".format(str(e)))
verbose_logger.debug(traceback.format_exc())
raise GeminiError(
message=traceback.format_exc(), status_code=response.status_code
)
# ## LOGGING
# logging_obj.post_call(
# input=prompt,
# api_key="",
# original_response=response,
# additional_args={"complete_input_dict": {}},
# )
# print_verbose(f"raw model_response: {response}")
# ## RESPONSE OBJECT
# completion_response = response
# try:
# choices_list = []
# for idx, item in enumerate(completion_response.candidates):
# if len(item.content.parts) > 0:
# message_obj = Message(content=item.content.parts[0].text)
# else:
# message_obj = Message(content=None)
# choice_obj = Choices(index=idx, message=message_obj)
# choices_list.append(choice_obj)
# model_response["choices"] = choices_list
# except Exception as e:
# verbose_logger.error("LiteLLM.gemini.py: Exception occured - {}".format(str(e)))
# verbose_logger.debug(traceback.format_exc())
# raise GeminiError(
# message=traceback.format_exc(), status_code=response.status_code
# )
try:
completion_response = model_response["choices"][0]["message"].get("content")
if completion_response is None:
raise Exception
except:
original_response = f"response: {response}"
if hasattr(response, "candidates"):
original_response = f"response: {response.candidates}"
if "SAFETY" in original_response:
original_response += (
"\nThe candidate content was flagged for safety reasons."
)
elif "RECITATION" in original_response:
original_response += (
"\nThe candidate content was flagged for recitation reasons."
)
raise GeminiError(
status_code=400,
message=f"No response received. Original response - {original_response}",
)
# try:
# completion_response = model_response["choices"][0]["message"].get("content")
# if completion_response is None:
# raise Exception
# except:
# original_response = f"response: {response}"
# if hasattr(response, "candidates"):
# original_response = f"response: {response.candidates}"
# if "SAFETY" in original_response:
# original_response += (
# "\nThe candidate content was flagged for safety reasons."
# )
# elif "RECITATION" in original_response:
# original_response += (
# "\nThe candidate content was flagged for recitation reasons."
# )
# raise GeminiError(
# status_code=400,
# message=f"No response received. Original response - {original_response}",
# )
## CALCULATING USAGE
prompt_str = ""
for m in messages:
if isinstance(m["content"], str):
prompt_str += m["content"]
elif isinstance(m["content"], list):
for content in m["content"]:
if content["type"] == "text":
prompt_str += content["text"]
prompt_tokens = len(encoding.encode(prompt_str))
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
)
# ## CALCULATING USAGE
# prompt_str = ""
# for m in messages:
# if isinstance(m["content"], str):
# prompt_str += m["content"]
# elif isinstance(m["content"], list):
# for content in m["content"]:
# if content["type"] == "text":
# prompt_str += content["text"]
# prompt_tokens = len(encoding.encode(prompt_str))
# completion_tokens = len(
# encoding.encode(model_response["choices"][0]["message"].get("content", ""))
# )
model_response["created"] = int(time.time())
model_response["model"] = "gemini/" + model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
model_response.usage = usage
return model_response
# model_response["created"] = int(time.time())
# model_response["model"] = "gemini/" + model
# usage = Usage(
# prompt_tokens=prompt_tokens,
# completion_tokens=completion_tokens,
# total_tokens=prompt_tokens + completion_tokens,
# )
# model_response.usage = usage
# return model_response
def embedding():
# logic for parsing in - calling - parsing out model embedding calls
pass
# def embedding():
# # logic for parsing in - calling - parsing out model embedding calls
# pass

View file

@ -1,17 +1,22 @@
## Uses the huggingface text generation inference API
import os, copy, types
import json
from enum import Enum
import httpx, requests
from .base import BaseLLM
import time
import litellm
from typing import Callable, Dict, List, Any, Literal, Tuple
from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper, Usage
from typing import Optional
from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm.types.completion import ChatCompletionMessageToolCallParam
import copy
import enum
import json
import os
import time
import types
from enum import Enum
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
import httpx
import requests
import litellm
from litellm.types.completion import ChatCompletionMessageToolCallParam
from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage
from .base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory
class HuggingfaceError(Exception):
@ -269,7 +274,7 @@ class Huggingface(BaseLLM):
def convert_to_model_response_object(
self,
completion_response,
model_response,
model_response: litellm.ModelResponse,
task: hf_tasks,
optional_params,
encoding,
@ -278,11 +283,9 @@ class Huggingface(BaseLLM):
):
if task == "conversational":
if len(completion_response["generated_text"]) > 0: # type: ignore
model_response["choices"][0]["message"][
"content"
] = completion_response[
model_response.choices[0].message.content = completion_response[ # type: ignore
"generated_text"
] # type: ignore
]
elif task == "text-generation-inference":
if (
not isinstance(completion_response, list)
@ -295,7 +298,7 @@ class Huggingface(BaseLLM):
)
if len(completion_response[0]["generated_text"]) > 0:
model_response["choices"][0]["message"]["content"] = output_parser(
model_response.choices[0].message.content = output_parser( # type: ignore
completion_response[0]["generated_text"]
)
## GETTING LOGPROBS + FINISH REASON
@ -310,7 +313,7 @@ class Huggingface(BaseLLM):
for token in completion_response[0]["details"]["tokens"]:
if token["logprob"] != None:
sum_logprob += token["logprob"]
model_response["choices"][0]["message"]._logprob = sum_logprob
setattr(model_response.choices[0].message, "_logprob", sum_logprob) # type: ignore
if "best_of" in optional_params and optional_params["best_of"] > 1:
if (
"details" in completion_response[0]
@ -337,14 +340,14 @@ class Huggingface(BaseLLM):
message=message_obj,
)
choices_list.append(choice_obj)
model_response["choices"].extend(choices_list)
model_response.choices.extend(choices_list)
elif task == "text-classification":
model_response["choices"][0]["message"]["content"] = json.dumps(
model_response.choices[0].message.content = json.dumps( # type: ignore
completion_response
)
else:
if len(completion_response[0]["generated_text"]) > 0:
model_response["choices"][0]["message"]["content"] = output_parser(
model_response.choices[0].message.content = output_parser( # type: ignore
completion_response[0]["generated_text"]
)
## CALCULATING USAGE
@ -371,14 +374,14 @@ class Huggingface(BaseLLM):
else:
completion_tokens = 0
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
model_response.usage = usage
setattr(model_response, "usage", usage)
model_response._hidden_params["original_response"] = completion_response
return model_response
@ -763,10 +766,10 @@ class Huggingface(BaseLLM):
self,
model: str,
input: list,
model_response: litellm.EmbeddingResponse,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
logging_obj=None,
model_response=None,
encoding=None,
):
super().embedding()
@ -867,15 +870,21 @@ class Huggingface(BaseLLM):
], # flatten list returned from hf
}
)
model_response["object"] = "list"
model_response["data"] = output_data
model_response["model"] = model
model_response.object = "list"
model_response.data = output_data
model_response.model = model
input_tokens = 0
for text in input:
input_tokens += len(encoding.encode(text))
model_response["usage"] = {
"prompt_tokens": input_tokens,
"total_tokens": input_tokens,
}
setattr(
model_response,
"usage",
litellm.Usage(
**{
"prompt_tokens": input_tokens,
"total_tokens": input_tokens,
}
),
)
return model_response

View file

@ -1,11 +1,15 @@
import os, types
import json
import os
import time
import traceback
import types
from enum import Enum
from typing import Callable, List, Optional
import requests # type: ignore
import time, traceback
from typing import Callable, Optional, List
from litellm.utils import ModelResponse, Choices, Message, Usage
import litellm
from litellm.utils import Choices, Message, ModelResponse, Usage
class MaritalkError(Exception):
@ -152,9 +156,9 @@ def completion(
else:
try:
if len(completion_response["answer"]) > 0:
model_response["choices"][0]["message"]["content"] = (
completion_response["answer"]
)
model_response.choices[0].message.content = completion_response[ # type: ignore
"answer"
]
except Exception as e:
raise MaritalkError(
message=response.text, status_code=response.status_code
@ -167,8 +171,8 @@ def completion(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,9 +1,12 @@
import os, types
import json
from enum import Enum
import requests # type: ignore
import os
import time
import types
from enum import Enum
from typing import Callable, Optional
import requests # type: ignore
import litellm
from litellm.utils import ModelResponse, Usage
@ -185,7 +188,7 @@ def completion(
else:
try:
if len(completion_response["generated_text"]) > 0:
model_response["choices"][0]["message"]["content"] = (
model_response.choices[0].message.content = ( # type: ignore
completion_response["generated_text"]
)
except:
@ -198,8 +201,8 @@ def completion(
prompt_tokens = completion_response["nb_input_tokens"]
completion_tokens = completion_response["nb_generated_tokens"]
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,13 +1,21 @@
from itertools import chain
import requests, types, time # type: ignore
import json, uuid
import asyncio
import json
import time
import traceback
from typing import Optional, List
import types
import uuid
from itertools import chain
from typing import List, Optional
import aiohttp
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.types.utils import ProviderField
import httpx, aiohttp, asyncio # type: ignore
from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm import verbose_logger
from litellm.types.utils import ProviderField
from .prompt_templates.factory import custom_prompt, prompt_factory
class OllamaError(Exception):
@ -138,7 +146,6 @@ class OllamaConfig:
)
]
def get_supported_openai_params(
self,
):
@ -157,7 +164,8 @@ class OllamaConfig:
# ollama wants plain base64 jpeg/png files as images. strip any leading dataURI
# and convert to jpeg if necessary.
def _convert_image(image):
import base64, io
import base64
import io
try:
from PIL import Image
@ -183,13 +191,13 @@ def _convert_image(image):
# ollama implementation
def get_ollama_response(
model_response: litellm.ModelResponse,
api_base="http://localhost:11434",
model="llama2",
prompt="Why is the sky blue?",
optional_params=None,
logging_obj=None,
acompletion: bool = False,
model_response=None,
encoding=None,
):
if api_base.endswith("/api/generate"):
@ -271,7 +279,7 @@ def get_ollama_response(
response_json = response.json()
## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop"
model_response.choices[0].finish_reason = "stop"
if data.get("format", "") == "json":
function_call = json.loads(response_json["response"])
message = litellm.Message(
@ -287,20 +295,24 @@ def get_ollama_response(
}
],
)
model_response["choices"][0]["message"] = message
model_response["choices"][0]["finish_reason"] = "tool_calls"
model_response.choices[0].message = message # type: ignore
model_response.choices[0].finish_reason = "tool_calls"
else:
model_response["choices"][0]["message"]["content"] = response_json["response"]
model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + model
model_response.choices[0].message.content = response_json["response"] # type: ignore
model_response.created = int(time.time())
model_response.model = "ollama/" + model
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt, disallowed_special=()))) # type: ignore
completion_tokens = response_json.get(
"eval_count", len(response_json.get("message", dict()).get("content", ""))
)
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
setattr(
model_response,
"usage",
litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
),
)
return model_response
@ -346,8 +358,8 @@ def ollama_completion_stream(url, data, logging_obj):
],
)
model_response = first_chunk
model_response["choices"][0]["delta"] = delta
model_response["choices"][0]["finish_reason"] = "tool_calls"
model_response.choices[0].delta = delta # type: ignore
model_response.choices[0].finish_reason = "tool_calls"
yield model_response
else:
for transformed_chunk in streamwrapper:
@ -401,8 +413,8 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
],
)
model_response = first_chunk
model_response["choices"][0]["delta"] = delta
model_response["choices"][0]["finish_reason"] = "tool_calls"
model_response.choices[0].delta = delta # type: ignore
model_response.choices[0].finish_reason = "tool_calls"
yield model_response
else:
async for transformed_chunk in streamwrapper:
@ -418,7 +430,9 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
raise e
async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
async def ollama_acompletion(
url, data, model_response: litellm.ModelResponse, encoding, logging_obj
):
data["stream"] = False
try:
timeout = aiohttp.ClientTimeout(total=litellm.request_timeout) # 10 minutes
@ -442,7 +456,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
response_json = await resp.json()
## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop"
model_response.choices[0].finish_reason = "stop"
if data.get("format", "") == "json":
function_call = json.loads(response_json["response"])
message = litellm.Message(
@ -451,30 +465,34 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
{
"id": f"call_{str(uuid.uuid4())}",
"function": {
"name": function_call.get("name", function_call.get("function", None)),
"name": function_call.get(
"name", function_call.get("function", None)
),
"arguments": json.dumps(function_call["arguments"]),
},
"type": "function",
}
],
)
model_response["choices"][0]["message"] = message
model_response["choices"][0]["finish_reason"] = "tool_calls"
model_response.choices[0].message = message # type: ignore
model_response.choices[0].finish_reason = "tool_calls"
else:
model_response["choices"][0]["message"]["content"] = response_json[
"response"
]
model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + data["model"]
model_response.choices[0].message.content = response_json["response"] # type: ignore
model_response.created = int(time.time())
model_response.model = "ollama/" + data["model"]
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"], disallowed_special=()))) # type: ignore
completion_tokens = response_json.get(
"eval_count",
len(response_json.get("message", dict()).get("content", "")),
)
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
setattr(
model_response,
"usage",
litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
),
)
return model_response
except Exception as e:
@ -491,9 +509,9 @@ async def ollama_aembeddings(
api_base: str,
model: str,
prompts: list,
model_response: litellm.EmbeddingResponse,
optional_params=None,
logging_obj=None,
model_response=None,
encoding=None,
):
if api_base.endswith("/api/embeddings"):
@ -554,13 +572,19 @@ async def ollama_aembeddings(
input_tokens = len(encoding.encode(prompt))
total_input_tokens += input_tokens
model_response["object"] = "list"
model_response["data"] = output_data
model_response["model"] = model
model_response["usage"] = {
"prompt_tokens": total_input_tokens,
"total_tokens": total_input_tokens,
}
model_response.object = "list"
model_response.data = output_data
model_response.model = model
setattr(
model_response,
"usage",
litellm.Usage(
**{
"prompt_tokens": total_input_tokens,
"total_tokens": total_input_tokens,
}
),
)
return model_response

View file

@ -1,15 +1,17 @@
from itertools import chain
import requests
import types
import time
import json
import uuid
import time
import traceback
import types
import uuid
from itertools import chain
from typing import Optional
from litellm import verbose_logger
import litellm
import httpx
import aiohttp
import httpx
import requests
import litellm
from litellm import verbose_logger
class OllamaError(Exception):
@ -195,6 +197,7 @@ class OllamaChatConfig:
# ollama implementation
def get_ollama_response(
model_response: litellm.ModelResponse,
api_base="http://localhost:11434",
api_key: Optional[str] = None,
model="llama2",
@ -202,7 +205,6 @@ def get_ollama_response(
optional_params=None,
logging_obj=None,
acompletion: bool = False,
model_response=None,
encoding=None,
):
if api_base.endswith("/api/chat"):
@ -295,7 +297,7 @@ def get_ollama_response(
response_json = response.json()
## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop"
model_response.choices[0].finish_reason = "stop"
if data.get("format", "") == "json":
function_call = json.loads(response_json["message"]["content"])
message = litellm.Message(
@ -311,22 +313,24 @@ def get_ollama_response(
}
],
)
model_response["choices"][0]["message"] = message
model_response["choices"][0]["finish_reason"] = "tool_calls"
model_response.choices[0].message = message # type: ignore
model_response.choices[0].finish_reason = "tool_calls"
else:
model_response["choices"][0]["message"]["content"] = response_json["message"][
"content"
]
model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + model
model_response.choices[0].message.content = response_json["message"]["content"] # type: ignore
model_response.created = int(time.time())
model_response.model = "ollama/" + model
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=messages)) # type: ignore
completion_tokens = response_json.get(
"eval_count", litellm.token_counter(text=response_json["message"]["content"])
)
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
setattr(
model_response,
"usage",
litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
),
)
return model_response
@ -379,8 +383,8 @@ def ollama_completion_stream(url, api_key, data, logging_obj):
],
)
model_response = first_chunk
model_response["choices"][0]["delta"] = delta
model_response["choices"][0]["finish_reason"] = "tool_calls"
model_response.choices[0].delta = delta # type: ignore
model_response.choices[0].finish_reason = "tool_calls"
yield model_response
else:
for transformed_chunk in streamwrapper:
@ -434,7 +438,9 @@ async def ollama_async_streaming(
{
"id": f"call_{str(uuid.uuid4())}",
"function": {
"name": function_call.get("name", function_call.get("function", None)),
"name": function_call.get(
"name", function_call.get("function", None)
),
"arguments": json.dumps(function_call["arguments"]),
},
"type": "function",
@ -442,8 +448,8 @@ async def ollama_async_streaming(
],
)
model_response = first_chunk
model_response["choices"][0]["delta"] = delta
model_response["choices"][0]["finish_reason"] = "tool_calls"
model_response.choices[0].delta = delta # type: ignore
model_response.choices[0].finish_reason = "tool_calls"
yield model_response
else:
async for transformed_chunk in streamwrapper:
@ -457,7 +463,7 @@ async def ollama_acompletion(
url,
api_key: Optional[str],
data,
model_response,
model_response: litellm.ModelResponse,
encoding,
logging_obj,
function_name,
@ -492,7 +498,7 @@ async def ollama_acompletion(
)
## RESPONSE OBJECT
model_response["choices"][0]["finish_reason"] = "stop"
model_response.choices[0].finish_reason = "stop"
if data.get("format", "") == "json":
function_call = json.loads(response_json["message"]["content"])
message = litellm.Message(
@ -510,15 +516,17 @@ async def ollama_acompletion(
}
],
)
model_response["choices"][0]["message"] = message
model_response["choices"][0]["finish_reason"] = "tool_calls"
model_response.choices[0].message = message # type: ignore
model_response.choices[0].finish_reason = "tool_calls"
else:
model_response["choices"][0]["message"]["content"] = response_json[
model_response.choices[0].message.content = response_json[ # type: ignore
"message"
]["content"]
][
"content"
]
model_response["created"] = int(time.time())
model_response["model"] = "ollama_chat/" + data["model"]
model_response.created = int(time.time())
model_response.model = "ollama_chat/" + data["model"]
prompt_tokens = response_json.get("prompt_eval_count", litellm.token_counter(messages=data["messages"])) # type: ignore
completion_tokens = response_json.get(
"eval_count",
@ -526,10 +534,14 @@ async def ollama_acompletion(
text=response_json["message"]["content"], count_response_tokens=True
),
)
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
setattr(
model_response,
"usage",
litellm.Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
),
)
return model_response
except Exception as e:

View file

@ -1,11 +1,14 @@
import os
import json
from enum import Enum
import requests # type: ignore
import os
import time
from enum import Enum
from typing import Callable, Optional
from litellm.utils import ModelResponse, Usage
from .prompt_templates.factory import prompt_factory, custom_prompt
import requests # type: ignore
from litellm.utils import EmbeddingResponse, ModelResponse, Usage
from .prompt_templates.factory import custom_prompt, prompt_factory
class OobaboogaError(Exception):
@ -99,17 +102,15 @@ def completion(
)
else:
try:
model_response["choices"][0]["message"]["content"] = (
completion_response["choices"][0]["message"]["content"]
)
model_response.choices[0].message.content = completion_response["choices"][0]["message"]["content"] # type: ignore
except:
raise OobaboogaError(
message=json.dumps(completion_response),
status_code=response.status_code,
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=completion_response["usage"]["prompt_tokens"],
completion_tokens=completion_response["usage"]["completion_tokens"],
@ -122,10 +123,10 @@ def completion(
def embedding(
model: str,
input: list,
model_response: EmbeddingResponse,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
):
@ -166,7 +167,7 @@ def embedding(
)
# Process response data
model_response["data"] = [
model_response.data = [
{
"embedding": completion_response["data"][0]["embedding"],
"index": 0,
@ -176,8 +177,12 @@ def embedding(
num_tokens = len(completion_response["data"][0]["embedding"])
# Adding metadata to response
model_response.usage = Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
model_response["object"] = "list"
model_response["model"] = model
setattr(
model_response,
"usage",
Usage(prompt_tokens=num_tokens, total_tokens=num_tokens),
)
model_response.object = "list"
model_response.model = model
return model_response

View file

@ -18,6 +18,7 @@ import httpx
import openai
from openai import AsyncOpenAI, OpenAI
from openai.types.beta.assistant_deleted import AssistantDeleted
from openai.types.file_deleted import FileDeleted
from pydantic import BaseModel
from typing_extensions import overload, override
@ -2064,6 +2065,151 @@ class OpenAIFilesAPI(BaseLLM):
return response
async def aretrieve_file(
self,
file_id: str,
openai_client: AsyncOpenAI,
) -> FileObject:
response = await openai_client.files.retrieve(file_id=file_id)
return response
def retrieve_file(
self,
_is_async: bool,
file_id: str,
api_base: str,
api_key: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
):
openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
client=client,
_is_async=_is_async,
)
if openai_client is None:
raise ValueError(
"OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncOpenAI):
raise ValueError(
"OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
)
return self.aretrieve_file( # type: ignore
file_id=file_id,
openai_client=openai_client,
)
response = openai_client.files.retrieve(file_id=file_id)
return response
async def adelete_file(
self,
file_id: str,
openai_client: AsyncOpenAI,
) -> FileDeleted:
response = await openai_client.files.delete(file_id=file_id)
return response
def delete_file(
self,
_is_async: bool,
file_id: str,
api_base: str,
api_key: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
):
openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
client=client,
_is_async=_is_async,
)
if openai_client is None:
raise ValueError(
"OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncOpenAI):
raise ValueError(
"OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
)
return self.adelete_file( # type: ignore
file_id=file_id,
openai_client=openai_client,
)
response = openai_client.files.delete(file_id=file_id)
return response
async def alist_files(
self,
openai_client: AsyncOpenAI,
purpose: Optional[str] = None,
):
if isinstance(purpose, str):
response = await openai_client.files.list(purpose=purpose)
else:
response = await openai_client.files.list()
return response
def list_files(
self,
_is_async: bool,
api_base: str,
api_key: Optional[str],
timeout: Union[float, httpx.Timeout],
max_retries: Optional[int],
organization: Optional[str],
purpose: Optional[str] = None,
client: Optional[Union[OpenAI, AsyncOpenAI]] = None,
):
openai_client: Optional[Union[OpenAI, AsyncOpenAI]] = self.get_openai_client(
api_key=api_key,
api_base=api_base,
timeout=timeout,
max_retries=max_retries,
organization=organization,
client=client,
_is_async=_is_async,
)
if openai_client is None:
raise ValueError(
"OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment."
)
if _is_async is True:
if not isinstance(openai_client, AsyncOpenAI):
raise ValueError(
"OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client."
)
return self.alist_files( # type: ignore
purpose=purpose,
openai_client=openai_client,
)
if isinstance(purpose, str):
response = openai_client.files.list(purpose=purpose)
else:
response = openai_client.files.list()
return response
class OpenAIBatchesAPI(BaseLLM):
"""

View file

@ -1,12 +1,14 @@
import types
import traceback
import copy
import time
import traceback
import types
from typing import Callable, Optional
from litellm.utils import ModelResponse, Choices, Message, Usage
import litellm
import httpx
import litellm
from litellm import verbose_logger
from litellm.utils import Choices, Message, ModelResponse, Usage
class PalmError(Exception):
@ -164,7 +166,7 @@ def completion(
message_obj = Message(content=None)
choice_obj = Choices(index=idx + 1, message=message_obj)
choices_list.append(choice_obj)
model_response["choices"] = choices_list
model_response.choices = choices_list # type: ignore
except Exception as e:
verbose_logger.error(
"litellm.llms.palm.py::completion(): Exception occured - {}".format(str(e))
@ -188,8 +190,8 @@ def completion(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
)
model_response["created"] = int(time.time())
model_response["model"] = "palm/" + model
model_response.created = int(time.time())
model_response.model = "palm/" + model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,12 +1,16 @@
import os, types
import json
from enum import Enum
import requests # type: ignore
import os
import time
import types
from enum import Enum
from typing import Callable, Optional
import requests # type: ignore
import litellm
from litellm.utils import ModelResponse, Usage
from .prompt_templates.factory import prompt_factory, custom_prompt
from .prompt_templates.factory import custom_prompt, prompt_factory
class PetalsError(Exception):
@ -151,8 +155,8 @@ def completion(
else:
try:
import torch
from transformers import AutoTokenizer
from petals import AutoDistributedModelForCausalLM # type: ignore
from transformers import AutoTokenizer
except:
raise Exception(
"Importing torch, transformers, petals failed\nTry pip installing petals \npip install git+https://github.com/bigscience-workshop/petals"
@ -189,15 +193,15 @@ def completion(
output_text = tokenizer.decode(outputs[0])
if len(output_text) > 0:
model_response["choices"][0]["message"]["content"] = output_text
model_response.choices[0].message.content = output_text # type: ignore
prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"].get("content"))
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -279,7 +279,7 @@ class PredibaseChatCompletion(BaseLLM):
message=f"'generated_text' is not a key response dictionary - {completion_response}",
)
if len(completion_response["generated_text"]) > 0:
model_response["choices"][0]["message"]["content"] = self.output_parser(
model_response.choices[0].message.content = self.output_parser( # type: ignore
completion_response["generated_text"]
)
## GETTING LOGPROBS + FINISH REASON
@ -294,10 +294,10 @@ class PredibaseChatCompletion(BaseLLM):
for token in completion_response["details"]["tokens"]:
if token["logprob"] is not None:
sum_logprob += token["logprob"]
model_response["choices"][0][
"message"
]._logprob = (
sum_logprob # [TODO] move this to using the actual logprobs
setattr(
model_response.choices[0].message, # type: ignore
"_logprob",
sum_logprob, # [TODO] move this to using the actual logprobs
)
if "best_of" in optional_params and optional_params["best_of"] > 1:
if (
@ -325,7 +325,7 @@ class PredibaseChatCompletion(BaseLLM):
message=message_obj,
)
choices_list.append(choice_obj)
model_response["choices"].extend(choices_list)
model_response.choices.extend(choices_list)
## CALCULATING USAGE
prompt_tokens = 0
@ -351,8 +351,8 @@ class PredibaseChatCompletion(BaseLLM):
total_tokens = prompt_tokens + completion_tokens
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -388,7 +388,7 @@ def process_response(
## Building RESPONSE OBJECT
if len(result) > 1:
model_response["choices"][0]["message"]["content"] = result
model_response.choices[0].message.content = result # type: ignore
# Calculate usage
prompt_tokens = len(encoding.encode(prompt, disallowed_special=()))
@ -398,7 +398,7 @@ def process_response(
disallowed_special=(),
)
)
model_response["model"] = "replicate/" + model
model_response.model = "replicate/" + model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
@ -498,7 +498,7 @@ def completion(
## Step1: Start Prediction: gets a prediction url
## Step2: Poll prediction url for response
## Step2: is handled with and without streaming
model_response["created"] = int(
model_response.created = int(
time.time()
) # for pricing this must remain right before calling api

View file

@ -1,16 +1,21 @@
import os, types, traceback
from enum import Enum
import json
import requests # type: ignore
import time
from typing import Callable, Optional, Any
import litellm
from litellm.utils import ModelResponse, EmbeddingResponse, get_secret, Usage
import sys
from copy import deepcopy
import httpx # type: ignore
import io
from .prompt_templates.factory import prompt_factory, custom_prompt
import json
import os
import sys
import time
import traceback
import types
from copy import deepcopy
from enum import Enum
from typing import Any, Callable, Optional
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.utils import EmbeddingResponse, ModelResponse, Usage, get_secret
from .prompt_templates.factory import custom_prompt, prompt_factory
class SagemakerError(Exception):
@ -377,7 +382,7 @@ def completion(
if completion_output.startswith(prompt) and "<s>" in prompt:
completion_output = completion_output.replace(prompt, "", 1)
model_response["choices"][0]["message"]["content"] = completion_output
model_response.choices[0].message.content = completion_output # type: ignore
except:
raise SagemakerError(
message=f"LiteLLM Error: Unable to parse sagemaker RAW RESPONSE {json.dumps(completion_response)}",
@ -390,8 +395,8 @@ def completion(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
@ -597,7 +602,7 @@ async def async_completion(
if completion_output.startswith(data["inputs"]) and "<s>" in data["inputs"]:
completion_output = completion_output.replace(data["inputs"], "", 1)
model_response["choices"][0]["message"]["content"] = completion_output
model_response.choices[0].message.content = completion_output # type: ignore
except:
raise SagemakerError(
message=f"LiteLLM Error: Unable to parse sagemaker RAW RESPONSE {json.dumps(completion_response)}",
@ -610,8 +615,8 @@ async def async_completion(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
@ -741,16 +746,20 @@ def embedding(
{"object": "embedding", "index": idx, "embedding": embedding}
)
model_response["object"] = "list"
model_response["data"] = output_data
model_response["model"] = model
model_response.object = "list"
model_response.data = output_data
model_response.model = model
input_tokens = 0
for text in input:
input_tokens += len(encoding.encode(text))
model_response["usage"] = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
setattr(
model_response,
"usage",
Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
),
)
return model_response

View file

@ -3,16 +3,20 @@ Deprecated. We now do together ai calls via the openai client.
Reference: https://docs.together.ai/docs/openai-api-compatibility
"""
import os, types
import json
from enum import Enum
import requests # type: ignore
import os
import time
import types
from enum import Enum
from typing import Callable, Optional
import litellm
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.utils import ModelResponse, Usage
from .prompt_templates.factory import prompt_factory, custom_prompt
from .prompt_templates.factory import custom_prompt, prompt_factory
class TogetherAIError(Exception):
@ -91,145 +95,145 @@ class TogetherAIConfig:
}
def validate_environment(api_key):
if api_key is None:
raise ValueError(
"Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params"
)
headers = {
"accept": "application/json",
"content-type": "application/json",
"Authorization": "Bearer " + api_key,
}
return headers
# def validate_environment(api_key):
# if api_key is None:
# raise ValueError(
# "Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params"
# )
# headers = {
# "accept": "application/json",
# "content-type": "application/json",
# "Authorization": "Bearer " + api_key,
# }
# return headers
def completion(
model: str,
messages: list,
api_base: str,
model_response: ModelResponse,
print_verbose: Callable,
encoding,
api_key,
logging_obj,
custom_prompt_dict={},
optional_params=None,
litellm_params=None,
logger_fn=None,
):
headers = validate_environment(api_key)
# def completion(
# model: str,
# messages: list,
# api_base: str,
# model_response: ModelResponse,
# print_verbose: Callable,
# encoding,
# api_key,
# logging_obj,
# custom_prompt_dict={},
# optional_params=None,
# litellm_params=None,
# logger_fn=None,
# ):
# headers = validate_environment(api_key)
## Load Config
config = litellm.TogetherAIConfig.get_config()
for k, v in config.items():
if (
k not in optional_params
): # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
# ## Load Config
# config = litellm.TogetherAIConfig.get_config()
# for k, v in config.items():
# if (
# k not in optional_params
# ): # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
# optional_params[k] = v
print_verbose(f"CUSTOM PROMPT DICT: {custom_prompt_dict}; model: {model}")
if model in custom_prompt_dict:
# check if the model has a registered custom prompt
model_prompt_details = custom_prompt_dict[model]
prompt = custom_prompt(
role_dict=model_prompt_details.get("roles", {}),
initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
bos_token=model_prompt_details.get("bos_token", ""),
eos_token=model_prompt_details.get("eos_token", ""),
messages=messages,
)
else:
prompt = prompt_factory(
model=model,
messages=messages,
api_key=api_key,
custom_llm_provider="together_ai",
) # api key required to query together ai model list
# print_verbose(f"CUSTOM PROMPT DICT: {custom_prompt_dict}; model: {model}")
# if model in custom_prompt_dict:
# # check if the model has a registered custom prompt
# model_prompt_details = custom_prompt_dict[model]
# prompt = custom_prompt(
# role_dict=model_prompt_details.get("roles", {}),
# initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
# final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
# bos_token=model_prompt_details.get("bos_token", ""),
# eos_token=model_prompt_details.get("eos_token", ""),
# messages=messages,
# )
# else:
# prompt = prompt_factory(
# model=model,
# messages=messages,
# api_key=api_key,
# custom_llm_provider="together_ai",
# ) # api key required to query together ai model list
data = {
"model": model,
"prompt": prompt,
"request_type": "language-model-inference",
**optional_params,
}
# data = {
# "model": model,
# "prompt": prompt,
# "request_type": "language-model-inference",
# **optional_params,
# }
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key=api_key,
additional_args={
"complete_input_dict": data,
"headers": headers,
"api_base": api_base,
},
)
## COMPLETION CALL
if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
response = requests.post(
api_base,
headers=headers,
data=json.dumps(data),
stream=optional_params["stream_tokens"],
)
return response.iter_lines()
else:
response = requests.post(api_base, headers=headers, data=json.dumps(data))
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
original_response=response.text,
additional_args={"complete_input_dict": data},
)
print_verbose(f"raw model_response: {response.text}")
## RESPONSE OBJECT
if response.status_code != 200:
raise TogetherAIError(
status_code=response.status_code, message=response.text
)
completion_response = response.json()
# ## LOGGING
# logging_obj.pre_call(
# input=prompt,
# api_key=api_key,
# additional_args={
# "complete_input_dict": data,
# "headers": headers,
# "api_base": api_base,
# },
# )
# ## COMPLETION CALL
# if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
# response = requests.post(
# api_base,
# headers=headers,
# data=json.dumps(data),
# stream=optional_params["stream_tokens"],
# )
# return response.iter_lines()
# else:
# response = requests.post(api_base, headers=headers, data=json.dumps(data))
# ## LOGGING
# logging_obj.post_call(
# input=prompt,
# api_key=api_key,
# original_response=response.text,
# additional_args={"complete_input_dict": data},
# )
# print_verbose(f"raw model_response: {response.text}")
# ## RESPONSE OBJECT
# if response.status_code != 200:
# raise TogetherAIError(
# status_code=response.status_code, message=response.text
# )
# completion_response = response.json()
if "error" in completion_response:
raise TogetherAIError(
message=json.dumps(completion_response),
status_code=response.status_code,
)
elif "error" in completion_response["output"]:
raise TogetherAIError(
message=json.dumps(completion_response["output"]),
status_code=response.status_code,
)
# if "error" in completion_response:
# raise TogetherAIError(
# message=json.dumps(completion_response),
# status_code=response.status_code,
# )
# elif "error" in completion_response["output"]:
# raise TogetherAIError(
# message=json.dumps(completion_response["output"]),
# status_code=response.status_code,
# )
if len(completion_response["output"]["choices"][0]["text"]) >= 0:
model_response["choices"][0]["message"]["content"] = completion_response[
"output"
]["choices"][0]["text"]
# if len(completion_response["output"]["choices"][0]["text"]) >= 0:
# model_response.choices[0].message.content = completion_response["output"][
# "choices"
# ][0]["text"]
## CALCULATING USAGE
print_verbose(
f"CALCULATING TOGETHERAI TOKEN USAGE. Model Response: {model_response}; model_response['choices'][0]['message'].get('content', ''): {model_response['choices'][0]['message'].get('content', None)}"
)
prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
)
if "finish_reason" in completion_response["output"]["choices"][0]:
model_response.choices[0].finish_reason = completion_response["output"][
"choices"
][0]["finish_reason"]
model_response["created"] = int(time.time())
model_response["model"] = "together_ai/" + model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
setattr(model_response, "usage", usage)
return model_response
# ## CALCULATING USAGE
# print_verbose(
# f"CALCULATING TOGETHERAI TOKEN USAGE. Model Response: {model_response}; model_response['choices'][0]['message'].get('content', ''): {model_response['choices'][0]['message'].get('content', None)}"
# )
# prompt_tokens = len(encoding.encode(prompt))
# completion_tokens = len(
# encoding.encode(model_response["choices"][0]["message"].get("content", ""))
# )
# if "finish_reason" in completion_response["output"]["choices"][0]:
# model_response.choices[0].finish_reason = completion_response["output"][
# "choices"
# ][0]["finish_reason"]
# model_response["created"] = int(time.time())
# model_response["model"] = "together_ai/" + model
# usage = Usage(
# prompt_tokens=prompt_tokens,
# completion_tokens=completion_tokens,
# total_tokens=prompt_tokens + completion_tokens,
# )
# setattr(model_response, "usage", usage)
# return model_response
def embedding():
# logic for parsing in - calling - parsing out model embedding calls
pass
# def embedding():
# # logic for parsing in - calling - parsing out model embedding calls
# pass

View file

@ -852,16 +852,14 @@ def completion(
## RESPONSE OBJECT
if isinstance(completion_response, litellm.Message):
model_response["choices"][0]["message"] = completion_response
model_response.choices[0].message = completion_response # type: ignore
elif len(str(completion_response)) > 0:
model_response["choices"][0]["message"]["content"] = str(
completion_response
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.choices[0].message.content = str(completion_response) # type: ignore
model_response.created = int(time.time())
model_response.model = model
## CALCULATING USAGE
if model in litellm.vertex_language_models and response_obj is not None:
model_response["choices"][0].finish_reason = map_finish_reason(
model_response.choices[0].finish_reason = map_finish_reason(
response_obj.candidates[0].finish_reason.name
)
usage = Usage(
@ -912,7 +910,7 @@ async def async_completion(
request_str: str,
print_verbose: Callable,
logging_obj,
encoding=None,
encoding,
client_options=None,
instances=None,
vertex_project=None,
@ -1088,16 +1086,16 @@ async def async_completion(
## RESPONSE OBJECT
if isinstance(completion_response, litellm.Message):
model_response["choices"][0]["message"] = completion_response
model_response.choices[0].message = completion_response # type: ignore
elif len(str(completion_response)) > 0:
model_response["choices"][0]["message"]["content"] = str(
model_response.choices[0].message.content = str( # type: ignore
completion_response
)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
## CALCULATING USAGE
if model in litellm.vertex_language_models and response_obj is not None:
model_response["choices"][0].finish_reason = map_finish_reason(
model_response.choices[0].finish_reason = map_finish_reason(
response_obj.candidates[0].finish_reason.name
)
usage = Usage(
@ -1377,16 +1375,16 @@ class VertexAITextEmbeddingConfig(BaseModel):
def embedding(
model: str,
input: Union[list, str],
print_verbose,
model_response: litellm.EmbeddingResponse,
optional_params: dict,
api_key: Optional[str] = None,
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
vertex_project=None,
vertex_location=None,
vertex_credentials=None,
aembedding=False,
print_verbose=None,
):
# logic for parsing in - calling - parsing out model embedding calls
try:
@ -1484,15 +1482,15 @@ def embedding(
"embedding": embedding.values,
}
)
input_tokens += embedding.statistics.token_count
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response["model"] = model
input_tokens += embedding.statistics.token_count # type: ignore
model_response.object = "list"
model_response.data = embedding_response
model_response.model = model
usage = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
)
model_response.usage = usage
setattr(model_response, "usage", usage)
return model_response
@ -1500,8 +1498,8 @@ def embedding(
async def async_embedding(
model: str,
input: Union[list, str],
model_response: litellm.EmbeddingResponse,
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
client=None,
@ -1541,11 +1539,11 @@ async def async_embedding(
)
input_tokens += embedding.statistics.token_count
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response["model"] = model
model_response.object = "list"
model_response.data = embedding_response
model_response.model = model
usage = Usage(
prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
)
model_response.usage = usage
setattr(model_response, "usage", usage)
return model_response

View file

@ -367,8 +367,8 @@ async def async_completion(
prompt_tokens = message.usage.input_tokens
completion_tokens = message.usage.output_tokens
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -1,11 +1,15 @@
import os
import json
import os
import time # type: ignore
from enum import Enum
from typing import Any, Callable
import httpx
import requests # type: ignore
import time, httpx # type: ignore
from typing import Callable, Any
from litellm.utils import ModelResponse, Usage
from .prompt_templates.factory import prompt_factory, custom_prompt
from .prompt_templates.factory import custom_prompt, prompt_factory
llm = None
@ -91,14 +95,14 @@ def completion(
)
print_verbose(f"raw model_response: {outputs}")
## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = outputs[0].outputs[0].text
model_response.choices[0].message.content = outputs[0].outputs[0].text # type: ignore
## CALCULATING USAGE
prompt_tokens = len(outputs[0].prompt_token_ids)
completion_tokens = len(outputs[0].outputs[0].token_ids)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
@ -173,14 +177,14 @@ def batch_completions(
for output in outputs:
model_response = ModelResponse()
## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = output.outputs[0].text
model_response.choices[0].message.content = output.outputs[0].text # type: ignore
## CALCULATING USAGE
prompt_tokens = len(output.prompt_token_ids)
completion_tokens = len(output.outputs[0].token_ids)
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.created = int(time.time())
model_response.model = model
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,

View file

@ -25,7 +25,13 @@ import requests # type: ignore
import litellm
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
from litellm.utils import ModelResponse, Usage, get_secret
from litellm.utils import (
EmbeddingResponse,
ModelResponse,
Usage,
get_secret,
map_finish_reason,
)
from .base import BaseLLM
from .prompt_templates import factory as ptf
@ -414,14 +420,16 @@ class IBMWatsonXAI(BaseLLM):
generated_text = json_resp["results"][0]["generated_text"]
prompt_tokens = json_resp["results"][0]["input_token_count"]
completion_tokens = json_resp["results"][0]["generated_token_count"]
model_response["choices"][0]["message"]["content"] = generated_text
model_response["finish_reason"] = json_resp["results"][0]["stop_reason"]
model_response.choices[0].message.content = generated_text # type: ignore
model_response.choices[0].finish_reason = map_finish_reason(
json_resp["results"][0]["stop_reason"]
)
if json_resp.get("created_at"):
model_response["created"] = datetime.fromisoformat(
json_resp["created_at"]
).timestamp()
model_response.created = int(
datetime.fromisoformat(json_resp["created_at"]).timestamp()
)
else:
model_response["created"] = int(time.time())
model_response.created = int(time.time())
usage = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
@ -463,7 +471,7 @@ class IBMWatsonXAI(BaseLLM):
prompt = convert_messages_to_prompt(
model, messages, provider, custom_prompt_dict
)
model_response["model"] = model
model_response.model = model
def process_stream_response(
stream_resp: Union[Iterator[str], AsyncIterator],
@ -551,10 +559,10 @@ class IBMWatsonXAI(BaseLLM):
raise WatsonXAIError(status_code=500, message=str(e))
def _process_embedding_response(
self, json_resp: dict, model_response: Union[ModelResponse, None] = None
) -> ModelResponse:
self, json_resp: dict, model_response: Optional[EmbeddingResponse] = None
) -> EmbeddingResponse:
if model_response is None:
model_response = ModelResponse(model=json_resp.get("model_id", None))
model_response = EmbeddingResponse(model=json_resp.get("model_id", None))
results = json_resp.get("results", [])
embedding_response = []
for idx, result in enumerate(results):
@ -565,8 +573,8 @@ class IBMWatsonXAI(BaseLLM):
"embedding": result["embedding"],
}
)
model_response["object"] = "list"
model_response["data"] = embedding_response
model_response.object = "list"
model_response.data = embedding_response
input_tokens = json_resp.get("input_token_count", 0)
setattr(
model_response,
@ -583,9 +591,9 @@ class IBMWatsonXAI(BaseLLM):
self,
model: str,
input: Union[list, str],
model_response: litellm.EmbeddingResponse,
api_key: Optional[str] = None,
logging_obj=None,
model_response=None,
optional_params=None,
encoding=None,
print_verbose=None,
@ -602,7 +610,7 @@ class IBMWatsonXAI(BaseLLM):
if k not in optional_params:
optional_params[k] = v
model_response["model"] = model
model_response.model = model
# Load auth variables from environment variables
if isinstance(input, str):
@ -635,12 +643,12 @@ class IBMWatsonXAI(BaseLLM):
}
request_manager = RequestManager(logging_obj)
def handle_embedding(request_params: dict) -> ModelResponse:
def handle_embedding(request_params: dict) -> EmbeddingResponse:
with request_manager.request(request_params, input=input) as resp:
json_resp = resp.json()
return self._process_embedding_response(json_resp, model_response)
async def handle_aembedding(request_params: dict) -> ModelResponse:
async def handle_aembedding(request_params: dict) -> EmbeddingResponse:
async with request_manager.async_request(
request_params, input=input
) as resp:

View file

@ -38,6 +38,7 @@ import dotenv
import httpx
import openai
import tiktoken
from pydantic import BaseModel
from typing_extensions import overload
import litellm
@ -48,6 +49,7 @@ from litellm import ( # type: ignore
get_litellm_params,
get_optional_params,
)
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.utils import (
CustomStreamWrapper,
@ -520,7 +522,7 @@ def mock_completion(
)
return response
if n is None:
model_response["choices"][0]["message"]["content"] = mock_response
model_response.choices[0].message.content = mock_response # type: ignore
else:
_all_choices = []
for i in range(n):
@ -531,12 +533,12 @@ def mock_completion(
),
)
_all_choices.append(_choice)
model_response["choices"] = _all_choices
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.choices = _all_choices # type: ignore
model_response.created = int(time.time())
model_response.model = model
if mock_tool_calls:
model_response["choices"][0]["message"]["tool_calls"] = [
model_response.choices[0].message.tool_calls = [ # type: ignore
ChatCompletionMessageToolCall(**tool_call)
for tool_call in mock_tool_calls
]
@ -1932,51 +1934,7 @@ def completion(
"""
Deprecated. We now do together ai calls via the openai client - https://docs.together.ai/docs/openai-api-compatibility
"""
custom_llm_provider = "together_ai"
together_ai_key = (
api_key
or litellm.togetherai_api_key
or get_secret("TOGETHER_AI_TOKEN")
or get_secret("TOGETHERAI_API_KEY")
or litellm.api_key
)
api_base = (
api_base
or litellm.api_base
or get_secret("TOGETHERAI_API_BASE")
or "https://api.together.xyz/inference"
)
custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
model_response = together_ai.completion(
model=model,
messages=messages,
api_base=api_base,
model_response=model_response,
print_verbose=print_verbose,
optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
encoding=encoding,
api_key=together_ai_key,
logging_obj=logging,
custom_prompt_dict=custom_prompt_dict,
)
if (
"stream_tokens" in optional_params
and optional_params["stream_tokens"] == True
):
# don't try to access stream object,
response = CustomStreamWrapper(
model_response,
model,
custom_llm_provider="together_ai",
logging_obj=logging,
)
return response
response = model_response
pass
elif custom_llm_provider == "palm":
palm_api_key = api_key or get_secret("PALM_API_KEY") or litellm.api_key
@ -2459,10 +2417,10 @@ def completion(
## LOGGING
generator = ollama.get_ollama_response(
api_base,
model,
prompt,
optional_params,
api_base=api_base,
model=model,
prompt=prompt,
optional_params=optional_params,
logging_obj=logging,
acompletion=acompletion,
model_response=model_response,
@ -2488,11 +2446,11 @@ def completion(
)
## LOGGING
generator = ollama_chat.get_ollama_response(
api_base,
api_key,
model,
messages,
optional_params,
api_base=api_base,
api_key=api_key,
model=model,
messages=messages,
optional_params=optional_params,
logging_obj=logging,
acompletion=acompletion,
model_response=model_response,
@ -2670,9 +2628,9 @@ def completion(
"""
string_response = response_json["data"][0]["output"][0]
## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = string_response
model_response["created"] = int(time.time())
model_response["model"] = model
model_response.choices[0].message.content = string_response # type: ignore
model_response.created = int(time.time())
model_response.model = model
response = model_response
else:
raise ValueError(
@ -3463,7 +3421,7 @@ def embedding(
or api_base
or get_secret("OLLAMA_API_BASE")
or "http://localhost:11434"
)
) # type: ignore
if isinstance(input, str):
input = [input]
if not all(isinstance(item, str) for item in input):
@ -3473,9 +3431,11 @@ def embedding(
llm_provider="ollama", # type: ignore
)
ollama_embeddings_fn = (
ollama.ollama_aembeddings if aembedding else ollama.ollama_embeddings
ollama.ollama_aembeddings
if aembedding is True
else ollama.ollama_embeddings
)
response = ollama_embeddings_fn(
response = ollama_embeddings_fn( # type: ignore
api_base=api_base,
model=model,
prompts=input,
@ -3943,6 +3903,63 @@ def text_completion(
return text_completion_response
###### Adapter Completion ################
async def aadapter_completion(*, adapter_id: str, **kwargs) -> Optional[BaseModel]:
"""
Implemented to handle async calls for adapter_completion()
"""
try:
translation_obj: Optional[CustomLogger] = None
for item in litellm.adapters:
if item["id"] == adapter_id:
translation_obj = item["adapter"]
if translation_obj is None:
raise ValueError(
"No matching adapter given. Received 'adapter_id'={}, litellm.adapters={}".format(
adapter_id, litellm.adapters
)
)
new_kwargs = translation_obj.translate_completion_input_params(kwargs=kwargs)
response: ModelResponse = await acompletion(**new_kwargs) # type: ignore
translated_response = translation_obj.translate_completion_output_params(
response=response
)
return translated_response
except Exception as e:
raise e
def adapter_completion(*, adapter_id: str, **kwargs) -> Optional[BaseModel]:
translation_obj: Optional[CustomLogger] = None
for item in litellm.adapters:
if item["id"] == adapter_id:
translation_obj = item["adapter"]
if translation_obj is None:
raise ValueError(
"No matching adapter given. Received 'adapter_id'={}, litellm.adapters={}".format(
adapter_id, litellm.adapters
)
)
new_kwargs = translation_obj.translate_completion_input_params(kwargs=kwargs)
response: ModelResponse = completion(**new_kwargs) # type: ignore
translated_response = translation_obj.translate_completion_output_params(
response=response
)
return translated_response
##### Moderation #######################

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-1c3809c50f029674.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-1c3809c50f029674.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/275ab6ee150b4fea.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[48951,[\"665\",\"static/chunks/3014691f-589a5f4865c3822f.js\",\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"294\",\"static/chunks/294-0e35509d5ca95267.js\",\"131\",\"static/chunks/131-6a03368053f9d26d.js\",\"684\",\"static/chunks/684-bb2d2f93d92acb0b.js\",\"759\",\"static/chunks/759-83a8bdddfe32b5d9.js\",\"777\",\"static/chunks/777-3264d0959a54279d.js\",\"931\",\"static/chunks/app/page-0cfbdaa2bf8fb022.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/275ab6ee150b4fea.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"LmgW0mreu0hjU2N9CAPDM\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-1c3809c50f029674.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-f960ab1e6d32b002.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-04708d7d4a17c1ee.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-1c3809c50f029674.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/275ab6ee150b4fea.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[48951,[\"665\",\"static/chunks/3014691f-589a5f4865c3822f.js\",\"936\",\"static/chunks/2f6dbc85-052c4579f80d66ae.js\",\"294\",\"static/chunks/294-0e35509d5ca95267.js\",\"131\",\"static/chunks/131-19b05e5ce40fa85d.js\",\"684\",\"static/chunks/684-bb2d2f93d92acb0b.js\",\"759\",\"static/chunks/759-d7572f2a46f911d5.js\",\"777\",\"static/chunks/777-3264d0959a54279d.js\",\"931\",\"static/chunks/app/page-1cc1412fb406fc70.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/275ab6ee150b4fea.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"aCz2wdplG6aqWrQnod4_6\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>

View file

@ -1,7 +1,7 @@
2:I[77831,[],""]
3:I[48951,["665","static/chunks/3014691f-589a5f4865c3822f.js","936","static/chunks/2f6dbc85-052c4579f80d66ae.js","294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","684","static/chunks/684-bb2d2f93d92acb0b.js","759","static/chunks/759-83a8bdddfe32b5d9.js","777","static/chunks/777-3264d0959a54279d.js","931","static/chunks/app/page-0cfbdaa2bf8fb022.js"],""]
3:I[48951,["665","static/chunks/3014691f-589a5f4865c3822f.js","936","static/chunks/2f6dbc85-052c4579f80d66ae.js","294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-19b05e5ce40fa85d.js","684","static/chunks/684-bb2d2f93d92acb0b.js","759","static/chunks/759-d7572f2a46f911d5.js","777","static/chunks/777-3264d0959a54279d.js","931","static/chunks/app/page-1cc1412fb406fc70.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["LmgW0mreu0hjU2N9CAPDM",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
0:["aCz2wdplG6aqWrQnod4_6",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

File diff suppressed because one or more lines are too long

View file

@ -1,7 +1,7 @@
2:I[77831,[],""]
3:I[87494,["294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-6a03368053f9d26d.js","777","static/chunks/777-3264d0959a54279d.js","418","static/chunks/app/model_hub/page-6575356e2cde4d07.js"],""]
3:I[87494,["294","static/chunks/294-0e35509d5ca95267.js","131","static/chunks/131-19b05e5ce40fa85d.js","777","static/chunks/777-3264d0959a54279d.js","418","static/chunks/app/model_hub/page-6575356e2cde4d07.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["LmgW0mreu0hjU2N9CAPDM",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
0:["aCz2wdplG6aqWrQnod4_6",[[["",{"children":["model_hub",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["model_hub",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","model_hub","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

File diff suppressed because one or more lines are too long

View file

@ -2,6 +2,6 @@
3:I[667,["665","static/chunks/3014691f-589a5f4865c3822f.js","294","static/chunks/294-0e35509d5ca95267.js","684","static/chunks/684-bb2d2f93d92acb0b.js","777","static/chunks/777-3264d0959a54279d.js","461","static/chunks/app/onboarding/page-c73480cdcfdbe5ac.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["LmgW0mreu0hjU2N9CAPDM",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
0:["aCz2wdplG6aqWrQnod4_6",[[["",{"children":["onboarding",{"children":["__PAGE__",{}]}]},"$undefined","$undefined",true],["",{"children":["onboarding",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children","onboarding","children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":"$undefined","notFoundStyles":"$undefined","styles":null}]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/275ab6ee150b4fea.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

View file

@ -1,19 +1,14 @@
model_list:
- model_name: "*"
- model_name: azure-ai-mistral
litellm_params:
model: "openai/*"
- model_name: gemini-1.5-flash
api_base: os.environ/AZURE_AI_MISTRAL_API_BASE
api_key: os.environ/AZURE_AI_MISTRAL_API_KEY
model: azure_ai/Mistral-large-nmefg
- model_name: azure-ai-phi
litellm_params:
model: gemini/gemini-1.5-flash
- model_name: whisper
litellm_params:
model: azure/azure-whisper
api_version: 2024-02-15-preview
api_base: os.environ/AZURE_EUROPE_API_BASE
api_key: os.environ/AZURE_EUROPE_API_KEY
model_info:
mode: audio_transcription
api_base: os.environ/AZURE_AI_PHI_API_BASE
api_key: os.environ/AZURE_AI_PHI_API_KEY
model: azure_ai/Phi-3-medium-128k-instruct-fpmvj
general_settings:

View file

@ -204,6 +204,10 @@ class LiteLLMRoutes(enum.Enum):
# files
"/v1/files",
"/files",
"/v1/files/{file_id}",
"/files/{file_id}",
"/v1/files/{file_id}/content",
"/files/{file_id}/content",
# assistants-related routes
"/assistants",
"/v1/assistants",

View file

@ -71,6 +71,11 @@ azure_api_key_header = APIKeyHeader(
auto_error=False,
description="Some older versions of the openai Python package will send an API-Key header with just the API key ",
)
anthropic_api_key_header = APIKeyHeader(
name="x-api-key",
auto_error=False,
description="If anthropic client used.",
)
def _get_bearer_token(
@ -87,6 +92,9 @@ async def user_api_key_auth(
request: Request,
api_key: str = fastapi.Security(api_key_header),
azure_api_key_header: str = fastapi.Security(azure_api_key_header),
anthropic_api_key_header: Optional[str] = fastapi.Security(
anthropic_api_key_header
),
) -> UserAPIKeyAuth:
from litellm.proxy.proxy_server import (
@ -114,6 +122,9 @@ async def user_api_key_auth(
elif isinstance(azure_api_key_header, str):
api_key = azure_api_key_header
elif isinstance(anthropic_api_key_header, str):
api_key = anthropic_api_key_header
parent_otel_span: Optional[Span] = None
if open_telemetry_logger is not None:
parent_otel_span = open_telemetry_logger.tracer.start_span(

View file

@ -25,3 +25,38 @@ if os.environ.get("LITELLM_PROFILE", "false").lower() == "true":
result.append(f"{stat.traceback.format()}: {stat.size / 1024} KiB")
return {"top_50_memory_usage": result}
@router.get("/otel-spans", include_in_schema=False)
async def get_otel_spans():
from litellm.integrations.opentelemetry import OpenTelemetry
from litellm.proxy.proxy_server import open_telemetry_logger
open_telemetry_logger: OpenTelemetry = open_telemetry_logger
otel_exporter = open_telemetry_logger.OTEL_EXPORTER
recorded_spans = otel_exporter.get_finished_spans()
print("Spans: ", recorded_spans) # noqa
most_recent_parent = None
most_recent_start_time = 1000000
spans_grouped_by_parent = {}
for span in recorded_spans:
if span.parent is not None:
parent_trace_id = span.parent.trace_id
if parent_trace_id not in spans_grouped_by_parent:
spans_grouped_by_parent[parent_trace_id] = []
spans_grouped_by_parent[parent_trace_id].append(span.name)
# check time of span
if span.start_time > most_recent_start_time:
most_recent_parent = parent_trace_id
most_recent_start_time = span.start_time
# these are otel spans - get the span name
span_names = [span.name for span in recorded_spans]
return {
"otel_spans": span_names,
"spans_grouped_by_parent": spans_grouped_by_parent,
"most_recent_parent": most_recent_parent,
}

View file

@ -35,6 +35,10 @@ def initialize_callbacks_on_proxy(
open_telemetry_logger = OpenTelemetry()
# Add Otel as a service callback
if "otel" not in litellm.service_callback:
litellm.service_callback.append("otel")
imported_list.append(open_telemetry_logger)
setattr(proxy_server, "open_telemetry_logger", open_telemetry_logger)
elif isinstance(callback, str) and callback == "presidio":

View file

@ -0,0 +1,11 @@
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
cache: true
callbacks: ["otel"]

View file

@ -406,6 +406,19 @@ async def active_callbacks():
}
def callback_name(callback):
if isinstance(callback, str):
return callback
try:
return callback.__name__
except AttributeError:
try:
return callback.__class__.__name__
except AttributeError:
return str(callback)
@router.get(
"/health/readiness",
tags=["health"],
@ -424,8 +437,8 @@ async def health_readiness():
try:
# this was returning a JSON of the values in some of the callbacks
# all we need is the callback name, hence we do str(callback)
success_callback_names = [str(x) for x in litellm.success_callback]
except:
success_callback_names = [callback_name(x) for x in litellm.success_callback]
except AttributeError:
# don't let this block the /health/readiness response, if we can't convert to str -> return litellm.success_callback
success_callback_names = litellm.success_callback

View file

@ -0,0 +1,599 @@
######################################################################
# /v1/files Endpoints
# Equivalent of https://platform.openai.com/docs/api-reference/files
######################################################################
import asyncio
import traceback
from datetime import datetime, timedelta, timezone
from typing import List, Optional
import fastapi
import httpx
from fastapi import (
APIRouter,
Depends,
File,
Form,
Header,
HTTPException,
Request,
Response,
UploadFile,
status,
)
import litellm
from litellm import CreateFileRequest, FileContentRequest
from litellm._logging import verbose_proxy_logger
from litellm.batches.main import FileObject
from litellm.proxy._types import *
from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
router = APIRouter()
@router.post(
"/v1/files",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
@router.post(
"/files",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
async def create_file(
request: Request,
fastapi_response: Response,
purpose: str = Form(...),
file: UploadFile = File(...),
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Upload a file that can be used across - Assistants API, Batch API
This is the equivalent of POST https://api.openai.com/v1/files
Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/create
Example Curl
```
curl http://localhost:4000/v1/files \
-H "Authorization: Bearer sk-1234" \
-F purpose="batch" \
-F file="@mydata.jsonl"
```
"""
from litellm.proxy.proxy_server import (
add_litellm_data_to_request,
general_settings,
get_custom_headers,
proxy_config,
proxy_logging_obj,
version,
)
data: Dict = {}
try:
# Use orjson to parse JSON data, orjson speeds up requests significantly
# Read the file content
file_content = await file.read()
# Prepare the data for forwarding
data = {"purpose": purpose}
# Include original request and headers in the data
data = await add_litellm_data_to_request(
data=data,
request=request,
general_settings=general_settings,
user_api_key_dict=user_api_key_dict,
version=version,
proxy_config=proxy_config,
)
# Prepare the file data according to FileTypes
file_data = (file.filename, file_content, file.content_type)
_create_file_request = CreateFileRequest(file=file_data, **data)
# for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
response = await litellm.acreate_file(
custom_llm_provider="openai", **_create_file_request
)
### ALERTING ###
asyncio.create_task(
proxy_logging_obj.update_request_status(
litellm_call_id=data.get("litellm_call_id", ""), status="success"
)
)
### RESPONSE HEADERS ###
hidden_params = getattr(response, "_hidden_params", {}) or {}
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
fastapi_response.headers.update(
get_custom_headers(
user_api_key_dict=user_api_key_dict,
model_id=model_id,
cache_key=cache_key,
api_base=api_base,
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
)
)
return response
except Exception as e:
await proxy_logging_obj.post_call_failure_hook(
user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
)
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.create_file(): Exception occured - {}".format(
str(e)
)
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "message", str(e.detail)),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
)
else:
error_msg = f"{str(e)}"
raise ProxyException(
message=getattr(e, "message", error_msg),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", 500),
)
@router.get(
"/v1/files/{file_id:path}",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
@router.get(
"/files/{file_id:path}",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
async def get_file(
request: Request,
fastapi_response: Response,
file_id: str,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Returns information about a specific file. that can be used across - Assistants API, Batch API
This is the equivalent of GET https://api.openai.com/v1/files/{file_id}
Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/retrieve
Example Curl
```
curl http://localhost:4000/v1/files/file-abc123 \
-H "Authorization: Bearer sk-1234"
```
"""
from litellm.proxy.proxy_server import (
add_litellm_data_to_request,
general_settings,
get_custom_headers,
proxy_config,
proxy_logging_obj,
version,
)
data: Dict = {}
try:
# Include original request and headers in the data
data = await add_litellm_data_to_request(
data=data,
request=request,
general_settings=general_settings,
user_api_key_dict=user_api_key_dict,
version=version,
proxy_config=proxy_config,
)
# for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
response = await litellm.afile_retrieve(
custom_llm_provider="openai", file_id=file_id, **data
)
### ALERTING ###
asyncio.create_task(
proxy_logging_obj.update_request_status(
litellm_call_id=data.get("litellm_call_id", ""), status="success"
)
)
### RESPONSE HEADERS ###
hidden_params = getattr(response, "_hidden_params", {}) or {}
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
fastapi_response.headers.update(
get_custom_headers(
user_api_key_dict=user_api_key_dict,
model_id=model_id,
cache_key=cache_key,
api_base=api_base,
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
)
)
return response
except Exception as e:
await proxy_logging_obj.post_call_failure_hook(
user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
)
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.retrieve_file(): Exception occured - {}".format(
str(e)
)
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "message", str(e.detail)),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
)
else:
error_msg = f"{str(e)}"
raise ProxyException(
message=getattr(e, "message", error_msg),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", 500),
)
@router.delete(
"/v1/files/{file_id:path}",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
@router.delete(
"/files/{file_id:path}",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
async def delete_file(
request: Request,
fastapi_response: Response,
file_id: str,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Deletes a specified file. that can be used across - Assistants API, Batch API
This is the equivalent of DELETE https://api.openai.com/v1/files/{file_id}
Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/delete
Example Curl
```
curl http://localhost:4000/v1/files/file-abc123 \
-X DELETE \
-H "Authorization: Bearer $OPENAI_API_KEY"
```
"""
from litellm.proxy.proxy_server import (
add_litellm_data_to_request,
general_settings,
get_custom_headers,
proxy_config,
proxy_logging_obj,
version,
)
data: Dict = {}
try:
# Include original request and headers in the data
data = await add_litellm_data_to_request(
data=data,
request=request,
general_settings=general_settings,
user_api_key_dict=user_api_key_dict,
version=version,
proxy_config=proxy_config,
)
# for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
response = await litellm.afile_delete(
custom_llm_provider="openai", file_id=file_id, **data
)
### ALERTING ###
asyncio.create_task(
proxy_logging_obj.update_request_status(
litellm_call_id=data.get("litellm_call_id", ""), status="success"
)
)
### RESPONSE HEADERS ###
hidden_params = getattr(response, "_hidden_params", {}) or {}
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
fastapi_response.headers.update(
get_custom_headers(
user_api_key_dict=user_api_key_dict,
model_id=model_id,
cache_key=cache_key,
api_base=api_base,
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
)
)
return response
except Exception as e:
await proxy_logging_obj.post_call_failure_hook(
user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
)
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.retrieve_file(): Exception occured - {}".format(
str(e)
)
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "message", str(e.detail)),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
)
else:
error_msg = f"{str(e)}"
raise ProxyException(
message=getattr(e, "message", error_msg),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", 500),
)
@router.get(
"/v1/files",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
@router.get(
"/files",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
async def list_files(
request: Request,
fastapi_response: Response,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
purpose: Optional[str] = None,
):
"""
Returns information about a specific file. that can be used across - Assistants API, Batch API
This is the equivalent of GET https://api.openai.com/v1/files/
Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/list
Example Curl
```
curl http://localhost:4000/v1/files\
-H "Authorization: Bearer sk-1234"
```
"""
from litellm.proxy.proxy_server import (
add_litellm_data_to_request,
general_settings,
get_custom_headers,
proxy_config,
proxy_logging_obj,
version,
)
data: Dict = {}
try:
# Include original request and headers in the data
data = await add_litellm_data_to_request(
data=data,
request=request,
general_settings=general_settings,
user_api_key_dict=user_api_key_dict,
version=version,
proxy_config=proxy_config,
)
# for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
response = await litellm.afile_list(
custom_llm_provider="openai", purpose=purpose, **data
)
### ALERTING ###
asyncio.create_task(
proxy_logging_obj.update_request_status(
litellm_call_id=data.get("litellm_call_id", ""), status="success"
)
)
### RESPONSE HEADERS ###
hidden_params = getattr(response, "_hidden_params", {}) or {}
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
fastapi_response.headers.update(
get_custom_headers(
user_api_key_dict=user_api_key_dict,
model_id=model_id,
cache_key=cache_key,
api_base=api_base,
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
)
)
return response
except Exception as e:
await proxy_logging_obj.post_call_failure_hook(
user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
)
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.list_files(): Exception occured - {}".format(
str(e)
)
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "message", str(e.detail)),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
)
else:
error_msg = f"{str(e)}"
raise ProxyException(
message=getattr(e, "message", error_msg),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", 500),
)
@router.get(
"/v1/files/{file_id:path}/content",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
@router.get(
"/files/{file_id:path}/content",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
async def get_file_content(
request: Request,
fastapi_response: Response,
file_id: str,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Returns information about a specific file. that can be used across - Assistants API, Batch API
This is the equivalent of GET https://api.openai.com/v1/files/{file_id}/content
Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/retrieve-contents
Example Curl
```
curl http://localhost:4000/v1/files/file-abc123/content \
-H "Authorization: Bearer sk-1234"
```
"""
from litellm.proxy.proxy_server import (
add_litellm_data_to_request,
general_settings,
get_custom_headers,
proxy_config,
proxy_logging_obj,
version,
)
data: Dict = {}
try:
# Include original request and headers in the data
data = await add_litellm_data_to_request(
data=data,
request=request,
general_settings=general_settings,
user_api_key_dict=user_api_key_dict,
version=version,
proxy_config=proxy_config,
)
# for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
response = await litellm.afile_content(
custom_llm_provider="openai", file_id=file_id, **data
)
### ALERTING ###
asyncio.create_task(
proxy_logging_obj.update_request_status(
litellm_call_id=data.get("litellm_call_id", ""), status="success"
)
)
### RESPONSE HEADERS ###
hidden_params = getattr(response, "_hidden_params", {}) or {}
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
fastapi_response.headers.update(
get_custom_headers(
user_api_key_dict=user_api_key_dict,
model_id=model_id,
cache_key=cache_key,
api_base=api_base,
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
)
)
httpx_response: Optional[httpx.Response] = getattr(response, "response", None)
if httpx_response is None:
raise ValueError(
f"Invalid response - response.response is None - got {response}"
)
return Response(
content=httpx_response.content,
status_code=httpx_response.status_code,
headers=httpx_response.headers,
)
except Exception as e:
await proxy_logging_obj.post_call_failure_hook(
user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
)
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.retrieve_file_content(): Exception occured - {}".format(
str(e)
)
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "message", str(e.detail)),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
)
else:
error_msg = f"{str(e)}"
raise ProxyException(
message=getattr(e, "message", error_msg),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", 500),
)

View file

@ -4,47 +4,14 @@ model_list:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: llama3
- model_name: gemini-flash
litellm_params:
model: groq/llama3-8b-8192
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
- model_name: "*"
litellm_params:
model: openai/*
api_key: os.environ/OPENAI_API_KEY
- model_name: mistral-embed
litellm_params:
model: mistral/mistral-embed
model: gemini/gemini-1.5-flash
general_settings:
pass_through_endpoints:
- path: "/v1/rerank"
target: "https://api.cohere.com/v1/rerank"
auth: true # 👈 Key change to use LiteLLM Auth / Keys
headers:
Authorization: "bearer os.environ/COHERE_API_KEY"
content-type: application/json
accept: application/json
- path: "/api/public/ingestion"
target: "https://us.cloud.langfuse.com/api/public/ingestion"
auth: true
headers:
LANGFUSE_PUBLIC_KEY: "os.environ/LANGFUSE_DEV_PUBLIC_KEY"
LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_DEV_SK_KEY"
general_settings:
master_key: sk-1234
litellm_settings:
guardrails:
- prompt_injection:
callbacks: [lakera_prompt_injection, hide_secrets]
default_on: true
- hide_secrets:
callbacks: [hide_secrets]
default_on: true
assistant_settings:
custom_llm_provider: openai
litellm_params:
api_key: os.environ/OPENAI_API_KEY
cache: true
callbacks: ["otel"]

View file

@ -1,24 +1,18 @@
import ast
import asyncio
import copy
import hashlib
import importlib
import inspect
import os
import platform
import random
import re
import secrets
import shutil
import subprocess
import sys
import threading
import time
import traceback
import uuid
import warnings
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Set, get_args
from datetime import datetime, timedelta
from typing import TYPE_CHECKING, Any, List, Optional
import requests
@ -106,7 +100,6 @@ import litellm
from litellm import (
CancelBatchRequest,
CreateBatchRequest,
CreateFileRequest,
ListBatchRequest,
RetrieveBatchRequest,
)
@ -174,6 +167,9 @@ from litellm.proxy.management_endpoints.key_management_endpoints import (
router as key_management_router,
)
from litellm.proxy.management_endpoints.team_endpoints import router as team_router
from litellm.proxy.openai_files_endpoints.files_endpoints import (
router as openai_files_router,
)
from litellm.proxy.pass_through_endpoints.pass_through_endpoints import (
initialize_pass_through_endpoints,
)
@ -213,6 +209,12 @@ from litellm.router import (
from litellm.router import ModelInfo as RouterModelInfo
from litellm.router import updateDeployment
from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler
from litellm.types.llms.anthropic import (
AnthropicMessagesRequest,
AnthropicResponse,
AnthropicResponseContentBlockText,
AnthropicResponseUsageBlock,
)
from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import RouterGeneralSettings
@ -2667,6 +2669,11 @@ async def startup_event():
def model_list(
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Use `/model/info` - to get detailed model information, example - pricing, mode, etc.
This is just for compatibility with openai projects like aider.
"""
global llm_model_list, general_settings
all_models = []
## CHECK IF MODEL RESTRICTIONS ARE SET AT KEY/TEAM LEVEL ##
@ -2791,7 +2798,7 @@ async def chat_completion(
## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
## IMPORTANT Note: - initialize this before running pre-call checks. Ensures we log rejected requests to langfuse.
data["litellm_call_id"] = str(uuid.uuid4())
data["litellm_call_id"] = request.headers.get('x-litellm-call-id', str(uuid.uuid4()))
logging_obj, data = litellm.utils.function_setup(
original_function="acompletion",
rules_obj=litellm.utils.Rules(),
@ -3243,6 +3250,12 @@ async def completion(
response_class=ORJSONResponse,
tags=["embeddings"],
)
@router.post(
"/engines/{model:path}/embeddings",
dependencies=[Depends(user_api_key_auth)],
response_class=ORJSONResponse,
tags=["embeddings"],
) # azure compatible endpoint
@router.post(
"/openai/deployments/{model:path}/embeddings",
dependencies=[Depends(user_api_key_auth)],
@ -4891,117 +4904,6 @@ async def retrieve_batch(
######################################################################
######################################################################
# /v1/files Endpoints
######################################################################
@router.post(
"/v1/files",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
@router.post(
"/files",
dependencies=[Depends(user_api_key_auth)],
tags=["files"],
)
async def create_file(
request: Request,
fastapi_response: Response,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Upload a file that can be used across - Assistants API, Batch API
This is the equivalent of POST https://api.openai.com/v1/files
Supports Identical Params as: https://platform.openai.com/docs/api-reference/files/create
Example Curl
```
curl https://api.openai.com/v1/files \
-H "Authorization: Bearer sk-1234" \
-F purpose="batch" \
-F file="@mydata.jsonl"
```
"""
global proxy_logging_obj
data: Dict = {}
try:
# Use orjson to parse JSON data, orjson speeds up requests significantly
form_data = await request.form()
data = {key: value for key, value in form_data.items() if key != "file"}
# Include original request and headers in the data
data = await add_litellm_data_to_request(
data=data,
request=request,
general_settings=general_settings,
user_api_key_dict=user_api_key_dict,
version=version,
proxy_config=proxy_config,
)
_create_file_request = CreateFileRequest()
# for now use custom_llm_provider=="openai" -> this will change as LiteLLM adds more providers for acreate_batch
response = await litellm.acreate_file(
custom_llm_provider="openai", **_create_file_request
)
### ALERTING ###
asyncio.create_task(
proxy_logging_obj.update_request_status(
litellm_call_id=data.get("litellm_call_id", ""), status="success"
)
)
### RESPONSE HEADERS ###
hidden_params = getattr(response, "_hidden_params", {}) or {}
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
fastapi_response.headers.update(
get_custom_headers(
user_api_key_dict=user_api_key_dict,
model_id=model_id,
cache_key=cache_key,
api_base=api_base,
version=version,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
)
)
return response
except Exception as e:
await proxy_logging_obj.post_call_failure_hook(
user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
)
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.create_file(): Exception occured - {}".format(
str(e)
)
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "message", str(e.detail)),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_400_BAD_REQUEST),
)
else:
error_msg = f"{str(e)}"
raise ProxyException(
message=getattr(e, "message", error_msg),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", 500),
)
@router.post(
"/v1/moderations",
@ -5150,6 +5052,198 @@ async def moderations(
)
#### ANTHROPIC ENDPOINTS ####
@router.post(
"/v1/messages",
tags=["[beta] Anthropic `/v1/messages`"],
dependencies=[Depends(user_api_key_auth)],
response_model=AnthropicResponse,
)
async def anthropic_response(
anthropic_data: AnthropicMessagesRequest,
fastapi_response: Response,
request: Request,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
from litellm import adapter_completion
from litellm.adapters.anthropic_adapter import anthropic_adapter
litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
global user_temperature, user_request_timeout, user_max_tokens, user_api_base
data: dict = {**anthropic_data, "adapter_id": "anthropic"}
try:
data["model"] = (
general_settings.get("completion_model", None) # server default
or user_model # model name passed via cli args
or data["model"] # default passed in http request
)
if user_model:
data["model"] = user_model
data = await add_litellm_data_to_request(
data=data, # type: ignore
request=request,
general_settings=general_settings,
user_api_key_dict=user_api_key_dict,
version=version,
proxy_config=proxy_config,
)
# override with user settings, these are params passed via cli
if user_temperature:
data["temperature"] = user_temperature
if user_request_timeout:
data["request_timeout"] = user_request_timeout
if user_max_tokens:
data["max_tokens"] = user_max_tokens
if user_api_base:
data["api_base"] = user_api_base
### MODEL ALIAS MAPPING ###
# check if model name in model alias map
# get the actual model name
if data["model"] in litellm.model_alias_map:
data["model"] = litellm.model_alias_map[data["model"]]
### CALL HOOKS ### - modify incoming data before calling the model
data = await proxy_logging_obj.pre_call_hook( # type: ignore
user_api_key_dict=user_api_key_dict, data=data, call_type="text_completion"
)
### ROUTE THE REQUESTs ###
router_model_names = llm_router.model_names if llm_router is not None else []
# skip router if user passed their key
if "api_key" in data:
llm_response = asyncio.create_task(litellm.aadapter_completion(**data))
elif (
llm_router is not None and data["model"] in router_model_names
): # model in router model list
llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
elif (
llm_router is not None
and llm_router.model_group_alias is not None
and data["model"] in llm_router.model_group_alias
): # model set in model_group_alias
llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
elif (
llm_router is not None and data["model"] in llm_router.deployment_names
): # model in router deployments, calling a specific deployment on the router
llm_response = asyncio.create_task(
llm_router.aadapter_completion(**data, specific_deployment=True)
)
elif (
llm_router is not None and data["model"] in llm_router.get_model_ids()
): # model in router model list
llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
elif (
llm_router is not None
and data["model"] not in router_model_names
and llm_router.default_deployment is not None
): # model in router deployments, calling a specific deployment on the router
llm_response = asyncio.create_task(llm_router.aadapter_completion(**data))
elif user_model is not None: # `litellm --model <your-model-name>`
llm_response = asyncio.create_task(litellm.aadapter_completion(**data))
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail={
"error": "completion: Invalid model name passed in model="
+ data.get("model", "")
},
)
# Await the llm_response task
response = await llm_response
hidden_params = getattr(response, "_hidden_params", {}) or {}
model_id = hidden_params.get("model_id", None) or ""
cache_key = hidden_params.get("cache_key", None) or ""
api_base = hidden_params.get("api_base", None) or ""
response_cost = hidden_params.get("response_cost", None) or ""
### ALERTING ###
asyncio.create_task(
proxy_logging_obj.update_request_status(
litellm_call_id=data.get("litellm_call_id", ""), status="success"
)
)
verbose_proxy_logger.debug("final response: %s", response)
fastapi_response.headers.update(
get_custom_headers(
user_api_key_dict=user_api_key_dict,
model_id=model_id,
cache_key=cache_key,
api_base=api_base,
version=version,
response_cost=response_cost,
)
)
verbose_proxy_logger.info("\nResponse from Litellm:\n{}".format(response))
return response
except RejectedRequestError as e:
_data = e.request_data
await proxy_logging_obj.post_call_failure_hook(
user_api_key_dict=user_api_key_dict,
original_exception=e,
request_data=_data,
)
if _data.get("stream", None) is not None and _data["stream"] == True:
_chat_response = litellm.ModelResponse()
_usage = litellm.Usage(
prompt_tokens=0,
completion_tokens=0,
total_tokens=0,
)
_chat_response.usage = _usage # type: ignore
_chat_response.choices[0].message.content = e.message # type: ignore
_iterator = litellm.utils.ModelResponseIterator(
model_response=_chat_response, convert_to_delta=True
)
_streaming_response = litellm.TextCompletionStreamWrapper(
completion_stream=_iterator,
model=_data.get("model", ""),
)
selected_data_generator = select_data_generator(
response=_streaming_response,
user_api_key_dict=user_api_key_dict,
request_data=data,
)
return StreamingResponse(
selected_data_generator,
media_type="text/event-stream",
headers={},
)
else:
_response = litellm.TextCompletionResponse()
_response.choices[0].text = e.message
return _response
except Exception as e:
await proxy_logging_obj.post_call_failure_hook(
user_api_key_dict=user_api_key_dict, original_exception=e, request_data=data
)
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.completion(): Exception occured - {}".format(
str(e)
)
)
verbose_proxy_logger.debug(traceback.format_exc())
error_msg = f"{str(e)}"
raise ProxyException(
message=getattr(e, "message", error_msg),
type=getattr(e, "type", "None"),
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", 500),
)
#### DEV UTILS ####
# @router.get(
@ -9302,3 +9396,4 @@ app.include_router(caching_router)
app.include_router(analytics_router)
app.include_router(debugging_endpoints_router)
app.include_router(ui_crud_endpoints_router)
app.include_router(openai_files_router)

View file

@ -1765,6 +1765,125 @@ class Router:
self.fail_calls[model] += 1
raise e
async def aadapter_completion(
self,
adapter_id: str,
model: str,
is_retry: Optional[bool] = False,
is_fallback: Optional[bool] = False,
is_async: Optional[bool] = False,
**kwargs,
):
try:
kwargs["model"] = model
kwargs["adapter_id"] = adapter_id
kwargs["original_function"] = self._aadapter_completion
kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
timeout = kwargs.get("request_timeout", self.timeout)
kwargs.setdefault("metadata", {}).update({"model_group": model})
response = await self.async_function_with_fallbacks(**kwargs)
return response
except Exception as e:
asyncio.create_task(
send_llm_exception_alert(
litellm_router_instance=self,
request_kwargs=kwargs,
error_traceback_str=traceback.format_exc(),
original_exception=e,
)
)
raise e
async def _aadapter_completion(self, adapter_id: str, model: str, **kwargs):
try:
verbose_router_logger.debug(
f"Inside _aadapter_completion()- model: {model}; kwargs: {kwargs}"
)
deployment = await self.async_get_available_deployment(
model=model,
messages=[{"role": "user", "content": "default text"}],
specific_deployment=kwargs.pop("specific_deployment", None),
)
kwargs.setdefault("metadata", {}).update(
{
"deployment": deployment["litellm_params"]["model"],
"model_info": deployment.get("model_info", {}),
"api_base": deployment.get("litellm_params", {}).get("api_base"),
}
)
kwargs["model_info"] = deployment.get("model_info", {})
data = deployment["litellm_params"].copy()
model_name = data["model"]
for k, v in self.default_litellm_params.items():
if (
k not in kwargs
): # prioritize model-specific params > default router params
kwargs[k] = v
elif k == "metadata":
kwargs[k].update(v)
potential_model_client = self._get_client(
deployment=deployment, kwargs=kwargs, client_type="async"
)
# check if provided keys == client keys #
dynamic_api_key = kwargs.get("api_key", None)
if (
dynamic_api_key is not None
and potential_model_client is not None
and dynamic_api_key != potential_model_client.api_key
):
model_client = None
else:
model_client = potential_model_client
self.total_calls[model_name] += 1
response = litellm.aadapter_completion(
**{
**data,
"adapter_id": adapter_id,
"caching": self.cache_responses,
"client": model_client,
"timeout": self.timeout,
**kwargs,
}
)
rpm_semaphore = self._get_client(
deployment=deployment,
kwargs=kwargs,
client_type="max_parallel_requests",
)
if rpm_semaphore is not None and isinstance(
rpm_semaphore, asyncio.Semaphore
):
async with rpm_semaphore:
"""
- Check rpm limits before making the call
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment
)
response = await response # type: ignore
else:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
response = await response # type: ignore
self.success_calls[model_name] += 1
verbose_router_logger.info(
f"litellm.aadapter_completion(model={model_name})\033[32m 200 OK\033[0m"
)
return response
except Exception as e:
verbose_router_logger.info(
f"litellm.aadapter_completion(model={model})\033[31m Exception {str(e)}\033[0m"
)
if model is not None:
self.fail_calls[model] += 1
raise e
def embedding(
self,
model: str,

File diff suppressed because one or more lines are too long

View file

@ -237,6 +237,8 @@ async def test_langfuse_logging_without_request_response(stream, langfuse_client
assert _trace_data[0].output == {
"role": "assistant",
"content": "redacted-by-litellm",
"function_call": None,
"tool_calls": None,
}
except Exception as e:
@ -273,7 +275,12 @@ async def test_langfuse_masked_input_output(langfuse_client):
expected_output = (
"redacted-by-litellm"
if mask_value
else {"content": "This is a test response", "role": "assistant"}
else {
"content": "This is a test response",
"role": "assistant",
"function_call": None,
"tool_calls": None,
}
)
langfuse_client.flush()
await asyncio.sleep(2)

View file

@ -0,0 +1,103 @@
# What is this?
## Unit tests for Anthropic Adapter
import asyncio
import os
import sys
import traceback
from dotenv import load_dotenv
load_dotenv()
import io
import os
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
from unittest.mock import MagicMock, patch
import pytest
import litellm
from litellm import AnthropicConfig, Router, adapter_completion
from litellm.adapters.anthropic_adapter import anthropic_adapter
from litellm.types.llms.anthropic import AnthropicResponse
def test_anthropic_completion_messages_translation():
messages = [{"role": "user", "content": "Hey, how's it going?"}]
translated_messages = AnthropicConfig().translate_anthropic_messages_to_openai(messages=messages) # type: ignore
assert translated_messages == [{"role": "user", "content": "Hey, how's it going?"}]
def test_anthropic_completion_input_translation():
data = {
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
}
translated_input = anthropic_adapter.translate_completion_input_params(kwargs=data)
assert translated_input is not None
assert translated_input["model"] == "gpt-3.5-turbo"
assert translated_input["messages"] == [
{"role": "user", "content": "Hey, how's it going?"}
]
def test_anthropic_completion_e2e():
litellm.set_verbose = True
litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
messages = [{"role": "user", "content": "Hey, how's it going?"}]
response = adapter_completion(
model="gpt-3.5-turbo",
messages=messages,
adapter_id="anthropic",
mock_response="This is a fake call",
)
print("Response: {}".format(response))
assert response is not None
assert isinstance(response, AnthropicResponse)
@pytest.mark.asyncio
async def test_anthropic_router_completion_e2e():
litellm.set_verbose = True
litellm.adapters = [{"id": "anthropic", "adapter": anthropic_adapter}]
router = Router(
model_list=[
{
"model_name": "claude-3-5-sonnet-20240620",
"litellm_params": {
"model": "gpt-3.5-turbo",
"mock_response": "hi this is macintosh.",
},
}
]
)
messages = [{"role": "user", "content": "Hey, how's it going?"}]
response = await router.aadapter_completion(
model="claude-3-5-sonnet-20240620",
messages=messages,
adapter_id="anthropic",
mock_response="This is a fake call",
)
print("Response: {}".format(response))
assert response is not None
assert isinstance(response, AnthropicResponse)
assert response.model == "gpt-3.5-turbo"

View file

@ -1,21 +1,20 @@
import asyncio
import litellm
from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from litellm._logging import verbose_logger
import logging
import time
import pytest
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
verbose_logger.setLevel(logging.DEBUG)
@pytest.mark.skip(
reason="new test. WIP. works locally but not on CI. Still figuring this out"
)
@pytest.mark.asyncio
async def test_otel_callback():
@pytest.mark.skip(reason="Local only test. WIP.")
async def test_async_otel_callback():
exporter = InMemorySpanExporter()
litellm.set_verbose = True
litellm.callbacks = [OpenTelemetry(OpenTelemetryConfig(exporter=exporter))]

View file

@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.llms.prompt_templates.factory import anthropic_messages_pt
# litellm.num_retries = 3
# litellm.num_retries=3
litellm.cache = None
litellm.success_callback = []
user_message = "Write a short poem about the sky"
@ -3065,32 +3065,38 @@ def response_format_tests(response: litellm.ModelResponse):
@pytest.mark.asyncio
async def test_completion_bedrock_httpx_models(sync_mode, model):
litellm.set_verbose = True
try:
if sync_mode:
response = completion(
model=model,
messages=[{"role": "user", "content": "Hey! how's it going?"}],
temperature=0.2,
max_tokens=200,
)
if sync_mode:
response = completion(
model=model,
messages=[{"role": "user", "content": "Hey! how's it going?"}],
temperature=0.2,
max_tokens=200,
)
assert isinstance(response, litellm.ModelResponse)
assert isinstance(response, litellm.ModelResponse)
response_format_tests(response=response)
else:
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hey! how's it going?"}],
temperature=0.2,
max_tokens=100,
)
response_format_tests(response=response)
else:
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hey! how's it going?"}],
temperature=0.2,
max_tokens=100,
)
assert isinstance(response, litellm.ModelResponse)
assert isinstance(response, litellm.ModelResponse)
print(f"response: {response}")
response_format_tests(response=response)
print(f"response: {response}")
response_format_tests(response=response)
print(f"response: {response}")
except litellm.RateLimitError as e:
print("got rate limit error=", e)
pass
except Exception as e:
pytest.fail(f"An error occurred - {str(e)}")
def test_completion_bedrock_titan_null_response():

View file

@ -712,6 +712,79 @@ def test_vertex_ai_claude_completion_cost():
assert cost == predicted_cost
def test_vertex_ai_embedding_completion_cost(caplog):
"""
Relevant issue - https://github.com/BerriAI/litellm/issues/4630
"""
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
text = "The quick brown fox jumps over the lazy dog."
input_tokens = litellm.token_counter(
model="vertex_ai/textembedding-gecko", text=text
)
model_info = litellm.get_model_info(model="vertex_ai/textembedding-gecko")
print("\nExpected model info:\n{}\n\n".format(model_info))
expected_input_cost = input_tokens * model_info["input_cost_per_token"]
## CALCULATED COST
calculated_input_cost, calculated_output_cost = cost_per_token(
model="textembedding-gecko",
custom_llm_provider="vertex_ai",
prompt_tokens=input_tokens,
call_type="aembedding",
)
assert round(expected_input_cost, 6) == round(calculated_input_cost, 6)
print("expected_input_cost: {}".format(expected_input_cost))
print("calculated_input_cost: {}".format(calculated_input_cost))
captured_logs = [rec.message for rec in caplog.records]
for item in captured_logs:
print("\nitem:{}\n".format(item))
if (
"litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Exception occured "
in item
):
raise Exception("Error log raised for calculating embedding cost")
# def test_vertex_ai_embedding_completion_cost_e2e():
# """
# Relevant issue - https://github.com/BerriAI/litellm/issues/4630
# """
# from litellm.tests.test_amazing_vertex_completion import load_vertex_ai_credentials
# load_vertex_ai_credentials()
# os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
# litellm.model_cost = litellm.get_model_cost_map(url="")
# text = "The quick brown fox jumps over the lazy dog."
# input_tokens = litellm.token_counter(
# model="vertex_ai/textembedding-gecko", text=text
# )
# model_info = litellm.get_model_info(model="vertex_ai/textembedding-gecko")
# print("\nExpected model info:\n{}\n\n".format(model_info))
# expected_input_cost = input_tokens * model_info["input_cost_per_token"]
# ## CALCULATED COST
# resp = litellm.embedding(model="textembedding-gecko", input=[text])
# calculated_input_cost = resp._hidden_params["response_cost"]
# assert round(expected_input_cost, 6) == round(calculated_input_cost, 6)
# print("expected_input_cost: {}".format(expected_input_cost))
# print("calculated_input_cost: {}".format(calculated_input_cost))
# assert False
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_completion_cost_hidden_params(sync_mode):

View file

@ -1,13 +1,16 @@
# What is this?
## Unit testing for the 'get_model_info()' function
import os, sys, traceback
import os
import sys
import traceback
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import pytest
import litellm
from litellm import get_model_info
import pytest
def test_get_model_info_simple_model_name():
@ -37,3 +40,9 @@ def test_get_model_info_custom_llm_with_same_name_vllm():
pytest.fail("Expected get model info to fail for an unmapped model/provider")
except Exception:
pass
def test_get_model_info_shows_correct_supports_vision():
info = litellm.get_model_info("gemini/gemini-1.5-flash")
print("info", info)
assert info["supports_vision"] is True

View file

@ -1,22 +1,26 @@
# What is this?
## Unit Tests for OpenAI Batches API
import sys, os, json
import traceback
import asyncio
import json
import os
import sys
import traceback
from dotenv import load_dotenv
load_dotenv()
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import pytest, logging, asyncio
import litellm
from litellm import (
create_batch,
create_file,
)
import asyncio
import logging
import time
import pytest
import litellm
from litellm import create_batch, create_file
def test_create_batch():
"""
@ -144,6 +148,28 @@ async def test_async_create_batch():
print("file content = ", file_content)
# file obj
file_obj = await litellm.afile_retrieve(
file_id=batch_input_file_id, custom_llm_provider="openai"
)
print("file obj = ", file_obj)
assert file_obj.id == batch_input_file_id
# delete file
delete_file_response = await litellm.afile_delete(
file_id=batch_input_file_id, custom_llm_provider="openai"
)
print("delete file response = ", delete_file_response)
assert delete_file_response.id == batch_input_file_id
all_files_list = await litellm.afile_list(
custom_llm_provider="openai",
)
print("all_files_list = ", all_files_list)
# # write this file content to a file
# with open("file_content.json", "w") as f:
# json.dump(file_content, f)

View file

@ -20,7 +20,7 @@ import pytest
import litellm
from litellm.proxy._types import LiteLLMRoutes
from litellm.proxy.auth.auth_utils import is_openai_route
from litellm.proxy.proxy_server import router
from litellm.proxy.proxy_server import app
# Configure logging
logging.basicConfig(
@ -37,7 +37,7 @@ def test_routes_on_litellm_proxy():
this prevents accidentelly deleting /threads, or /batches etc
"""
_all_routes = []
for route in router.routes:
for route in app.routes:
_path_as_str = str(route.path)
if ":path" in _path_as_str:

View file

@ -21,6 +21,8 @@ sys.path.insert(
from dotenv import load_dotenv
load_dotenv()
import random
import litellm
from litellm import (
AuthenticationError,
@ -1373,7 +1375,8 @@ async def test_bedrock_httpx_streaming(sync_mode, model):
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}\n\nFinalChunk: {final_chunk}")
except RateLimitError:
except RateLimitError as e:
print("got rate limit error=", e)
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
@ -3037,8 +3040,11 @@ def test_completion_claude_3_function_call_with_streaming():
@pytest.mark.parametrize(
"model", ["gemini/gemini-1.5-flash"]
) # "claude-3-opus-20240229",
"model",
[
"gemini/gemini-1.5-flash",
], # "claude-3-opus-20240229"
) #
@pytest.mark.asyncio
async def test_acompletion_claude_3_function_call_with_streaming(model):
litellm.set_verbose = True
@ -3046,41 +3052,45 @@ async def test_acompletion_claude_3_function_call_with_streaming(model):
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"name": "generate_series_of_questions",
"description": "Generate a series of questions, given a topic.",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
"questions": {
"type": "array",
"description": "The questions to be generated.",
"items": {"type": "string"},
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
"required": ["questions"],
},
},
}
},
]
SYSTEM_PROMPT = "You are an AI assistant"
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": "What's the weather like in Boston today in fahrenheit?",
}
"content": "Generate 3 questions about civil engineering.",
},
]
try:
# test without max tokens
response = await acompletion(
model=model,
# model="claude-3-5-sonnet-20240620",
messages=messages,
tools=tools,
tool_choice="required",
stream=True,
temperature=0.75,
tools=tools,
stream_options={"include_usage": True},
)
idx = 0
print(f"response: {response}")
async for chunk in response:
# print(f"chunk: {chunk}")
print(f"chunk in test: {chunk}")
if idx == 0:
assert (
chunk.choices[0].delta.tool_calls[0].function.arguments is not None
@ -3510,3 +3520,56 @@ def test_unit_test_custom_stream_wrapper_function_call():
if chunk.choices[0].finish_reason is not None:
finish_reason = chunk.choices[0].finish_reason
assert finish_reason == "tool_calls"
## UNIT TEST RECREATING MODEL RESPONSE
from litellm.types.utils import (
ChatCompletionDeltaToolCall,
Delta,
Function,
StreamingChoices,
Usage,
)
initial_model_response = litellm.ModelResponse(
id="chatcmpl-842826b6-75a1-4ed4-8a68-7655e60654b3",
choices=[
StreamingChoices(
finish_reason=None,
index=0,
delta=Delta(
content="",
role="assistant",
function_call=None,
tool_calls=[
ChatCompletionDeltaToolCall(
id="7ee88721-bfee-4584-8662-944a23d4c7a5",
function=Function(
arguments='{"questions": ["What are the main challenges facing civil engineers today?", "How has technology impacted the field of civil engineering?", "What are some of the most innovative projects in civil engineering in recent years?"]}',
name="generate_series_of_questions",
),
type="function",
index=0,
)
],
),
logprobs=None,
)
],
created=1720755257,
model="gemini-1.5-flash",
object="chat.completion.chunk",
system_fingerprint=None,
usage=Usage(prompt_tokens=67, completion_tokens=55, total_tokens=122),
stream=True,
)
obj_dict = initial_model_response.dict()
if "usage" in obj_dict:
del obj_dict["usage"]
new_model = response.model_response_creator(chunk=obj_dict)
print("\n\n{}\n\n".format(new_model))
assert len(new_model.choices[0].delta.tool_calls) > 0

View file

@ -258,6 +258,13 @@ def test_validate_environment_empty_model():
raise Exception()
def test_validate_environment_api_key():
response_obj = validate_environment(model="gpt-3.5-turbo", api_key="sk-my-test-key")
assert (
response_obj["keys_in_environment"] is True
), f"Missing keys={response_obj['missing_keys']}"
@mock.patch.dict(os.environ, {"OLLAMA_API_BASE": "foo"}, clear=True)
def test_validate_environment_ollama():
for provider in ["ollama", "ollama_chat"]:

10
litellm/types/adapter.py Normal file
View file

@ -0,0 +1,10 @@
from typing import List
from typing_extensions import Dict, Required, TypedDict, override
from litellm.integrations.custom_logger import CustomLogger
class AdapterItem(TypedDict):
id: str
adapter: CustomLogger

View file

@ -9,25 +9,27 @@ class AnthropicMessagesToolChoice(TypedDict, total=False):
name: str
class AnthopicMessagesAssistantMessageTextContentParam(TypedDict, total=False):
type: Required[Literal["text"]]
class AnthropicMessagesTool(TypedDict, total=False):
name: Required[str]
description: str
input_schema: Required[dict]
class AnthropicMessagesTextParam(TypedDict):
type: Literal["text"]
text: str
class AnthopicMessagesAssistantMessageToolCallParam(TypedDict, total=False):
type: Required[Literal["tool_use"]]
class AnthropicMessagesToolUseParam(TypedDict):
type: Literal["tool_use"]
id: str
name: str
input: dict
AnthropicMessagesAssistantMessageValues = Union[
AnthopicMessagesAssistantMessageTextContentParam,
AnthopicMessagesAssistantMessageToolCallParam,
AnthropicMessagesTextParam,
AnthropicMessagesToolUseParam,
]
@ -46,6 +48,72 @@ class AnthopicMessagesAssistantMessageParam(TypedDict, total=False):
"""
class AnthropicImageParamSource(TypedDict):
type: Literal["base64"]
media_type: str
data: str
class AnthropicMessagesImageParam(TypedDict):
type: Literal["image"]
source: AnthropicImageParamSource
class AnthropicMessagesToolResultContent(TypedDict):
type: Literal["text"]
text: str
class AnthropicMessagesToolResultParam(TypedDict, total=False):
type: Required[Literal["tool_result"]]
tool_use_id: Required[str]
is_error: bool
content: Union[
str,
Iterable[
Union[AnthropicMessagesToolResultContent, AnthropicMessagesImageParam]
],
]
AnthropicMessagesUserMessageValues = Union[
AnthropicMessagesTextParam,
AnthropicMessagesImageParam,
AnthropicMessagesToolResultParam,
]
class AnthropicMessagesUserMessageParam(TypedDict, total=False):
role: Required[Literal["user"]]
content: Required[Union[str, Iterable[AnthropicMessagesUserMessageValues]]]
class AnthropicMetadata(TypedDict, total=False):
user_id: str
class AnthropicMessagesRequest(TypedDict, total=False):
model: Required[str]
messages: Required[
List[
Union[
AnthropicMessagesUserMessageParam,
AnthopicMessagesAssistantMessageParam,
]
]
]
max_tokens: Required[int]
metadata: AnthropicMetadata
stop_sequences: List[str]
stream: bool
system: str
temperature: float
tool_choice: AnthropicMessagesToolChoice
tools: List[AnthropicMessagesTool]
top_k: int
top_p: float
class ContentTextBlockDelta(TypedDict):
"""
'delta': {'type': 'text_delta', 'text': 'Hello'}
@ -155,3 +223,51 @@ class MessageStartBlock(TypedDict):
type: Literal["message_start"]
message: MessageChunk
class AnthropicResponseContentBlockText(BaseModel):
type: Literal["text"]
text: str
class AnthropicResponseContentBlockToolUse(BaseModel):
type: Literal["tool_use"]
id: str
name: str
input: dict
class AnthropicResponseUsageBlock(BaseModel):
input_tokens: int
output_tokens: int
AnthropicFinishReason = Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"]
class AnthropicResponse(BaseModel):
id: str
"""Unique object identifier."""
type: Literal["message"]
"""For Messages, this is always "message"."""
role: Literal["assistant"]
"""Conversational role of the generated message. This will always be "assistant"."""
content: List[
Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse]
]
"""Content generated by the model."""
model: str
"""The model that handled the request."""
stop_reason: Optional[AnthropicFinishReason]
"""The reason that we stopped."""
stop_sequence: Optional[str]
"""Which custom stop sequence was generated, if any."""
usage: AnthropicResponseUsageBlock
"""Billing and rate-limit usage."""

View file

@ -305,7 +305,13 @@ class ChatCompletionToolCallFunctionChunk(TypedDict, total=False):
arguments: str
class ChatCompletionToolCallChunk(TypedDict):
class ChatCompletionAssistantToolCall(TypedDict):
id: Optional[str]
type: Literal["function"]
function: ChatCompletionToolCallFunctionChunk
class ChatCompletionToolCallChunk(TypedDict): # result of /chat/completions call
id: Optional[str]
type: Literal["function"]
function: ChatCompletionToolCallFunctionChunk
@ -319,6 +325,107 @@ class ChatCompletionDeltaToolCallChunk(TypedDict, total=False):
index: int
class ChatCompletionTextObject(TypedDict):
type: Literal["text"]
text: str
class ChatCompletionImageUrlObject(TypedDict, total=False):
url: Required[str]
detail: str
class ChatCompletionImageObject(TypedDict):
type: Literal["image_url"]
image_url: ChatCompletionImageUrlObject
class ChatCompletionUserMessage(TypedDict):
role: Literal["user"]
content: Union[
str, Iterable[Union[ChatCompletionTextObject, ChatCompletionImageObject]]
]
class ChatCompletionAssistantMessage(TypedDict, total=False):
role: Required[Literal["assistant"]]
content: Optional[str]
name: str
tool_calls: List[ChatCompletionAssistantToolCall]
class ChatCompletionToolMessage(TypedDict):
role: Literal["tool"]
content: str
tool_call_id: str
class ChatCompletionSystemMessage(TypedDict, total=False):
role: Required[Literal["system"]]
content: Required[str]
name: str
AllMessageValues = Union[
ChatCompletionUserMessage,
ChatCompletionAssistantMessage,
ChatCompletionToolMessage,
ChatCompletionSystemMessage,
]
class ChatCompletionToolChoiceFunctionParam(TypedDict):
name: str
class ChatCompletionToolChoiceObjectParam(TypedDict):
type: Literal["function"]
function: ChatCompletionToolChoiceFunctionParam
ChatCompletionToolChoiceStringValues = Literal["none", "auto", "required"]
ChatCompletionToolChoiceValues = Union[
ChatCompletionToolChoiceStringValues, ChatCompletionToolChoiceObjectParam
]
class ChatCompletionToolParamFunctionChunk(TypedDict, total=False):
name: Required[str]
description: str
parameters: dict
class ChatCompletionToolParam(TypedDict):
type: Literal["function"]
function: ChatCompletionToolParamFunctionChunk
class ChatCompletionRequest(TypedDict, total=False):
model: Required[str]
messages: Required[List[AllMessageValues]]
frequency_penalty: float
logit_bias: dict
logprobs: bool
top_logprobs: int
max_tokens: int
n: int
presence_penalty: float
response_format: dict
seed: int
service_tier: str
stop: Union[str, List[str]]
stream_options: dict
temperature: float
top_p: float
tools: List[ChatCompletionToolParam]
tool_choice: ChatCompletionToolChoiceValues
parallel_tool_calls: bool
function_call: Union[str, dict]
functions: List
user: str
class ChatCompletionDeltaChunk(TypedDict, total=False):
content: Optional[str]
tool_calls: List[ChatCompletionDeltaToolCallChunk]

View file

@ -73,6 +73,7 @@ class ModelInfo(TypedDict, total=False):
supported_openai_params: Required[Optional[List[str]]]
supports_system_messages: Optional[bool]
supports_response_schema: Optional[bool]
supports_vision: Optional[bool]
class GenericStreamingChunk(TypedDict):
@ -166,7 +167,9 @@ class FunctionCall(OpenAIObject):
class Function(OpenAIObject):
arguments: str
name: Optional[str] = None
name: Optional[
str
] # can be None - openai e.g.: ChoiceDeltaToolCallFunction(arguments='{"', name=None), type=None)
def __init__(
self,
@ -280,29 +283,43 @@ class ChatCompletionMessageToolCall(OpenAIObject):
setattr(self, key, value)
"""
Reference:
ChatCompletionMessage(content='This is a test', role='assistant', function_call=None, tool_calls=None))
"""
class Message(OpenAIObject):
content: Optional[str]
role: Literal["assistant"]
tool_calls: Optional[List[ChatCompletionMessageToolCall]]
function_call: Optional[FunctionCall]
def __init__(
self,
content: Optional[str] = "default",
role="assistant",
logprobs=None,
content: Optional[str] = None,
role: Literal["assistant"] = "assistant",
function_call=None,
tool_calls=None,
**params,
):
super(Message, self).__init__(**params)
self.content = content
self.role = role
if function_call is not None:
self.function_call = FunctionCall(**function_call)
if tool_calls is not None:
self.tool_calls = []
for tool_call in tool_calls:
self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
if logprobs is not None:
self._logprobs = ChoiceLogprobs(**logprobs)
init_values = {
"content": content,
"role": "assistant",
"function_call": (
FunctionCall(**function_call) if function_call is not None else None
),
"tool_calls": (
[ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls]
if tool_calls is not None
else None
),
}
super(Message, self).__init__(
**init_values,
**params,
)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
@ -556,6 +573,8 @@ class ModelResponse(OpenAIObject):
_new_choice = choice # type: ignore
elif isinstance(choice, dict):
_new_choice = Choices(**choice) # type: ignore
else:
_new_choice = choice
new_choices.append(_new_choice)
choices = new_choices
else:
@ -608,10 +627,6 @@ class ModelResponse(OpenAIObject):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
def json(self, **kwargs):
try:
return self.model_dump() # noqa

View file

@ -4829,6 +4829,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
supports_response_schema=_model_info.get(
"supports_response_schema", None
),
supports_vision=_model_info.get("supports_vision", False),
)
except Exception:
raise Exception(
@ -5048,12 +5049,15 @@ def create_proxy_transport_and_mounts():
return sync_proxy_mounts, async_proxy_mounts
def validate_environment(model: Optional[str] = None) -> dict:
def validate_environment(
model: Optional[str] = None, api_key: Optional[str] = None
) -> dict:
"""
Checks if the environment variables are valid for the given model.
Args:
model (Optional[str]): The name of the model. Defaults to None.
api_key (Optional[str]): If the user passed in an api key, of their own.
Returns:
dict: A dictionary containing the following keys:
@ -5329,6 +5333,13 @@ def validate_environment(model: Optional[str] = None) -> dict:
keys_in_environment = True
else:
missing_keys.append("NLP_CLOUD_API_KEY")
if api_key is not None:
new_missing_keys = []
for key in missing_keys:
if "api_key" not in key.lower():
new_missing_keys.append(key)
missing_keys = new_missing_keys
return {"keys_in_environment": keys_in_environment, "missing_keys": missing_keys}
@ -8126,7 +8137,7 @@ class CustomStreamWrapper:
if chunk.startswith(self.complete_response):
# Remove last_sent_chunk only if it appears at the start of the new chunk
chunk = chunk[len(self.complete_response):]
chunk = chunk[len(self.complete_response) :]
self.complete_response += chunk
return chunk
@ -8940,7 +8951,16 @@ class CustomStreamWrapper:
model_response.system_fingerprint = self.system_fingerprint
model_response._hidden_params["custom_llm_provider"] = _logging_obj_llm_provider
model_response._hidden_params["created_at"] = time.time()
model_response.choices = [StreamingChoices(finish_reason=None)]
if (
len(model_response.choices) > 0
and hasattr(model_response.choices[0], "delta")
and model_response.choices[0].delta is not None
):
# do nothing, if object instantiated
pass
else:
model_response.choices = [StreamingChoices(finish_reason=None)]
return model_response
def is_delta_empty(self, delta: Delta) -> bool:
@ -9483,8 +9503,8 @@ class CustomStreamWrapper:
model_response.choices[0].delta = Delta(**_json_delta)
except Exception as e:
verbose_logger.error(
"litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}".format(
str(e)
"litellm.CustomStreamWrapper.chunk_creator(): Exception occured - {}\n{}".format(
str(e), traceback.format_exc()
)
)
verbose_logger.debug(traceback.format_exc())
@ -9881,7 +9901,6 @@ class CustomStreamWrapper:
self.rules.post_call_rules(
input=self.response_uptil_now, model=self.model
)
print_verbose(f"final returned processed chunk: {processed_chunk}")
self.chunks.append(processed_chunk)
if hasattr(
processed_chunk, "usage"
@ -9895,6 +9914,7 @@ class CustomStreamWrapper:
# Create a new object without the removed attribute
processed_chunk = self.model_response_creator(chunk=obj_dict)
print_verbose(f"final returned processed chunk: {processed_chunk}")
return processed_chunk
raise StopAsyncIteration
else: # temporary patch for non-aiohttp async calls
@ -10124,7 +10144,7 @@ def mock_completion_streaming_obj(
model_response, mock_response, model, n: Optional[int] = None
):
for i in range(0, len(mock_response), 3):
completion_obj = Delta(role="assistant", content=mock_response[i: i + 3])
completion_obj = Delta(role="assistant", content=mock_response[i : i + 3])
if n is None:
model_response.choices[0].delta = completion_obj
else:
@ -10133,7 +10153,7 @@ def mock_completion_streaming_obj(
_streaming_choice = litellm.utils.StreamingChoices(
index=j,
delta=litellm.utils.Delta(
role="assistant", content=mock_response[i: i + 3]
role="assistant", content=mock_response[i : i + 3]
),
)
_all_choices.append(_streaming_choice)
@ -10145,7 +10165,7 @@ async def async_mock_completion_streaming_obj(
model_response, mock_response, model, n: Optional[int] = None
):
for i in range(0, len(mock_response), 3):
completion_obj = Delta(role="assistant", content=mock_response[i: i + 3])
completion_obj = Delta(role="assistant", content=mock_response[i : i + 3])
if n is None:
model_response.choices[0].delta = completion_obj
else:
@ -10154,7 +10174,7 @@ async def async_mock_completion_streaming_obj(
_streaming_choice = litellm.utils.StreamingChoices(
index=j,
delta=litellm.utils.Delta(
role="assistant", content=mock_response[i: i + 3]
role="assistant", content=mock_response[i : i + 3]
),
)
_all_choices.append(_streaming_choice)

6
poetry.lock generated
View file

@ -225,13 +225,13 @@ aio = ["aiohttp (>=3.0)"]
[[package]]
name = "azure-identity"
version = "1.16.0"
version = "1.16.1"
description = "Microsoft Azure Identity Library for Python"
optional = true
python-versions = ">=3.8"
files = [
{file = "azure-identity-1.16.0.tar.gz", hash = "sha256:6ff1d667cdcd81da1ceab42f80a0be63ca846629f518a922f7317a7e3c844e1b"},
{file = "azure_identity-1.16.0-py3-none-any.whl", hash = "sha256:722fdb60b8fdd55fa44dc378b8072f4b419b56a5e54c0de391f644949f3a826f"},
{file = "azure-identity-1.16.1.tar.gz", hash = "sha256:6d93f04468f240d59246d8afde3091494a5040d4f141cad0f49fc0c399d0d91e"},
{file = "azure_identity-1.16.1-py3-none-any.whl", hash = "sha256:8fb07c25642cd4ac422559a8b50d3e77f73dcc2bbfaba419d06d6c9d7cff6726"},
]
[package.dependencies]

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.41.15"
version = "1.41.18"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -91,10 +91,16 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.41.15"
version = "1.41.18"
version_files = [
"pyproject.toml:^version"
]
[tool.mypy]
plugins = "pydantic.mypy"
[tool.prisma]
# cache engine binaries in a directory relative to your project
# binary_cache_dir = '.binaries'
home_dir = '.prisma'
nodeenv_cache_dir = '.nodeenv'

Some files were not shown because too many files have changed in this diff Show more