mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-25 02:34:29 +00:00
443 lines
11 KiB
Markdown
443 lines
11 KiB
Markdown
import Tabs from '@theme/Tabs';
|
|
import TabItem from '@theme/TabItem';
|
|
|
|
# LiteLLM - Getting Started
|
|
|
|
https://github.com/BerriAI/litellm
|
|
|
|
import QuickStart from '../src/components/QuickStart.js'
|
|
|
|
## **Call 100+ LLMs using the same Input/Output Format**
|
|
|
|
## Basic usage
|
|
<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_Getting_Started.ipynb">
|
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
|
</a>
|
|
|
|
```shell
|
|
pip install litellm
|
|
```
|
|
<Tabs>
|
|
<TabItem value="openai" label="OpenAI">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
## set ENV variables
|
|
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
|
|
|
response = completion(
|
|
model="gpt-3.5-turbo",
|
|
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
<TabItem value="anthropic" label="Anthropic">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
## set ENV variables
|
|
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
|
|
|
|
response = completion(
|
|
model="claude-2",
|
|
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
|
|
<TabItem value="vertex" label="VertexAI">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
# auth: run 'gcloud auth application-default'
|
|
os.environ["VERTEX_PROJECT"] = "hardy-device-386718"
|
|
os.environ["VERTEX_LOCATION"] = "us-central1"
|
|
|
|
response = completion(
|
|
model="chat-bison",
|
|
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
|
|
<TabItem value="hugging" label="HuggingFace">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
|
|
|
|
# e.g. Call 'WizardLM/WizardCoder-Python-34B-V1.0' hosted on HF Inference endpoints
|
|
response = completion(
|
|
model="huggingface/WizardLM/WizardCoder-Python-34B-V1.0",
|
|
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
|
api_base="https://my-endpoint.huggingface.cloud"
|
|
)
|
|
|
|
print(response)
|
|
```
|
|
|
|
</TabItem>
|
|
|
|
<TabItem value="azure" label="Azure OpenAI">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
## set ENV variables
|
|
os.environ["AZURE_API_KEY"] = ""
|
|
os.environ["AZURE_API_BASE"] = ""
|
|
os.environ["AZURE_API_VERSION"] = ""
|
|
|
|
# azure call
|
|
response = completion(
|
|
"azure/<your_deployment_name>",
|
|
messages = [{ "content": "Hello, how are you?","role": "user"}]
|
|
)
|
|
```
|
|
|
|
</TabItem>
|
|
|
|
|
|
<TabItem value="ollama" label="Ollama">
|
|
|
|
```python
|
|
from litellm import completion
|
|
|
|
response = completion(
|
|
model="ollama/llama2",
|
|
messages = [{ "content": "Hello, how are you?","role": "user"}],
|
|
api_base="http://localhost:11434"
|
|
)
|
|
```
|
|
</TabItem>
|
|
<TabItem value="or" label="Openrouter">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
## set ENV variables
|
|
os.environ["OPENROUTER_API_KEY"] = "openrouter_api_key"
|
|
|
|
response = completion(
|
|
model="openrouter/google/palm-2-chat-bison",
|
|
messages = [{ "content": "Hello, how are you?","role": "user"}],
|
|
)
|
|
```
|
|
</TabItem>
|
|
|
|
</Tabs>
|
|
|
|
## Streaming
|
|
Set `stream=True` in the `completion` args.
|
|
<Tabs>
|
|
<TabItem value="openai" label="OpenAI">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
## set ENV variables
|
|
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
|
|
|
response = completion(
|
|
model="gpt-3.5-turbo",
|
|
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
|
stream=True,
|
|
)
|
|
|
|
for chunk in response:
|
|
print(chunk)
|
|
```
|
|
|
|
</TabItem>
|
|
<TabItem value="anthropic" label="Anthropic">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
## set ENV variables
|
|
os.environ["ANTHROPIC_API_KEY"] = "your-api-key"
|
|
|
|
response = completion(
|
|
model="claude-2",
|
|
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
|
stream=True,
|
|
)
|
|
|
|
for chunk in response:
|
|
print(chunk)
|
|
```
|
|
|
|
</TabItem>
|
|
|
|
<TabItem value="vertex" label="VertexAI">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
# auth: run 'gcloud auth application-default'
|
|
os.environ["VERTEX_PROJECT"] = "hardy-device-386718"
|
|
os.environ["VERTEX_LOCATION"] = "us-central1"
|
|
|
|
response = completion(
|
|
model="chat-bison",
|
|
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
|
stream=True,
|
|
)
|
|
|
|
for chunk in response:
|
|
print(chunk)
|
|
```
|
|
|
|
</TabItem>
|
|
|
|
<TabItem value="hugging" label="HuggingFace">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
os.environ["HUGGINGFACE_API_KEY"] = "huggingface_api_key"
|
|
|
|
# e.g. Call 'WizardLM/WizardCoder-Python-34B-V1.0' hosted on HF Inference endpoints
|
|
response = completion(
|
|
model="huggingface/WizardLM/WizardCoder-Python-34B-V1.0",
|
|
messages=[{ "content": "Hello, how are you?","role": "user"}],
|
|
api_base="https://my-endpoint.huggingface.cloud",
|
|
stream=True,
|
|
)
|
|
|
|
|
|
for chunk in response:
|
|
print(chunk)
|
|
```
|
|
|
|
</TabItem>
|
|
|
|
<TabItem value="azure" label="Azure OpenAI">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
## set ENV variables
|
|
os.environ["AZURE_API_KEY"] = ""
|
|
os.environ["AZURE_API_BASE"] = ""
|
|
os.environ["AZURE_API_VERSION"] = ""
|
|
|
|
# azure call
|
|
response = completion(
|
|
"azure/<your_deployment_name>",
|
|
messages = [{ "content": "Hello, how are you?","role": "user"}],
|
|
stream=True,
|
|
)
|
|
|
|
for chunk in response:
|
|
print(chunk)
|
|
```
|
|
|
|
</TabItem>
|
|
|
|
|
|
<TabItem value="ollama" label="Ollama">
|
|
|
|
```python
|
|
from litellm import completion
|
|
|
|
response = completion(
|
|
model="ollama/llama2",
|
|
messages = [{ "content": "Hello, how are you?","role": "user"}],
|
|
api_base="http://localhost:11434",
|
|
stream=True,
|
|
)
|
|
|
|
for chunk in response:
|
|
print(chunk)
|
|
```
|
|
</TabItem>
|
|
<TabItem value="or" label="Openrouter">
|
|
|
|
```python
|
|
from litellm import completion
|
|
import os
|
|
|
|
## set ENV variables
|
|
os.environ["OPENROUTER_API_KEY"] = "openrouter_api_key"
|
|
|
|
response = completion(
|
|
model="openrouter/google/palm-2-chat-bison",
|
|
messages = [{ "content": "Hello, how are you?","role": "user"}],
|
|
stream=True,
|
|
)
|
|
|
|
for chunk in response:
|
|
print(chunk)
|
|
```
|
|
</TabItem>
|
|
|
|
</Tabs>
|
|
|
|
## Exception handling
|
|
|
|
LiteLLM maps exceptions across all supported providers to the OpenAI exceptions. All our exceptions inherit from OpenAI's exception types, so any error-handling you have for that, should work out of the box with LiteLLM.
|
|
|
|
```python
|
|
from openai.error import OpenAIError
|
|
from litellm import completion
|
|
|
|
os.environ["ANTHROPIC_API_KEY"] = "bad-key"
|
|
try:
|
|
# some code
|
|
completion(model="claude-instant-1", messages=[{"role": "user", "content": "Hey, how's it going?"}])
|
|
except OpenAIError as e:
|
|
print(e)
|
|
```
|
|
|
|
## Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks))
|
|
LiteLLM exposes pre defined callbacks to send data to Langfuse, LLMonitor, Helicone, Promptlayer, Traceloop, Slack
|
|
```python
|
|
from litellm import completion
|
|
|
|
## set env variables for logging tools
|
|
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
|
|
os.environ["LANGFUSE_SECRET_KEY"] = ""
|
|
os.environ["LLMONITOR_APP_ID"] = "your-llmonitor-app-id"
|
|
|
|
os.environ["OPENAI_API_KEY"]
|
|
|
|
# set callbacks
|
|
litellm.success_callback = ["langfuse", "llmonitor"] # log input/output to langfuse, llmonitor, supabase
|
|
|
|
#openai call
|
|
response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}])
|
|
```
|
|
|
|
## Calculate Costs, Usage, Latency
|
|
|
|
Pass the completion response to `litellm.completion_cost(completion_response=response)` and get the cost
|
|
|
|
```python
|
|
from litellm import completion, completion_cost
|
|
import os
|
|
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
|
|
|
response = completion(
|
|
model="gpt-3.5-turbo",
|
|
messages=[{ "content": "Hello, how are you?","role": "user"}]
|
|
)
|
|
|
|
cost = completion_cost(completion_response=response)
|
|
print("Cost for completion call with gpt-3.5-turbo: ", f"${float(cost):.10f}")
|
|
```
|
|
|
|
**Output**
|
|
```shell
|
|
Cost for completion call with gpt-3.5-turbo: $0.0000775000
|
|
```
|
|
|
|
### Track Costs, Usage, Latency for streaming
|
|
We use a custom callback function for this - more info on custom callbacks: https://docs.litellm.ai/docs/observability/custom_callback
|
|
- We define a callback function to calculate cost `def track_cost_callback()`
|
|
- In `def track_cost_callback()` we check if the stream is complete - `if "complete_streaming_response" in kwargs`
|
|
- Use `litellm.completion_cost()` to calculate cost, once the stream is complete
|
|
|
|
```python
|
|
import litellm
|
|
|
|
# track_cost_callback
|
|
def track_cost_callback(
|
|
kwargs, # kwargs to completion
|
|
completion_response, # response from completion
|
|
start_time, end_time # start/end time
|
|
):
|
|
try:
|
|
# check if it has collected an entire stream response
|
|
if "complete_streaming_response" in kwargs:
|
|
# for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost
|
|
completion_response=kwargs["complete_streaming_response"]
|
|
input_text = kwargs["messages"]
|
|
output_text = completion_response["choices"][0]["message"]["content"]
|
|
response_cost = litellm.completion_cost(
|
|
model = kwargs["model"],
|
|
messages = input_text,
|
|
completion=output_text
|
|
)
|
|
print("streaming response_cost", response_cost)
|
|
except:
|
|
pass
|
|
# set callback
|
|
litellm.success_callback = [track_cost_callback] # set custom callback function
|
|
|
|
# litellm.completion() call
|
|
response = completion(
|
|
model="gpt-3.5-turbo",
|
|
messages=[
|
|
{
|
|
"role": "user",
|
|
"content": "Hi 👋 - i'm openai"
|
|
}
|
|
],
|
|
stream=True
|
|
)
|
|
```
|
|
|
|
## OpenAI Proxy
|
|
|
|
Track spend across multiple projects/people
|
|
|
|
The proxy provides:
|
|
1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth)
|
|
2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class)
|
|
3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend)
|
|
4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits)
|
|
|
|
### 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/)
|
|
|
|
### Quick Start Proxy - CLI
|
|
|
|
```shell
|
|
pip install 'litellm[proxy]'
|
|
```
|
|
|
|
#### Step 1: Start litellm proxy
|
|
```shell
|
|
$ litellm --model huggingface/bigcode/starcoder
|
|
|
|
#INFO: Proxy running on http://0.0.0.0:8000
|
|
```
|
|
|
|
#### Step 2: Make ChatCompletions Request to Proxy
|
|
```python
|
|
import openai # openai v1.0.0+
|
|
client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:8000") # set proxy to base_url
|
|
# request sent to model set on litellm proxy, `litellm --model`
|
|
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
|
|
{
|
|
"role": "user",
|
|
"content": "this is a test request, write a short poem"
|
|
}
|
|
])
|
|
|
|
print(response)
|
|
```
|
|
|
|
|
|
## More details
|
|
* [exception mapping](./exception_mapping.md)
|
|
* [retries + model fallbacks for completion()](./completion/reliable_completions.md)
|
|
* [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md)
|