## Use Azure OpenAI with LiteLLM

In [None]:
!pip install litellm

Pass API_BASE, API_VERSION, API_KEY in COMPLETION()

In [4]:
import litellm
response = litellm.completion(
    model = "azure/chatgpt-v-2",                                  # model = azure/<your deployment name>
    api_base = "https://openai-gpt-4-test-v-1.openai.azure.com/", # azure api base
    api_version = "2023-05-15",                                   # azure api version
    api_key = "",                                                 # azure api key
    messages = [{"role": "user", "content": "good morning"}],
    max_tokens=10,
)
print(response)

{
  "id": "chatcmpl-877x4J2JUSReOuxVGE3THLjcmdrI8",
  "object": "chat.completion",
  "created": 1696709554,
  "model": "gpt-35-turbo",
  "choices": [
    {
      "index": 0,
      "finish_reason": "length",
      "message": {
        "role": "assistant",
        "content": "Good morning! How can I assist you today?"
      }
    }
  ],
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 10,
    "total_tokens": 20
  }
}


## Set .env variables with Azure / LiteLLM

In [5]:
import litellm
import os

os.environ['AZURE_API_KEY'] = ""
os.environ['AZURE_API_BASE'] = ""
os.environ['AZURE_API_VERSION'] = ""

response = litellm.completion(
    model = "azure/chatgpt-v-2", # model = azure/<your deployment name>
    messages = [{"role": "user", "content": "good morning"}],
    max_tokens=10,
)
print(response)

{
  "id": "chatcmpl-877zB0GWZl4zswopLt12yQEzEfYWy",
  "object": "chat.completion",
  "created": 1696709685,
  "model": "gpt-35-turbo",
  "choices": [
    {
      "index": 0,
      "finish_reason": "length",
      "message": {
        "role": "assistant",
        "content": "Good morning! How can I assist you today?"
      }
    }
  ],
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 10,
    "total_tokens": 20
  }
}


## With Streaming

In [None]:
response = litellm.completion(
    model = "azure/chatgpt-v-2",
    messages = [{"role": "user", "content": "good morning"}],
    max_tokens=10,
    stream=True
)

for chunk in response:
  print(chunk)

## With Rate Limit Handler

In [8]:
from litellm import RateLimitManager

handler = RateLimitManager(max_requests_per_minute=10, max_tokens_per_minute=200)

response = await handler.acompletion(
    model = "azure/chatgpt-v-2",
    messages = [{"role": "user", "content": "good morning"}],
    max_tokens=10,
)
print(response)

{
  "id": "chatcmpl-8781gvDKwPbp44CliumABgAuIDnSf",
  "object": "chat.completion",
  "created": 1696709840,
  "model": "gpt-35-turbo",
  "choices": [
    {
      "index": 0,
      "finish_reason": "length",
      "message": {
        "role": "assistant",
        "content": "Good morning! How can I assist you today?"
      }
    }
  ],
  "usage": {
    "completion_tokens": 10,
    "prompt_tokens": 10,
    "total_tokens": 20
  }
}
