Merge branch 'main' into multi-class-krrish

This commit is contained in:
Krish Dholakia 2023-08-12 17:47:37 -07:00 committed by GitHub
commit 887350d082
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 696 additions and 3 deletions

View file

@ -13,7 +13,7 @@ a light package to simplify calling OpenAI, Azure, Cohere, Anthropic, Huggingfac
- exception mapping - common exceptions across providers are mapped to the [OpenAI exception types](https://help.openai.com/en/articles/6897213-openai-library-error-types-guidance)
# usage
Demo - https://litellm.ai/ \
Read the docs - https://litellm.readthedocs.io/en/latest/
Read the docs - https://docs.litellm.ai/docs/
## quick start
```

View file

@ -0,0 +1,179 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install litellm==0.1.385"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from litellm import completion\n",
"import asyncio"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup Messages"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"user_message = \"respond in 20 words. who are you?\"\n",
"messages = [{ \"content\": user_message,\"role\": \"user\"}]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Call Ollama - llama2 with chatGPT Input/Output using litellm.completion() "
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<async_generator object get_ollama_response_stream at 0x1069198b0>\n"
]
}
],
"source": [
"response = completion(model=\"llama2\", messages=messages, custom_api_base=\"http://localhost:11434\", custom_llm_provider=\"ollama\", stream=True)\n",
"print(response)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Iterate through the generator - Streaming"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" I\n",
"{'role': 'assistant', 'content': ' I'}\n",
" am\n",
"{'role': 'assistant', 'content': ' am'}\n",
" L\n",
"{'role': 'assistant', 'content': ' L'}\n",
"La\n",
"{'role': 'assistant', 'content': 'La'}\n",
"MA\n",
"{'role': 'assistant', 'content': 'MA'}\n",
",\n",
"{'role': 'assistant', 'content': ','}\n",
" an\n",
"{'role': 'assistant', 'content': ' an'}\n",
" A\n",
"{'role': 'assistant', 'content': ' A'}\n",
"I\n",
"{'role': 'assistant', 'content': 'I'}\n",
" assistant\n",
"{'role': 'assistant', 'content': ' assistant'}\n",
" developed\n",
"{'role': 'assistant', 'content': ' developed'}\n",
" by\n",
"{'role': 'assistant', 'content': ' by'}\n",
" Meta\n",
"{'role': 'assistant', 'content': ' Meta'}\n",
" A\n",
"{'role': 'assistant', 'content': ' A'}\n",
"I\n",
"{'role': 'assistant', 'content': 'I'}\n",
" that\n",
"{'role': 'assistant', 'content': ' that'}\n",
" can\n",
"{'role': 'assistant', 'content': ' can'}\n",
" understand\n",
"{'role': 'assistant', 'content': ' understand'}\n",
" and\n",
"{'role': 'assistant', 'content': ' and'}\n",
" respond\n",
"{'role': 'assistant', 'content': ' respond'}\n",
" to\n",
"{'role': 'assistant', 'content': ' to'}\n",
" human\n",
"{'role': 'assistant', 'content': ' human'}\n",
" input\n",
"{'role': 'assistant', 'content': ' input'}\n",
" in\n",
"{'role': 'assistant', 'content': ' in'}\n",
" a\n",
"{'role': 'assistant', 'content': ' a'}\n",
" convers\n",
"{'role': 'assistant', 'content': ' convers'}\n",
"ational\n",
"{'role': 'assistant', 'content': 'ational'}\n",
" manner\n",
"{'role': 'assistant', 'content': ' manner'}\n",
".\n",
"{'role': 'assistant', 'content': '.'}\n"
]
}
],
"source": [
"\n",
"async def get_response(generator):\n",
" response = \"\"\n",
" async for elem in generator:\n",
" print(elem)\n",
" response += elem['choices'][0]['delta'][\"content\"]\n",
" return response\n",
"\n",
"string_response = await get_response(response)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View file

@ -0,0 +1,349 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "MZ01up0p7wOJ"
},
"source": [
"## 🚅 liteLLM Demo\n",
"### TLDR: Call 50+ LLM APIs using chatGPT Input/Output format\n",
"https://github.com/BerriAI/litellm\n",
"\n",
"liteLLM is package to simplify calling **OpenAI, Azure, Llama2, Cohere, Anthropic, Huggingface API Endpoints**. LiteLLM manages\n",
"\n",
"* Translating inputs to the provider's `completion()` and `embedding()` endpoints\n",
"* Guarantees consistent output, text responses will always be available at `['choices'][0]['message']['content']`\n",
"* Exception mapping - common exceptions across providers are mapped to the OpenAI exception types\n",
"\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "RZtzCnQS7rW-"
},
"source": [
"## Installation and setting Params"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rsrN5W-N7L8d"
},
"outputs": [],
"source": [
"!pip install litellm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "ArrWyG5b7QAG"
},
"outputs": [],
"source": [
"from litellm import completion\n",
"import os"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "bbhJRt34_NJ1"
},
"source": [
"## Set your API keys\n",
"- liteLLM reads your .env, env variables or key manager for Auth"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"id": "-h8Ga5cR7SvV"
},
"outputs": [],
"source": [
"os.environ['OPENAI_API_KEY'] = \"\" #@param\n",
"os.environ[\"ANTHROPIC_API_KEY\"] = \"\" #@param\n",
"os.environ[\"AZURE_API_BASE\"] = \"\" #@param\n",
"os.environ[\"AZURE_API_VERSION\"] = \"\" #@param\n",
"os.environ[\"AZURE_API_KEY\"] = \"\" #@param\n",
"os.environ[\"REPLICATE_API_TOKEN\"] = \"\" #@param\n",
"os.environ[\"COHERE_API_KEY\"] = \"\" #@param\n",
"os.environ[\"HF_TOKEN\"] = \"\" #@param"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "MBujGiby8YBu"
},
"outputs": [],
"source": [
"messages = [{ \"content\": \"what's the weather in SF\",\"role\": \"user\"}]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "fhqpKv6L8fBj"
},
"source": [
"## Call chatGPT"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "speIkoX_8db4",
"outputId": "bc804d62-1d33-4198-b6d7-b721961694a3"
},
"outputs": [
{
"data": {
"text/plain": [
"<OpenAIObject chat.completion id=chatcmpl-7mrklZEq2zK3Z5pSkOR3Jn54gpN5A at 0x7f76df70e930> JSON: {\n",
" \"id\": \"chatcmpl-7mrklZEq2zK3Z5pSkOR3Jn54gpN5A\",\n",
" \"object\": \"chat.completion\",\n",
" \"created\": 1691880727,\n",
" \"model\": \"gpt-3.5-turbo-0613\",\n",
" \"choices\": [\n",
" {\n",
" \"index\": 0,\n",
" \"message\": {\n",
" \"role\": \"assistant\",\n",
" \"content\": \"I'm sorry, but as an AI language model, I don't have real-time data. However, you can check the current weather in San Francisco by using a weather website or app, or by searching \\\"weather in San Francisco\\\" on a search engine.\"\n",
" },\n",
" \"finish_reason\": \"stop\"\n",
" }\n",
" ],\n",
" \"usage\": {\n",
" \"prompt_tokens\": 13,\n",
" \"completion_tokens\": 52,\n",
" \"total_tokens\": 65\n",
" }\n",
"}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"completion(model=\"gpt-3.5-turbo\", messages=messages)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "Q3jV1Uxv8zNo"
},
"source": [
"## Call Claude-2"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "V8yTWYzY8m9S",
"outputId": "8b6dd32d-f9bf-4e89-886d-47cb8020f025"
},
"outputs": [
{
"data": {
"text/plain": [
"{'choices': [{'finish_reason': 'stop',\n",
" 'index': 0,\n",
" 'message': {'role': 'assistant',\n",
" 'content': \" Unfortunately I do not have enough context to provide the current weather in San Francisco. To get the most accurate weather report, it's helpful if I know details like:\\n\\n- Exact location (city name, zip code, etc)\\n- Time frame (current conditions, forecast for a certain day/week, etc)\\n\\nIf you can provide some more specifics about what weather information you need for San Francisco, I'd be happy to look that up for you!\"}}],\n",
" 'created': 1691880836.974166,\n",
" 'model': 'claude-2',\n",
" 'usage': {'prompt_tokens': 18, 'completion_tokens': 95, 'total_tokens': 113}}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"completion(model=\"claude-2\", messages=messages)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "yu0LPDmW9PJa"
},
"source": [
"## Call llama2 on replicate"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0GWV5mtO9Jbu",
"outputId": "38538825-b271-406d-a437-f5cf0eb7e548"
},
"outputs": [
{
"data": {
"text/plain": [
"{'choices': [{'finish_reason': 'stop',\n",
" 'index': 0,\n",
" 'message': {'role': 'assistant',\n",
" 'content': ' I\\'m happy to help! However, I must point out that the question \"what\\'s the weather in SF\" doesn\\'t make sense as \"SF\" could refer to multiple locations (San Francisco, South Florida, San Fernando, etc.). Could you please provide more context or specify which location you\\'re referring to? That way, I can give you an accurate answer.'}}],\n",
" 'created': 1691880930.9003325,\n",
" 'model': 'replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1',\n",
" 'usage': {'prompt_tokens': 6, 'completion_tokens': 74, 'total_tokens': 80}}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = \"replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1\"\n",
"completion(model=model, messages=messages)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "HXdj5SEe9iLK"
},
"source": [
"## Call Command-Nightly"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EaUq2xIx9fhr",
"outputId": "55fe6f52-b58b-4729-948a-74dac4b431b2"
},
"outputs": [
{
"data": {
"text/plain": [
"{'choices': [{'finish_reason': 'stop',\n",
" 'index': 0,\n",
" 'message': {'role': 'assistant',\n",
" 'content': ' The weather in San Francisco can be quite unpredictable. The city is known for its fog, which can'}}],\n",
" 'created': 1691880972.5565543,\n",
" 'model': 'command-nightly',\n",
" 'usage': {'prompt_tokens': 6, 'completion_tokens': 20, 'total_tokens': 26}}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"completion(model=\"command-nightly\", messages=messages)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "1g9hSgsL9soJ"
},
"source": [
"## Call Azure OpenAI"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AvLjR-PF-lt0",
"outputId": "deff2db3-b003-48cd-ea62-c03a68a4464a"
},
"outputs": [
{
"data": {
"text/plain": [
"<OpenAIObject chat.completion id=chatcmpl-7mrtwvpx3okijXmbt9PEYdPMeE7lH at 0x7f76cfb356c0> JSON: {\n",
" \"id\": \"chatcmpl-7mrtwvpx3okijXmbt9PEYdPMeE7lH\",\n",
" \"object\": \"chat.completion\",\n",
" \"created\": 1691881296,\n",
" \"model\": \"gpt-35-turbo\",\n",
" \"choices\": [\n",
" {\n",
" \"index\": 0,\n",
" \"finish_reason\": \"stop\",\n",
" \"message\": {\n",
" \"role\": \"assistant\",\n",
" \"content\": \"I'm sorry, as an AI language model, I don't have real-time data. However, you can check the weather forecast for San Francisco on websites such as AccuWeather or Weather Channel.\"\n",
" }\n",
" }\n",
" ],\n",
" \"usage\": {\n",
" \"completion_tokens\": 40,\n",
" \"prompt_tokens\": 14,\n",
" \"total_tokens\": 54\n",
" }\n",
"}"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"completion(deployment_id=\"chatgpt-test\", messages=messages, azure=True)"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

BIN
cookbook/proxy-server/.DS_Store vendored Normal file

Binary file not shown.

View file

@ -10,6 +10,8 @@ from .llms.anthropic import AnthropicLLM
import tiktoken
from concurrent.futures import ThreadPoolExecutor
encoding = tiktoken.get_encoding("cl100k_base")
from litellm.utils import get_secret, install_and_import, CustomStreamWrapper, read_config_args
from litellm.utils import get_ollama_response_stream, stream_to_string
####### ENVIRONMENT VARIABLES ###################
dotenv.load_dotenv() # Loading env variables using dotenv
new_response = {
@ -388,6 +390,15 @@ def completion(
model_response["created"] = time.time()
model_response["model"] = model
response = model_response
elif custom_llm_provider == "ollama":
endpoint = litellm.api_base if litellm.api_base is not None else custom_api_base
prompt = " ".join([message["content"] for message in messages])
## LOGGING
logging(model=model, input=prompt, azure=azure, logger_fn=logger_fn)
generator = get_ollama_response_stream(endpoint, model, prompt)
# assume all responses are streamed
return generator
else:
## LOGGING
logging(model=model, input=messages, custom_llm_provider=custom_llm_provider, logger_fn=logger_fn)

View file

@ -0,0 +1,62 @@
###### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ######
# import aiohttp
# import json
# import asyncio
# import requests
# async def get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"):
# session = aiohttp.ClientSession()
# url = f'{api_base}/api/generate'
# data = {
# "model": model,
# "prompt": prompt,
# }
# response = ""
# try:
# async with session.post(url, json=data) as resp:
# async for line in resp.content.iter_any():
# if line:
# try:
# json_chunk = line.decode("utf-8")
# chunks = json_chunk.split("\n")
# for chunk in chunks:
# if chunk.strip() != "":
# j = json.loads(chunk)
# if "response" in j:
# print(j["response"])
# yield {
# "role": "assistant",
# "content": j["response"]
# }
# # self.responses.append(j["response"])
# # yield "blank"
# except Exception as e:
# print(f"Error decoding JSON: {e}")
# finally:
# await session.close()
# # async def get_ollama_response_no_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"):
# # generator = get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?")
# # response = ""
# # async for elem in generator:
# # print(elem)
# # response += elem["content"]
# # return response
# # #generator = get_ollama_response_stream()
# # result = asyncio.run(get_ollama_response_no_stream())
# # print(result)
# # # return this generator to the client for streaming requests
# # async def get_response():
# # global generator
# # async for elem in generator:
# # print(elem)
# # asyncio.run(get_response())

View file

@ -0,0 +1,52 @@
###### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ######
# import sys, os
# import traceback
# from dotenv import load_dotenv
# load_dotenv()
# import os
# sys.path.insert(0, os.path.abspath('../..')) # Adds the parent directory to the system path
# import pytest
# import litellm
# from litellm import embedding, completion
# import asyncio
# user_message = "respond in 20 words. who are you?"
# messages = [{ "content": user_message,"role": "user"}]
# async def get_response(generator):
# response = ""
# async for elem in generator:
# print(elem)
# response += elem["content"]
# return response
# def test_completion_ollama():
# try:
# response = completion(model="llama2", messages=messages, custom_api_base="http://localhost:11434", custom_llm_provider="ollama")
# print(response)
# string_response = asyncio.run(get_response(response))
# print(string_response)
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# # test_completion_ollama()
# def test_completion_ollama_stream():
# try:
# response = completion(model="llama2", messages=messages, custom_api_base="http://localhost:11434", custom_llm_provider="ollama", stream=True)
# print(response)
# string_response = asyncio.run(get_response(response))
# print(string_response)
# except Exception as e:
# pytest.fail(f"Error occurred: {e}")
# test_completion_ollama_stream()

View file

@ -772,4 +772,44 @@ def read_config_args(config_path):
return config
except Exception as e:
print("An error occurred while reading config:", str(e))
raise e
raise e
########## ollama implementation ############################
import aiohttp
async def get_ollama_response_stream(api_base="http://localhost:11434", model="llama2", prompt="Why is the sky blue?"):
session = aiohttp.ClientSession()
url = f'{api_base}/api/generate'
data = {
"model": model,
"prompt": prompt,
}
try:
async with session.post(url, json=data) as resp:
async for line in resp.content.iter_any():
if line:
try:
json_chunk = line.decode("utf-8")
chunks = json_chunk.split("\n")
for chunk in chunks:
if chunk.strip() != "":
j = json.loads(chunk)
if "response" in j:
completion_obj ={ "role": "assistant", "content": ""}
completion_obj["content"] = j["response"]
yield {"choices": [{"delta": completion_obj}]}
# self.responses.append(j["response"])
# yield "blank"
except Exception as e:
print(f"Error decoding JSON: {e}")
finally:
await session.close()
async def stream_to_string(generator):
response = ""
async for chunk in generator:
response += chunk["content"]
return response

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "0.1.383"
version = "0.1.385"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT License"