update proxy cli

This commit is contained in:
Krrish Dholakia 2023-09-28 16:24:41 -07:00
parent 43dc41e2c4
commit 91521175f1
7 changed files with 92 additions and 35 deletions

View file

@ -10,21 +10,55 @@ This works for async + streaming as well.
Works with **ALL MODELS** supported by LiteLLM. To see supported providers check out this list - [Provider List](https://docs.litellm.ai/docs/providers). Works with **ALL MODELS** supported by LiteLLM. To see supported providers check out this list - [Provider List](https://docs.litellm.ai/docs/providers).
**Requirements** Make sure relevant keys are set in the local .env. **Requirements** Make sure relevant keys are set in the local .env.
[**Jump to tutorial**](#tutorial---using-with-aider)
## quick start ## quick start
Call Huggingface models through your OpenAI proxy. Call Huggingface models through your OpenAI proxy.
**Start Proxy**
Run this in your CLI. Run this in your CLI.
```shell ```python
$ pip install litellm $ pip install litellm
``` ```
```shell ```python
$ export HUGGINGFACE_API_KEY=your-api-key # [OPTIONAL] $ litellm --model huggingface/bigcode/starcoder
$ litellm --model huggingface/stabilityai/stablecode-instruct-alpha-3b #INFO: Uvicorn running on http://0.0.0.0:8000
``` ```
This will host a local proxy api at: **http://0.0.0.0:8000** This will host a local proxy api at: **http://0.0.0.0:8000**
**Test it**
<Tabs>
<TabItem value="openai" label="OpenAI">
```python
import openai
openai.api_base = "http://0.0.0.0:8000"
print(openai.ChatCompletion.create(model="test", messages=[{"role":"user", "content":"Hey!"}]))
```
</TabItem>
<TabItem value="curl" label="curl">
```curl
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"messages": [
{
"role": "user",
"content": "what do you know?"
}
],
}'
```
</TabItem>
</Tabs>
Other supported models: Other supported models:
<Tabs> <Tabs>
<TabItem value="anthropic" label="Anthropic"> <TabItem value="anthropic" label="Anthropic">
@ -36,6 +70,15 @@ $ litellm --model claude-instant-1
</TabItem> </TabItem>
<TabItem value="huggingface" label="Huggingface">
```shell
$ export HUGGINGFACE_API_KEY=my-api-key #[OPTIONAL]
$ litellm --model claude-instant-1
```
</TabItem>
<TabItem value="together_ai" label="TogetherAI"> <TabItem value="together_ai" label="TogetherAI">
```shell ```shell
@ -115,34 +158,12 @@ Pass in the api_base as well
litellm --model huggingface/meta-llama/llama2 --api_base https://my-endpoint.huggingface.cloud litellm --model huggingface/meta-llama/llama2 --api_base https://my-endpoint.huggingface.cloud
``` ```
Other examples **Ollama example**
<Tabs>
<TabItem value="ollama_2" label="Ollama">
```shell ```shell
$ litellm --model ollama/llama2 --api_base http://localhost:11434 $ litellm --model ollama/llama2 --api_base http://localhost:11434
``` ```
</TabItem>
</Tabs>
## test it
```curl
curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"messages": [
{
"role": "user",
"content": "what do you know?"
}
],
}'
```
## tutorial - using with aider ## tutorial - using with aider
[Aider](https://github.com/paul-gauthier/aider) is an AI pair programming in your terminal. [Aider](https://github.com/paul-gauthier/aider) is an AI pair programming in your terminal.
@ -182,3 +203,4 @@ $ aider --openai-api-base http://0.0.0.0:8000
And that's it!

View file

@ -0,0 +1 @@
from . import *

View file

@ -6,9 +6,12 @@ load_dotenv()
@click.option('--port', default=8000, help='Port to bind the server to.') @click.option('--port', default=8000, help='Port to bind the server to.')
@click.option('--api_base', default=None, help='API base URL.') @click.option('--api_base', default=None, help='API base URL.')
@click.option('--model', required=True, help='The model name to pass to litellm expects') @click.option('--model', required=True, help='The model name to pass to litellm expects')
def run_server(port, api_base, model): @click.option('--debug', is_flag=True, help='To debug the input')
@click.option('--temperature', default=None, type=float, help='Set temperature for the model')
@click.option('--max_tokens', default=None, help='Set max tokens for the model')
def run_server(port, api_base, model, debug, temperature, max_tokens):
from .proxy_server import app, initialize from .proxy_server import app, initialize
initialize(model, api_base) initialize(model, api_base, debug, temperature, max_tokens)
try: try:
import uvicorn import uvicorn
except: except:

View file

@ -10,18 +10,36 @@ from fastapi.responses import StreamingResponse
import json import json
app = FastAPI() app = FastAPI()
user_api_base = None user_api_base = None
user_model = None user_model = None
user_debug = False
user_max_tokens = None
user_temperature = None
def initialize(model, api_base): def print_verbose(print_statement):
global user_model, user_api_base global user_debug
print(f"user_debug: {user_debug}")
if user_debug:
print(print_statement)
def initialize(model, api_base, debug, temperature, max_tokens):
global user_model, user_api_base, user_debug, user_max_tokens, user_temperature
user_model = model user_model = model
user_api_base = api_base user_api_base = api_base
user_debug = debug
user_max_tokens = max_tokens
user_temperature = temperature
# if debug:
# litellm.set_verbose = True
# for streaming # for streaming
def data_generator(response): def data_generator(response):
print("inside generator")
for chunk in response: for chunk in response:
print(f"chunk: {chunk}")
print_verbose(f"returned chunk: {chunk}")
yield f"data: {json.dumps(chunk)}\n\n" yield f"data: {json.dumps(chunk)}\n\n"
@app.get("/models") # if project requires model list @app.get("/models") # if project requires model list
@ -34,6 +52,7 @@ def model_list():
@app.post("/completions") @app.post("/completions")
async def completion(request: Request): async def completion(request: Request):
data = await request.json() data = await request.json()
print_verbose(f"data passed in: {data}")
if (user_model is None): if (user_model is None):
raise ValueError("Proxy model needs to be set") raise ValueError("Proxy model needs to be set")
data["model"] = user_model data["model"] = user_model
@ -47,12 +66,23 @@ async def completion(request: Request):
@app.post("/chat/completions") @app.post("/chat/completions")
async def chat_completion(request: Request): async def chat_completion(request: Request):
data = await request.json() data = await request.json()
print_verbose(f"data passed in: {data}")
if (user_model is None): if (user_model is None):
raise ValueError("Proxy model needs to be set") raise ValueError("Proxy model needs to be set")
data["model"] = user_model data["model"] = user_model
# override with user settings
if user_temperature:
data["temperature"] = user_temperature
if user_max_tokens:
data["max_tokens"] = user_max_tokens
if user_api_base: if user_api_base:
data["api_base"] = user_api_base data["api_base"] = user_api_base
response = litellm.completion(**data) response = litellm.completion(**data)
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
print("reaches stream")
return StreamingResponse(data_generator(response), media_type='text/event-stream') return StreamingResponse(data_generator(response), media_type='text/event-stream')
print_verbose(f"response: {response}")
return response return response

View file

@ -2876,9 +2876,10 @@ class CustomStreamWrapper:
text = "" text = ""
is_finished = False is_finished = False
finish_reason = "" finish_reason = ""
print_verbose(f"chunk: {chunk}")
if chunk.startswith("data:"): if chunk.startswith("data:"):
data_json = json.loads(chunk[5:]) data_json = json.loads(chunk[5:])
print(f"data json: {data_json}") print_verbose(f"data json: {data_json}")
if "token" in data_json and "text" in data_json["token"]: if "token" in data_json and "text" in data_json["token"]:
text = data_json["token"]["text"] text = data_json["token"]["text"]
if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text