forked from phoenix/litellm-mirror
update proxy cli
This commit is contained in:
parent
c5bed0c9fd
commit
09b8c08cad
7 changed files with 92 additions and 35 deletions
|
@ -10,21 +10,55 @@ This works for async + streaming as well.
|
||||||
Works with **ALL MODELS** supported by LiteLLM. To see supported providers check out this list - [Provider List](https://docs.litellm.ai/docs/providers).
|
Works with **ALL MODELS** supported by LiteLLM. To see supported providers check out this list - [Provider List](https://docs.litellm.ai/docs/providers).
|
||||||
|
|
||||||
**Requirements** Make sure relevant keys are set in the local .env.
|
**Requirements** Make sure relevant keys are set in the local .env.
|
||||||
|
|
||||||
|
[**Jump to tutorial**](#tutorial---using-with-aider)
|
||||||
## quick start
|
## quick start
|
||||||
Call Huggingface models through your OpenAI proxy.
|
Call Huggingface models through your OpenAI proxy.
|
||||||
|
|
||||||
|
**Start Proxy**
|
||||||
Run this in your CLI.
|
Run this in your CLI.
|
||||||
```shell
|
```python
|
||||||
$ pip install litellm
|
$ pip install litellm
|
||||||
```
|
```
|
||||||
```shell
|
```python
|
||||||
$ export HUGGINGFACE_API_KEY=your-api-key # [OPTIONAL]
|
$ litellm --model huggingface/bigcode/starcoder
|
||||||
|
|
||||||
$ litellm --model huggingface/stabilityai/stablecode-instruct-alpha-3b
|
#INFO: Uvicorn running on http://0.0.0.0:8000
|
||||||
```
|
```
|
||||||
|
|
||||||
This will host a local proxy api at: **http://0.0.0.0:8000**
|
This will host a local proxy api at: **http://0.0.0.0:8000**
|
||||||
|
|
||||||
|
**Test it**
|
||||||
|
<Tabs>
|
||||||
|
<TabItem value="openai" label="OpenAI">
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
|
||||||
|
openai.api_base = "http://0.0.0.0:8000"
|
||||||
|
|
||||||
|
print(openai.ChatCompletion.create(model="test", messages=[{"role":"user", "content":"Hey!"}]))
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="curl" label="curl">
|
||||||
|
|
||||||
|
```curl
|
||||||
|
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
||||||
|
--header 'Content-Type: application/json' \
|
||||||
|
--data '{
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "what do you know?"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
</TabItem>
|
||||||
|
</Tabs>
|
||||||
|
|
||||||
Other supported models:
|
Other supported models:
|
||||||
<Tabs>
|
<Tabs>
|
||||||
<TabItem value="anthropic" label="Anthropic">
|
<TabItem value="anthropic" label="Anthropic">
|
||||||
|
@ -36,6 +70,15 @@ $ litellm --model claude-instant-1
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="huggingface" label="Huggingface">
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ export HUGGINGFACE_API_KEY=my-api-key #[OPTIONAL]
|
||||||
|
$ litellm --model claude-instant-1
|
||||||
|
```
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
<TabItem value="together_ai" label="TogetherAI">
|
<TabItem value="together_ai" label="TogetherAI">
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
@ -115,34 +158,12 @@ Pass in the api_base as well
|
||||||
litellm --model huggingface/meta-llama/llama2 --api_base https://my-endpoint.huggingface.cloud
|
litellm --model huggingface/meta-llama/llama2 --api_base https://my-endpoint.huggingface.cloud
|
||||||
```
|
```
|
||||||
|
|
||||||
Other examples
|
**Ollama example**
|
||||||
<Tabs>
|
|
||||||
<TabItem value="ollama_2" label="Ollama">
|
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
$ litellm --model ollama/llama2 --api_base http://localhost:11434
|
$ litellm --model ollama/llama2 --api_base http://localhost:11434
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
</TabItem>
|
|
||||||
|
|
||||||
</Tabs>
|
|
||||||
|
|
||||||
## test it
|
|
||||||
|
|
||||||
```curl
|
|
||||||
curl --location 'http://0.0.0.0:8000/chat/completions' \
|
|
||||||
--header 'Content-Type: application/json' \
|
|
||||||
--data '{
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "what do you know?"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
## tutorial - using with aider
|
## tutorial - using with aider
|
||||||
[Aider](https://github.com/paul-gauthier/aider) is an AI pair programming in your terminal.
|
[Aider](https://github.com/paul-gauthier/aider) is an AI pair programming in your terminal.
|
||||||
|
|
||||||
|
@ -182,3 +203,4 @@ $ aider --openai-api-base http://0.0.0.0:8000
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
And that's it!
|
Binary file not shown.
Binary file not shown.
1
litellm/proxy/__init__.py
Normal file
1
litellm/proxy/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from . import *
|
|
@ -6,9 +6,12 @@ load_dotenv()
|
||||||
@click.option('--port', default=8000, help='Port to bind the server to.')
|
@click.option('--port', default=8000, help='Port to bind the server to.')
|
||||||
@click.option('--api_base', default=None, help='API base URL.')
|
@click.option('--api_base', default=None, help='API base URL.')
|
||||||
@click.option('--model', required=True, help='The model name to pass to litellm expects')
|
@click.option('--model', required=True, help='The model name to pass to litellm expects')
|
||||||
def run_server(port, api_base, model):
|
@click.option('--debug', is_flag=True, help='To debug the input')
|
||||||
|
@click.option('--temperature', default=None, type=float, help='Set temperature for the model')
|
||||||
|
@click.option('--max_tokens', default=None, help='Set max tokens for the model')
|
||||||
|
def run_server(port, api_base, model, debug, temperature, max_tokens):
|
||||||
from .proxy_server import app, initialize
|
from .proxy_server import app, initialize
|
||||||
initialize(model, api_base)
|
initialize(model, api_base, debug, temperature, max_tokens)
|
||||||
try:
|
try:
|
||||||
import uvicorn
|
import uvicorn
|
||||||
except:
|
except:
|
||||||
|
|
|
@ -10,18 +10,36 @@ from fastapi.responses import StreamingResponse
|
||||||
import json
|
import json
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
user_api_base = None
|
user_api_base = None
|
||||||
user_model = None
|
user_model = None
|
||||||
|
user_debug = False
|
||||||
|
user_max_tokens = None
|
||||||
|
user_temperature = None
|
||||||
|
|
||||||
def initialize(model, api_base):
|
def print_verbose(print_statement):
|
||||||
global user_model, user_api_base
|
global user_debug
|
||||||
|
print(f"user_debug: {user_debug}")
|
||||||
|
if user_debug:
|
||||||
|
print(print_statement)
|
||||||
|
|
||||||
|
def initialize(model, api_base, debug, temperature, max_tokens):
|
||||||
|
global user_model, user_api_base, user_debug, user_max_tokens, user_temperature
|
||||||
user_model = model
|
user_model = model
|
||||||
user_api_base = api_base
|
user_api_base = api_base
|
||||||
|
user_debug = debug
|
||||||
|
user_max_tokens = max_tokens
|
||||||
|
user_temperature = temperature
|
||||||
|
|
||||||
|
# if debug:
|
||||||
|
# litellm.set_verbose = True
|
||||||
|
|
||||||
# for streaming
|
# for streaming
|
||||||
def data_generator(response):
|
def data_generator(response):
|
||||||
|
print("inside generator")
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
|
print(f"chunk: {chunk}")
|
||||||
|
print_verbose(f"returned chunk: {chunk}")
|
||||||
yield f"data: {json.dumps(chunk)}\n\n"
|
yield f"data: {json.dumps(chunk)}\n\n"
|
||||||
|
|
||||||
@app.get("/models") # if project requires model list
|
@app.get("/models") # if project requires model list
|
||||||
|
@ -34,6 +52,7 @@ def model_list():
|
||||||
@app.post("/completions")
|
@app.post("/completions")
|
||||||
async def completion(request: Request):
|
async def completion(request: Request):
|
||||||
data = await request.json()
|
data = await request.json()
|
||||||
|
print_verbose(f"data passed in: {data}")
|
||||||
if (user_model is None):
|
if (user_model is None):
|
||||||
raise ValueError("Proxy model needs to be set")
|
raise ValueError("Proxy model needs to be set")
|
||||||
data["model"] = user_model
|
data["model"] = user_model
|
||||||
|
@ -47,12 +66,23 @@ async def completion(request: Request):
|
||||||
@app.post("/chat/completions")
|
@app.post("/chat/completions")
|
||||||
async def chat_completion(request: Request):
|
async def chat_completion(request: Request):
|
||||||
data = await request.json()
|
data = await request.json()
|
||||||
|
print_verbose(f"data passed in: {data}")
|
||||||
if (user_model is None):
|
if (user_model is None):
|
||||||
raise ValueError("Proxy model needs to be set")
|
raise ValueError("Proxy model needs to be set")
|
||||||
data["model"] = user_model
|
data["model"] = user_model
|
||||||
|
|
||||||
|
# override with user settings
|
||||||
|
if user_temperature:
|
||||||
|
data["temperature"] = user_temperature
|
||||||
|
if user_max_tokens:
|
||||||
|
data["max_tokens"] = user_max_tokens
|
||||||
if user_api_base:
|
if user_api_base:
|
||||||
data["api_base"] = user_api_base
|
data["api_base"] = user_api_base
|
||||||
|
|
||||||
|
|
||||||
response = litellm.completion(**data)
|
response = litellm.completion(**data)
|
||||||
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
|
if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
|
||||||
|
print("reaches stream")
|
||||||
return StreamingResponse(data_generator(response), media_type='text/event-stream')
|
return StreamingResponse(data_generator(response), media_type='text/event-stream')
|
||||||
|
print_verbose(f"response: {response}")
|
||||||
return response
|
return response
|
|
@ -2876,9 +2876,10 @@ class CustomStreamWrapper:
|
||||||
text = ""
|
text = ""
|
||||||
is_finished = False
|
is_finished = False
|
||||||
finish_reason = ""
|
finish_reason = ""
|
||||||
|
print_verbose(f"chunk: {chunk}")
|
||||||
if chunk.startswith("data:"):
|
if chunk.startswith("data:"):
|
||||||
data_json = json.loads(chunk[5:])
|
data_json = json.loads(chunk[5:])
|
||||||
print(f"data json: {data_json}")
|
print_verbose(f"data json: {data_json}")
|
||||||
if "token" in data_json and "text" in data_json["token"]:
|
if "token" in data_json and "text" in data_json["token"]:
|
||||||
text = data_json["token"]["text"]
|
text = data_json["token"]["text"]
|
||||||
if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text
|
if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue