update proxy cli

2023-09-28 16:24:41 -07:00 · 2023-09-28 16:24:41 -07:00 · 09b8c08cad
commit 09b8c08cad
parent c5bed0c9fd
7 changed files with 92 additions and 35 deletions
--- a/docs/my-website/docs/proxy_server.md
+++ b/docs/my-website/docs/proxy_server.md
@ -10,21 +10,55 @@ This works for async + streaming as well.
 Works with **ALL MODELS** supported by LiteLLM. To see supported providers check out this list - [Provider List](https://docs.litellm.ai/docs/providers).

 **Requirements** Make sure relevant keys are set in the local .env. 
+
+[**Jump to tutorial**](#tutorial---using-with-aider)
 ## quick start
 Call Huggingface models through your OpenAI proxy.

+**Start Proxy**  
 Run this in your CLI.
-```shell 
+```python
 $ pip install litellm
 ```
-```shell
-$ export HUGGINGFACE_API_KEY=your-api-key # [OPTIONAL]
+```python 
+$ litellm --model huggingface/bigcode/starcoder

-$ litellm --model huggingface/stabilityai/stablecode-instruct-alpha-3b
+#INFO:     Uvicorn running on http://0.0.0.0:8000
 ```

 This will host a local proxy api at: **http://0.0.0.0:8000**

+**Test it**
+<Tabs>
+<TabItem value="openai" label="OpenAI">
+
+```python
+import openai 
+
+openai.api_base = "http://0.0.0.0:8000"
+
+print(openai.ChatCompletion.create(model="test", messages=[{"role":"user", "content":"Hey!"}]))
+```
+
+</TabItem>
+
+<TabItem value="curl" label="curl">
+
+```curl 
+curl --location 'http://0.0.0.0:8000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data '{
+  "messages": [
+    {
+      "role": "user", 
+      "content": "what do you know?"
+    }
+  ], 
+}'
+```
+</TabItem>
+</Tabs>
+
 Other supported models:
 <Tabs>
 <TabItem value="anthropic" label="Anthropic">
@ -36,6 +70,15 @@ $ litellm --model claude-instant-1

 </TabItem>

+<TabItem value="huggingface" label="Huggingface">
+
+```shell
+$ export HUGGINGFACE_API_KEY=my-api-key #[OPTIONAL]
+$ litellm --model claude-instant-1
+```
+
+</TabItem>
+
 <TabItem value="together_ai" label="TogetherAI">

 ```shell
@ -115,34 +158,12 @@ Pass in the api_base as well
 litellm --model huggingface/meta-llama/llama2 --api_base https://my-endpoint.huggingface.cloud
 ```

-Other examples
-<Tabs>
-<TabItem value="ollama_2" label="Ollama">
+**Ollama example**

 ```shell
 $ litellm --model ollama/llama2 --api_base http://localhost:11434
 ```

-
-</TabItem>
-
-</Tabs>
-
-## test it 
-
-```curl 
-curl --location 'http://0.0.0.0:8000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
-  "messages": [
-    {
-      "role": "user", 
-      "content": "what do you know?"
-    }
-  ], 
-}'
-```
-
 ## tutorial - using with aider 
 [Aider](https://github.com/paul-gauthier/aider) is an AI pair programming in your terminal.

@ -182,3 +203,4 @@ $ aider --openai-api-base http://0.0.0.0:8000



+And that's it! 
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/proxy/init.py
+++ b/litellm/proxy/init.py
@ -0,0 +1 @@
+from . import *
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -6,9 +6,12 @@ load_dotenv()
@click.option('--port', default=8000, help='Port to bind the server to.')
@click.option('--api_base', default=None, help='API base URL.')
@click.option('--model', required=True, help='The model name to pass to litellm expects') 
-def run_server(port, api_base, model):
+@click.option('--debug', is_flag=True, help='To debug the input') 
+@click.option('--temperature', default=None, type=float, help='Set temperature for the model') 
+@click.option('--max_tokens', default=None, help='Set max tokens for the model') 
+def run_server(port, api_base, model, debug, temperature, max_tokens):
    from .proxy_server import app, initialize
-    initialize(model, api_base)
+    initialize(model, api_base, debug, temperature, max_tokens)
    try:
        import uvicorn
    except:
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -10,18 +10,36 @@ from fastapi.responses import StreamingResponse
 import json

 app = FastAPI()
+
 user_api_base = None
 user_model = None
+user_debug = False
+user_max_tokens = None
+user_temperature = None

-def initialize(model, api_base):
-    global user_model, user_api_base
+def print_verbose(print_statement):
+    global user_debug 
+    print(f"user_debug: {user_debug}")
+    if user_debug: 
+         print(print_statement)
+
+def initialize(model, api_base, debug, temperature, max_tokens):
+    global user_model, user_api_base, user_debug, user_max_tokens, user_temperature
    user_model = model
    user_api_base = api_base
+    user_debug = debug
+    user_max_tokens = max_tokens
+    user_temperature = temperature

+    # if debug: 
+    #     litellm.set_verbose = True

 # for streaming
 def data_generator(response):
+    print("inside generator")
    for chunk in response:
+        print(f"chunk: {chunk}")
+        print_verbose(f"returned chunk: {chunk}")
        yield f"data: {json.dumps(chunk)}\n\n"
        
@app.get("/models") # if project requires model list 
@ -34,6 +52,7 @@ def model_list():
@app.post("/completions")
 async def completion(request: Request):
    data = await request.json()
+    print_verbose(f"data passed in: {data}")
    if (user_model is None):
        raise ValueError("Proxy model needs to be set")
    data["model"] = user_model
@ -47,12 +66,23 @@ async def completion(request: Request):
@app.post("/chat/completions")
 async def chat_completion(request: Request):
    data = await request.json()
+    print_verbose(f"data passed in: {data}")
    if (user_model is None):
        raise ValueError("Proxy model needs to be set")
    data["model"] = user_model
+
+    # override with user settings
+    if user_temperature: 
+        data["temperature"] = user_temperature
+    if user_max_tokens: 
+        data["max_tokens"] = user_max_tokens
    if user_api_base: 
        data["api_base"] = user_api_base
+
+
    response = litellm.completion(**data)
    if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
+        print("reaches stream")
        return StreamingResponse(data_generator(response), media_type='text/event-stream')
+    print_verbose(f"response: {response}")
    return response
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2876,9 +2876,10 @@ class CustomStreamWrapper:
        text = "" 
        is_finished = False
        finish_reason = ""
+        print_verbose(f"chunk: {chunk}")
        if chunk.startswith("data:"):
            data_json = json.loads(chunk[5:])
-            print(f"data json: {data_json}")
+            print_verbose(f"data json: {data_json}")
            if "token" in data_json and "text" in data_json["token"]:
                text = data_json["token"]["text"]
                if "meta-llama/Llama-2" in self.model: #clean eos tokens like </s> from the returned output text