feat(proxy_server.py): enable infinite retries on rate limited requests

2023-12-15 20:03:41 -08:00 · 2023-12-15 20:03:41 -08:00 · 4791dda66f
commit 4791dda66f
parent e5268fa6bc
7 changed files with 255 additions and 186 deletions
--- a/.gitignore
+++ b/.gitignore
@ -24,3 +24,4 @@ litellm/tests/test_custom_logger.py
 litellm/tests/langfuse.log
 litellm/tests/dynamo*.log
 .vscode/settings.json
 litellm/proxy/log.txt
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -182,17 +182,11 @@ def ollama_completion_stream(url, data):
                    traceback.print_exc()
    session.close()
 async def iter_lines(reader):
    buffer = b""
    async for chunk in reader.iter_any():
        buffer += chunk
        while b'\n' in buffer:
            line, buffer = buffer.split(b'\n', 1)
            yield line
 async def ollama_async_streaming(url, data, model_response, encoding, logging_obj):
    try:
-        with httpx.stream(
+        client = httpx.AsyncClient()
        async with client.stream(
                    url=f"{url}",
                    json=data,
                    method="POST",
@ -201,8 +195,8 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
                    if response.status_code != 200:
                        raise OllamaError(status_code=response.status_code, message=response.text) 
-                    streamwrapper = litellm.CustomStreamWrapper(completion_stream=response.iter_lines(), model=data['model'], custom_llm_provider="ollama",logging_obj=logging_obj)
+                    streamwrapper = litellm.CustomStreamWrapper(completion_stream=response.aiter_lines(), model=data['model'], custom_llm_provider="ollama",logging_obj=logging_obj)
-                    for transformed_chunk in streamwrapper:
+                    async for transformed_chunk in streamwrapper:
                        yield transformed_chunk
    except Exception as e:
        traceback.print_exc()
--- a/litellm/proxy/hooks/parallel_request_limiter.py
+++ b/litellm/proxy/hooks/parallel_request_limiter.py
@ -16,6 +16,7 @@ class MaxParallelRequestsHandler(CustomLogger):
    async def async_pre_call_hook(self, user_api_key_dict: UserAPIKeyAuth, cache: DualCache, data: dict, call_type: str): 
        self.print_verbose(f"Inside Max Parallel Request Pre-Call Hook")
        api_key = user_api_key_dict.api_key
        max_parallel_requests = user_api_key_dict.max_parallel_requests
@ -61,6 +62,7 @@ class MaxParallelRequestsHandler(CustomLogger):
    async def async_log_failure_call(self, user_api_key_dict: UserAPIKeyAuth, original_exception: Exception):
        try:
            self.print_verbose(f"Inside Max Parallel Request Failure Hook")
            api_key = user_api_key_dict.api_key
            if api_key is None:
                return
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -195,8 +195,10 @@ prisma_client: Optional[PrismaClient] = None
 user_api_key_cache = DualCache()
 user_custom_auth = None
 use_background_health_checks = None
 use_queue = False
 health_check_interval = None
 health_check_results = {}
 queue: List = []
 ### INITIALIZE GLOBAL LOGGING OBJECT ###
 proxy_logging_obj = ProxyLogging(user_api_key_cache=user_api_key_cache)
 ### REDIS QUEUE ### 
@ -324,17 +326,6 @@ def prisma_setup(database_url: Optional[str]):
        except Exception as e:
            print("Error when initializing prisma, Ensure you run pip install prisma", e)
 def celery_setup(use_queue: bool): 
    global celery_fn, celery_app_conn, async_result
    if use_queue:
        from litellm.proxy.queue.celery_worker import start_worker
        from litellm.proxy.queue.celery_app import celery_app, process_job
        from celery.result import AsyncResult
        start_worker(os.getcwd())
        celery_fn = process_job
        async_result = AsyncResult
        celery_app_conn = celery_app
 def load_from_azure_key_vault(use_azure_key_vault: bool = False): 
    if use_azure_key_vault is False:
        return
@ -450,7 +441,7 @@ async def _run_background_health_check():
        await asyncio.sleep(health_check_interval)
 def load_router_config(router: Optional[litellm.Router], config_file_path: str):
-    global master_key, user_config_file_path, otel_logging, user_custom_auth, user_custom_auth_path, use_background_health_checks, health_check_interval
+    global master_key, user_config_file_path, otel_logging, user_custom_auth, user_custom_auth_path, use_background_health_checks, health_check_interval, use_queue
    config = {}
    try: 
        if os.path.exists(config_file_path):
@ -566,7 +557,6 @@ def load_router_config(router: Optional[litellm.Router], config_file_path: str):
        cost_tracking()
        ### START REDIS QUEUE ###
        use_queue = general_settings.get("use_queue", False)
        celery_setup(use_queue=use_queue)
        ### MASTER KEY ###
        master_key = general_settings.get("master_key", None)
        if master_key and master_key.startswith("os.environ/"): 
@ -757,8 +747,6 @@ def initialize(
    if max_budget:  # litellm-specific param
        litellm.max_budget = max_budget
        dynamic_config["general"]["max_budget"] = max_budget
    if use_queue: 
        celery_setup(use_queue=use_queue)
    if experimental: 
        pass
    user_telemetry = telemetry
@ -815,9 +803,11 @@ def get_litellm_model_info(model: dict = {}):
@router.on_event("startup")
 async def startup_event():
-    global prisma_client, master_key, use_background_health_checks
+    global prisma_client, master_key, use_background_health_checks, use_queue
    import json
    print(f"VALUE OF USE_QUEUE: {use_queue}")
    ### LOAD CONFIG ### 
    worker_config = litellm.get_secret("WORKER_CONFIG")
    print_verbose(f"worker_config: {worker_config}")
@ -841,6 +831,7 @@ async def startup_event():
        # add master key to db
        await generate_key_helper_fn(duration=None, models=[], aliases={}, config={}, spend=0, token=master_key)
    print("END OF STARTUP")
 #### API ENDPOINTS ####
@router.get("/v1/models", dependencies=[Depends(user_api_key_auth)])
@ -1361,46 +1352,107 @@ async def delete_model(model_info: ModelInfoDelete):
        raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")
 #### EXPERIMENTAL QUEUING #### 
-@router.post("/queue/request", dependencies=[Depends(user_api_key_auth)])
+async def _litellm_chat_completions_worker(data, user_api_key_dict):
-async def async_queue_request(request: Request): 
+    """
-    global celery_fn, llm_model_list
+    worker to make litellm completions calls 
-    if celery_fn is not None: 
+    """
-        body = await request.body()
+    while True: 
        body_str = body.decode()
        try: 
-            data = ast.literal_eval(body_str)
+            ### CALL HOOKS ### - modify incoming data before calling the model
-        except: 
+            data = await proxy_logging_obj.pre_call_hook(user_api_key_dict=user_api_key_dict, data=data, call_type="completion")
-            data = json.loads(body_str)
+
            print(f"_litellm_chat_completions_worker started")
            ### ROUTE THE REQUEST ###
            router_model_names = [m["model_name"] for m in llm_model_list] if llm_model_list is not None else []
            if llm_router is not None and data["model"] in router_model_names: # model in router model list 
                    response = await llm_router.acompletion(**data)
            elif llm_router is not None and data["model"] in llm_router.deployment_names: # model in router deployments, calling a specific deployment on the router
                response = await llm_router.acompletion(**data, specific_deployment = True)
            elif llm_router is not None and llm_router.model_group_alias is not None and data["model"] in llm_router.model_group_alias: # model set in model_group_alias
                response = await llm_router.acompletion(**data)
            else: # router is not set
                response = await litellm.acompletion(**data)
            print(f"final response: {response}")
            return response
        except HTTPException as e: 
            print(f"EXCEPTION RAISED IN _litellm_chat_completions_worker - {e.status_code}; {e.detail}")
            if e.status_code == 429 and "Max parallel request limit reached" in e.detail:
                print(f"Max parallel request limit reached!")
                timeout = litellm._calculate_retry_after(remaining_retries=3, max_retries=3, min_timeout=1)
                await asyncio.sleep(timeout)
            else: 
                raise e 
@router.post("/queue/chat/completions", tags=["experimental"], dependencies=[Depends(user_api_key_auth)])
 async def async_queue_request(request: Request, model: Optional[str] = None, user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), background_tasks: BackgroundTasks = BackgroundTasks()):
    global general_settings, user_debug, proxy_logging_obj 
    """
    v2 attempt at a background worker to handle queuing. 
    Just supports /chat/completion calls currently.
    Now using a FastAPI background task + /chat/completions compatible endpoint
    """
    try: 
        data = {}
        data = await request.json() # type: ignore 
        # Include original request and headers in the data
        data["proxy_server_request"] = {
            "url": str(request.url),
            "method": request.method,
            "headers": dict(request.headers),
            "body": copy.copy(data)  # use copy instead of deepcopy
        }
        print_verbose(f"receiving data: {data}")
        data["model"] = (
            general_settings.get("completion_model", None) # server default
            or user_model # model name passed via cli args
            or model # for azure deployments
            or data["model"] # default passed in http request
        )
-        data["llm_model_list"] = llm_model_list
+
-        print(f"data: {data}")
+        # users can pass in 'user' param to /chat/completions. Don't override it
-        job = celery_fn.apply_async(kwargs=data)
+        if data.get("user", None) is None and user_api_key_dict.user_id is not None:
-        return {"id": job.id, "url": f"/queue/response/{job.id}", "eta": 5, "status": "queued"}
+            # if users are using user_api_key_auth, set `user` in `data`
            data["user"] = user_api_key_dict.user_id
        if "metadata" in data:
            print(f'received metadata: {data["metadata"]}')
            data["metadata"]["user_api_key"] = user_api_key_dict.api_key
            data["metadata"]["headers"] = dict(request.headers)
        else:
            data["metadata"] = {"user_api_key": user_api_key_dict.api_key}
            data["metadata"]["headers"] = dict(request.headers)
        global user_temperature, user_request_timeout, user_max_tokens, user_api_base
        # override with user settings, these are params passed via cli
        if user_temperature: 
            data["temperature"] = user_temperature
        if user_request_timeout:
            data["request_timeout"] = user_request_timeout
        if user_max_tokens: 
            data["max_tokens"] = user_max_tokens
        if user_api_base: 
            data["api_base"] = user_api_base
        response = await asyncio.wait_for(_litellm_chat_completions_worker(data=data, user_api_key_dict=user_api_key_dict), timeout=litellm.request_timeout)
        if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses
            return StreamingResponse(async_data_generator(user_api_key_dict=user_api_key_dict, response=response), media_type='text/event-stream')
        background_tasks.add_task(log_input_output, request, response) # background task for logging to OTEL 
        return response
    except Exception as e: 
        await proxy_logging_obj.post_call_failure_hook(user_api_key_dict=user_api_key_dict, original_exception=e) 
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
-            detail={"error": "Queue not initialized"},
+            detail={"error": str(e)},
        )
@router.get("/queue/response/{task_id}", dependencies=[Depends(user_api_key_auth)])
 async def async_queue_response(request: Request, task_id: str): 
    global celery_app_conn, async_result
    try: 
        if celery_app_conn is not None and async_result is not None: 
            job = async_result(task_id, app=celery_app_conn) 
            if job.ready():
                return {"status": "finished", "result": job.result}
            else:
                return {'status': 'queued'}
        else:
            raise Exception()
    except Exception as e: 
        return {"status": "finished", "result": str(e)}
@router.get("/ollama_logs", dependencies=[Depends(user_api_key_auth)])
 async def retrieve_server_log(request: Request):
--- a/litellm/tests/test_ollama_local.py
+++ b/litellm/tests/test_ollama_local.py
@ -1,158 +1,171 @@
-# ##### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ######
+##### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ######
-# # https://ollama.ai/
+# https://ollama.ai/
-# import sys, os
+import sys, os
-# import traceback
+import traceback
-# from dotenv import load_dotenv
+from dotenv import load_dotenv
-# load_dotenv()
+load_dotenv()
-# import os
+import os
-# sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
+sys.path.insert(0, os.path.abspath('../..'))  # Adds the parent directory to the system path
-# import pytest
+import pytest
-# import litellm
+import litellm
-# from litellm import embedding, completion
+from litellm import embedding, completion
-# import asyncio
+import asyncio
-# user_message = "respond in 20 words. who are you?"
+user_message = "respond in 20 words. who are you?"
-# messages = [{ "content": user_message,"role": "user"}]
+messages = [{ "content": user_message,"role": "user"}]
-# def test_completion_ollama():
+async def test_async_ollama_streaming():
-#     try:
+    try: 
-#         response = completion(
+        litellm.set_verbose = True
-#             model="ollama/llama2", 
+        response = await litellm.acompletion(model="ollama/mistral-openorca", 
-#             messages=messages, 
+                                             messages=[{"role": "user", "content": "Hey, how's it going?"}],
-#             max_tokens=200,
+                                             stream=True)
-#             request_timeout = 10,
+        async for chunk in response: 
            print(chunk)
    except Exception as e: 
        print(e)
-#         )
+asyncio.run(test_async_ollama_streaming())
-#         print(response)
+
-#     except Exception as e:
+def test_completion_ollama():
-#         pytest.fail(f"Error occurred: {e}")
+    try:
        response = completion(
            model="ollama/llama2", 
            messages=messages, 
            max_tokens=200,
            request_timeout = 10,
        )
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_ollama()
-# def test_completion_ollama_with_api_base():
+def test_completion_ollama_with_api_base():
-#     try:
+    try:
-#         response = completion(
+        response = completion(
-#             model="ollama/llama2", 
+            model="ollama/llama2", 
-#             messages=messages, 
+            messages=messages, 
-#             api_base="http://localhost:11434"
+            api_base="http://localhost:11434"
-#         )
+        )
-#         print(response)
+        print(response)
-#     except Exception as e:
+    except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+        pytest.fail(f"Error occurred: {e}")
 # test_completion_ollama_with_api_base()
-# def test_completion_ollama_custom_prompt_template():
+def test_completion_ollama_custom_prompt_template():
-#     user_message = "what is litellm?"
+    user_message = "what is litellm?"
-#     litellm.register_prompt_template(
+    litellm.register_prompt_template(
-#         model="ollama/llama2",
+        model="ollama/llama2",
-#         roles={
+        roles={
-#             "system": {"pre_message": "System: "},
+            "system": {"pre_message": "System: "},
-#             "user": {"pre_message": "User: "},
+            "user": {"pre_message": "User: "},
-#             "assistant": {"pre_message": "Assistant: "}
+            "assistant": {"pre_message": "Assistant: "}
-#         }
+        }
-#     )
+    )
-#     messages = [{ "content": user_message,"role": "user"}]
+    messages = [{ "content": user_message,"role": "user"}]
-#     litellm.set_verbose = True
+    litellm.set_verbose = True
-#     try:
+    try:
-#         response = completion(
+        response = completion(
-#             model="ollama/llama2", 
+            model="ollama/llama2", 
-#             messages=messages, 
+            messages=messages, 
-#             stream=True
+            stream=True
-#         )
+        )
-#         print(response)
+        print(response)
-#         for chunk in response:
+        for chunk in response:
-#             print(chunk)
+            print(chunk)
-#             # print(chunk['choices'][0]['delta'])
+            # print(chunk['choices'][0]['delta'])
-#     except Exception as e:
+    except Exception as e:
-#         traceback.print_exc()
+        traceback.print_exc()
-#         pytest.fail(f"Error occurred: {e}")
+        pytest.fail(f"Error occurred: {e}")
 # test_completion_ollama_custom_prompt_template()
-# async def test_completion_ollama_async_stream():
+async def test_completion_ollama_async_stream():
-#     user_message = "what is the weather"
+    user_message = "what is the weather"
-#     messages = [{ "content": user_message,"role": "user"}]
+    messages = [{ "content": user_message,"role": "user"}]
-#     try:
+    try:
-#         response = await litellm.acompletion(
+        response = await litellm.acompletion(
-#             model="ollama/llama2", 
+            model="ollama/llama2", 
-#             messages=messages, 
+            messages=messages, 
-#             api_base="http://localhost:11434", 
+            api_base="http://localhost:11434", 
-#             stream=True
+            stream=True
-#         )
+        )
-#         async for chunk in response:
+        async for chunk in response:
-#             print(chunk['choices'][0]['delta'])
+            print(chunk['choices'][0]['delta'])
-#         print("TEST ASYNC NON Stream")
+        print("TEST ASYNC NON Stream")
-#         response = await litellm.acompletion(
+        response = await litellm.acompletion(
-#             model="ollama/llama2", 
+            model="ollama/llama2", 
-#             messages=messages, 
+            messages=messages, 
-#             api_base="http://localhost:11434", 
+            api_base="http://localhost:11434", 
-#         )
+        )
-#         print(response)
+        print(response)
-#     except Exception as e:
+    except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+        pytest.fail(f"Error occurred: {e}")
 # import asyncio
 # asyncio.run(test_completion_ollama_async_stream())
-# def prepare_messages_for_chat(text: str) -> list:
+def prepare_messages_for_chat(text: str) -> list:
-#     messages = [
+    messages = [
-#         {"role": "user", "content": text},
+        {"role": "user", "content": text},
-#     ]
+    ]
-#     return messages
+    return messages
-# async def ask_question():
+async def ask_question():
-#     params = {
+    params = {
-#         "messages": prepare_messages_for_chat("What is litellm? tell me 10 things about it who is sihaan.write an essay"),
+        "messages": prepare_messages_for_chat("What is litellm? tell me 10 things about it who is sihaan.write an essay"),
-#         "api_base": "http://localhost:11434",
+        "api_base": "http://localhost:11434",
-#         "model": "ollama/llama2",
+        "model": "ollama/llama2",
-#         "stream": True,
+        "stream": True,
-#     }
+    }
-#     response = await litellm.acompletion(**params)
+    response = await litellm.acompletion(**params)
-#     return response
+    return response
-# async def main():
+async def main():
-#     response = await ask_question()
+    response = await ask_question()
-#     async for chunk in response:
+    async for chunk in response:
-#         print(chunk)
+        print(chunk)
-#     print("test async completion without streaming")
+    print("test async completion without streaming")
-#     response = await litellm.acompletion(
+    response = await litellm.acompletion(
-#         model="ollama/llama2",
+        model="ollama/llama2",
-#         messages=prepare_messages_for_chat("What is litellm? respond in 2 words"),
+        messages=prepare_messages_for_chat("What is litellm? respond in 2 words"),
-#     )
+    )
-#     print("response", response)
+    print("response", response)
-# def test_completion_expect_error():
+def test_completion_expect_error():
-#     # this tests if we can exception map correctly for ollama
+    # this tests if we can exception map correctly for ollama
-#     print("making ollama request")
+    print("making ollama request")
-#     # litellm.set_verbose=True
+    # litellm.set_verbose=True
-#     user_message = "what is litellm?"
+    user_message = "what is litellm?"
-#     messages = [{ "content": user_message,"role": "user"}]
+    messages = [{ "content": user_message,"role": "user"}]
-#     try:
+    try:
-#         response = completion(
+        response = completion(
-#             model="ollama/invalid", 
+            model="ollama/invalid", 
-#             messages=messages, 
+            messages=messages, 
-#             stream=True
+            stream=True
-#         )
+        )
-#         print(response)
+        print(response)
-#         for chunk in response:
+        for chunk in response:
-#             print(chunk)
+            print(chunk)
-#             # print(chunk['choices'][0]['delta'])
+            # print(chunk['choices'][0]['delta'])
-#     except Exception as e:
+    except Exception as e:
-#         pass
+        pass
-#         pytest.fail(f"Error occurred: {e}")
+        pytest.fail(f"Error occurred: {e}")
 # test_completion_expect_error()
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1079,7 +1079,8 @@ class Logging:
                # print_verbose(f"final set of received chunks: {self.streaming_chunks}")
                try: 
                    complete_streaming_response = litellm.stream_chunk_builder(self.streaming_chunks, messages=self.model_call_details.get("messages", None))
-                except: 
+                except Exception as e:
                    print_verbose(f"Error occurred building stream chunk: {traceback.format_exc()}") 
                    complete_streaming_response = None
            else:
                self.streaming_chunks.append(result)
@ -5953,14 +5954,20 @@ class CustomStreamWrapper:
                or self.custom_llm_provider == "custom_openai"
                or self.custom_llm_provider == "text-completion-openai"
                or self.custom_llm_provider == "huggingface"
                or self.custom_llm_provider == "ollama"
                or self.custom_llm_provider == "vertex_ai"):
                print_verbose(f"INSIDE ASYNC STREAMING!!!")
                print_verbose(f"value of async completion stream: {self.completion_stream}")
                async for chunk in self.completion_stream:
                    print_verbose(f"value of async chunk: {chunk}")
                    if chunk == "None" or chunk is None:
                        raise Exception
                    # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks. 
                    # __anext__ also calls async_success_handler, which does logging
                    print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}")
                    processed_chunk = self.chunk_creator(chunk=chunk) 
                    print_verbose(f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}")
                    if processed_chunk is None: 
                        continue
                    ## LOGGING
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.14.9"
+version = "1.14.10"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"
@ -55,7 +55,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.14.9"
+version = "1.14.10"
 version_files = [
    "pyproject.toml:^version"
 ]