From 541a8b7bc8d3f17dba8ff80374e68c6ebe51981b Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Mon, 16 Oct 2023 19:42:53 -0700 Subject: [PATCH] fix(proxy_server): improve error handling --- litellm/__pycache__/main.cpython-311.pyc | Bin 53916 -> 53849 bytes litellm/__pycache__/utils.cpython-311.pyc | Bin 170437 -> 170871 bytes litellm/proxy/llm.py | 141 ++++++++++++++++++++++ litellm/proxy/proxy_server.py | 59 +-------- litellm/utils.py | 21 +++- 5 files changed, 166 insertions(+), 55 deletions(-) create mode 100644 litellm/proxy/llm.py diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 0c7d22aa1bc67b650b8d508f90e7daae8ac34355..a153c493047a2ac9ad2786b83f4eff8ea5320a8f 100644 GIT binary patch delta 6720 zcmbstX>eP`dH2ck8QHRY$dWC~j^snO6<_jwN0XEmB+K%bZ1x8ISypc&P}7yWnZUl!nTf0c0)q|H?B z3?zM~G>wJf1s_yNkg14_oJh8$iqYd$a}a>%=o@?_El&3uiO-p>_40_-^2|STWIu53iUHJp#j5Uj#&rhE;DDyZSYf|0(6c~W9Tm>G>Qy_F40}RDPrBAUUtLiWn8{1IbOad->8Xka>9Tnh`d%#2? zENn^QqK&iw@hrWqn2J2J(^FQeswCx(Dn?$41n+ zU0@?SQYdT;!D`e*RuveS!Y8$y!kc-8dm(C(v>e|8WqmJC*}8}_PIM@^0BjD^VV$=W z7kP_=tCF(?2|>GQ3KeW-TXLYlNLusLNK0jsu(aXYA}ZFR_8DQZ*^)p+0TX3nufhO; zSu0g))+99=wu=3camMET$NZwd!(}cIW4bC?lvh>6{3|{0gcR$dJ`=H6y2#DSS9f_mPSs z4J;s9hu%`4U~uwoISXz9b(1~aMbJWCv1XIkiqa|hmw(8LC<1Fm2YJPkxe@iu z#4Yw1&b}22&zL~3|P_&dVJr!)2?jR^%nJiz! zTpaa4`UW2ewgRb2+@$mYz;@I}4wU5Nbp!X#5gO@IwNxXzqY!G)8j4_;?B6agGocqZ z^Rr?8V19s)VX4-mjmAcCi!sF8xz*N2R>Re3-EKOGYv*s6Q|Y1;s^q{`v(+pD$-2^< zguYgWW+&P_ecGUSM`XdV8sglaO^%nQ!$ESgw8TDy`se5-pOaG>!^2iIBvV@@yD!y) z(Xi~%nKbT)Nvcf`Ye})Kuq&L1yf)5unF=qjiA#ew@`{iX(zwm&lIq0m3JJ~eRt09+ zKPHjLDMTDsFsP_?)Cty;p+-F!Y%y{!Vx{yJG(3MZ&G-h=Q>KFrWMi4h(5O&*%ftx< zrhJ>p^UY>>%b^C-KzIjIy?AdF%JM|3znW-hL&$p#WM{dqi>X_FXy%Bgdfpg3R3>)Q z#2!lO!TGVa5F+pzIqlxc3*I0~3gWVL+Hh6DH!6a+t?>9U;7s44@TBMPG!?y$o9$F) zsl!;v%jLN$x&`v*id>S`klGSadZT#1f_ZJSlOel4&85)bMk?V5xl&Ud>kr7S$WGp| z>tGl8)NX3%R_K^i3Jd`HevG^F>fX$&yGeXFLb^;BZn~v>na~gT?MON-r}Ie~W4ifKJ^?_0b=;8dWYNa~!G%Y%l zE9jZ5lwzSn*(l6!A&#m=NAcp?jUC0C!ikZc;IV1lm+aX~6}B~`u=8lEWTZ;Fne)kT z`Qlx(I2)LM$SZBBj6v2^YnlJng>@t~W3xAf;}VurY7q}c8D)&TUu`Mj(~1_iES>d; zwWbJeC;dzK_12J+m!7 z=GuZ>PmrSzPDrtZ4yzJqJ)A^-SDVJ#G7m&|agIe#u;8<(x}l`ODy6nS!Qm*wqgXeg ziFH@Gl35;(s>Kgh=r zfBjfoHMFyo8*>q{A{@RyVp3({?~k;aK)b@JIKh)cGm8|8wFhZd;ZKJj^QS|H_+-Qv zV*@cZ3}b?@gNaHDoLWO)IEw((9|cbmUt@NT)G0>acRQ)Pk|HnpuyKm}HSiOHn{2e6 z_crBrxy0XGw{R}(q&)6lGD|!V=04h7VBag}-g+(uZdLd$l~>`G96D^42b0-bC;zhsVjTF7xs;-Z4cu1N5-pR5KxMev0~0 z-1v5BZSZ$XPtYf)rFD26iVK+_p2E{1{50{sh&bp8{t)^pOahT#Bfp@(Mn-dgiDvl{ z8a+9qMo&>6{uQ~>-FD~GQ7#spyPsKlTpGed-_2pW6^Es_;E=0gdY$*^xSh@6%*VkK zLhSyq{U*Swbt3c@!DmTnZy`M4@9(`m&423VQ2~DCFWBfxfP4L;n@6Gl&D(Ag65dyP zC$@P+f5E7rGX96^ZsKe9`5*g0NIxp}iSxH^*$;(_`Tp0pjzEb0McZBw{1>)=Zk!yc z1Fu^8i=4DQo78P)?b|kI&Mg8s2O;a76OuQmRp(YXHUQPTKuzIy)g^4=U3;%g_Lq#+ z0_%UA?b5UMlT|JaYcI6A3R(MEzAKxxU(`9}Hn1YkYXgB&dG*PFE%|P%bcLcmLFZJP5xgCk`Hjm;FsX6ND%I8y>ApgeJ1@=nzFa zfAl^G{5KwZJHdR3S><2sqh?kaVxLad?Pt(s((}|K(BgmVDNAbnM{M{L^3C&x_zMjD z8_8YBkN=E;pOcOS6I2oRLOzTmcp(P{$YTqe=*Yzd2ORbnzwmVe)cRMRY!=|>#C^&J zFOYqwYM`B*Jat7-Lvr$T8Ep4|eEQcAy8P3BJgknt%+~z}sebhWTT zFT#{B*+d=t1pF_(;f|Xe)WYNdVBhm8sR5PTjR$ENfdm;$1lr6%k_;w;Ono3l23G)` z@kr6MIiQidsbEE>uV<8j9oON z-M4Sw?!JBhy*G}lr=C%3PiVD?fd2M(JGZYsr5!TC3&QQDU#j4zv@tc@jQC`c0zrWyMFAeq-oPdel6nnlJkV5W1GPi0hJ017O<0 zB;%u!$NAjq5FRs$+i?NT#Tm0LGt@qx7j2YL@J{!#ucb{rHc;)PAeW@gEV^Z<9dI{> z7{~|ST7Cr%s!jv3f$JmB%kGA=HRSEQj!4DhXo}a6=~7+9+JknxuTVOX>OKLr zVnBNhYGb`$OALfrLKj)$(2~rO z1aiGX$9g5txzZBUK3zns(o^a^dBA?17F6MbAw?7<>&@R;6puo#C^Ob?q=n*Ssu== zR)!0{!mSUdMltg#xFLBtz~yL7Xg6GjZrn#FamDP~nN&sC_(aw%*0V-fHkD1$ZD>YR(py#PJ-#V^~7IigPX|P z)z%*Q3Vb!UH;iU+yj6b{Z)kJS(9LASa>GhFqtT!gO9j1UE*W|EVS{I>LcC29KOPGe z?+eA5X|v(!`^Dr_JQ)Cz71TEW$MRv;E}&M&Jah9W%7 zK$EMO0(>;N&LX9|LmH05{NhlB-Ia%j7l<9AGHq>9a5J1@l-vlOqIE6?N{Z&A(e zBx$Ad5_{B&PE71K$@z*E1(C7bpXk1y%57IrZkNz5-a=)!L1B%nB<*>+=SKjtvrMmx z78!<_6;#Qx;p?!?lcd;J=8Ugb6!3e<&&%uuvb~FOO^@rN$2JmE?tr(FqA0ezD`*|I zwIjj%tw$rgg`1G>Gv69%P7km2X#7MmI71OZ?X&Kvo-z_VEt51?Aj^89?4QV2sf46j2`HZAgLghqW+YT3iu6cLn* z@c0JB>>Uxajkw_+6}ppez>T!Oi*H;QTi8Swpl?m7&}fJescz4N(Pe|3)Wv(NO-Xy_ zOlZ`A_7QKj9quKMSKF4$Zb8$)x0P(N1SgpToyAO3v~>QW+O$^2FQ`apVnH|JVUv$| z`BBLeyqdSBM%nGhS&yQ=#n+ zm3^p(=o`}W(f$yMZ{g&cffjl#l;OK2hRL~_F~x_1d&$GA59};>AXn#AbGj||4`gPk(rZ?nLWRztUMHCWq^C>Yq4^? zp$s;Xiw%XEhot@c2I&3|2lwyg&U~HyXA%Bb%BmDecG~GHDEFA0u3o!}wk02PM8liofJsQO0PZ?V8g$V%B?=c|?0LTx|CBUcXx0#{LY4o(R z97o5dH0b-(#*dTB?Tu~Eg!}{bguydma2THN#l4F%=yu7VX;2NCGN$i-w?T2I6FBWC zcv2O4GwfUkSfNk;+TnmdAn~2K@HwHV^JZPjK3&^}b?euz$?5H1yRo~yqhEM>XhPUFi=aX&cb1VNCUv(kA>U?~aFtg`li*G;!Ua{X$<)Zz0bGwc8 z7iwJ@8x7!M3S^^;hQv)q^~J2pO?vg~dJTnNHx{so*Bd(>tiNnurDFYiOIuB>|65b5 zmh~SjY0YK*M=M&>S$|dI68k__mZug3YQ)`rf!f5|OVroY1-BQeuN7$+Tq17&4^ul} z%RbF;nOXl?c~>Rt&$blI?SHnls1GNj%IZ?T33Q*~H|G_V9+(vf9fVm^uno1*e|?u>fdoN zMfl50bt-s;SWXwh4@mpza=3!rb@~g{yMWwvrU*6*$IiR}VVltRt0y$E*V!(gkSj0$ z9j*~Bz4B)j{EzVBt3QEB0X9)3o*{6*9fS1C4u=Y|?azHH(V}UL;5GR81K%W*! z5Wz%{W(p*U;1Xc4OghGG0j)Sp220ZX9b?S_ojBBkc)eeJA0J2&hX$ZE`qz)$7)TX| z=^({CX&l!DOl;^2WB`+G@{aM_0%mdA0y1)@HRHZOrZ~(3DaL?R1hWBU&?bU;AU%6> zXnY`G7l+G$&g35*8wnJO!y?9=ecT#YE)I)9a=L$LY#`tihm{~UHBcpj^xC=7?XShD aRH{z^ZLX<-D*nFoK)gt^LkPUKTlGJN3L{wn diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index ddeb7a4400ded00daa07358bd2042e8a7966a084..6fd96bed601209b6e066679b2bb788f60dbf3d89 100644 GIT binary patch delta 1509 zcmZuxeN0nV6o0?_UV#?dra}R$;u``kI>q9Ig3}22)qOc*j85FL+Ll>pUy(jOWRD4M zfF@?^tmbZU5LhxzRNRcy>zp4OlPns|TF3k*nVE`-e~62TF@|LqOoN-dx$owj^ZT9O zIrk*@^&K`mI%+WOH5v^9U!gx|dQVJiF&!}>RAu-pQxK@&L6a0|nP{WFQkdwujbwa8 zwi_e(QkX6BHwRpkc2lN{WN+ZpI_Y@`);rwD)vrQ1hw9 zj|sYT4PCZ~WrdXZUMcI_XXVNP*RGQ=dsE}_&D79D_;tB35GJ@5lda6dt zo7JK?mzVbMfgtSX-_x}P$|ar?pgnC^nW8)=9H1p1;sqMs0vi zuS|-26*y3{htF*P=-fu4UAeSn8%hvj zKW~F6(XhC|?X6aQL8X{p4zs5+>GymD}TC8x_3HNk*f@|Keghmj3Tqggx9Eg5-B zw08%3$vhivi(rA(Z_wooBwM3tFZRrgB|G|)9Yew@dNqPG*vvY2Vzt#VkTN-%Ir~U< zETynNrEo|PEsiU4{$;t~qFnH8{f{Ijm-WkKVQ1KRTb>%tE{Ms6{c>U0Ne%l@oHb++ z(q1{XDQaED_m#Xk(bSg)Z1yYivgpf8qWqMX(QpScGabi@qSi%x$2l|q%dsUq&8 z!mtRH8Lpv%U_0A*1B-zjj19mj;d9n^6I~L%V-~S;2ILG0&z%iyIVE-E9~bb#T>t<8 delta 1208 zcmZuve@xV682^5r?;Yped3P9a1H1$h8Arg} zZPlno4i8tugP?ZzCN-!=WOxzETWis!m-WhLJg4HcEX&GmK6U`}I(@Q=t_U&kCgO#$ z2~P&MLs_qqd$)%cTCZ}7zDh)in@w;;r7S3m;VxEDY74Mq(sAvE*j0;A@pvxgHZU~5#)!3-Egd*a3o!KB%R%OF>BP3KITYoH;OGi*sPz^ zOQ9~l--AhttXuw?NxEk;sVn7sp;@Hb45f=ET4b?4+o#c`#N~}!a0=d zafIV8VGrRVztf8^2*V<`57QVEN&VP|3DMJ!uW*x<0bD2C;mkp-m1$-!=gho*Zzhs9ZqrhH0JE+Wz%1nG{Y@sOmquZdBjZbkv5uQ zrMo~C7Y0+V5nbX;1pSW7d^D1FkXG#v2YqYMhUkkWqk-nF{7nK484$yVlIesFFLI)j z@_{05-9#sVa*j`>M}ap)j*E_?ktbcWjnKl*bjmZ}LvGBW0v)FLmuDzRhJi!2(%>@W ziW}SM67txePn&^g{%!|d(P1aQ{W5I;!uZQqXj;}>x?iPGGb;IV86~X1I8W76+A_^1 iu9;2(o#JW>o!8!^zLh+-@M06z%t}l4?Tn*TLw^JNtYd}% diff --git a/litellm/proxy/llm.py b/litellm/proxy/llm.py new file mode 100644 index 000000000..878131697 --- /dev/null +++ b/litellm/proxy/llm.py @@ -0,0 +1,141 @@ +from typing import Dict, Optional +from collections import defaultdict +import threading +import os, subprocess, traceback, json +from fastapi import HTTPException +from fastapi.responses import StreamingResponse + +import backoff +import openai.error + +import litellm +import litellm.exceptions + +cost_dict: Dict[str, Dict[str, float]] = defaultdict(dict) +cost_dict_lock = threading.Lock() + +debug = False +##### HELPER FUNCTIONS ##### +def print_verbose(print_statement): + global debug + if debug: + print(print_statement) + +# for streaming +def data_generator(response): + print_verbose("inside generator") + for chunk in response: + print_verbose(f"returned chunk: {chunk}") + yield f"data: {json.dumps(chunk)}\n\n" + +def run_ollama_serve(): + command = ['ollama', 'serve'] + + with open(os.devnull, 'w') as devnull: + process = subprocess.Popen(command, stdout=devnull, stderr=devnull) + +##### ERROR HANDLING ##### +class RetryConstantError(Exception): + pass + + +class RetryExpoError(Exception): + pass + + +class UnknownLLMError(Exception): + pass + + +def handle_llm_exception(e: Exception, user_api_base: Optional[str]=None): + print(f"\033[1;31mLiteLLM.Exception: {str(e)}\033[0m") + if isinstance(e, openai.error.ServiceUnavailableError) and e.llm_provider == "ollama": + run_ollama_serve() + if isinstance(e, openai.error.InvalidRequestError) and e.llm_provider == "ollama": + completion_call_details = {} + completion_call_details["model"] = e.model + if user_api_base: + completion_call_details["api_base"] = user_api_base + else: + completion_call_details["api_base"] = None + print(f"\033[1;31mLiteLLM.Exception: Invalid API Call. Call details: Model: \033[1;37m{e.model}\033[1;31m; LLM Provider: \033[1;37m{e.llm_provider}\033[1;31m; Custom API Base - \033[1;37m{completion_call_details['api_base']}\033[1;31m\033[0m") + if completion_call_details["api_base"] == "http://localhost:11434": + print() + print("Trying to call ollama? Try `litellm --model ollama/llama2 --api_base http://localhost:11434`") + print() + if isinstance( + e, + ( + openai.error.APIError, + openai.error.TryAgain, + openai.error.Timeout, + openai.error.ServiceUnavailableError, + ), + ): + raise RetryConstantError from e + elif isinstance(e, openai.error.RateLimitError): + raise RetryExpoError from e + elif isinstance( + e, + ( + openai.error.APIConnectionError, + openai.error.InvalidRequestError, + openai.error.AuthenticationError, + openai.error.PermissionError, + openai.error.InvalidAPIType, + openai.error.SignatureVerificationError, + ), + ): + raise e + else: + raise UnknownLLMError from e + + +@backoff.on_exception( + wait_gen=backoff.constant, + exception=RetryConstantError, + max_tries=3, + interval=3, +) +@backoff.on_exception( + wait_gen=backoff.expo, + exception=RetryExpoError, + jitter=backoff.full_jitter, + max_value=100, + factor=1.5, +) + +def litellm_completion(data: Dict, + type: str, + user_model: Optional[str], + user_temperature: Optional[str], + user_max_tokens: Optional[int], + user_api_base: Optional[str], + user_headers: Optional[dict], + user_debug: bool) -> litellm.ModelResponse: + try: + global debug + debug = user_debug + if user_model: + data["model"] = user_model + # override with user settings + if user_temperature: + data["temperature"] = user_temperature + if user_max_tokens: + data["max_tokens"] = user_max_tokens + if user_api_base: + data["api_base"] = user_api_base + if user_headers: + data["headers"] = user_headers + if type == "completion": + response = litellm.text_completion(**data) + elif type == "chat_completion": + response = litellm.completion(**data) + if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses + return StreamingResponse(data_generator(response), media_type='text/event-stream') + print_verbose(f"response: {response}") + return response + except Exception as e: + print(e) + handle_llm_exception(e=e, user_api_base=user_api_base) + return {"message": "An error occurred"}, 500 diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 854f27aa3..90a967921 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -23,6 +23,10 @@ except ImportError: import appdirs import tomli_w +try: + from .llm import litellm_completion +except ImportError as e: + from llm import litellm_completion import random list_of_messages = [ @@ -305,14 +309,6 @@ def deploy_proxy(model, api_base, debug, temperature, max_tokens, telemetry, dep return url - -# for streaming -def data_generator(response): - print_verbose("inside generator") - for chunk in response: - print_verbose(f"returned chunk: {chunk}") - yield f"data: {json.dumps(chunk)}\n\n" - def track_cost_callback( kwargs, # kwargs to completion completion_response, # response from completion @@ -433,49 +429,6 @@ litellm.input_callback = [logger] litellm.success_callback = [logger] litellm.failure_callback = [logger] -def litellm_completion(data, type): - try: - if user_model: - data["model"] = user_model - # override with user settings - if user_temperature: - data["temperature"] = user_temperature - if user_max_tokens: - data["max_tokens"] = user_max_tokens - if user_api_base: - data["api_base"] = user_api_base - if user_headers: - data["headers"] = user_headers - if type == "completion": - response = litellm.text_completion(**data) - elif type == "chat_completion": - response = litellm.completion(**data) - if 'stream' in data and data['stream'] == True: # use generate_responses to stream responses - return StreamingResponse(data_generator(response), media_type='text/event-stream') - print_verbose(f"response: {response}") - return response - except Exception as e: - traceback.print_exc() - if "Invalid response object from API" in str(e): - completion_call_details = {} - if user_model: - completion_call_details["model"] = user_model - else: - completion_call_details["model"] = data['model'] - - if user_api_base: - completion_call_details["api_base"] = user_api_base - else: - completion_call_details["api_base"] = None - print(f"\033[1;31mLiteLLM.Exception: Invalid API Call. Call details: Model: \033[1;37m{completion_call_details['model']}\033[1;31m; LLM Provider: \033[1;37m{e.llm_provider}\033[1;31m; Custom API Base - \033[1;37m{completion_call_details['api_base']}\033[1;31m\033[0m") - if completion_call_details["api_base"] == "http://localhost:11434": - print() - print("Trying to call ollama? Try `litellm --model ollama/llama2 --api_base http://localhost:11434`") - print() - else: - print(f"\033[1;31mLiteLLM.Exception: {str(e)}\033[0m") - return {"message": "An error occurred"}, 500 - #### API ENDPOINTS #### @router.get("/models") # if project requires model list def model_list(): @@ -494,12 +447,12 @@ def model_list(): @router.post("/completions") async def completion(request: Request): data = await request.json() - return litellm_completion(data=data, type="completion") + return litellm_completion(data=data, type="completion", user_model=user_model, user_temperature=user_temperature, user_max_tokens=user_max_tokens, user_api_base=user_api_base, user_headers=user_headers, user_debug=user_debug) @router.post("/chat/completions") async def chat_completion(request: Request): data = await request.json() - response = litellm_completion(data, type="chat_completion") + response = litellm_completion(data, type="chat_completion", user_model=user_model, user_temperature=user_temperature, user_max_tokens=user_max_tokens, user_api_base=user_api_base, user_headers=user_headers, user_debug=user_debug) return response diff --git a/litellm/utils.py b/litellm/utils.py index 64499381b..478eeda36 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -3092,14 +3092,31 @@ def exception_type( raise original_exception raise original_exception elif custom_llm_provider == "ollama": - error_str = original_exception.get("error", "") + if isinstance(original_exception, dict): + error_str = original_exception.get("error", "") + else: + error_str = str(original_exception) if "no such file or directory" in error_str: exception_mapping_worked = True raise InvalidRequestError( - message=f"Ollama Exception Invalid Model/Model not loaded - {original_exception}", + message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}", model=model, llm_provider="ollama" ) + elif "Failed to establish a new connection" in error_str: + exception_mapping_worked = True + raise ServiceUnavailableError( + message=f"OllamaException: {original_exception}", + llm_provider="ollama", + model=model + ) + elif "Invalid response object from API" in error_str: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"OllamaException: {original_exception}", + llm_provider="ollama", + model=model + ) elif custom_llm_provider == "vllm": if hasattr(original_exception, "status_code"): if original_exception.status_code == 0: