(fix) hf calculating usage non blocking

2025-04-25 18:54:30 +00:00 · 2023-11-03 17:59:34 -07:00 · 2023-11-03 17:59:34 -07:00 · df57e9247a
commit df57e9247a
parent 4e1885734a
1 changed files with 16 additions and 6 deletions
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -333,15 +333,25 @@ def completion(
                        "content"
                    ] = completion_response[0]["generated_text"]   
        ## CALCULATING USAGE
-        prompt_tokens = len(
-            encoding.encode(input_text)
-        )  ##[TODO] use the llama2 tokenizer here
+        prompt_tokens = 0
+        try:
+            prompt_tokens = len(
+                encoding.encode(input_text)
+            )  ##[TODO] use the llama2 tokenizer here
+        except:
+            # this should remain non blocking we should not block a response returning if calculating usage fails
+            pass
        print_verbose(f'output: {model_response["choices"][0]["message"]}')
        output_text = model_response["choices"][0]["message"].get("content", "")
        if output_text is not None and len(output_text) > 0:
-            completion_tokens = len(
-                encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-            )  ##[TODO] use the llama2 tokenizer here
+            completion_tokens = 0
+            try:
+                completion_tokens = len(
+                    encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+                )  ##[TODO] use the llama2 tokenizer here
+            except:
+                # this should remain non blocking we should not block a response returning if calculating usage fails
+                pass
        else: 
            completion_tokens = 0