(fix) hf calculating usage non blocking

2025-04-26 03:04:13 +00:00 · 2023-11-03 17:59:34 -07:00 · 2023-11-03 17:59:34 -07:00 · df57e9247a
commit df57e9247a
parent 4e1885734a
1 changed files with 16 additions and 6 deletions
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -333,15 +333,25 @@ def completion(
                        "content"
                    ] = completion_response[0]["generated_text"]   
        ## CALCULATING USAGE
-        prompt_tokens = len(
+        prompt_tokens = 0
-            encoding.encode(input_text)
+        try:
-        )  ##[TODO] use the llama2 tokenizer here
+            prompt_tokens = len(
                encoding.encode(input_text)
            )  ##[TODO] use the llama2 tokenizer here
        except:
            # this should remain non blocking we should not block a response returning if calculating usage fails
            pass
        print_verbose(f'output: {model_response["choices"][0]["message"]}')
        output_text = model_response["choices"][0]["message"].get("content", "")
        if output_text is not None and len(output_text) > 0:
-            completion_tokens = len(
+            completion_tokens = 0
-                encoding.encode(model_response["choices"][0]["message"].get("content", ""))
+            try:
-            )  ##[TODO] use the llama2 tokenizer here
+                completion_tokens = len(
                    encoding.encode(model_response["choices"][0]["message"].get("content", ""))
                )  ##[TODO] use the llama2 tokenizer here
            except:
                # this should remain non blocking we should not block a response returning if calculating usage fails
                pass
        else: 
            completion_tokens = 0