## LiteLLM Hugging Face

Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface


In [None]:
!pip install litellm

## Serverless Inference Providers

Read more about Inference Providers here: https://huggingface.co/blog/inference-providers.

In order to use litellm with Hugging Face Inference Providers, you need to set `model=huggingface//`.

Example: `huggingface/together/deepseek-ai/DeepSeek-R1` to run DeepSeek-R1 (https://huggingface.co/deepseek-ai/DeepSeek-R1) through Together AI.


In [None]:
import os
from litellm import completion

# You can create a HF token here: https://huggingface.co/settings/tokens
os.environ["HF_TOKEN"] = "hf_xxxxxx"

# Call DeepSeek-R1 model through Together AI
response = completion(
 model="huggingface/together/deepseek-ai/DeepSeek-R1",
 messages=[{"content": "How many r's are in the word `strawberry`?", "role": "user"}],
)
print(response)

## Streaming


In [None]:
import os
from litellm import completion

os.environ["HF_TOKEN"] = "hf_xxxxxx"

response = completion(
 model="huggingface/together/deepseek-ai/DeepSeek-R1",
 messages=[
 {
 "role": "user",
 "content": "How many r's are in the word `strawberry`?",
 
 }
 ],
 stream=True,
)

for chunk in response:
 print(chunk)

## With images as input


In [None]:
from litellm import completion

# Set your Hugging Face Token
os.environ["HF_TOKEN"] = "hf_xxxxxx"

messages = [
 {
 "role": "user",
 "content": [
 {"type": "text", "text": "What's in this image?"},
 {
 "type": "image_url",
 "image_url": {
 "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
 },
 },
 ],
 }
]

response = completion(
 model="huggingface/sambanova/meta-llama/Llama-3.3-70B-Instruct",
 messages=messages,
)
print(response.choices[0])

## Tools - Function Calling


In [None]:
import os
from litellm import completion


# Set your Hugging Face Token
os.environ["HF_TOKEN"] = "hf_xxxxxx"

tools = [
 {
 "type": "function",
 "function": {
 "name": "get_current_weather",
 "description": "Get the current weather in a given location",
 "parameters": {
 "type": "object",
 "properties": {
 "location": {
 "type": "string",
 "description": "The city and state, e.g. San Francisco, CA",
 },
 "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
 },
 "required": ["location"],
 },
 },
 }
]
messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]

response = completion(
 model="huggingface/sambanova/meta-llama/Llama-3.1-8B-Instruct", messages=messages, tools=tools, tool_choice="auto"
)
print(response)

## Hugging Face Dedicated Inference Endpoints

Steps to use

- Create your own Hugging Face dedicated endpoint here: https://ui.endpoints.huggingface.co/
- Set `api_base` to your deployed api base
- set the model to `huggingface/tgi` so that litellm knows it's a huggingface Deployed Inference Endpoint.


In [None]:
import os
import litellm


response = litellm.completion(
 model="huggingface/tgi",
 messages=[{"content": "Hello, how are you?", "role": "user"}],
 api_base="https://my-endpoint.endpoints.huggingface.cloud/v1/",
)
print(response)