## LiteLLM HuggingFace
Docs for huggingface: https://docs.litellm.ai/docs/providers/huggingface

In [None]:
!pip install litellm

## HuggingFace TGI Model - Deployed Inference Endpoints
Steps to use
* set `api_base` to your deployed api base
* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint

In [9]:
import os
import litellm

os.environ["HUGGINGFACE_API_KEY"] = ""

# TGI model: Call https://huggingface.co/glaiveai/glaive-coder-7b
# add the 'huggingface/' prefix to the model to set huggingface as the provider
# set api base to your deployed api endpoint from hugging face
response = litellm.completion(
    model="huggingface/glaiveai/glaive-coder-7b",
    messages=[{ "content": "Hello, how are you?","role": "user"}],
    api_base="https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud"
)
print(response)

{
  "object": "chat.completion",
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "message": {
        "content": "\n\nI am doing well, thank you for asking. How about you?\nI am doing",
        "role": "assistant",
        "logprobs": -8.9481967812
      }
    }
  ],
  "id": "chatcmpl-74dc9d89-3916-47ce-9bea-b80e66660f77",
  "created": 1695871068.8413374,
  "model": "glaiveai/glaive-coder-7b",
  "usage": {
    "prompt_tokens": 6,
    "completion_tokens": 18,
    "total_tokens": 24
  }
}


## HuggingFace Non TGI/Non Conversational Model - Deployed Inference Endpoints
* set `api_base` to your deployed api base
* Add the `huggingface/` prefix to your model so litellm knows it's a huggingface Deployed Inference Endpoint

In [6]:
import os
import litellm

os.environ["HUGGINGFACE_API_KEY"] = ""
#  model: https://huggingface.co/roneneldan/TinyStories-3M
# add the 'huggingface/' prefix to the model to set huggingface as the provider
# set api base to your deployed api endpoint from hugging face
response = litellm.completion(
            model="huggingface/roneneldan/TinyStories-3M",
            messages=[{ "content": "Hello, how are you?","role": "user"}],
            api_base="https://p69xlsj6rpno5drq.us-east-1.aws.endpoints.huggingface.cloud",
        )
print(response)


{
  "object": "chat.completion",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Hello, how are you? I have a surprise for you. I have a surprise for you.",
        "role": "assistant",
        "logprobs": null
      }
    }
  ],
  "id": "chatcmpl-6035abd6-7753-4a7d-ba0a-8193522e23cf",
  "created": 1695871015.0468287,
  "model": "roneneldan/TinyStories-3M",
  "usage": {
    "prompt_tokens": 6,
    "completion_tokens": 20,
    "total_tokens": 26
  }
}


## Hugging Face Free Inference API
When API base is not set it defaults to sending requests to https://api-inference.huggingface.co/models/

In order to use litellm to call hugging face inference api llms
* Copy the model name from hugging face
* set `model = "huggingface/<model-name>"`

Example set `model=huggingface/bigcode/starcoder` to call `bigcode/starcoder`

https://huggingface.co/bigcode/starcoder

In [None]:
import os
import litellm

os.environ["HUGGINGFACE_API_KEY"] = ""

# Call https://huggingface.co/bigcode/starcoder
# add the 'huggingface/' prefix to the model to set huggingface as the provider
response = litellm.completion(
    model="huggingface/bigcode/starcoder",
    messages=[{ "content": "Hello, how are you?","role": "user"}]
)
print(response)


# Call https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf
response = litellm.completion(
    model="huggingface/codellama/CodeLlama-34b-Instruct-hf",
    messages=[{ "content": "Hello, how are you?","role": "user"}]
)
print(response)

{
  "object": "chat.completion",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": " I am fine, thank you. And you?')\nprint(result)\n\n# 2",
        "role": "assistant",
        "logprobs": null
      }
    }
  ],
  "id": "chatcmpl-982e4cd0-9779-4108-9f7e-d6cbf9b71516",
  "created": 1695835548.2239568,
  "model": "bigcode/starcoder",
  "usage": {
    "prompt_tokens": 6,
    "completion_tokens": 17,
    "total_tokens": 23
  }
}
{
  "object": "chat.completion",
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Hello! I'm doing well, thank you for asking. It's nice to meet you",
        "role": "assistant",
        "logprobs": null
      }
    }
  ],
  "id": "chatcmpl-6622d64d-e9fc-4a46-9ca7-b2d011f6968c",
  "created": 1695835549.2932954,
  "model": "codellama/CodeLlama-34b-Instruct-hf",
  "usage": {
    "prompt_tokens": 12,
    "completion_tokens": 18,
    "total_tokens":

## HuggingFace - Deployed Inference Endpoints + Streaming
Set stream = True

In [None]:
import os
import litellm

os.environ["HUGGINGFACE_API_KEY"] = ""

# Call https://huggingface.co/glaiveai/glaive-coder-7b
# add the 'huggingface/' prefix to the model to set huggingface as the provider
# set api base to your deployed api endpoint from hugging face
response = litellm.completion(
    model="huggingface/aws-glaive-coder-7b-0998",
    messages=[{ "content": "Hello, how are you?","role": "user"}],
    api_base="https://wjiegasee9bmqke2.us-east-1.aws.endpoints.huggingface.cloud",
    stream=True
)
print(response)

for chunk in response:
  print(chunk)

<litellm.utils.CustomStreamWrapper object at 0x7d1364efa650>
data json: {'token': {'id': 13, 'text': '\n', 'logprob': -1.4355469, 'special': False}, 'generated_text': None, 'details': None}
{
  "object": "chat.completion.chunk",
  "choices": [
    {
      "finish_reason": null,
      "index": 0,
      "delta": {
        "content": "\n",
        "role": "assistant"
      }
    }
  ],
  "id": "chatcmpl-b581bf7e-e20d-46fd-9ca0-b38870db3f3c",
  "created": 1695837652,
  "model": "aws-glaive-coder-7b-0998",
  "usage": {
    "prompt_tokens": null,
    "completion_tokens": null,
    "total_tokens": null
  }
}
data json: {'token': {'id': 13, 'text': '\n', 'logprob': -1.9277344, 'special': False}, 'generated_text': None, 'details': None}
{
  "object": "chat.completion.chunk",
  "choices": [
    {
      "finish_reason": null,
      "index": 0,
      "delta": {
        "content": "\n"
      }
    }
  ],
  "id": "chatcmpl-49c7b630-ec07-4390-ae22-bbb068ac66aa",
  "created": 1695837653,
  "model": "a