diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md index 6e399e6ce..e845c3c48 100644 --- a/docs/source/distributions/self_hosted_distro/nvidia.md +++ b/docs/source/distributions/self_hosted_distro/nvidia.md @@ -157,7 +157,7 @@ docker run \ If you've set up your local development environment, you can also build the image using your local virtual environment. ```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct +INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct llama stack build --distro nvidia --image-type venv llama stack run ./run.yaml \ --port 8321 \ diff --git a/llama_stack/distributions/nvidia/doc_template.md b/llama_stack/distributions/nvidia/doc_template.md index 3884e6b51..56e99e523 100644 --- a/llama_stack/distributions/nvidia/doc_template.md +++ b/llama_stack/distributions/nvidia/doc_template.md @@ -129,7 +129,7 @@ docker run \ If you've set up your local development environment, you can also build the image using your local virtual environment. ```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct +INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct llama stack build --distro nvidia --image-type venv llama stack run ./run.yaml \ --port 8321 \ diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md index 2505718e0..4a072215c 100644 --- a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md +++ b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md @@ -42,8 +42,8 @@ client.initialize() ### Create Completion ```python -response = client.completion( - model_id="meta-llama/Llama-3.1-8b-Instruct", +response = client.inference.completion( + model_id="meta-llama/Llama-3.1-8B-Instruct", content="Complete the sentence using one word: Roses are red, violets are :", stream=False, sampling_params={ @@ -56,8 +56,8 @@ print(f"Response: {response.content}") ### Create Chat Completion ```python -response = client.chat_completion( - model_id="meta-llama/Llama-3.1-8b-Instruct", +response = client.inference.chat_completion( + model_id="meta-llama/Llama-3.1-8B-Instruct", messages=[ { "role": "system", @@ -78,8 +78,10 @@ print(f"Response: {response.completion_message.content}") ### Create Embeddings ```python -response = client.embeddings( - model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"] +response = client.inference.embeddings( + model_id="nvidia/llama-3.2-nv-embedqa-1b-v2", + contents=["What is the capital of France?"], + task_type="query", ) print(f"Embeddings: {response.embeddings}") -``` +``` \ No newline at end of file