chore: unpublish /inference/chat-completion (#3609)

# What does this PR do? BREAKING CHANGE: removes /inference/chat-completion route and updates relevant documentation ## Test Plan 🤷
2025-12-03 09:53:45 +00:00 · 2025-09-30 14:00:42 -04:00 · 2025-09-30 14:00:42 -04:00 · cb33f45c11
commit cb33f45c11
parent 62e302613f
23 changed files with 1448 additions and 2137 deletions
--- a/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/2_finetuning_and_inference.ipynb
@ -687,23 +687,17 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "completion = client.inference.chat_completion(\n",
-    "    model_id=CUSTOMIZED_MODEL,\n",
+    "completion = client.chat.completions.create(\n",
+    "    model=CUSTOMIZED_MODEL,\n",
    "    messages=test_sample[\"messages\"],\n",
    "    tools=test_sample[\"tools\"],\n",
    "    tool_choice=\"auto\",\n",
    "    stream=False,\n",
-    "    sampling_params={\n",
-    "        \"max_tokens\": 512,\n",
-    "        \"strategy\": {\n",
-    "            \"type\": \"top_p\",\n",
-    "            \"temperature\": 0.1,\n",
-    "            \"top_p\": 0.7,\n",
-    "        }\n",
-    "    },\n",
+    "    max_tokens=512,\n",
+    "    temperature=0.1,\n",
    ")\n",
    "\n",
-    "completion.completion_message.tool_calls"
+    "completion.choices[0].message.tool_calls"
   ]
  },
  {
--- a/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
+++ b/docs/notebooks/nvidia/tool_calling/4_adding_safety_guardrails.ipynb
@ -423,42 +423,30 @@
    "            violation = self.check_guardrails(user_message.get(\"content\"))\n",
    "            \n",
    "            if violation is None:\n",
-    "                completion = client.inference.chat_completion(\n",
-    "                    model_id=self.customized_model,\n",
+    "                completion = client.chat.completions.create(\n",
+    "                    model=self.customized_model,\n",
    "                    messages=[user_message],\n",
    "                    tools=tools,\n",
    "                    tool_choice=\"auto\",\n",
    "                    stream=False,\n",
-    "                    sampling_params={\n",
-    "                        \"max_tokens\": 1024,\n",
-    "                        \"strategy\": {\n",
-    "                            \"type\": \"top_p\",\n",
-    "                            \"top_p\": 0.7,\n",
-    "                            \"temperature\": 0.2\n",
-    "                        }\n",
-    "                    }\n",
+    "                    max_tokens=1024,\n",
+    "                    temperature=0.2,\n",
    "                )\n",
-    "                return completion.completion_message\n",
+    "                return completion.choices[0].message.content\n",
    "            else:\n",
    "                return f\"Not a safe input, the guardrails has resulted in a violation: {violation}. Tool-calling shall not happen\"\n",
    "        \n",
    "        elif self.guardrails == \"OFF\":\n",
-    "            completion = client.inference.chat_completion(\n",
-    "                model_id=self.customized_model,\n",
+    "            completion = client.chat.completions.create(\n",
+    "                model=self.customized_model,\n",
    "                messages=[user_message],\n",
    "                tools=tools,\n",
    "                tool_choice=\"auto\",\n",
    "                stream=False,\n",
-    "                sampling_params={\n",
-    "                    \"max_tokens\": 1024,\n",
-    "                    \"strategy\": {\n",
-    "                        \"type\": \"top_p\",\n",
-    "                        \"top_p\": 0.7,\n",
-    "                        \"temperature\": 0.2\n",
-    "                    }\n",
-    "                }\n",
+    "                max_tokens=1024,\n",
+    "                temperature=0.2,\n",
    "            )\n",
-    "            return completion.completion_message"
+    "            return completion.choices[0].message.content"
   ]
  },
  {