Files
PowerCLI-Example-Scripts/Scripts/PAIF-N/02-install-nvidia-driver-vlcm.ps1
lyuboasenov 062749868d Add PAIF-N automation example (#629)
* Adding PAIF-N demo scripts

* Removing Confidential from headers

* Addressing review comments

---------

Co-authored-by: Lyuboslav Asenov <lasenov@vmware.com>
2024-03-28 15:50:45 +02:00

195 lines
6.8 KiB
PowerShell

<#
# © 2024 Broadcom. All Rights Reserved. Broadcom. The term "Broadcom" refers to
# Broadcom Inc. and/or its subsidiaries.
#>
<#
.SYNOPSIS
This script configures the ESXi host for AI workloads
.DESCRIPTION
This script configures the ESXi host for AI workloads which includes installing the
NVIDIA AI Enterprise vGPU driver and NVIDIA GPU Management Daemon on the ESXi hosts.
vLCM is used for that purpose.
The script changes the default graphics type of the GPU devices to Shared Direct. The Xorg
service is then restarted. Finally, the vLCM is used to install the NVIDIA GPU driver and
management daemon.
.NOTES
Prerequisites:
- VI workload domain (vCenter server instance)
- ESXi hosts with GPUs
"Global parameters", "Workload domain parameters", "GPU parameters" should be updated to
reflect the environment they are run in. This may require altering the spec creation script.
#>
$ErrorActionPreference = 'Stop'
# --------------------------------------------------------------------------------------------------------------------------
# Global parameters
# --------------------------------------------------------------------------------------------------------------------------
# Name of the workload domain - used as a prefix for nested inventory items
$domainName = 'sfo-w01'
$domain = 'vrack.vsphere.local'
# --------------------------------------------------------------------------------------------------------------------------
# Workload domain parameters - stripped down version of $domainSpec from 01-deploy-vcf-workload-domain.ps1
$domainSpec = @{
VCenterSpec = @{
RootPassword = "VMware123!"
NetworkDetailsSpec = @{
DnsName = "$DomainName-vc01.$domain"
}
}
ComputeSpec = @{
ClusterSpecs = @(
@{
Name = "$DomainName-cl01"
}
)
}
}
# --------------------------------------------------------------------------------------------------------------------------
# GPU parameters
$nvidiaDriverLocation = "http://NVIDIA-VGPU-DRIVER-LOCATION/"
$gpuParameters = @{
EsxiImageName = "8.0 U2b - 23305546"
NVIDIA = @(
@{
Location = "$nvidiaDriverLocation/NVD-AIE-800_550.54.16-1OEM.800.1.0.20613240_23471877.zip"
Name = "NVIDIA AI Enterprise vGPU driver for VMWare ESX-8.0.0"
Version = "550.54.16"
Description = 'NVIDIA AI Enterprise vGPU driver for VMWare ESX-8.0.0'
},
@{
Location = "$nvidiaDriverLocation/nvd-gpu-mgmt-daemon_550.54.16-0.0.0000_23475823.zip"
Name = "NVIDIA GPU monitoring and management daemon"
Version = "550.54.16 - Build 0000"
Description = "NVIDIA GPU monitoring and management daemon"
}
)
GraphicsType = 'sharedDirect'
HostDefaultGraphicsType = 'sharedDirect'
SharedPassthruAssignmentPolicy = 'performance'
}
# --------------------------------------------------------------------------------------------------------------------------
# Connect to the VC of the workload domain
$vcConn = Connect-VIServer `
-Server $domainSpec.VCenterSpec.NetworkDetailsSpec.DnsName `
-User 'administrator@vsphere.local' `
-Password $domainSpec.VCenterSpec.RootPassword
$esxHosts = $domainSpec.ComputeSpec.ClusterSpecs | ForEach-Object { Get-VMHost -Location $_.Name }
# Preparing the GPU Device for the vGPU Driver
$esxHosts | ForEach-Object {
$graphicsManager = Get-View -Id $_.ExtensionData.ConfigManager.GraphicsManager
# Preparing the GPU Device for the vGPU Driver
# change the default graphics type to Shared Direct
$_.ExtensionData.Config.GraphicsInfo | `
Where-Object { $_.GraphicsType -ne $gpuParameters.GraphicsType } | `
ForEach-Object {
$config = New-Object VMware.Vim.HostGraphicsConfig
$config.DeviceType = New-Object VMware.Vim.HostGraphicsConfigDeviceType[] (1)
$config.DeviceType[0] = New-Object VMware.Vim.HostGraphicsConfigDeviceType
$config.DeviceType[0].DeviceId = $_.PciId
$config.DeviceType[0].GraphicsType = $gpuParameters.GraphicsType
$config.HostDefaultGraphicsType = $gpuParameters.HostDefaultGraphicsType
$config.SharedPassthruAssignmentPolicy = $gpuParameters.SharedPassthruAssignmentPolicy
$graphicsManager.UpdateGraphicsConfig($config)
}
# Restart xorg service
$_this = Get-View -Id $_.ExtensionData.ConfigManager.ServiceSystem
$_this.RestartService('xorg')
}
$uploadTasksId = $gpuParameters.NVIDIA | ForEach-Object {
# Upload the driver to vLCM
$SettingsDepotsOfflineCreateSpec = Initialize-SettingsDepotsOfflineCreateSpec `
-SourceType "PULL" `
-Location $_.Location `
-Description $_.Description
Invoke-CreateDepotsOfflineAsync -SettingsDepotsOfflineCreateSpec $SettingsDepotsOfflineCreateSpec
}
$uploadTasks = $uploadTasksId | ForEach-Object { Invoke-GetTask -task $_ }
Write-Progress -Id 0 "Uploading NVIDIA vGPU driver into vLCM"
$inProgress = $true
while ($inProgress) {
Write-Verbose "Waiting for NVIDIA driver upload into vLCM"
$uploadTasks | ConvertTo-Json -Depth 5 | Write-Verbose
$subprocess = ''
$completed = 0
$total = 0
$inProgress = $false
foreach ($t in $uploadTasks) {
if ($t -and $t.status -ne 'SUCCEEDED' -and $t.status -ne 'FAILED') {
$inProgress = $true
if ($t.progress) {
if ($t.progress.message -and `
$t.progress.message.default_message) {
if ($subprocess.Length -gt 0) {
$subprocess += ','
}
$subprocess += $t.progress.message.default_message
}
$completed += $t.progress.completed
$total += $t.progress.total
}
}
}
if ($total -eq 0) { $total = 100 }
Write-Progress -Id 0 "Uploading NVIDIA vGPU driver into vLCM" -Status $subprocess -PercentComplete (($completed * 100) / $total)
Start-Sleep -Seconds 1
$uploadTasks = $uploadTasksId | ForEach-Object { Invoke-GetTask -task $_ }
}
Write-Progress -Id 0 "Uploading NVIDIA vGPU driver into vLCM" -Completed
# 3 vSphere LifeCycle Management Configuration
$esxiBaseImage = Get-LcmImage `
-Type BaseImage `
-Version $gpuParameters.EsxiImageName
$allComponents = Get-LcmImage -Type Component
$components = $gpuParameters.NVIDIA | ForEach-Object {
$nvd = $_
$allComponents | `
Where-Object {
$_.Name -eq $nvd.Name -and `
$_.Version -eq $nvd.Version
}
}
if (($components -isnot [array]) -or ($components.Length -ne $gpuParameters.NVIDIA.Length)) {
throw "Not all Nvidia components found"
}
$domainSpec.ComputeSpec.ClusterSpecs | ForEach-Object {
$cluster = Get-Cluster -Name $_.Name
$cluster = $cluster | Set-Cluster -BaseImage $esxiBaseImage -Component $components -Confirm:$false
$cluster = $cluster | Set-Cluster -AcceptEULA -Remediate -Confirm:$false
}
Disconnect-VIServer $vcConn