* Adding PAIF-N demo scripts * Removing Confidential from headers * Addressing review comments --------- Co-authored-by: Lyuboslav Asenov <lasenov@vmware.com>
195 lines
6.8 KiB
PowerShell
195 lines
6.8 KiB
PowerShell
<#
|
|
# © 2024 Broadcom. All Rights Reserved. Broadcom. The term "Broadcom" refers to
|
|
# Broadcom Inc. and/or its subsidiaries.
|
|
#>
|
|
|
|
<#
|
|
.SYNOPSIS
|
|
|
|
This script configures the ESXi host for AI workloads
|
|
|
|
.DESCRIPTION
|
|
|
|
This script configures the ESXi host for AI workloads which includes installing the
|
|
NVIDIA AI Enterprise vGPU driver and NVIDIA GPU Management Daemon on the ESXi hosts.
|
|
vLCM is used for that purpose.
|
|
|
|
The script changes the default graphics type of the GPU devices to Shared Direct. The Xorg
|
|
service is then restarted. Finally, the vLCM is used to install the NVIDIA GPU driver and
|
|
management daemon.
|
|
|
|
.NOTES
|
|
|
|
Prerequisites:
|
|
- VI workload domain (vCenter server instance)
|
|
- ESXi hosts with GPUs
|
|
|
|
"Global parameters", "Workload domain parameters", "GPU parameters" should be updated to
|
|
reflect the environment they are run in. This may require altering the spec creation script.
|
|
|
|
#>
|
|
|
|
$ErrorActionPreference = 'Stop'
|
|
|
|
# --------------------------------------------------------------------------------------------------------------------------
|
|
# Global parameters
|
|
# --------------------------------------------------------------------------------------------------------------------------
|
|
|
|
# Name of the workload domain - used as a prefix for nested inventory items
|
|
$domainName = 'sfo-w01'
|
|
|
|
$domain = 'vrack.vsphere.local'
|
|
|
|
# --------------------------------------------------------------------------------------------------------------------------
|
|
# Workload domain parameters - stripped down version of $domainSpec from 01-deploy-vcf-workload-domain.ps1
|
|
|
|
$domainSpec = @{
|
|
VCenterSpec = @{
|
|
RootPassword = "VMware123!"
|
|
NetworkDetailsSpec = @{
|
|
DnsName = "$DomainName-vc01.$domain"
|
|
}
|
|
}
|
|
ComputeSpec = @{
|
|
ClusterSpecs = @(
|
|
@{
|
|
Name = "$DomainName-cl01"
|
|
}
|
|
)
|
|
}
|
|
}
|
|
# --------------------------------------------------------------------------------------------------------------------------
|
|
# GPU parameters
|
|
|
|
$nvidiaDriverLocation = "http://NVIDIA-VGPU-DRIVER-LOCATION/"
|
|
$gpuParameters = @{
|
|
EsxiImageName = "8.0 U2b - 23305546"
|
|
NVIDIA = @(
|
|
@{
|
|
Location = "$nvidiaDriverLocation/NVD-AIE-800_550.54.16-1OEM.800.1.0.20613240_23471877.zip"
|
|
Name = "NVIDIA AI Enterprise vGPU driver for VMWare ESX-8.0.0"
|
|
Version = "550.54.16"
|
|
Description = 'NVIDIA AI Enterprise vGPU driver for VMWare ESX-8.0.0'
|
|
},
|
|
@{
|
|
Location = "$nvidiaDriverLocation/nvd-gpu-mgmt-daemon_550.54.16-0.0.0000_23475823.zip"
|
|
Name = "NVIDIA GPU monitoring and management daemon"
|
|
Version = "550.54.16 - Build 0000"
|
|
Description = "NVIDIA GPU monitoring and management daemon"
|
|
}
|
|
)
|
|
GraphicsType = 'sharedDirect'
|
|
HostDefaultGraphicsType = 'sharedDirect'
|
|
SharedPassthruAssignmentPolicy = 'performance'
|
|
}
|
|
# --------------------------------------------------------------------------------------------------------------------------
|
|
|
|
# Connect to the VC of the workload domain
|
|
$vcConn = Connect-VIServer `
|
|
-Server $domainSpec.VCenterSpec.NetworkDetailsSpec.DnsName `
|
|
-User 'administrator@vsphere.local' `
|
|
-Password $domainSpec.VCenterSpec.RootPassword
|
|
|
|
$esxHosts = $domainSpec.ComputeSpec.ClusterSpecs | ForEach-Object { Get-VMHost -Location $_.Name }
|
|
|
|
# Preparing the GPU Device for the vGPU Driver
|
|
$esxHosts | ForEach-Object {
|
|
$graphicsManager = Get-View -Id $_.ExtensionData.ConfigManager.GraphicsManager
|
|
|
|
# Preparing the GPU Device for the vGPU Driver
|
|
# change the default graphics type to Shared Direct
|
|
$_.ExtensionData.Config.GraphicsInfo | `
|
|
Where-Object { $_.GraphicsType -ne $gpuParameters.GraphicsType } | `
|
|
ForEach-Object {
|
|
$config = New-Object VMware.Vim.HostGraphicsConfig
|
|
$config.DeviceType = New-Object VMware.Vim.HostGraphicsConfigDeviceType[] (1)
|
|
$config.DeviceType[0] = New-Object VMware.Vim.HostGraphicsConfigDeviceType
|
|
$config.DeviceType[0].DeviceId = $_.PciId
|
|
$config.DeviceType[0].GraphicsType = $gpuParameters.GraphicsType
|
|
$config.HostDefaultGraphicsType = $gpuParameters.HostDefaultGraphicsType
|
|
$config.SharedPassthruAssignmentPolicy = $gpuParameters.SharedPassthruAssignmentPolicy
|
|
$graphicsManager.UpdateGraphicsConfig($config)
|
|
}
|
|
|
|
# Restart xorg service
|
|
$_this = Get-View -Id $_.ExtensionData.ConfigManager.ServiceSystem
|
|
$_this.RestartService('xorg')
|
|
}
|
|
|
|
$uploadTasksId = $gpuParameters.NVIDIA | ForEach-Object {
|
|
# Upload the driver to vLCM
|
|
$SettingsDepotsOfflineCreateSpec = Initialize-SettingsDepotsOfflineCreateSpec `
|
|
-SourceType "PULL" `
|
|
-Location $_.Location `
|
|
-Description $_.Description
|
|
|
|
Invoke-CreateDepotsOfflineAsync -SettingsDepotsOfflineCreateSpec $SettingsDepotsOfflineCreateSpec
|
|
}
|
|
|
|
$uploadTasks = $uploadTasksId | ForEach-Object { Invoke-GetTask -task $_ }
|
|
|
|
Write-Progress -Id 0 "Uploading NVIDIA vGPU driver into vLCM"
|
|
$inProgress = $true
|
|
while ($inProgress) {
|
|
Write-Verbose "Waiting for NVIDIA driver upload into vLCM"
|
|
$uploadTasks | ConvertTo-Json -Depth 5 | Write-Verbose
|
|
|
|
$subprocess = ''
|
|
$completed = 0
|
|
$total = 0
|
|
|
|
$inProgress = $false
|
|
|
|
foreach ($t in $uploadTasks) {
|
|
if ($t -and $t.status -ne 'SUCCEEDED' -and $t.status -ne 'FAILED') {
|
|
$inProgress = $true
|
|
|
|
if ($t.progress) {
|
|
if ($t.progress.message -and `
|
|
$t.progress.message.default_message) {
|
|
if ($subprocess.Length -gt 0) {
|
|
$subprocess += ','
|
|
}
|
|
$subprocess += $t.progress.message.default_message
|
|
}
|
|
$completed += $t.progress.completed
|
|
$total += $t.progress.total
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($total -eq 0) { $total = 100 }
|
|
|
|
Write-Progress -Id 0 "Uploading NVIDIA vGPU driver into vLCM" -Status $subprocess -PercentComplete (($completed * 100) / $total)
|
|
|
|
Start-Sleep -Seconds 1
|
|
$uploadTasks = $uploadTasksId | ForEach-Object { Invoke-GetTask -task $_ }
|
|
}
|
|
Write-Progress -Id 0 "Uploading NVIDIA vGPU driver into vLCM" -Completed
|
|
|
|
# 3 vSphere LifeCycle Management Configuration
|
|
$esxiBaseImage = Get-LcmImage `
|
|
-Type BaseImage `
|
|
-Version $gpuParameters.EsxiImageName
|
|
$allComponents = Get-LcmImage -Type Component
|
|
$components = $gpuParameters.NVIDIA | ForEach-Object {
|
|
$nvd = $_
|
|
$allComponents | `
|
|
Where-Object {
|
|
$_.Name -eq $nvd.Name -and `
|
|
$_.Version -eq $nvd.Version
|
|
}
|
|
}
|
|
|
|
if (($components -isnot [array]) -or ($components.Length -ne $gpuParameters.NVIDIA.Length)) {
|
|
throw "Not all Nvidia components found"
|
|
}
|
|
|
|
$domainSpec.ComputeSpec.ClusterSpecs | ForEach-Object {
|
|
$cluster = Get-Cluster -Name $_.Name
|
|
$cluster = $cluster | Set-Cluster -BaseImage $esxiBaseImage -Component $components -Confirm:$false
|
|
$cluster = $cluster | Set-Cluster -AcceptEULA -Remediate -Confirm:$false
|
|
}
|
|
|
|
|
|
Disconnect-VIServer $vcConn |