GeForce RTX 2060を導入し、サーバーを新調(HP->Lenovo)したので、
nvidia-dockerのインストール手順を再度メモ。
※注意!!tf,cuda,cudnnのバージョンで不整合が発生中
# lspci | grep VGA
02:00.0 VGA compatible controller: Matrox Electronics Systems Ltd. MGA G200e [Pilot] ServerEngines (SEP1) (rev 42)
04:00.0 VGA compatible controller: NVIDIA Corporation TU106 [GeForce RTX 2070] (rev a1)
# lsmod | grep nouveau
nouveau 1898794 0
mxm_wmi 13021 1 nouveau
wmi 21636 2 mxm_wmi,nouveau
video 24538 1 nouveau
i2c_algo_bit 13413 2 mgag200,nouveau
drm_kms_helper 186531 2 mgag200,nouveau
ttm 96673 2 mgag200,nouveau
drm 456166 5 ttm,drm_kms_helper,mgag200,nouveau
# vi /etc/modprobe.d/blacklist-nouveau.conf
root@st250 ~]# cat /etc/modprobe.d/blacklist-nouveau.conf
blacklist nouveau
options nouveau modeset=0
# dracut --force
# reboot
# yum install -y gcc make kernel-devel
# curl -O http://jp.download.nvidia.com/XFree86/Linux-x86_64/440.82/NVIDIA-Linux-x86_64-440.82.run
# chmod 755 NVIDIA-Linux-x86_64-440.82.run
# ./NVIDIA-Linux-x86_64-440.82.run
kernel-develよりkernelのバージョンが古かったのでkernelをアップ
# yum update -y kernel
# reboot
# ./NVIDIA-Linux-x86_64-440.82.run
# nvidia-smi
Sat May 16 15:26:25 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82 Driver Version: 440.82 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce RTX 2070 Off | 00000000:04:00.0 Off | N/A |
| 29% 48C P0 26W / 175W | 0MiB / 7982MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
# curl -O http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
# chmod 755 cuda_10.2.89_440.33.01_linux.run
# ./cuda_10.2.89_440.33.01_linux.run
Driverはインストール済みなので、cudaだけインストール
# yum install -y yum-utils
# yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
# yum install -y yum-utils device-mapper-persistent-data lvm2
# yum install -y docker-ce
# yum-config-manager --add-repo https://nvidia.github.io/nvidia-docker/centos7/nvidia-docker.repo
# yum install -y nvidia-docker2
# systemctl enable docker
# systemctl start docker
# vi /etc/docker/daemon.json
# cat /etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
# systemctl stop docker
# systemctl start docker
# curl -L https://github.com/docker/compose/releases/download/1.21.2/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose
# chmod +x /usr/local/bin/docker-compose