More than 1 year has passed since last update.

Pytorch で複数 GPU を使う（DDP, NVIDIA/CUDA, Windows 11 pro）

Last updated at 2023-07-08Posted at 2023-01-26

主旨

検索の仕方がよくないのか、公式以外のコードは動かなかったり、タイプミスなどでそのまま動かなかったりしたので、コピペだけで確実に動作するもので、可能な限り最小量のコードを置いておきます。

追記：MNIST のコードも置いておきます。

環境

Windows 11 Pro
Anaconda 2.3.2
python: 3.10.9
pytorch: 1.13.1
cuda: 11.7
nvidia driver: 516.01

GPU 構成

GTX 1660 super x2
GTX 3060 TI x3
GTX 3070 TI x1

コード

コードは公式ページのもの、ほぼそのままです。

GETTING STARTED WITH DISTRIBUTED DATA PARALLEL

前述の環境下で、下記プログラムで複数 GPU での動作ができることを確認しました。

ddp.py

import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP

def setup(rank, world_size):
  os.environ['MASTER_ADDR'] = 'localhost'
  os.environ['MASTER_PORT'] = '12355'

  # initialize the process group
  dist.init_process_group("gloo", rank=rank, world_size=world_size)

def cleanup():
  dist.destroy_process_group()

class ToyModel(nn.Module):
  def __init__(self):
    super(ToyModel, self).__init__()
    self.net1 = nn.Linear(10, 10)
    self.relu = nn.ReLU()
    self.net2 = nn.Linear(10, 5)

  def forward(self, x):
    return self.net2(self.relu(self.net1(x)))


def demo_basic(rank, world_size):
  print(f"Running basic DDP example on rank {rank}.")
  setup(rank, world_size)

  # create model and move it to GPU with id rank
  model = ToyModel().to(rank)
  ddp_model = DDP(model, device_ids=[rank])

  loss_fn = nn.MSELoss()
  optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

  optimizer.zero_grad()
  outputs = ddp_model(torch.randn(20, 10))
  labels = torch.randn(20, 5).to(rank)
  loss_fn(outputs, labels).backward()
  optimizer.step()

  cleanup()


def run_demo(demo_fn, world_size):
  mp.spawn(demo_fn,
    args=(world_size,),
    nprocs=world_size,
    join=True)

if __name__ == "__main__":
    run_demo(demo_basic, 6)

どの GPU を使うかは、このコードでは選べません。コードを書けば、選択すること自体は可能です。

実行結果

> python ddp.py
Running basic DDP example on rank 1.
Running basic DDP example on rank 5.
Running basic DDP example on rank 3.
Running basic DDP example on rank 0.
Running basic DDP example on rank 2.
Running basic DDP example on rank 4.

nvidia-smi

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 516.01       Driver Version: 516.01       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ... WDDM  | 00000000:05:00.0 Off |                  N/A |
| 41%   20C    P2    30W / 125W |     89MiB /  6144MiB |     10%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ... WDDM  | 00000000:06:00.0 Off |                  N/A |
|  0%   36C    P2    33W / 200W |    175MiB /  8192MiB |     16%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  NVIDIA GeForce ... WDDM  | 00000000:07:00.0 Off |                  N/A |
|  0%   32C    P2    42W / 200W |    193MiB /  8192MiB |     18%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  NVIDIA GeForce ... WDDM  | 00000000:08:00.0 Off |                  N/A |
|  0%   27C    P2    37W / 200W |    231MiB /  8192MiB |     23%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   4  NVIDIA GeForce ... WDDM  | 00000000:0D:00.0 Off |                  N/A |
| 43%   18C    P2    21W / 125W |    159MiB /  6144MiB |     17%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   5  NVIDIA GeForce ... WDDM  | 00000000:0E:00.0 Off |                  N/A |
|  0%   30C    P2    69W / 290W |    282MiB /  8192MiB |     19%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A     10156      C   ...3\envs\pytorch\python.exe    N/A      |
|    1   N/A  N/A      9732      C   ...3\envs\pytorch\python.exe    N/A      |
|    2   N/A  N/A      4976      C   ...3\envs\pytorch\python.exe    N/A      |
|    3   N/A  N/A      8088      C   ...3\envs\pytorch\python.exe    N/A      |
|    4   N/A  N/A     10024      C   ...3\envs\pytorch\python.exe    N/A      |
|    5   N/A  N/A      2516      C   ...3\envs\pytorch\python.exe    N/A      |
+-----------------------------------------------------------------------------+

MNIST-DDP

公式の example を ddp 化したものです。world_size を変えることで、使用する GPU を変更できます（多分）。

ddp_mnist.py

from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import torch.multiprocessing as mp

import torch.distributed as dist
import os
from torch.nn.parallel import DistributedDataParallel as DDP
import time

def setup(rank, world_size):
  os.environ['MASTER_ADDR'] = 'localhost'
  os.environ['MASTER_PORT'] = '12355'

  # initialize the process group
  dist.init_process_group("gloo", rank=rank, world_size=world_size)

def cleanup():
  dist.destroy_process_group()

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.conv1 = nn.Conv2d(1, 32, 3, 1)
    self.conv2 = nn.Conv2d(32, 64, 3, 1)
    self.dropout1 = nn.Dropout(0.25)
    self.dropout2 = nn.Dropout(0.5)
    self.fc1 = nn.Linear(9216, 128)
    self.fc2 = nn.Linear(128, 10)

  def forward(self, x):
    x = self.conv1(x)
    x = F.relu(x)
    x = self.conv2(x)
    x = F.relu(x)
    x = F.max_pool2d(x, 2)
    x = self.dropout1(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = F.relu(x)
    x = self.dropout2(x)
    x = self.fc2(x)
    output = F.log_softmax(x, dim=1)
    return output

def train(ddp_model, rank, world_size, train_loader, optimizer, epoch):
  ddp_model.train()
  for batch_idx, (data, label) in enumerate(train_loader):
    data, label = data.to(rank), label.to(rank)
    optimizer.zero_grad()
    output = ddp_model(data)
    loss = F.nll_loss(output, label)
    loss.backward()
    optimizer.step()

    if batch_idx % 100 == 0:
      print('rank: {} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
          rank, epoch + 1, batch_idx * len(data) * world_size, len(train_loader.dataset),
          100. * batch_idx / len(train_loader), loss.item()))

def test(ddp_model, rank, test_loader):
  ddp_model.eval()
  test_loss = 0
  correct = 0
  data_num = 0
  with torch.no_grad():
    for data, target in test_loader:
      data, target = data.to(rank), target.to(rank)
      output = ddp_model(data)
      test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
      pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
      correct += pred.eq(target.view_as(pred)).sum().item()
      data_num += len(data)

  test_loss /= data_num

  print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%) [rank: {}]\n'.format(
      test_loss, correct, data_num,
      100. * correct / data_num, rank))

def main(rank, world_size, batch_size, epochs):
  print(f"Running basic DDP example on rank {rank}.")
  setup(rank, world_size)

  # create model and move it to GPU with id rank
  model = Net().to(rank)
  ddp_model = DDP(model, device_ids=[rank])

  transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])

  dataset = datasets.MNIST('../data', train=True, download=True,
                    transform=transform)
  dataset2 = datasets.MNIST('../data', train=False,
                      transform=transform)

  optimizer = optim.Adam(ddp_model.parameters(), lr=0.001)
  train_sampler = torch.utils.data.distributed.DistributedSampler(
      dataset, num_replicas=world_size, rank=rank, shuffle=False
  )
  train_loader = torch.utils.data.DataLoader(
      dataset, batch_size=batch_size, shuffle=train_sampler is None, sampler=train_sampler
  )

  test_sampler = torch.utils.data.distributed.DistributedSampler(
      dataset2, num_replicas=world_size, rank=rank, shuffle=False
  )
  test_loader = torch.utils.data.DataLoader(
      dataset2, batch_size=batch_size, shuffle=test_sampler is None, sampler=test_sampler
  )

  for epoch in range(epochs):
    train(ddp_model=ddp_model, rank=rank, world_size=world_size,
      train_loader=train_loader, optimizer=optimizer, epoch=epoch)
    test(ddp_model=ddp_model, rank=rank, test_loader=test_loader)

  cleanup()


def run(demo_fn, world_size=1, batch_size=2, epochs=1):
  mp.spawn(demo_fn,
    args=(world_size,batch_size,epochs),
    nprocs=world_size,
    join=True)

if __name__ == "__main__":
  start = time.time()
  run(main, world_size=1, batch_size=128, epochs=1)
  print("Process time: ", time.time() - start)

使用するGPUを指定したい場合

使う GPU を指定したい場合は、to(rank) の部分に使いたいGPUのデバイス名(cuda:0とかcuda:1とか）を指定します。

cuda.py

devices = ['cuda:1', 'cuda:2', 'cuda:0']
...
model = Net().to(devices[rank])

こんな感じです。ささっている　GPU と　cuda:n の対応は、nvidia-smi　コマンドで調べられます。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up