主旨
検索の仕方がよくないのか、公式以外のコードは動かなかったり、タイプミスなどでそのまま動かなかったりしたので、コピペだけで確実に動作するもので、可能な限り最小量のコードを置いておきます。
追記:MNIST のコードも置いておきます。
環境
- Windows 11 Pro
- Anaconda 2.3.2
- python: 3.10.9
- pytorch: 1.13.1
- cuda: 11.7
- nvidia driver: 516.01
GPU 構成
- GTX 1660 super x2
- GTX 3060 TI x3
- GTX 3070 TI x1
コード
コードは公式ページのもの、ほぼそのままです。
前述の環境下で、下記プログラムで複数 GPU での動作ができることを確認しました。
ddp.py
import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
dist.init_process_group("gloo", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def demo_basic(rank, world_size):
print(f"Running basic DDP example on rank {rank}.")
setup(rank, world_size)
# create model and move it to GPU with id rank
model = ToyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(rank)
loss_fn(outputs, labels).backward()
optimizer.step()
cleanup()
def run_demo(demo_fn, world_size):
mp.spawn(demo_fn,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
run_demo(demo_basic, 6)
どの GPU を使うかは、このコードでは選べません。コードを書けば、選択すること自体は可能です。
実行結果
> python ddp.py
Running basic DDP example on rank 1.
Running basic DDP example on rank 5.
Running basic DDP example on rank 3.
Running basic DDP example on rank 0.
Running basic DDP example on rank 2.
Running basic DDP example on rank 4.
nvidia-smi
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 516.01 Driver Version: 516.01 CUDA Version: 11.7 |
|-------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... WDDM | 00000000:05:00.0 Off | N/A |
| 41% 20C P2 30W / 125W | 89MiB / 6144MiB | 10% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce ... WDDM | 00000000:06:00.0 Off | N/A |
| 0% 36C P2 33W / 200W | 175MiB / 8192MiB | 16% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 2 NVIDIA GeForce ... WDDM | 00000000:07:00.0 Off | N/A |
| 0% 32C P2 42W / 200W | 193MiB / 8192MiB | 18% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 3 NVIDIA GeForce ... WDDM | 00000000:08:00.0 Off | N/A |
| 0% 27C P2 37W / 200W | 231MiB / 8192MiB | 23% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 4 NVIDIA GeForce ... WDDM | 00000000:0D:00.0 Off | N/A |
| 43% 18C P2 21W / 125W | 159MiB / 6144MiB | 17% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 5 NVIDIA GeForce ... WDDM | 00000000:0E:00.0 Off | N/A |
| 0% 30C P2 69W / 290W | 282MiB / 8192MiB | 19% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 10156 C ...3\envs\pytorch\python.exe N/A |
| 1 N/A N/A 9732 C ...3\envs\pytorch\python.exe N/A |
| 2 N/A N/A 4976 C ...3\envs\pytorch\python.exe N/A |
| 3 N/A N/A 8088 C ...3\envs\pytorch\python.exe N/A |
| 4 N/A N/A 10024 C ...3\envs\pytorch\python.exe N/A |
| 5 N/A N/A 2516 C ...3\envs\pytorch\python.exe N/A |
+-----------------------------------------------------------------------------+
MNIST-DDP
公式の example を ddp 化したものです。world_size を変えることで、使用する GPU を変更できます(多分)。
ddp_mnist.py
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import torch.multiprocessing as mp
import torch.distributed as dist
import os
from torch.nn.parallel import DistributedDataParallel as DDP
import time
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
dist.init_process_group("gloo", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def train(ddp_model, rank, world_size, train_loader, optimizer, epoch):
ddp_model.train()
for batch_idx, (data, label) in enumerate(train_loader):
data, label = data.to(rank), label.to(rank)
optimizer.zero_grad()
output = ddp_model(data)
loss = F.nll_loss(output, label)
loss.backward()
optimizer.step()
if batch_idx % 100 == 0:
print('rank: {} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
rank, epoch + 1, batch_idx * len(data) * world_size, len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def test(ddp_model, rank, test_loader):
ddp_model.eval()
test_loss = 0
correct = 0
data_num = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(rank), target.to(rank)
output = ddp_model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
data_num += len(data)
test_loss /= data_num
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%) [rank: {}]\n'.format(
test_loss, correct, data_num,
100. * correct / data_num, rank))
def main(rank, world_size, batch_size, epochs):
print(f"Running basic DDP example on rank {rank}.")
setup(rank, world_size)
# create model and move it to GPU with id rank
model = Net().to(rank)
ddp_model = DDP(model, device_ids=[rank])
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
optimizer = optim.Adam(ddp_model.parameters(), lr=0.001)
train_sampler = torch.utils.data.distributed.DistributedSampler(
dataset, num_replicas=world_size, rank=rank, shuffle=False
)
train_loader = torch.utils.data.DataLoader(
dataset, batch_size=batch_size, shuffle=train_sampler is None, sampler=train_sampler
)
test_sampler = torch.utils.data.distributed.DistributedSampler(
dataset2, num_replicas=world_size, rank=rank, shuffle=False
)
test_loader = torch.utils.data.DataLoader(
dataset2, batch_size=batch_size, shuffle=test_sampler is None, sampler=test_sampler
)
for epoch in range(epochs):
train(ddp_model=ddp_model, rank=rank, world_size=world_size,
train_loader=train_loader, optimizer=optimizer, epoch=epoch)
test(ddp_model=ddp_model, rank=rank, test_loader=test_loader)
cleanup()
def run(demo_fn, world_size=1, batch_size=2, epochs=1):
mp.spawn(demo_fn,
args=(world_size,batch_size,epochs),
nprocs=world_size,
join=True)
if __name__ == "__main__":
start = time.time()
run(main, world_size=1, batch_size=128, epochs=1)
print("Process time: ", time.time() - start)
使用するGPUを指定したい場合
使う GPU を指定したい場合は、to(rank)
の部分に使いたいGPUのデバイス名(cuda:0
とかcuda:1
とか)を指定します。
cuda.py
devices = ['cuda:1', 'cuda:2', 'cuda:0']
...
model = Net().to(devices[rank])
こんな感じです。ささっている GPU と cuda:n
の対応は、nvidia-smi
コマンドで調べられます。