http://zh.d2l.ai/chapter_computational-performance/auto-parallelism.html
Is there nobody ???
1 Like
I don’t have two GPU in my computer.
2 Likes
import torch
import time
class Timer():
def __enter__(self):
self.begin = time.time()
def __exit__(self, exc_type, exc_val, exc_tb):
self.end = time.time()
self.time = self.end - self.begin
print(self.time)
device1 = torch.device("cuda:2")
device2 = torch.device("cuda:3")
def run(x):
return [x.mm(x) for _ in range(50)]
x_gpu1 = torch.rand((4000, 4000), device=device1)
x_gpu2 = torch.rand((4000, 4000), device=device2)
run(x_gpu1)
run(x_gpu2)
torch.cuda.synchronize(device1)
torch.cuda.synchronize(device2)
with Timer():
run(x_gpu1)
print("gpu1:")
torch.cuda.synchronize(device1)
with Timer():
run(x_gpu2)
print("gpu2:")
torch.cuda.synchronize(device2)
with Timer():
run(x_gpu1)
run(x_gpu2)
print("together")
torch.cuda.synchronize()
gpu1:
0.9710698127746582
gpu2:
0.6902849674224854
together
3.9346890449523926
反而是并行的远远慢了,有点疑惑。。。(凑字数。。。。。。。。。。)
def copy_to_cpu(x, non_blocking=False):
return [y.to('cpu', non_blocking = non_blocking) for y in x]
with Timer():
print("复制到cpu")
y_cpu = copy_to_cpu(y1)
torch.cuda.synchronize()
with Timer():
y2 = run(x_gpu1)
y2_cpu = copy_to_cpu(y2)
print("计算+复制")
torch.cuda.synchronize(device1)
gpu1:
1.045579195022583
复制到cpu
87.37183403968811
计算+复制
3.7805612087249756