背景介绍
nvidia-docker run--gpus '"device=0"' -name train_container_1 7055fe2b9719
import zerorpc
class HelloRPC(object):
def hello(self, name):
return "Hello, %s" % name
s = zerorpc.Server(HelloRPC())
s.bind("tcp://0.0.0.0:4242")
s.run()
import zerorpc
c = zerorpc.Client()
c.connect("tcp://127.0.0.1:4242")
print(c.hello("RPC"))
import torch.distributed as dist
dist.init_process_group(
…
backend="nccl",
timeout=timedelta(seconds=60)
)
@contextmanager
def torch_distributed_zero_first(local_rank: int):
# Decorator to make all processes in distributed training wait for each local_master to do something
if local_rank not in [-1, 0]:
dist.barrier(device_ids=[local_rank])
yield
if local_rank == 0:
dist.barrier(device_ids=[0])
# 使用该装饰器下载资源
with torch_distributed_zero_first(LOCAL_RANK):
weights = attempt_download(weights) # download if not found locally