diff --git a/pytorch_job_wganSingleGen_ncc.yaml b/pytorch_job_wganSingleGen_ncc.yaml index b980cec96d582d2027f729e672cf0caaded7cfa3..e1c5db6b7d00832a3f84a35985121568ca41ec18 100644 --- a/pytorch_job_wganSingleGen_ncc.yaml +++ b/pytorch_job_wganSingleGen_ncc.yaml @@ -40,7 +40,7 @@ spec: limits: nvidia.com/gpu: 1 Worker: - replicas: 4 + replicas: 2 restartPolicy: OnFailure template: metadata: diff --git a/wganSingleGen.py b/wganSingleGen.py index 1ece4d0cce35c0e75c722d2c632008b2b2aa8e47..4a92e3a5d503c488cac692ee8762cc0cb1dce63d 100644 --- a/wganSingleGen.py +++ b/wganSingleGen.py @@ -274,9 +274,8 @@ def run(args): print('Critic trainable params:', sum(p.numel() for p in Crit_E_H.parameters() if p.requires_grad)) print('Generator trainable params:', sum(p.numel() for p in Gen_E_H.parameters() if p.requires_grad)) - if args.world_size > 1: - Distributor = nn.parallel.DistributedDataParallel if use_cuda \ - else nn.parallel.DistributedDataParallelCPU + if args.world_size > 1: + Distributor = nn.parallel.DistributedDataParallel if use_cuda else nn.parallel.DistributedDataParallelCPU Crit_E_H = Distributor(Crit_E_H, device_ids=[args.local_rank], output_device=args.local_rank ) Gen_E_H = Distributor(Gen_E_H, device_ids=[args.local_rank], output_device=args.local_rank )