diff --git a/pytorch_job_wganSingleGen_ncc.yaml b/pytorch_job_wganSingleGen_ncc.yaml index d857eeead94bd3e2e9a363c63e851c4dbd38ecb7..9e52b51213f2132999b9f7db0be0eaf52c1f484f 100644 --- a/pytorch_job_wganSingleGen_ncc.yaml +++ b/pytorch_job_wganSingleGen_ncc.yaml @@ -35,12 +35,12 @@ spec: value: "1" command: [sh, -c] args: - - python -u wganSingleGen.py --backend nccl --epochs 50 --exp wganSingleGenV1 --lrGen_E_H 0.00001 --chpt --chpt_eph 50 --batch-size 64 --ncrit 4 + - python -u wganSingleGen.py --backend nccl --epochs 150 --exp wganSingleGenV1 --lrGen_E_H 0.00001 --chpt --chpt_eph 50 --batch-size 64 --ncrit 4 resources: limits: nvidia.com/gpu: 1 Worker: - replicas: 3 + replicas: 4 restartPolicy: OnFailure template: metadata: @@ -70,7 +70,7 @@ spec: value: "1" command: [sh, -c] args: - - python -u wganSingleGen.py --backend nccl --epochs 50 --exp wganSingleGenV1 --lrGen_E_H 0.00001 --chpt --chpt_eph 50 --batch-size 64 --ncrit 4 + - python -u wganSingleGen.py --backend nccl --epochs 150 --exp wganSingleGenV1 --lrGen_E_H 0.00001 --chpt --chpt_eph 50 --batch-size 64 --ncrit 4 resources: limits: nvidia.com/gpu: 1 diff --git a/wganSingleGen.py b/wganSingleGen.py index 03840f98dfb12b94e917d5822c0defb4c95a2239..8509436c04c28a757bb31234b92b5d65f17bc6f3 100644 --- a/wganSingleGen.py +++ b/wganSingleGen.py @@ -299,7 +299,7 @@ def run(args): optimizerD_E_H.load_state_dict(critic_E_H_checkpoint['optimizer_state_dict']) Gen_E_H.load_state_dict(gen_E_H_checkpoint['model_state_dict']) - optimizerG_E_H.load_state_dict(gen_H_E_checkpoint['optimizer_state_dict']) + optimizerG_E_H.load_state_dict(gen_E_H_checkpoint['optimizer_state_dict']) eph = gen_E_H_checkpoint['epoch']