Add jsrun launcher based on neox 2.0 · EleutherAI/gpt-neox@3782c7a (original) (raw)
3 files changed
lines changed
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1809,7 +1809,7 @@ Args for deepspeed runner (deepspeed.launcher.runner). | ||
1809 | 1809 | |
1810 | 1810 | |
1811 | 1811 | |
1812 | -- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm'] | |
1812 | +- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm', 'jsrun'] | |
1813 | 1813 | |
1814 | 1814 | Default = pdsh |
1815 | 1815 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -192,7 +192,7 @@ class NeoXArgsDeepspeedRunner(NeoXArgsTemplate): | ||
192 | 192 | IP address of node 0, will be inferred via 'hostname -I' if not specified. |
193 | 193 | """ |
194 | 194 | |
195 | -launcher: Literal["pdsh", "openmpi", "mvapich", "slurm"] = "pdsh" | |
195 | +launcher: Literal["pdsh", "openmpi", "mvapich", "slurm", "jsrun"] = "pdsh" | |
196 | 196 | """ |
197 | 197 | Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH. |
198 | 198 | """ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -612,6 +612,11 @@ class NeoXArgsOther(NeoXArgsTemplate): | ||
612 | 612 | Run via SLURM, this will attempt to discover the necessary variables to initialize torch distributed from the SLURM environment |
613 | 613 | """ |
614 | 614 | |
615 | +deepspeed_jsrun: bool = False | |
616 | +""" | |
617 | + Run via JSRUN, this will attempt to discover the necessary variables to initialize torch distributed from the IBM LSF environment | |
618 | + """ | |
619 | + | |
615 | 620 | user_script: str = None |
616 | 621 | """ |
617 | 622 | user script to be run |