Add jsrun launcher based on neox 2.0 · EleutherAI/gpt-neox@3782c7a (original) (raw)

3 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -1809,7 +1809,7 @@ Args for deepspeed runner (deepspeed.launcher.runner).
1809 1809
1810 1810
1811 1811
1812 -- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm']
1812 +- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm', 'jsrun']
1813 1813
1814 1814 Default = pdsh
1815 1815
Original file line number Diff line number Diff line change
@@ -192,7 +192,7 @@ class NeoXArgsDeepspeedRunner(NeoXArgsTemplate):
192 192 IP address of node 0, will be inferred via 'hostname -I' if not specified.
193 193 """
194 194
195 -launcher: Literal["pdsh", "openmpi", "mvapich", "slurm"] = "pdsh"
195 +launcher: Literal["pdsh", "openmpi", "mvapich", "slurm", "jsrun"] = "pdsh"
196 196 """
197 197 Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH.
198 198 """
Original file line number Diff line number Diff line change
@@ -612,6 +612,11 @@ class NeoXArgsOther(NeoXArgsTemplate):
612 612 Run via SLURM, this will attempt to discover the necessary variables to initialize torch distributed from the SLURM environment
613 613 """
614 614
615 +deepspeed_jsrun: bool = False
616 +"""
617 + Run via JSRUN, this will attempt to discover the necessary variables to initialize torch distributed from the IBM LSF environment
618 + """
619 +
615 620 user_script: str = None
616 621 """
617 622 user script to be run