cc_staff
282
edits
mNo edit summary |
No edit summary |
||
Line 37: | Line 37: | ||
#SBATCH --output=%N-%j.out | #SBATCH --output=%N-%j.out | ||
export HEAD_NODE=$(hostname) # store | export HEAD_NODE=$(hostname) # store head node's address | ||
export RAY_PORT=34567 # choose a port to start Ray on | export RAY_PORT=34567 # choose a port to start Ray on the head node | ||
module load python gcc/9.3.0 arrow | module load python gcc/9.3.0 arrow | ||
Line 66: | Line 66: | ||
# Check that ray can see 6 cpus and 1 GPU | # Check that ray can see 6 cpus and 1 GPU | ||
print(ray.available_resources()) | |||
}} | |||
== Multiple Nodes == | |||
In the example that follows, we submit a job that spawns a two-node Ray cluster with 6 cpus and 1 GPU per node. | |||
{{File | |||
|name=ray-example.sh | |||
|lang="bash" | |||
|contents= | |||
#!/bin/bash | |||
#SBATCH --nodes 2 | |||
#SBATCH --ntasks-per-node=1 | |||
#SBATCH --gpus-per-task=1 | |||
#SBATCH --cpus-per-task=6 | |||
#SBATCH --mem=32000M | |||
#SBATCH --time=0-00:10 | |||
#SBATCH --output=%N-%j.out | |||
## Create a virtualenv and install Ray on all nodes ## | |||
srun -N $SLURM_NNODES -n $SLURM_NNODES config_env.sh | |||
export HEAD_NODE=$(hostname) # store head node's address | |||
export RAY_PORT=34567 # choose a port to start Ray on the head node | |||
source $SLURM_TMPDIR/ENV/bin/activate | |||
## Start Ray cluster Head Node ## | |||
ray start --head --node-ip-address=$HEAD_NODE --port=$RAY_PORT --num-cpus=$SLURM_CPUS_PER_TASK --block & | |||
sleep 10 | |||
## Launch worker nodes on all the other nodes allocated by the job ## | |||
srun launch_ray.sh & | |||
ray_cluster_pid=$! | |||
python test_ray.py | |||
kill $ray_cluster_pid | |||
}} | |||
Where the script <code>config_env.sh</code> is: | |||
{{File | |||
|name=config_env.sh | |||
|lang="bash" | |||
|contents= | |||
#!/bin/bash | |||
module load python | |||
virtualenv --no-download $SLURM_TMPDIR/ENV | |||
source $SLURM_TMPDIR/ENV/bin/activate | |||
pip install --upgrade pip --no-index | |||
pip install ray pandas --no-index | |||
}} | |||
And the script <code>launch_ray.sh</code> is: | |||
{{File | |||
|name=launch_ray.sh | |||
|lang="bash" | |||
|contents= | |||
#!/bin/bash | |||
source $SLURM_TMPDIR/ENV/bin/activate | |||
module load gcc/9.3.0 arrow | |||
if [[ "$SLURM_PROCID" -eq "0" ]]; then | |||
echo "Ray head node already started..." | |||
sleep 10 | |||
else | |||
ray start --address "${HEAD_NODE}:${RAY_PORT}" --num-cpus="${SLURM_CPUS_PER_TASK}" --block | |||
sleep 5 | |||
echo "ray worker started!" | |||
fi | |||
}} | |||
In this simple example, we connect to the single-node Ray cluster launched in the job submission script, then we check that Ray sees the resources allocated to the job. | |||
{{File | |||
|name=test_ray.py | |||
|lang="python" | |||
|contents= | |||
import ray | |||
import os | |||
# Connect to Ray cluster | |||
ray.init(address=f"{os.environ['HEAD_NODE']}:{os.environ['RAY_PORT']}",_node_ip_address=os.environ['HEAD_NODE']) | |||
# Check that Ray sees two nodes and their status is 'Alive' | |||
print("Nodes in the Ray cluster:") | |||
print(ray.nodes()) | |||
# Check that Ray sees 12 CPUs and 2 GPUs over 2 Nodes | |||
print(ray.available_resources()) | print(ray.available_resources()) | ||
}} | }} |