Automate node bisection via launching small-scale baseline
Created by: suchenzang
Turn this script:
export SUSPECTS="nodelist[1024-1035,1037-1045,1049-1068,1079-1101,1109-1121,1123-1129,1131-1151,1153-1156,1172-1179,1200-1202,1204-1211,1213-1217,1219-1226,1233-1237,1241-1252,1254-1275,1291-1298,1301-1322,1324-1326,1335-1337,1339-1341,1345-1347,1349-1355,1357-1373,1375-1388,1391-1420,1423-1432,1435-1437,1443-1450,1452-1457,1459-1462,1466-1469,1475-1483,1485-1489,1493-1515,1527-1530,1538-1544,1546-1559,1574-1579,1581-1586,1591-1596,1610-1613,1617-1626,1638-1652,1654-1660,1663-1670,1672,1675-1690,1694,1696,1698-1704,1706-1709]"
export NUM_CHUNKS=4
export CHECKPOINT_DIR=<redacted>
scontrol show hostnames $SUSPECTS > machine_list.txt
awk -v chunks="$NUM_CHUNKS" 'NR%chunks{printf "%s,",$0;next;}1' machine_list.txt | xargs -n1 scontrol show hostlist > chunked_machine_list.txt
mkdir $CHECKPOINT_DIR
cat chunked_machine_list.txt
while IFS= read -r machines; do
INCLUDED_HOSTS=$machines \
PYTHONPATH=. ./<baselines script> \
-n $NUM_CHUNKS -g 8 -t 1 \
-p bisect_$machines \
--model-size 125m \
--no-save-dir \
--checkpoints-dir $CHECKPOINT_DIR
done < chunked_machine_list.txt
----------
< wait around for jobs to start training, grab list of job ids that was launched >
----------
for job_id in $(seq 17168 17286); do
stderr_path=$(find $CHECKPOINT_DIR -name train.stderr.$job_id)
train_log_path="${stderr_path%/*}/train.log"
if grep -q "gnorm_layer2" $train_log_path; then
echo found in job $job_id, path: $train_log_path
else
echo not found in job $job_id, path: $train_log_path
fi
done
----------
for job_id in $(seq 17168 17286); do
scancel $job_id
done
into something respectable