Fix CI errors

karpathy · Jun 24, 2024 · a259eea · a259eea
1 parent 3abdf9a
commit a259eea
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 8 deletions.
diff --git a/Makefile b/Makefile
@@ -207,14  207,14 @@ else
   endif
 endif
 
-ifeq ($(USE_MPI), 1)
 ifeq ($(NO_USE_MPI), 1)
   $(info → MPI is manually disabled)
 else
   $(info → MPI is manually enabled)
   NVCC_INCLUDES  = -I/usr/lib/x86_64-linux-gnu/openmpi/include
   NVCC_LDFLAGS  = -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
   NVCC_FLAGS  = -DUSE_MPI
   NVCC_LDLIBS  = -lmpi
-else
-  $(info → MPI is manually disabled)
 endif
 
 # Precision settings, default to bf16 but ability to override

diff --git a/profile_gpt2.cu b/profile_gpt2.cu
@@ -28,7  28,13 @@ the profile.ncu-rep from a cloud box to local to pretty view.
 #include "train_gpt2.cu"
 
 int main(int argc, char *argv[]) {
-    multi_gpu_config = multi_gpu_config_init(&argc, &argv);
     char nccl_init_method[256] = "mpi";  // "tcp" or "fs" or "mpi"
     int num_processes = -1;  // doesn't matter when using MPI
     int process_rank = -1;  // doesn't matter when using MPI
     int gpus_per_node = -1;  // doesn't matter when using MPI
     char server_ip[256] = "";  // doesn't matter when using MPI
     char fs_path[256] = "";  // doesn't matter when using MPI
     multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, nccl_init_method, server_ip, fs_path);
     common_start(true, true);
 
     // build the GPT-2 model from a checkpoint

diff --git a/scripts/multi_node/run_gpt2_124M_fs.sbatch b/scripts/multi_node/run_gpt2_124M_fs.sbatch
@@ -11,7  11,7 @@
 # NOTE: change the above slurm arguments to match your system!
 # Run with `sbatch <path_to_this_script.sh>`
 
-make train_gpt2cu USE_CUDNN=1 USE_MPI=0
 make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1
 
 # NOTE: change the following to match your system
 binary_path="/home/ubuntu/llm.c/train_gpt2cu"

diff --git a/scripts/multi_node/run_gpt2_124M_mpi.sh b/scripts/multi_node/run_gpt2_124M_mpi.sh
@@ -1,5  1,5 @@
 
-make train_gpt2cu USE_CUDNN=1 USE_MPI=1
 make train_gpt2cu USE_CUDNN=1
 
 # NOTE: change the following to match your system
 binary_path="/home/ubuntu/llm.c/train_gpt2cu"

diff --git a/scripts/multi_node/run_gpt2_124M_tcp.sbatch b/scripts/multi_node/run_gpt2_124M_tcp.sbatch
@@ -11,7  11,7 @@
 # NOTE: change the above slurm arguments to match your system!
 # Run with `sbatch <path_to_this_script.sh>`
 
-make train_gpt2cu USE_CUDNN=1 USE_MPI=0
 make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1
 
 # NOTE: change the following to match your system
 binary_path="/home/ubuntu/llm.c/train_gpt2cu"

diff --git a/test_gpt2.cu b/test_gpt2.cu
@@ -89,7  89,13 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size
 }
 
 int main(int argc, char *argv[]) {
-    multi_gpu_config = multi_gpu_config_init(&argc, &argv);
     char nccl_init_method[256] = "mpi";  // "tcp" or "fs" or "mpi"
     int num_processes = -1;  // doesn't matter when using MPI
     int process_rank = -1;  // doesn't matter when using MPI
     int gpus_per_node = -1;  // doesn't matter when using MPI
     char server_ip[256] = "";  // doesn't matter when using MPI
     char fs_path[256] = "";  // doesn't matter when using MPI
     multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, nccl_init_method, server_ip, fs_path);
     common_start(false, true);
 
     // set the right paths