Skip to content

Commit

Permalink
Fix CI errors
Browse files Browse the repository at this point in the history
  • Loading branch information
gordicaleksa committed Jun 24, 2024
1 parent 3abdf9a commit a259eea
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 8 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -207,14 207,14 @@ else
endif
endif

ifeq ($(USE_MPI), 1)
ifeq ($(NO_USE_MPI), 1)
$(info → MPI is manually disabled)
else
$(info → MPI is manually enabled)
NVCC_INCLUDES = -I/usr/lib/x86_64-linux-gnu/openmpi/include
NVCC_LDFLAGS = -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
NVCC_FLAGS = -DUSE_MPI
NVCC_LDLIBS = -lmpi
else
$(info → MPI is manually disabled)
endif

# Precision settings, default to bf16 but ability to override
Expand Down
8 changes: 7 additions & 1 deletion profile_gpt2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 28,13 @@ the profile.ncu-rep from a cloud box to local to pretty view.
#include "train_gpt2.cu"

int main(int argc, char *argv[]) {
multi_gpu_config = multi_gpu_config_init(&argc, &argv);
char nccl_init_method[256] = "mpi"; // "tcp" or "fs" or "mpi"
int num_processes = -1; // doesn't matter when using MPI
int process_rank = -1; // doesn't matter when using MPI
int gpus_per_node = -1; // doesn't matter when using MPI
char server_ip[256] = ""; // doesn't matter when using MPI
char fs_path[256] = ""; // doesn't matter when using MPI
multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, nccl_init_method, server_ip, fs_path);
common_start(true, true);

// build the GPT-2 model from a checkpoint
Expand Down
2 changes: 1 addition & 1 deletion scripts/multi_node/run_gpt2_124M_fs.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 11,7 @@
# NOTE: change the above slurm arguments to match your system!
# Run with `sbatch <path_to_this_script.sh>`

make train_gpt2cu USE_CUDNN=1 USE_MPI=0
make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1

# NOTE: change the following to match your system
binary_path="/home/ubuntu/llm.c/train_gpt2cu"
Expand Down
2 changes: 1 addition & 1 deletion scripts/multi_node/run_gpt2_124M_mpi.sh
Original file line number Diff line number Diff line change
@@ -1,5 1,5 @@

make train_gpt2cu USE_CUDNN=1 USE_MPI=1
make train_gpt2cu USE_CUDNN=1

# NOTE: change the following to match your system
binary_path="/home/ubuntu/llm.c/train_gpt2cu"
Expand Down
2 changes: 1 addition & 1 deletion scripts/multi_node/run_gpt2_124M_tcp.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 11,7 @@
# NOTE: change the above slurm arguments to match your system!
# Run with `sbatch <path_to_this_script.sh>`

make train_gpt2cu USE_CUDNN=1 USE_MPI=0
make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1

# NOTE: change the following to match your system
binary_path="/home/ubuntu/llm.c/train_gpt2cu"
Expand Down
8 changes: 7 additions & 1 deletion test_gpt2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 89,13 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size
}

int main(int argc, char *argv[]) {
multi_gpu_config = multi_gpu_config_init(&argc, &argv);
char nccl_init_method[256] = "mpi"; // "tcp" or "fs" or "mpi"
int num_processes = -1; // doesn't matter when using MPI
int process_rank = -1; // doesn't matter when using MPI
int gpus_per_node = -1; // doesn't matter when using MPI
char server_ip[256] = ""; // doesn't matter when using MPI
char fs_path[256] = ""; // doesn't matter when using MPI
multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, nccl_init_method, server_ip, fs_path);
common_start(false, true);

// set the right paths
Expand Down

0 comments on commit a259eea

Please sign in to comment.