diff --git a/cuda-global/gol b/cuda-global/gol index 85af8de..83f9de6 100755 Binary files a/cuda-global/gol and b/cuda-global/gol differ diff --git a/cuda-global/src/main.cu b/cuda-global/src/main.cu index 41c2abf..03b66f2 100644 --- a/cuda-global/src/main.cu +++ b/cuda-global/src/main.cu @@ -36,7 +36,10 @@ true) { // Do the simulation void simulate(int argc, char** argv) { srand(SEED); - clock_t global_start = clock(); + cudaEvent_t global_start, global_end; + cudaEventCreate(&global_start); + cudaEventCreate(&global_end); + cudaEventRecord(global_start); char* filename; struct GAME game; game.padding = PADDING; @@ -135,8 +138,12 @@ void simulate(int argc, char** argv) { game.grid = temp; } } + cudaEventRecord(global_end); + cudaEventSynchronize(global_end); + float global_time; + cudaEventElapsedTime(&global_time, global_start, global_end); - printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", time_computing_life, ((double)clock() - (double)global_start)/CLOCKS_PER_SEC); + printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", time_computing_life, global_time/(double)1000); } int main(int argc, char** argv) { diff --git a/cuda-global/timing-study/output--1000-1000.txt b/cuda-global/timing-study/output--1000-1000.txt index 3792efc..e133bb0 100644 --- a/cuda-global/timing-study/output--1000-1000.txt +++ b/cuda-global/timing-study/output--1000-1000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.169687 -Clock time: 1.560000 +Time computing life: 0.169470 +Clock time: 5.175729 diff --git a/cuda-global/timing-study/output--1000-1250.txt b/cuda-global/timing-study/output--1000-1250.txt index 9081eb5..af6171c 100644 --- a/cuda-global/timing-study/output--1000-1250.txt +++ b/cuda-global/timing-study/output--1000-1250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.254989 -Clock time: 2.240000 +Time computing life: 0.255659 +Clock time: 5.620605 diff --git a/cuda-global/timing-study/output--1000-1500.txt b/cuda-global/timing-study/output--1000-1500.txt index c2fafe8..ba345e4 100644 --- a/cuda-global/timing-study/output--1000-1500.txt +++ b/cuda-global/timing-study/output--1000-1500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.354361 -Clock time: 3.050000 +Time computing life: 0.354065 +Clock time: 8.177913 diff --git a/cuda-global/timing-study/output--1000-1750.txt b/cuda-global/timing-study/output--1000-1750.txt index 557165e..e13e8b2 100644 --- a/cuda-global/timing-study/output--1000-1750.txt +++ b/cuda-global/timing-study/output--1000-1750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.480174 -Clock time: 4.070000 +Time computing life: 0.480989 +Clock time: 9.626799 diff --git a/cuda-global/timing-study/output--1000-2000.txt b/cuda-global/timing-study/output--1000-2000.txt index 91a9d45..27091f7 100644 --- a/cuda-global/timing-study/output--1000-2000.txt +++ b/cuda-global/timing-study/output--1000-2000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.619636 -Clock time: 5.220000 +Time computing life: 0.618807 +Clock time: 10.948197 diff --git a/cuda-global/timing-study/output--1000-250.txt b/cuda-global/timing-study/output--1000-250.txt index 0808a20..3f8aaa0 100644 --- a/cuda-global/timing-study/output--1000-250.txt +++ b/cuda-global/timing-study/output--1000-250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.029867 -Clock time: 0.330000 +Time computing life: 0.029682 +Clock time: 2.946978 diff --git a/cuda-global/timing-study/output--1000-500.txt b/cuda-global/timing-study/output--1000-500.txt index 913b15e..a78f7af 100644 --- a/cuda-global/timing-study/output--1000-500.txt +++ b/cuda-global/timing-study/output--1000-500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.059907 -Clock time: 0.540000 +Time computing life: 0.059339 +Clock time: 3.249037 diff --git a/cuda-global/timing-study/output--1000-750.txt b/cuda-global/timing-study/output--1000-750.txt index e3b8c5d..41908a8 100644 --- a/cuda-global/timing-study/output--1000-750.txt +++ b/cuda-global/timing-study/output--1000-750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.110954 -Clock time: 1.000000 +Time computing life: 0.110569 +Clock time: 4.294806 diff --git a/cuda-global/timing-study/slurm-3610476.err-notch081 b/cuda-global/timing-study/slurm-3617105.err-notch081 similarity index 100% rename from cuda-global/timing-study/slurm-3610476.err-notch081 rename to cuda-global/timing-study/slurm-3617105.err-notch081 diff --git a/cuda-global/timing-study/slurm-3610476.out-notch081 b/cuda-global/timing-study/slurm-3617105.out-notch081 similarity index 100% rename from cuda-global/timing-study/slurm-3610476.out-notch081 rename to cuda-global/timing-study/slurm-3617105.out-notch081 diff --git a/cuda-global/timing-study/slurm-3611549.err-notch081 b/cuda-global/timing-study/slurm-3617127.err-notch081 similarity index 100% rename from cuda-global/timing-study/slurm-3611549.err-notch081 rename to cuda-global/timing-study/slurm-3617127.err-notch081 diff --git a/cuda-global/timing-study/slurm-3611549.out-notch081 b/cuda-global/timing-study/slurm-3617127.out-notch081 similarity index 100% rename from cuda-global/timing-study/slurm-3611549.out-notch081 rename to cuda-global/timing-study/slurm-3617127.out-notch081 diff --git a/mpi/gol b/mpi/gol index f9518e9..4e09964 100755 Binary files a/mpi/gol and b/mpi/gol differ diff --git a/mpi/src/game.c b/mpi/src/game.c index de73f67..8f464a0 100644 --- a/mpi/src/game.c +++ b/mpi/src/game.c @@ -5,7 +5,7 @@ int neighbors(struct GAME* game, int x, int y, unsigned char* halo_above, unsign for (int dy = -1; dy <= 1; dy++) { for (int dx = -1; dx <= 1; dx++) { - if (!(dx == 0 && dy == 0) && (x+dx) > 0 && (x+dx) < game->width+(game->padding*2)) { + if (!(dx == 0 && dy == 0) && (x+dx) > 0 && (x+dx) < game->width+(game->padding*2) && (y+dy) < game->height) { if (y+dy == -1 && halo_above != NULL) { if (halo_above[x+dx]) { n++; diff --git a/mpi/src/main.c b/mpi/src/main.c index 02e2bb6..ea6c4d7 100644 --- a/mpi/src/main.c +++ b/mpi/src/main.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "file.h" #include "game.h" diff --git a/mpi/timing-study/output-1-1000-1000.txt b/mpi/timing-study/output-1-1000-1000.txt index f500f74..bc81ea9 100644 --- a/mpi/timing-study/output-1-1000-1000.txt +++ b/mpi/timing-study/output-1-1000-1000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 33.832562 -Clock time: 37.939663 +Time computing life: 68.237100 +Clock time: 73.897736 diff --git a/mpi/timing-study/output-1-1000-1250.txt b/mpi/timing-study/output-1-1000-1250.txt index f928063..87aeae3 100644 --- a/mpi/timing-study/output-1-1000-1250.txt +++ b/mpi/timing-study/output-1-1000-1250.txt @@ -1,11 +1,4 @@ -=================================================================================== -= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES -= PID 21716 RUNNING AT kp013 -= EXIT CODE: 11 -= CLEANING UP REMAINING PROCESSES -= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES -=================================================================================== -YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11) -This typically refers to a problem with your application. -Please see the FAQ page for debugging suggestions +===Timing=== +Time computing life: 105.934486 +Clock time: 112.662907 diff --git a/mpi/timing-study/output-1-1000-1500.txt b/mpi/timing-study/output-1-1000-1500.txt index d31db96..4cec827 100644 --- a/mpi/timing-study/output-1-1000-1500.txt +++ b/mpi/timing-study/output-1-1000-1500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 75.141736 -Clock time: 83.149478 +Time computing life: 155.201482 +Clock time: 165.129865 diff --git a/mpi/timing-study/output-1-1000-1750.txt b/mpi/timing-study/output-1-1000-1750.txt index 6bbaf1f..91fe574 100644 --- a/mpi/timing-study/output-1-1000-1750.txt +++ b/mpi/timing-study/output-1-1000-1750.txt @@ -1,11 +1,4 @@ -=================================================================================== -= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES -= PID 21837 RUNNING AT kp013 -= EXIT CODE: 11 -= CLEANING UP REMAINING PROCESSES -= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES -=================================================================================== -YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11) -This typically refers to a problem with your application. -Please see the FAQ page for debugging suggestions +===Timing=== +Time computing life: 207.332667 +Clock time: 219.494609 diff --git a/mpi/timing-study/output-1-1000-2000.txt b/mpi/timing-study/output-1-1000-2000.txt index 3ba37f2..3386c29 100644 --- a/mpi/timing-study/output-1-1000-2000.txt +++ b/mpi/timing-study/output-1-1000-2000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 132.636661 -Clock time: 145.001708 +Time computing life: 269.160186 +Clock time: 284.025931 diff --git a/mpi/timing-study/output-1-1000-250.txt b/mpi/timing-study/output-1-1000-250.txt index 544de8e..f68dc4a 100644 --- a/mpi/timing-study/output-1-1000-250.txt +++ b/mpi/timing-study/output-1-1000-250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 2.383001 -Clock time: 4.113476 +Time computing life: 5.132488 +Clock time: 6.490781 diff --git a/mpi/timing-study/output-1-1000-500.txt b/mpi/timing-study/output-1-1000-500.txt index dfa5abb..9db1058 100644 --- a/mpi/timing-study/output-1-1000-500.txt +++ b/mpi/timing-study/output-1-1000-500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 8.793952 -Clock time: 9.832794 +Time computing life: 18.976428 +Clock time: 20.433132 diff --git a/mpi/timing-study/output-1-1000-750.txt b/mpi/timing-study/output-1-1000-750.txt index e1437a3..2a0075a 100644 --- a/mpi/timing-study/output-1-1000-750.txt +++ b/mpi/timing-study/output-1-1000-750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 19.270078 -Clock time: 21.813069 +Time computing life: 41.754895 +Clock time: 44.502337 diff --git a/mpi/timing-study/output-12-1000-1000.txt b/mpi/timing-study/output-12-1000-1000.txt index 2d1c3b3..0eee327 100644 --- a/mpi/timing-study/output-12-1000-1000.txt +++ b/mpi/timing-study/output-12-1000-1000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 2.833550 -Clock time: 6.323680 +Time computing life: 6.001465 +Clock time: 9.559285 diff --git a/mpi/timing-study/output-12-1000-1250.txt b/mpi/timing-study/output-12-1000-1250.txt index 5e8cbfb..d657a43 100644 --- a/mpi/timing-study/output-12-1000-1250.txt +++ b/mpi/timing-study/output-12-1000-1250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 4.347700 -Clock time: 9.178630 +Time computing life: 9.368616 +Clock time: 14.966020 diff --git a/mpi/timing-study/output-12-1000-1500.txt b/mpi/timing-study/output-12-1000-1500.txt index 206bf6c..1ae5997 100644 --- a/mpi/timing-study/output-12-1000-1500.txt +++ b/mpi/timing-study/output-12-1000-1500.txt @@ -1,11 +1,4 @@ -=================================================================================== -= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES -= PID 23209 RUNNING AT kp013 -= EXIT CODE: 11 -= CLEANING UP REMAINING PROCESSES -= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES -=================================================================================== -YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11) -This typically refers to a problem with your application. -Please see the FAQ page for debugging suggestions +===Timing=== +Time computing life: 13.390212 +Clock time: 20.945775 diff --git a/mpi/timing-study/output-12-1000-1750.txt b/mpi/timing-study/output-12-1000-1750.txt index f2798ae..807c4a2 100644 --- a/mpi/timing-study/output-12-1000-1750.txt +++ b/mpi/timing-study/output-12-1000-1750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 8.483342 -Clock time: 17.330302 +Time computing life: 18.167763 +Clock time: 28.215494 diff --git a/mpi/timing-study/output-12-1000-2000.txt b/mpi/timing-study/output-12-1000-2000.txt index 165b598..f8d09ad 100644 --- a/mpi/timing-study/output-12-1000-2000.txt +++ b/mpi/timing-study/output-12-1000-2000.txt @@ -1,11 +1,4 @@ -=================================================================================== -= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES -= PID 23290 RUNNING AT kp013 -= EXIT CODE: 11 -= CLEANING UP REMAINING PROCESSES -= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES -=================================================================================== -YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11) -This typically refers to a problem with your application. -Please see the FAQ page for debugging suggestions +===Timing=== +Time computing life: 23.784948 +Clock time: 36.657344 diff --git a/mpi/timing-study/output-12-1000-250.txt b/mpi/timing-study/output-12-1000-250.txt index e4be53e..fef0749 100644 --- a/mpi/timing-study/output-12-1000-250.txt +++ b/mpi/timing-study/output-12-1000-250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.198089 -Clock time: 2.217166 +Time computing life: 0.621095 +Clock time: 2.379479 diff --git a/mpi/timing-study/output-12-1000-500.txt b/mpi/timing-study/output-12-1000-500.txt index 51bc78b..925b30d 100644 --- a/mpi/timing-study/output-12-1000-500.txt +++ b/mpi/timing-study/output-12-1000-500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.735509 -Clock time: 2.513034 +Time computing life: 1.541923 +Clock time: 3.193527 diff --git a/mpi/timing-study/output-12-1000-750.txt b/mpi/timing-study/output-12-1000-750.txt index c9351f2..d5731ae 100644 --- a/mpi/timing-study/output-12-1000-750.txt +++ b/mpi/timing-study/output-12-1000-750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 1.617002 -Clock time: 4.091923 +Time computing life: 3.622372 +Clock time: 5.586008 diff --git a/mpi/timing-study/output-16-1000-1000.txt b/mpi/timing-study/output-16-1000-1000.txt index 4b98fae..d8ee38c 100644 --- a/mpi/timing-study/output-16-1000-1000.txt +++ b/mpi/timing-study/output-16-1000-1000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 2.106571 -Clock time: 7.500836 +Time computing life: 4.442463 +Clock time: 10.520606 diff --git a/mpi/timing-study/output-16-1000-1250.txt b/mpi/timing-study/output-16-1000-1250.txt index 183314c..e6fea7b 100644 --- a/mpi/timing-study/output-16-1000-1250.txt +++ b/mpi/timing-study/output-16-1000-1250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 3.445883 -Clock time: 11.167682 +Time computing life: 7.085545 +Clock time: 15.203315 diff --git a/mpi/timing-study/output-16-1000-1500.txt b/mpi/timing-study/output-16-1000-1500.txt index a08be6f..d0dffc7 100644 --- a/mpi/timing-study/output-16-1000-1500.txt +++ b/mpi/timing-study/output-16-1000-1500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 4.741983 -Clock time: 16.777514 +Time computing life: 10.130384 +Clock time: 22.828620 diff --git a/mpi/timing-study/output-16-1000-1750.txt b/mpi/timing-study/output-16-1000-1750.txt index cd6757e..95aff4c 100644 --- a/mpi/timing-study/output-16-1000-1750.txt +++ b/mpi/timing-study/output-16-1000-1750.txt @@ -1,8 +1,4 @@ -=================================================================================== -= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES -= PID 34784 RUNNING AT kp160 -= EXIT CODE: 11 -= CLEANING UP REMAINING PROCESSES -= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES -=================================================================================== +===Timing=== +Time computing life: 13.581483 +Clock time: 30.474959 diff --git a/mpi/timing-study/output-16-1000-2000.txt b/mpi/timing-study/output-16-1000-2000.txt index 4bfa78c..40746c4 100644 --- a/mpi/timing-study/output-16-1000-2000.txt +++ b/mpi/timing-study/output-16-1000-2000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 8.301682 -Clock time: 28.791425 +Time computing life: 17.827682 +Clock time: 38.839103 diff --git a/mpi/timing-study/output-16-1000-250.txt b/mpi/timing-study/output-16-1000-250.txt index 2f97b52..4e5dad0 100644 --- a/mpi/timing-study/output-16-1000-250.txt +++ b/mpi/timing-study/output-16-1000-250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.145483 -Clock time: 2.572587 +Time computing life: 0.295248 +Clock time: 2.424203 diff --git a/mpi/timing-study/output-16-1000-500.txt b/mpi/timing-study/output-16-1000-500.txt index adc146c..06e5a5e 100644 --- a/mpi/timing-study/output-16-1000-500.txt +++ b/mpi/timing-study/output-16-1000-500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.570992 -Clock time: 3.899400 +Time computing life: 1.134216 +Clock time: 3.333061 diff --git a/mpi/timing-study/output-16-1000-750.txt b/mpi/timing-study/output-16-1000-750.txt index 961b2a9..4480ddb 100644 --- a/mpi/timing-study/output-16-1000-750.txt +++ b/mpi/timing-study/output-16-1000-750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 1.215016 -Clock time: 5.047125 +Time computing life: 2.724965 +Clock time: 6.326008 diff --git a/mpi/timing-study/output-20-1000-1000.txt b/mpi/timing-study/output-20-1000-1000.txt new file mode 100644 index 0000000..57e8bcf --- /dev/null +++ b/mpi/timing-study/output-20-1000-1000.txt @@ -0,0 +1,4 @@ + +===Timing=== +Time computing life: 3.626076 +Clock time: 10.578861 diff --git a/mpi/timing-study/output-20-1000-1250.txt b/mpi/timing-study/output-20-1000-1250.txt new file mode 100644 index 0000000..b38fa4b --- /dev/null +++ b/mpi/timing-study/output-20-1000-1250.txt @@ -0,0 +1,4 @@ + +===Timing=== +Time computing life: 5.589568 +Clock time: 15.829982 diff --git a/mpi/timing-study/output-20-1000-1500.txt b/mpi/timing-study/output-20-1000-1500.txt new file mode 100644 index 0000000..c88765b --- /dev/null +++ b/mpi/timing-study/output-20-1000-1500.txt @@ -0,0 +1,4 @@ + +===Timing=== +Time computing life: 8.249432 +Clock time: 22.901060 diff --git a/mpi/timing-study/output-20-1000-1750.txt b/mpi/timing-study/output-20-1000-1750.txt new file mode 100644 index 0000000..dfa7155 --- /dev/null +++ b/mpi/timing-study/output-20-1000-1750.txt @@ -0,0 +1,4 @@ + +===Timing=== +Time computing life: 10.833047 +Clock time: 31.488105 diff --git a/mpi/timing-study/output-20-1000-2000.txt b/mpi/timing-study/output-20-1000-2000.txt new file mode 100644 index 0000000..aa4639b --- /dev/null +++ b/mpi/timing-study/output-20-1000-2000.txt @@ -0,0 +1,4 @@ + +===Timing=== +Time computing life: 14.341513 +Clock time: 41.437950 diff --git a/mpi/timing-study/output-20-1000-250.txt b/mpi/timing-study/output-20-1000-250.txt index e69de29..cbbb4d4 100644 --- a/mpi/timing-study/output-20-1000-250.txt +++ b/mpi/timing-study/output-20-1000-250.txt @@ -0,0 +1,4 @@ + +===Timing=== +Time computing life: 0.290254 +Clock time: 2.288621 diff --git a/mpi/timing-study/output-20-1000-500.txt b/mpi/timing-study/output-20-1000-500.txt new file mode 100644 index 0000000..35978a6 --- /dev/null +++ b/mpi/timing-study/output-20-1000-500.txt @@ -0,0 +1,4 @@ + +===Timing=== +Time computing life: 0.922726 +Clock time: 3.252760 diff --git a/mpi/timing-study/output-20-1000-750.txt b/mpi/timing-study/output-20-1000-750.txt new file mode 100644 index 0000000..1a09df2 --- /dev/null +++ b/mpi/timing-study/output-20-1000-750.txt @@ -0,0 +1,4 @@ + +===Timing=== +Time computing life: 2.058615 +Clock time: 6.415593 diff --git a/mpi/timing-study/output-24-1000-1000.txt b/mpi/timing-study/output-24-1000-1000.txt index b1fd01d..e8d4816 100644 --- a/mpi/timing-study/output-24-1000-1000.txt +++ b/mpi/timing-study/output-24-1000-1000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 1.414322 -Clock time: 9.439315 +Time computing life: 2.948477 +Clock time: 11.073549 diff --git a/mpi/timing-study/output-24-1000-1250.txt b/mpi/timing-study/output-24-1000-1250.txt index 08acf8e..abd985a 100644 --- a/mpi/timing-study/output-24-1000-1250.txt +++ b/mpi/timing-study/output-24-1000-1250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 2.171989 -Clock time: 13.927639 +Time computing life: 4.599746 +Clock time: 16.289204 diff --git a/mpi/timing-study/output-24-1000-1500.txt b/mpi/timing-study/output-24-1000-1500.txt index e8452d5..6f49e70 100644 --- a/mpi/timing-study/output-24-1000-1500.txt +++ b/mpi/timing-study/output-24-1000-1500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 3.133675 -Clock time: 19.271850 +Time computing life: 6.653323 +Clock time: 23.581825 diff --git a/mpi/timing-study/output-24-1000-1750.txt b/mpi/timing-study/output-24-1000-1750.txt index 9757c78..3839b94 100644 --- a/mpi/timing-study/output-24-1000-1750.txt +++ b/mpi/timing-study/output-24-1000-1750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 4.398371 -Clock time: 25.650748 +Time computing life: 9.023902 +Clock time: 32.654144 diff --git a/mpi/timing-study/output-24-1000-2000.txt b/mpi/timing-study/output-24-1000-2000.txt index 8fd3c60..4b52342 100644 --- a/mpi/timing-study/output-24-1000-2000.txt +++ b/mpi/timing-study/output-24-1000-2000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 5.639865 -Clock time: 33.529967 +Time computing life: 11.813565 +Clock time: 42.361473 diff --git a/mpi/timing-study/output-24-1000-250.txt b/mpi/timing-study/output-24-1000-250.txt index e6ddcb7..16adb19 100644 --- a/mpi/timing-study/output-24-1000-250.txt +++ b/mpi/timing-study/output-24-1000-250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.100765 -Clock time: 2.412458 +Time computing life: 0.194711 +Clock time: 2.336872 diff --git a/mpi/timing-study/output-24-1000-500.txt b/mpi/timing-study/output-24-1000-500.txt index 8f6af46..c98f559 100644 --- a/mpi/timing-study/output-24-1000-500.txt +++ b/mpi/timing-study/output-24-1000-500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.465147 -Clock time: 3.942927 +Time computing life: 0.751125 +Clock time: 3.215283 diff --git a/mpi/timing-study/output-24-1000-750.txt b/mpi/timing-study/output-24-1000-750.txt index 1329b1b..311c711 100644 --- a/mpi/timing-study/output-24-1000-750.txt +++ b/mpi/timing-study/output-24-1000-750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.815429 -Clock time: 5.642879 +Time computing life: 1.749681 +Clock time: 6.566280 diff --git a/mpi/timing-study/output-4-1000-1000.txt b/mpi/timing-study/output-4-1000-1000.txt index 14dc1e9..f5920d0 100644 --- a/mpi/timing-study/output-4-1000-1000.txt +++ b/mpi/timing-study/output-4-1000-1000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 8.467197 -Clock time: 11.707533 +Time computing life: 18.303801 +Clock time: 22.160403 diff --git a/mpi/timing-study/output-4-1000-1250.txt b/mpi/timing-study/output-4-1000-1250.txt index 408cfeb..e7d7340 100644 --- a/mpi/timing-study/output-4-1000-1250.txt +++ b/mpi/timing-study/output-4-1000-1250.txt @@ -1,11 +1,4 @@ -=================================================================================== -= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES -= PID 22126 RUNNING AT kp013 -= EXIT CODE: 11 -= CLEANING UP REMAINING PROCESSES -= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES -=================================================================================== -YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11) -This typically refers to a problem with your application. -Please see the FAQ page for debugging suggestions +===Timing=== +Time computing life: 28.577705 +Clock time: 33.967832 diff --git a/mpi/timing-study/output-4-1000-1500.txt b/mpi/timing-study/output-4-1000-1500.txt index d304a5d..d84625f 100644 --- a/mpi/timing-study/output-4-1000-1500.txt +++ b/mpi/timing-study/output-4-1000-1500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 18.823087 -Clock time: 26.449810 +Time computing life: 40.818054 +Clock time: 49.034747 diff --git a/mpi/timing-study/output-4-1000-1750.txt b/mpi/timing-study/output-4-1000-1750.txt index ab98c94..4db0764 100644 --- a/mpi/timing-study/output-4-1000-1750.txt +++ b/mpi/timing-study/output-4-1000-1750.txt @@ -1,11 +1,4 @@ -=================================================================================== -= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES -= PID 22197 RUNNING AT kp013 -= EXIT CODE: 11 -= CLEANING UP REMAINING PROCESSES -= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES -=================================================================================== -YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11) -This typically refers to a problem with your application. -Please see the FAQ page for debugging suggestions +===Timing=== +Time computing life: 55.473812 +Clock time: 66.402986 diff --git a/mpi/timing-study/output-4-1000-2000.txt b/mpi/timing-study/output-4-1000-2000.txt index 2c85e0c..348d043 100644 --- a/mpi/timing-study/output-4-1000-2000.txt +++ b/mpi/timing-study/output-4-1000-2000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 33.274214 -Clock time: 45.841294 +Time computing life: 72.029655 +Clock time: 84.802906 diff --git a/mpi/timing-study/output-4-1000-250.txt b/mpi/timing-study/output-4-1000-250.txt index 8b1fa3c..e83b01c 100644 --- a/mpi/timing-study/output-4-1000-250.txt +++ b/mpi/timing-study/output-4-1000-250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.599813 -Clock time: 2.807879 +Time computing life: 1.263181 +Clock time: 2.244229 diff --git a/mpi/timing-study/output-4-1000-500.txt b/mpi/timing-study/output-4-1000-500.txt index b3ce6ae..52c0a86 100644 --- a/mpi/timing-study/output-4-1000-500.txt +++ b/mpi/timing-study/output-4-1000-500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 2.212790 -Clock time: 4.133439 +Time computing life: 4.748224 +Clock time: 6.104621 diff --git a/mpi/timing-study/output-4-1000-750.txt b/mpi/timing-study/output-4-1000-750.txt index 59aa17f..81c3d88 100644 --- a/mpi/timing-study/output-4-1000-750.txt +++ b/mpi/timing-study/output-4-1000-750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 4.830949 -Clock time: 6.854574 +Time computing life: 10.420523 +Clock time: 12.881742 diff --git a/mpi/timing-study/output-8-1000-1000.txt b/mpi/timing-study/output-8-1000-1000.txt index c063ee2..4e49826 100644 --- a/mpi/timing-study/output-8-1000-1000.txt +++ b/mpi/timing-study/output-8-1000-1000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 4.226861 -Clock time: 7.517444 +Time computing life: 9.155404 +Clock time: 12.658901 diff --git a/mpi/timing-study/output-8-1000-1250.txt b/mpi/timing-study/output-8-1000-1250.txt index 4be7ca8..886b53b 100644 --- a/mpi/timing-study/output-8-1000-1250.txt +++ b/mpi/timing-study/output-8-1000-1250.txt @@ -1,11 +1,4 @@ -=================================================================================== -= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES -= PID 22852 RUNNING AT kp013 -= EXIT CODE: 11 -= CLEANING UP REMAINING PROCESSES -= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES -=================================================================================== -YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11) -This typically refers to a problem with your application. -Please see the FAQ page for debugging suggestions +===Timing=== +Time computing life: 14.082438 +Clock time: 19.224195 diff --git a/mpi/timing-study/output-8-1000-1500.txt b/mpi/timing-study/output-8-1000-1500.txt index 957fc99..f6a18e4 100644 --- a/mpi/timing-study/output-8-1000-1500.txt +++ b/mpi/timing-study/output-8-1000-1500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 9.416485 -Clock time: 16.706325 +Time computing life: 20.413675 +Clock time: 27.885011 diff --git a/mpi/timing-study/output-8-1000-1750.txt b/mpi/timing-study/output-8-1000-1750.txt index 8dbd945..25a9db6 100644 --- a/mpi/timing-study/output-8-1000-1750.txt +++ b/mpi/timing-study/output-8-1000-1750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 12.741221 -Clock time: 22.281683 +Time computing life: 27.722141 +Clock time: 38.768550 diff --git a/mpi/timing-study/output-8-1000-2000.txt b/mpi/timing-study/output-8-1000-2000.txt index 9610e3f..0f34654 100644 --- a/mpi/timing-study/output-8-1000-2000.txt +++ b/mpi/timing-study/output-8-1000-2000.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 16.578412 -Clock time: 26.921717 +Time computing life: 35.856221 +Clock time: 48.674318 diff --git a/mpi/timing-study/output-8-1000-250.txt b/mpi/timing-study/output-8-1000-250.txt index ca01ca3..431677d 100644 --- a/mpi/timing-study/output-8-1000-250.txt +++ b/mpi/timing-study/output-8-1000-250.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 0.296146 -Clock time: 2.211905 +Time computing life: 0.617449 +Clock time: 2.370964 diff --git a/mpi/timing-study/output-8-1000-500.txt b/mpi/timing-study/output-8-1000-500.txt index 3e3b83c..296d56a 100644 --- a/mpi/timing-study/output-8-1000-500.txt +++ b/mpi/timing-study/output-8-1000-500.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 1.111486 -Clock time: 2.710176 +Time computing life: 2.396797 +Clock time: 3.529909 diff --git a/mpi/timing-study/output-8-1000-750.txt b/mpi/timing-study/output-8-1000-750.txt index 4a98753..1791244 100644 --- a/mpi/timing-study/output-8-1000-750.txt +++ b/mpi/timing-study/output-8-1000-750.txt @@ -1,4 +1,4 @@ ===Timing=== -Time computing life: 2.419305 -Clock time: 4.675962 +Time computing life: 5.226469 +Clock time: 7.317886 diff --git a/mpi/timing-study/slurm-10870703.err-kp013 b/mpi/timing-study/slurm-10870703.err-kp013 deleted file mode 100644 index 4bdaa5d..0000000 --- a/mpi/timing-study/slurm-10870703.err-kp013 +++ /dev/null @@ -1,11 +0,0 @@ -mkdir: cannot create directory ‘timing-study’: File exists -[proxy:0:0@kp013] HYD_pmcd_pmip_control_cmd_cb (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmip_cb.c:887): assert (!closed) failed -[proxy:0:0@kp013] HYDT_dmxu_poll_wait_for_event (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status -[proxy:0:0@kp013] main (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmip.c:202): demux engine error waiting for event -srun: error: kp013: task 0: Exited with exit code 7 -[mpiexec@kp013] HYDT_bscu_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/bootstrap/utils/bscu_wait.c:76): one of the processes terminated badly; aborting -[mpiexec@kp013] HYDT_bsci_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/bootstrap/src/bsci_wait.c:23): launcher returned error waiting for completion -[mpiexec@kp013] HYD_pmci_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmiserv_pmci.c:218): launcher returned error waiting for completion -[mpiexec@kp013] main (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/ui/mpich/mpiexec.c:340): process manager error waiting for completion -srun: error: Unable to create step for job 10870703: Job/step already completing or completed -slurmstepd: error: *** JOB 10870703 ON kp013 CANCELLED AT 2021-12-08T01:29:02 DUE TO TIME LIMIT *** diff --git a/mpi/timing-study/slurm-10870708.err-kp018 b/mpi/timing-study/slurm-10870708.err-kp018 new file mode 100644 index 0000000..a4f6f22 --- /dev/null +++ b/mpi/timing-study/slurm-10870708.err-kp018 @@ -0,0 +1,11 @@ +mkdir: cannot create directory ‘timing-study’: File exists + +Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163". + + +Due to MODULEPATH changes, the following have been reloaded: + 1) mpich/3.2.1 + +srun: Job step aborted: Waiting up to 62 seconds for job step to finish. +slurmstepd: error: *** STEP 10870708.3 ON kp018 CANCELLED AT 2021-12-08T03:00:27 *** +slurmstepd: error: *** JOB 10870708 ON kp018 CANCELLED AT 2021-12-08T03:00:27 *** diff --git a/mpi/timing-study/slurm-10870703.out-kp013 b/mpi/timing-study/slurm-10870708.out-kp018 similarity index 100% rename from mpi/timing-study/slurm-10870703.out-kp013 rename to mpi/timing-study/slurm-10870708.out-kp018 diff --git a/mpi/timing-study/slurm-10870709.err-kp018 b/mpi/timing-study/slurm-10870709.err-kp018 new file mode 100644 index 0000000..61401da --- /dev/null +++ b/mpi/timing-study/slurm-10870709.err-kp018 @@ -0,0 +1,8 @@ +mkdir: cannot create directory ‘timing-study’: File exists + +Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163". + + +Due to MODULEPATH changes, the following have been reloaded: + 1) mpich/3.2.1 + diff --git a/mpi/timing-study/slurm-10870709.out-kp018 b/mpi/timing-study/slurm-10870709.out-kp018 new file mode 100644 index 0000000..e69de29 diff --git a/mpi/timing-study/slurm-10870714.err-kp007 b/mpi/timing-study/slurm-10870714.err-kp007 new file mode 100644 index 0000000..61401da --- /dev/null +++ b/mpi/timing-study/slurm-10870714.err-kp007 @@ -0,0 +1,8 @@ +mkdir: cannot create directory ‘timing-study’: File exists + +Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163". + + +Due to MODULEPATH changes, the following have been reloaded: + 1) mpich/3.2.1 + diff --git a/mpi/timing-study/slurm-10870714.out-kp007 b/mpi/timing-study/slurm-10870714.out-kp007 new file mode 100644 index 0000000..e69de29 diff --git a/mpi/timing-study/timing_study.sh b/mpi/timing-study/timing_study.sh index 04b64ff..4076257 100644 --- a/mpi/timing-study/timing_study.sh +++ b/mpi/timing-study/timing_study.sh @@ -1,9 +1,9 @@ #!/bin/bash -#SBATCH --time=0:10:00 # walltime, abbreviated by -t -#SBATCH --nodes=2 # number of cluster nodes, abbreviated by -N +#SBATCH --time=0:20:00 # walltime, abbreviated by -t +#SBATCH --nodes=1 # number of cluster nodes, abbreviated by -N #SBATCH -o slurm-%j.out-%N # name of the stdout, using the job number (%j) and the first node (%N) #SBATCH -e slurm-%j.err-%N # name of the stderr, using job and first node values -#SBATCH --ntasks=24 # number of MPI tasks, abbreviated by -n +#SBATCH --ntasks=1 # number of MPI tasks, abbreviated by -n # additional information for allocated clusters #SBATCH --account=usucs5030 # account - abbreviated by -A #SBATCH --partition=kingspeak # partition, abbreviated by -p @@ -15,9 +15,9 @@ module load intel mpich iterations=1000 -for cores in 1 4 8 12 16 20 #24 +for cores in 1 #12 16 20 24 do - for size in 250 500 750 1000 1250 1500 1750 2000 + for size in 1000 1250 1500 1750 2000 #250 500 750 1000 1250 1500 1750 2000 do mpirun -np $cores ./gol simulate random $size $size $iterations 1 > timing-study/output-$cores-$iterations-$size.txt done diff --git a/report/.DS_Store b/report/.DS_Store new file mode 100644 index 0000000..5f134d3 Binary files /dev/null and b/report/.DS_Store differ diff --git a/report/Game of Life.xlsx b/report/Game of Life.xlsx new file mode 100644 index 0000000..52d58a9 Binary files /dev/null and b/report/Game of Life.xlsx differ diff --git a/report/cores-vs-runtimes.png b/report/cores-vs-runtimes.png new file mode 100644 index 0000000..66a864f Binary files /dev/null and b/report/cores-vs-runtimes.png differ diff --git a/report/cuda-speedup.png b/report/cuda-speedup.png new file mode 100644 index 0000000..4aa2649 Binary files /dev/null and b/report/cuda-speedup.png differ diff --git a/report/cuda-times.png b/report/cuda-times.png new file mode 100644 index 0000000..9237cc8 Binary files /dev/null and b/report/cuda-times.png differ diff --git a/report/report.org b/report/report.org new file mode 100644 index 0000000..68c451d --- /dev/null +++ b/report/report.org @@ -0,0 +1,76 @@ +#+TITLE: Final Project: Game of Life +#+STARTUP: fold inlineimages +#+OPTIONS: toc:nil +#+AUTHOR: Logan Hunt +#+LATEX_HEADER: \usepackage{amsfonts} \usepackage{amssymb} \usepackage{mathtools} \usepackage{ upgreek } + +* Description +From [[https://mathworld.wolfram.com/CellularAutomaton.html][Wolfram MathWorld]]: + +#+BEGIN_QUOTE +A cellular automaton is a collection of "colored" cells on a grid of specified shape that evolves through a number of discrete time steps according to a set of rules based on the states of neighboring cells. The rules are then applied iteratively for as many time steps as desired. +#+END_QUOTE + +Conway's Game of Life is one such automaton. In the Game of Life, the rules for each cell are as follows (from [[https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life][Wikipedia]]): + +#+BEGIN_QUOTE +1. Any live cell with fewer than two live neighbours dies, as if by underpopulation. +2. Any live cell with two or three live neighbours lives on to the next generation. +3. Any live cell with more than three live neighbours dies, as if by overpopulation. +4. Any dead cell with exactly three live neighbours becomes a live cell, as if by reproduction. +#+END_QUOTE + +To help visualize this automaton I created a script to go through the output of my Game of Life simulation and compile a video with ffmpeg. As an example, I've uploaded the output of a simulation with a 1920x1080 grid of cells with 1000 iterations [[https://www.youtube.com/watch?v=N_aUWYNqpeY][to YouTube]]. Each cell that is white is alive and each black cell is dead. + +There are four implementations of Conway's Game of Life in this project; a serial implementation, a distributed memory implementation (in OpenMPI), a shared memory implementation (in OpenMP), and a GPU implementation (in Cuda). + +A timing study is performed on each implementation by calculating the elapsed time of the program given varying sizes of initial grids and, in the shared and distributed memory versions, a different number of cores. In each, both the time it takes to compute the next iteration and the total wall clock time are measured. + +* Performance analysis (of Game of Life iteration time) +Results can be found on a [[https://docs.google.com/spreadsheets/d/1QxCsyMFzk67Qpuv-xZ-tRny4jHMebXTrdq5ncc7C4Tw/edit?usp=sharing][Google Sheet]] +** Shared memory vs distributed memory +*** Runtime +In runtime, both implementations have the same property of decreasing over an increasing number of cores in all problem sizes (as one would certainly hope). As the problem size increases, the overall differences in the runtimes of each implementation also decreases; meaning they follow the same trends. This can be shown in the runtimes for both implmentations running on a small grid and a large grid: + +#+ATTR_LATEX: :width 8cm +[[./cores-vs-runtimes.png]] + +Both seem to converge to some rational function. Using an online regression calculator it was found that the MPI Life Computation (iteration computation time only) runtime follows the function $t(p)=\frac{274.449}{p^{0.985}}$ with a correlation coefficient of $r=-0.999892441$. Since $t$ is very close to being a rational function of $p$, we know that the runtime fits to what could be expected: $T_{\text{parallel}} = \frac{T_{\text{serial}}}{p}$. + +*** Speedup +In speedup, both implementations tend to increase over an increasing number of cores in all problem sizes. However, it doesn't strictly increase. With some numbers of cores in the shared memory implementation, the speedup actually decreases from its predecessor. + +#+ATTR_LATEX: :width 10cm +[[./speedups-vs-cores.png]] + + +*** Efficiency +Efficiency is the ratio of speedup to $p$ processors ($E = \frac{S}{p}$), so it can be thought of as the derivative of the speedup. Thus efficiency can be measured without plotting it explicitly. + +By definition, a program is "strongly scalable" if it can keep its efficiency constant over a varying input size. In the results, it can be seen that the slope of the Distributed Memory Life Computation Time line tends to be constant, meaning that the efficiency is also constant. Thus, the MPI version is strongly scalable. + +However, the shared memory (OpenMP) implementation does not seem to be perfectly strongly scalable. As the problem size varies, the speedup does not follow a constant slope. Instead, it tends to match the efficiency of the MPI implemenation until some point where the slope drops off. + +Theoretically, the OpenMP implementation should be just as strongly scalable as the MPI implementation. One reason overhead could be present is in thread scheduling. + +** CUDA Implementation +*** Runtime +For the CUDA implementation, different grid sizes are used to measure the iteration time as well as the wall time. Again, 1000 iterations are used for the timing study. + +#+ATTR_LATEX: :width 8cm +[[./cuda-times.png]] + +Using an online regression calculator again, it was found that the runtime as a function of input size can be expressed with by $t(n) = (1.486)(10^{-7})n^2 + (1.328)(10^{-6})n + 0.02151$ with a correlation coefficient $r = 0.9999278678$. + +Since the number of cores is constant, we would hope to see a quadratic increase in the runtime as the input size grows. This is because the number of cells increases with $(\text{input size})^2$. + +Indeed, this is what we see. + +*** Speedup +The speedup of the cuda implementation as input size increases tends to follow a logarithmic curve, plateuing after around $n=1000$. While I am not entirely sure why it follows this trend, I guess it might have to do with the warp scheduling. + +#+ATTR_LATEX: :width 8cm +[[./cuda-speedup.png]] + +*** Efficiency +Since the core count on the K80 is constant (4992 CUDA cores), the efficiency can be calculated by $E = \frac{S}{4992}$. As the efficiency is just a constant multiplied by the speedup, the efficiency graph will just be a scaled version of the speedup graph. As such the efficiency will not be constant over different input sizes since the speedup isn't, and thus the CUDA implementation is not strongly scalable. diff --git a/report/report.pdf b/report/report.pdf new file mode 100644 index 0000000..036f608 Binary files /dev/null and b/report/report.pdf differ diff --git a/report/report.tex b/report/report.tex new file mode 100644 index 0000000..8fdc2d7 --- /dev/null +++ b/report/report.tex @@ -0,0 +1,118 @@ +% Created 2021-12-08 Wed 18:34 +% Intended LaTeX compiler: pdflatex +\documentclass[11pt]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{graphicx} +\usepackage{grffile} +\usepackage{longtable} +\usepackage{wrapfig} +\usepackage{rotating} +\usepackage[normalem]{ulem} +\usepackage{amsmath} +\usepackage{textcomp} +\usepackage{amssymb} +\usepackage{capt-of} +\usepackage{hyperref} +\usepackage{amsfonts} \usepackage{amssymb} \usepackage{mathtools} \usepackage{ upgreek } +\author{Logan Hunt} +\date{\today} +\title{Final Project: Game of Life} +\hypersetup{ + pdfauthor={Logan Hunt}, + pdftitle={Final Project: Game of Life}, + pdfkeywords={}, + pdfsubject={}, + pdfcreator={Emacs 27.2 (Org mode 9.4.4)}, + pdflang={English}} +\begin{document} + +\maketitle + +\section{Description} +\label{sec:orgee5348a} +From \href{https://mathworld.wolfram.com/CellularAutomaton.html}{Wolfram MathWorld}: + +\begin{quote} +A cellular automaton is a collection of "colored" cells on a grid of specified shape that evolves through a number of discrete time steps according to a set of rules based on the states of neighboring cells. The rules are then applied iteratively for as many time steps as desired. +\end{quote} + +Conway's Game of Life is one such automaton. In the Game of Life, the rules for each cell are as follows (from \href{https://en.wikipedia.org/wiki/Conway\%27s\_Game\_of\_Life}{Wikipedia}): + +\begin{quote} +\begin{enumerate} +\item Any live cell with fewer than two live neighbours dies, as if by underpopulation. +\item Any live cell with two or three live neighbours lives on to the next generation. +\item Any live cell with more than three live neighbours dies, as if by overpopulation. +\item Any dead cell with exactly three live neighbours becomes a live cell, as if by reproduction. +\end{enumerate} +\end{quote} + +To help visualize this automaton I created a script to go through the output of my Game of Life simulation and compile a video with ffmpeg. As an example, I've uploaded the output of a simulation with a 1920x1080 grid of cells with 1000 iterations \href{https://www.youtube.com/watch?v=N\_aUWYNqpeY}{to YouTube}. Each cell that is white is alive and each black cell is dead. + +There are four implementations of Conway's Game of Life in this project; a serial implementation, a distributed memory implementation (in OpenMPI), a shared memory implementation (in OpenMP), and a GPU implementation (in Cuda). + +A timing study is performed on each implementation by calculating the elapsed time of the program given varying sizes of initial grids and, in the shared and distributed memory versions, a different number of cores. In each, both the time it takes to compute the next iteration and the total wall clock time are measured. + +\section{Performance analysis (of Game of Life iteration time)} +\label{sec:org1f7f2b6} +Results can be found on a \href{https://docs.google.com/spreadsheets/d/1QxCsyMFzk67Qpuv-xZ-tRny4jHMebXTrdq5ncc7C4Tw/edit?usp=sharing}{Google Sheet} +\subsection{Shared memory vs distributed memory} +\label{sec:org9f5d8c9} +\subsubsection{Runtime} +\label{sec:org9209384} +In runtime, both implementations have the same property of decreasing over an increasing number of cores in all problem sizes (as one would certainly hope). As the problem size increases, the overall differences in the runtimes of each implementation also decreases; meaning they follow the same trends. This can be shown in the runtimes for both implmentations running on a small grid and a large grid: + +\begin{center} +\includegraphics[width=8cm]{./cores-vs-runtimes.png} +\end{center} + +Both seem to converge to some rational function. Using an online regression calculator it was found that the MPI Life Computation (iteration computation time only) runtime follows the function \(t(p)=\frac{274.449}{p^{0.985}}\) with a correlation coefficient of \(r=-0.999892441\). Since \(t\) is very close to being a rational function of \(p\), we know that the runtime fits to what could be expected: \(T_{\text{parallel}} = \frac{T_{\text{serial}}}{p}\). + +\subsubsection{Speedup} +\label{sec:org3812f4f} +In speedup, both implementations tend to increase over an increasing number of cores in all problem sizes. However, it doesn't strictly increase. With some numbers of cores in the shared memory implementation, the speedup actually decreases from its predecessor. + +\begin{center} +\includegraphics[width=10cm]{./speedups-vs-cores.png} +\end{center} + + +\subsubsection{Efficiency} +\label{sec:org477fbb5} +Efficiency is the ratio of speedup to \(p\) processors (\(E = \frac{S}{p}\)), so it can be thought of as the derivative of the speedup. Thus efficiency can be measured without plotting it explicitly. + +By definition, a program is "strongly scalable" if it can keep its efficiency constant over a varying input size. In the results, it can be seen that the slope of the Distributed Memory Life Computation Time line tends to be constant, meaning that the efficiency is also constant. Thus, the MPI version is strongly scalable. + +However, the shared memory (OpenMP) implementation does not seem to be perfectly strongly scalable. As the problem size varies, the speedup does not follow a constant slope. Instead, it tends to match the efficiency of the MPI implemenation until some point where the slope drops off. + +Theoretically, the OpenMP implementation should be just as strongly scalable as the MPI implementation. One reason overhead could be present is in thread scheduling. + +\subsection{CUDA Implementation} +\label{sec:org31daf97} +\subsubsection{Runtime} +\label{sec:orgb6b22ab} +For the CUDA implementation, different grid sizes are used to measure the iteration time as well as the wall time. Again, 1000 iterations are used for the timing study. + +\begin{center} +\includegraphics[width=8cm]{./cuda-times.png} +\end{center} + +Using an online regression calculator again, it was found that the runtime as a function of input size can be expressed with by \(t(n) = (1.486)(10^{-7})n^2 + (1.328)(10^{-6})n + 0.02151\) with a correlation coefficient \(r = 0.9999278678\). + +Since the number of cores is constant, we would hope to see a quadratic increase in the runtime as the input size grows. This is because the number of cells increases with \((\text{input size})^2\). + +Indeed, this is what we see. + +\subsubsection{Speedup} +\label{sec:org888d520} +The speedup of the cuda implementation as input size increases tends to follow a logarithmic curve, plateuing after around \(n=1000\). While I am not entirely sure why it follows this trend, I guess it might have to do with the warp scheduling. + +\begin{center} +\includegraphics[width=8cm]{./cuda-speedup.png} +\end{center} + +\subsubsection{Efficiency} +\label{sec:org530b4aa} +Since the core count on the K80 is constant (4992 CUDA cores), the efficiency can be calculated by \(E = \frac{S}{4992}\). As the efficiency is just a constant multiplied by the speedup, the efficiency graph will just be a scaled version of the speedup graph. As such the efficiency will not be constant over different input sizes since the speedup isn't, and thus the CUDA implementation is not strongly scalable. +\end{document} diff --git a/report/speedups-vs-cores.png b/report/speedups-vs-cores.png new file mode 100644 index 0000000..06dc40a Binary files /dev/null and b/report/speedups-vs-cores.png differ