Finished

2021-12-08 18:38:46 -07:00 · 2021-12-08 18:38:46 -07:00 · 9362bc1591
commit 9362bc1591
parent c846568cf2
90 changed files with 389 additions and 192 deletions
--- a/cuda-global/gol
+++ b/cuda-global/gol
--- a/cuda-global/src/main.cu
+++ b/cuda-global/src/main.cu
@ -36,7 +36,10 @@ true) {
 // Do the simulation
 void simulate(int argc, char** argv) {
  srand(SEED);
-  clock_t global_start = clock();
+  cudaEvent_t global_start, global_end;
  cudaEventCreate(&global_start);
  cudaEventCreate(&global_end);
  cudaEventRecord(global_start);
  char* filename;
  struct GAME game;
  game.padding = PADDING;
@ -135,8 +138,12 @@ void simulate(int argc, char** argv) {
      game.grid = temp;
    }
  }
  cudaEventRecord(global_end);
  cudaEventSynchronize(global_end);
  float global_time;
  cudaEventElapsedTime(&global_time, global_start, global_end);
-  printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", time_computing_life, ((double)clock() - (double)global_start)/CLOCKS_PER_SEC);
+  printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", time_computing_life, global_time/(double)1000);
 }
 int main(int argc, char** argv) {
--- a/cuda-global/timing-study/output--1000-1000.txt
+++ b/cuda-global/timing-study/output--1000-1000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.169687
+Time computing life: 0.169470
-Clock time: 1.560000
+Clock time: 5.175729
--- a/cuda-global/timing-study/output--1000-1250.txt
+++ b/cuda-global/timing-study/output--1000-1250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.254989
+Time computing life: 0.255659
-Clock time: 2.240000
+Clock time: 5.620605
--- a/cuda-global/timing-study/output--1000-1500.txt
+++ b/cuda-global/timing-study/output--1000-1500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.354361
+Time computing life: 0.354065
-Clock time: 3.050000
+Clock time: 8.177913
--- a/cuda-global/timing-study/output--1000-1750.txt
+++ b/cuda-global/timing-study/output--1000-1750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.480174
+Time computing life: 0.480989
-Clock time: 4.070000
+Clock time: 9.626799
--- a/cuda-global/timing-study/output--1000-2000.txt
+++ b/cuda-global/timing-study/output--1000-2000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.619636
+Time computing life: 0.618807
-Clock time: 5.220000
+Clock time: 10.948197
--- a/cuda-global/timing-study/output--1000-250.txt
+++ b/cuda-global/timing-study/output--1000-250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.029867
+Time computing life: 0.029682
-Clock time: 0.330000
+Clock time: 2.946978
--- a/cuda-global/timing-study/output--1000-500.txt
+++ b/cuda-global/timing-study/output--1000-500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.059907
+Time computing life: 0.059339
-Clock time: 0.540000
+Clock time: 3.249037
--- a/cuda-global/timing-study/output--1000-750.txt
+++ b/cuda-global/timing-study/output--1000-750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.110954
+Time computing life: 0.110569
-Clock time: 1.000000
+Clock time: 4.294806
--- a/cuda-global/timing-study/slurm-3617105.err-notch081
+++ b/cuda-global/timing-study/slurm-3617105.err-notch081
--- a/cuda-global/timing-study/slurm-3617105.out-notch081
+++ b/cuda-global/timing-study/slurm-3617105.out-notch081
--- a/cuda-global/timing-study/slurm-3617127.err-notch081
+++ b/cuda-global/timing-study/slurm-3617127.err-notch081
--- a/cuda-global/timing-study/slurm-3617127.out-notch081
+++ b/cuda-global/timing-study/slurm-3617127.out-notch081
--- a/mpi/gol
+++ b/mpi/gol
--- a/mpi/src/game.c
+++ b/mpi/src/game.c
@ -5,7 +5,7 @@ int neighbors(struct GAME* game, int x, int y, unsigned char* halo_above, unsign
  for (int dy = -1; dy <= 1; dy++) {
    for (int dx = -1; dx <= 1; dx++) {
-      if (!(dx == 0 && dy == 0) && (x+dx) > 0 && (x+dx) < game->width+(game->padding*2)) {
+      if (!(dx == 0 && dy == 0) && (x+dx) > 0 && (x+dx) < game->width+(game->padding*2) && (y+dy) < game->height) {
        if (y+dy == -1 && halo_above != NULL) {
          if (halo_above[x+dx]) {
            n++;
--- a/mpi/src/main.c
+++ b/mpi/src/main.c
@ -3,6 +3,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <mpi.h>
 #include <stddef.h>
 #include "file.h"
 #include "game.h"
--- a/mpi/timing-study/output-1-1000-1000.txt
+++ b/mpi/timing-study/output-1-1000-1000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 33.832562
+Time computing life: 68.237100
-Clock time: 37.939663
+Clock time: 73.897736
--- a/mpi/timing-study/output-1-1000-1250.txt
+++ b/mpi/timing-study/output-1-1000-1250.txt
@ -1,11 +1,4 @@
-===================================================================================
+===Timing===
-=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
+Time computing life: 105.934486
-=   PID 21716 RUNNING AT kp013
+Clock time: 112.662907
 =   EXIT CODE: 11
 =   CLEANING UP REMAINING PROCESSES
 =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
 ===================================================================================
 YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
 This typically refers to a problem with your application.
 Please see the FAQ page for debugging suggestions
--- a/mpi/timing-study/output-1-1000-1500.txt
+++ b/mpi/timing-study/output-1-1000-1500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 75.141736
+Time computing life: 155.201482
-Clock time: 83.149478
+Clock time: 165.129865
--- a/mpi/timing-study/output-1-1000-1750.txt
+++ b/mpi/timing-study/output-1-1000-1750.txt
@ -1,11 +1,4 @@
-===================================================================================
+===Timing===
-=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
+Time computing life: 207.332667
-=   PID 21837 RUNNING AT kp013
+Clock time: 219.494609
 =   EXIT CODE: 11
 =   CLEANING UP REMAINING PROCESSES
 =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
 ===================================================================================
 YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
 This typically refers to a problem with your application.
 Please see the FAQ page for debugging suggestions
--- a/mpi/timing-study/output-1-1000-2000.txt
+++ b/mpi/timing-study/output-1-1000-2000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 132.636661
+Time computing life: 269.160186
-Clock time: 145.001708
+Clock time: 284.025931
--- a/mpi/timing-study/output-1-1000-250.txt
+++ b/mpi/timing-study/output-1-1000-250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 2.383001
+Time computing life: 5.132488
-Clock time: 4.113476
+Clock time: 6.490781
--- a/mpi/timing-study/output-1-1000-500.txt
+++ b/mpi/timing-study/output-1-1000-500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 8.793952
+Time computing life: 18.976428
-Clock time: 9.832794
+Clock time: 20.433132
--- a/mpi/timing-study/output-1-1000-750.txt
+++ b/mpi/timing-study/output-1-1000-750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 19.270078
+Time computing life: 41.754895
-Clock time: 21.813069
+Clock time: 44.502337
--- a/mpi/timing-study/output-12-1000-1000.txt
+++ b/mpi/timing-study/output-12-1000-1000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 2.833550
+Time computing life: 6.001465
-Clock time: 6.323680
+Clock time: 9.559285
--- a/mpi/timing-study/output-12-1000-1250.txt
+++ b/mpi/timing-study/output-12-1000-1250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 4.347700
+Time computing life: 9.368616
-Clock time: 9.178630
+Clock time: 14.966020
--- a/mpi/timing-study/output-12-1000-1500.txt
+++ b/mpi/timing-study/output-12-1000-1500.txt
@ -1,11 +1,4 @@
-===================================================================================
+===Timing===
-=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
+Time computing life: 13.390212
-=   PID 23209 RUNNING AT kp013
+Clock time: 20.945775
 =   EXIT CODE: 11
 =   CLEANING UP REMAINING PROCESSES
 =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
 ===================================================================================
 YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
 This typically refers to a problem with your application.
 Please see the FAQ page for debugging suggestions
--- a/mpi/timing-study/output-12-1000-1750.txt
+++ b/mpi/timing-study/output-12-1000-1750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 8.483342
+Time computing life: 18.167763
-Clock time: 17.330302
+Clock time: 28.215494
--- a/mpi/timing-study/output-12-1000-2000.txt
+++ b/mpi/timing-study/output-12-1000-2000.txt
@ -1,11 +1,4 @@
-===================================================================================
+===Timing===
-=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
+Time computing life: 23.784948
-=   PID 23290 RUNNING AT kp013
+Clock time: 36.657344
 =   EXIT CODE: 11
 =   CLEANING UP REMAINING PROCESSES
 =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
 ===================================================================================
 YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
 This typically refers to a problem with your application.
 Please see the FAQ page for debugging suggestions
--- a/mpi/timing-study/output-12-1000-250.txt
+++ b/mpi/timing-study/output-12-1000-250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.198089
+Time computing life: 0.621095
-Clock time: 2.217166
+Clock time: 2.379479
--- a/mpi/timing-study/output-12-1000-500.txt
+++ b/mpi/timing-study/output-12-1000-500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.735509
+Time computing life: 1.541923
-Clock time: 2.513034
+Clock time: 3.193527
--- a/mpi/timing-study/output-12-1000-750.txt
+++ b/mpi/timing-study/output-12-1000-750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 1.617002
+Time computing life: 3.622372
-Clock time: 4.091923
+Clock time: 5.586008
--- a/mpi/timing-study/output-16-1000-1000.txt
+++ b/mpi/timing-study/output-16-1000-1000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 2.106571
+Time computing life: 4.442463
-Clock time: 7.500836
+Clock time: 10.520606
--- a/mpi/timing-study/output-16-1000-1250.txt
+++ b/mpi/timing-study/output-16-1000-1250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 3.445883
+Time computing life: 7.085545
-Clock time: 11.167682
+Clock time: 15.203315
--- a/mpi/timing-study/output-16-1000-1500.txt
+++ b/mpi/timing-study/output-16-1000-1500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 4.741983
+Time computing life: 10.130384
-Clock time: 16.777514
+Clock time: 22.828620
--- a/mpi/timing-study/output-16-1000-1750.txt
+++ b/mpi/timing-study/output-16-1000-1750.txt
@ -1,8 +1,4 @@
-===================================================================================
+===Timing===
-=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
+Time computing life: 13.581483
-=   PID 34784 RUNNING AT kp160
+Clock time: 30.474959
 =   EXIT CODE: 11
 =   CLEANING UP REMAINING PROCESSES
 =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
 ===================================================================================
--- a/mpi/timing-study/output-16-1000-2000.txt
+++ b/mpi/timing-study/output-16-1000-2000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 8.301682
+Time computing life: 17.827682
-Clock time: 28.791425
+Clock time: 38.839103
--- a/mpi/timing-study/output-16-1000-250.txt
+++ b/mpi/timing-study/output-16-1000-250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.145483
+Time computing life: 0.295248
-Clock time: 2.572587
+Clock time: 2.424203
--- a/mpi/timing-study/output-16-1000-500.txt
+++ b/mpi/timing-study/output-16-1000-500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.570992
+Time computing life: 1.134216
-Clock time: 3.899400
+Clock time: 3.333061
--- a/mpi/timing-study/output-16-1000-750.txt
+++ b/mpi/timing-study/output-16-1000-750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 1.215016
+Time computing life: 2.724965
-Clock time: 5.047125
+Clock time: 6.326008
--- a/mpi/timing-study/output-20-1000-1000.txt
+++ b/mpi/timing-study/output-20-1000-1000.txt
@ -0,0 +1,4 @@
 ===Timing===
 Time computing life: 3.626076
 Clock time: 10.578861
--- a/mpi/timing-study/output-20-1000-1250.txt
+++ b/mpi/timing-study/output-20-1000-1250.txt
@ -0,0 +1,4 @@
 ===Timing===
 Time computing life: 5.589568
 Clock time: 15.829982
--- a/mpi/timing-study/output-20-1000-1500.txt
+++ b/mpi/timing-study/output-20-1000-1500.txt
@ -0,0 +1,4 @@
 ===Timing===
 Time computing life: 8.249432
 Clock time: 22.901060
--- a/mpi/timing-study/output-20-1000-1750.txt
+++ b/mpi/timing-study/output-20-1000-1750.txt
@ -0,0 +1,4 @@
 ===Timing===
 Time computing life: 10.833047
 Clock time: 31.488105
--- a/mpi/timing-study/output-20-1000-2000.txt
+++ b/mpi/timing-study/output-20-1000-2000.txt
@ -0,0 +1,4 @@
 ===Timing===
 Time computing life: 14.341513
 Clock time: 41.437950
--- a/mpi/timing-study/output-20-1000-250.txt
+++ b/mpi/timing-study/output-20-1000-250.txt
@ -0,0 +1,4 @@
 ===Timing===
 Time computing life: 0.290254
 Clock time: 2.288621
--- a/mpi/timing-study/output-20-1000-500.txt
+++ b/mpi/timing-study/output-20-1000-500.txt
@ -0,0 +1,4 @@
 ===Timing===
 Time computing life: 0.922726
 Clock time: 3.252760
--- a/mpi/timing-study/output-20-1000-750.txt
+++ b/mpi/timing-study/output-20-1000-750.txt
@ -0,0 +1,4 @@
 ===Timing===
 Time computing life: 2.058615
 Clock time: 6.415593
--- a/mpi/timing-study/output-24-1000-1000.txt
+++ b/mpi/timing-study/output-24-1000-1000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 1.414322
+Time computing life: 2.948477
-Clock time: 9.439315
+Clock time: 11.073549
--- a/mpi/timing-study/output-24-1000-1250.txt
+++ b/mpi/timing-study/output-24-1000-1250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 2.171989
+Time computing life: 4.599746
-Clock time: 13.927639
+Clock time: 16.289204
--- a/mpi/timing-study/output-24-1000-1500.txt
+++ b/mpi/timing-study/output-24-1000-1500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 3.133675
+Time computing life: 6.653323
-Clock time: 19.271850
+Clock time: 23.581825
--- a/mpi/timing-study/output-24-1000-1750.txt
+++ b/mpi/timing-study/output-24-1000-1750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 4.398371
+Time computing life: 9.023902
-Clock time: 25.650748
+Clock time: 32.654144
--- a/mpi/timing-study/output-24-1000-2000.txt
+++ b/mpi/timing-study/output-24-1000-2000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 5.639865
+Time computing life: 11.813565
-Clock time: 33.529967
+Clock time: 42.361473
--- a/mpi/timing-study/output-24-1000-250.txt
+++ b/mpi/timing-study/output-24-1000-250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.100765
+Time computing life: 0.194711
-Clock time: 2.412458
+Clock time: 2.336872
--- a/mpi/timing-study/output-24-1000-500.txt
+++ b/mpi/timing-study/output-24-1000-500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.465147
+Time computing life: 0.751125
-Clock time: 3.942927
+Clock time: 3.215283
--- a/mpi/timing-study/output-24-1000-750.txt
+++ b/mpi/timing-study/output-24-1000-750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.815429
+Time computing life: 1.749681
-Clock time: 5.642879
+Clock time: 6.566280
--- a/mpi/timing-study/output-4-1000-1000.txt
+++ b/mpi/timing-study/output-4-1000-1000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 8.467197
+Time computing life: 18.303801
-Clock time: 11.707533
+Clock time: 22.160403
--- a/mpi/timing-study/output-4-1000-1250.txt
+++ b/mpi/timing-study/output-4-1000-1250.txt
@ -1,11 +1,4 @@
-===================================================================================
+===Timing===
-=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
+Time computing life: 28.577705
-=   PID 22126 RUNNING AT kp013
+Clock time: 33.967832
 =   EXIT CODE: 11
 =   CLEANING UP REMAINING PROCESSES
 =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
 ===================================================================================
 YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
 This typically refers to a problem with your application.
 Please see the FAQ page for debugging suggestions
--- a/mpi/timing-study/output-4-1000-1500.txt
+++ b/mpi/timing-study/output-4-1000-1500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 18.823087
+Time computing life: 40.818054
-Clock time: 26.449810
+Clock time: 49.034747
--- a/mpi/timing-study/output-4-1000-1750.txt
+++ b/mpi/timing-study/output-4-1000-1750.txt
@ -1,11 +1,4 @@
-===================================================================================
+===Timing===
-=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
+Time computing life: 55.473812
-=   PID 22197 RUNNING AT kp013
+Clock time: 66.402986
 =   EXIT CODE: 11
 =   CLEANING UP REMAINING PROCESSES
 =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
 ===================================================================================
 YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
 This typically refers to a problem with your application.
 Please see the FAQ page for debugging suggestions
--- a/mpi/timing-study/output-4-1000-2000.txt
+++ b/mpi/timing-study/output-4-1000-2000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 33.274214
+Time computing life: 72.029655
-Clock time: 45.841294
+Clock time: 84.802906
--- a/mpi/timing-study/output-4-1000-250.txt
+++ b/mpi/timing-study/output-4-1000-250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.599813
+Time computing life: 1.263181
-Clock time: 2.807879
+Clock time: 2.244229
--- a/mpi/timing-study/output-4-1000-500.txt
+++ b/mpi/timing-study/output-4-1000-500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 2.212790
+Time computing life: 4.748224
-Clock time: 4.133439
+Clock time: 6.104621
--- a/mpi/timing-study/output-4-1000-750.txt
+++ b/mpi/timing-study/output-4-1000-750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 4.830949
+Time computing life: 10.420523
-Clock time: 6.854574
+Clock time: 12.881742
--- a/mpi/timing-study/output-8-1000-1000.txt
+++ b/mpi/timing-study/output-8-1000-1000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 4.226861
+Time computing life: 9.155404
-Clock time: 7.517444
+Clock time: 12.658901
--- a/mpi/timing-study/output-8-1000-1250.txt
+++ b/mpi/timing-study/output-8-1000-1250.txt
@ -1,11 +1,4 @@
-===================================================================================
+===Timing===
-=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
+Time computing life: 14.082438
-=   PID 22852 RUNNING AT kp013
+Clock time: 19.224195
 =   EXIT CODE: 11
 =   CLEANING UP REMAINING PROCESSES
 =   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
 ===================================================================================
 YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
 This typically refers to a problem with your application.
 Please see the FAQ page for debugging suggestions
--- a/mpi/timing-study/output-8-1000-1500.txt
+++ b/mpi/timing-study/output-8-1000-1500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 9.416485
+Time computing life: 20.413675
-Clock time: 16.706325
+Clock time: 27.885011
--- a/mpi/timing-study/output-8-1000-1750.txt
+++ b/mpi/timing-study/output-8-1000-1750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 12.741221
+Time computing life: 27.722141
-Clock time: 22.281683
+Clock time: 38.768550
--- a/mpi/timing-study/output-8-1000-2000.txt
+++ b/mpi/timing-study/output-8-1000-2000.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 16.578412
+Time computing life: 35.856221
-Clock time: 26.921717
+Clock time: 48.674318
--- a/mpi/timing-study/output-8-1000-250.txt
+++ b/mpi/timing-study/output-8-1000-250.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 0.296146
+Time computing life: 0.617449
-Clock time: 2.211905
+Clock time: 2.370964
--- a/mpi/timing-study/output-8-1000-500.txt
+++ b/mpi/timing-study/output-8-1000-500.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 1.111486
+Time computing life: 2.396797
-Clock time: 2.710176
+Clock time: 3.529909
--- a/mpi/timing-study/output-8-1000-750.txt
+++ b/mpi/timing-study/output-8-1000-750.txt
@ -1,4 +1,4 @@
 ===Timing===
-Time computing life: 2.419305
+Time computing life: 5.226469
-Clock time: 4.675962
+Clock time: 7.317886
--- a/mpi/timing-study/slurm-10870703.err-kp013
+++ b/mpi/timing-study/slurm-10870703.err-kp013
@ -1,11 +0,0 @@
 mkdir: cannot create directory ‘timing-study’: File exists
 [proxy:0:0@kp013] HYD_pmcd_pmip_control_cmd_cb (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmip_cb.c:887): assert (!closed) failed
 [proxy:0:0@kp013] HYDT_dmxu_poll_wait_for_event (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
 [proxy:0:0@kp013] main (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmip.c:202): demux engine error waiting for event
 srun: error: kp013: task 0: Exited with exit code 7
 [mpiexec@kp013] HYDT_bscu_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/bootstrap/utils/bscu_wait.c:76): one of the processes terminated badly; aborting
 [mpiexec@kp013] HYDT_bsci_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/bootstrap/src/bsci_wait.c:23): launcher returned error waiting for completion
 [mpiexec@kp013] HYD_pmci_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmiserv_pmci.c:218): launcher returned error waiting for completion
 [mpiexec@kp013] main (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/ui/mpich/mpiexec.c:340): process manager error waiting for completion
 srun: error: Unable to create step for job 10870703: Job/step already completing or completed
 slurmstepd: error: *** JOB 10870703 ON kp013 CANCELLED AT 2021-12-08T01:29:02 DUE TO TIME LIMIT ***
--- a/mpi/timing-study/slurm-10870708.err-kp018
+++ b/mpi/timing-study/slurm-10870708.err-kp018
@ -0,0 +1,11 @@
 mkdir: cannot create directory ‘timing-study’: File exists
 Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163".
 Due to MODULEPATH changes, the following have been reloaded:
  1) mpich/3.2.1
 srun: Job step aborted: Waiting up to 62 seconds for job step to finish.
 slurmstepd: error: *** STEP 10870708.3 ON kp018 CANCELLED AT 2021-12-08T03:00:27 ***
 slurmstepd: error: *** JOB 10870708 ON kp018 CANCELLED AT 2021-12-08T03:00:27 ***
--- a/mpi/timing-study/slurm-10870708.out-kp018
+++ b/mpi/timing-study/slurm-10870708.out-kp018
--- a/mpi/timing-study/slurm-10870709.err-kp018
+++ b/mpi/timing-study/slurm-10870709.err-kp018
@ -0,0 +1,8 @@
 mkdir: cannot create directory ‘timing-study’: File exists
 Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163".
 Due to MODULEPATH changes, the following have been reloaded:
  1) mpich/3.2.1
--- a/mpi/timing-study/slurm-10870709.out-kp018
+++ b/mpi/timing-study/slurm-10870709.out-kp018
--- a/mpi/timing-study/slurm-10870714.err-kp007
+++ b/mpi/timing-study/slurm-10870714.err-kp007
@ -0,0 +1,8 @@
 mkdir: cannot create directory ‘timing-study’: File exists
 Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163".
 Due to MODULEPATH changes, the following have been reloaded:
  1) mpich/3.2.1
--- a/mpi/timing-study/slurm-10870714.out-kp007
+++ b/mpi/timing-study/slurm-10870714.out-kp007
--- a/mpi/timing-study/timing_study.sh
+++ b/mpi/timing-study/timing_study.sh
@ -1,9 +1,9 @@
 #!/bin/bash
-#SBATCH --time=0:10:00 # walltime, abbreviated by -t
+#SBATCH --time=0:20:00 # walltime, abbreviated by -t
-#SBATCH --nodes=2     # number of cluster nodes, abbreviated by -N
+#SBATCH --nodes=1     # number of cluster nodes, abbreviated by -N
 #SBATCH -o slurm-%j.out-%N # name of the stdout, using the job number (%j) and the first node (%N)
 #SBATCH -e slurm-%j.err-%N # name of the stderr, using job and first node values
-#SBATCH --ntasks=24   # number of MPI tasks, abbreviated by -n
+#SBATCH --ntasks=1   # number of MPI tasks, abbreviated by -n
 # additional information for allocated clusters
 #SBATCH --account=usucs5030     # account - abbreviated by -A
 #SBATCH --partition=kingspeak  # partition, abbreviated by -p
@ -15,9 +15,9 @@ module load intel mpich
 iterations=1000
-for cores in 1 4 8 12 16 20 #24
+for cores in 1 #12 16 20 24
 do
-  for size in 250 500 750 1000 1250 1500 1750 2000
+  for size in 1000 1250 1500 1750 2000 #250 500 750 1000 1250 1500 1750 2000
  do
    mpirun -np $cores ./gol simulate random $size $size $iterations 1 > timing-study/output-$cores-$iterations-$size.txt
  done
--- a/report/.DS_Store
+++ b/report/.DS_Store
--- a/report/Game
+++ b/report/Game
--- a/report/cores-vs-runtimes.png
+++ b/report/cores-vs-runtimes.png
--- a/report/cuda-speedup.png
+++ b/report/cuda-speedup.png
--- a/report/cuda-times.png
+++ b/report/cuda-times.png
--- a/report/report.org
+++ b/report/report.org
@ -0,0 +1,76 @@
 #+TITLE: Final Project: Game of Life
 #+STARTUP: fold inlineimages
 #+OPTIONS: toc:nil
 #+AUTHOR: Logan Hunt
 #+LATEX_HEADER: \usepackage{amsfonts} \usepackage{amssymb} \usepackage{mathtools} \usepackage{ upgreek }
 * Description
 From [[https://mathworld.wolfram.com/CellularAutomaton.html][Wolfram MathWorld]]:
 #+BEGIN_QUOTE
 A cellular automaton is a collection of "colored" cells on a grid of specified shape that evolves through a number of discrete time steps according to a set of rules based on the states of neighboring cells. The rules are then applied iteratively for as many time steps as desired. 
 #+END_QUOTE
 Conway's Game of Life is one such automaton. In the Game of Life, the rules for each cell are as follows (from [[https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life][Wikipedia]]):
 #+BEGIN_QUOTE
 1. Any live cell with fewer than two live neighbours dies, as if by underpopulation.
 2. Any live cell with two or three live neighbours lives on to the next generation.
 3. Any live cell with more than three live neighbours dies, as if by overpopulation.
 4. Any dead cell with exactly three live neighbours becomes a live cell, as if by reproduction.
 #+END_QUOTE
 To help visualize this automaton I created a script to go through the output of my Game of Life simulation and compile a video with ffmpeg. As an example, I've uploaded the output of a simulation with a 1920x1080 grid of cells with 1000 iterations [[https://www.youtube.com/watch?v=N_aUWYNqpeY][to YouTube]]. Each cell that is white is alive and each black cell is dead.
 There are four implementations of Conway's Game of Life in this project; a serial implementation, a distributed memory implementation (in OpenMPI), a shared memory implementation (in OpenMP), and a GPU implementation (in Cuda). 
 A timing study is performed on each implementation by calculating the elapsed time of the program given varying sizes of initial grids and, in the shared and distributed memory versions, a different number of cores. In each, both the time it takes to compute the next iteration and the total wall clock time are measured.
 * Performance analysis (of Game of Life iteration time)
 Results can be found on a [[https://docs.google.com/spreadsheets/d/1QxCsyMFzk67Qpuv-xZ-tRny4jHMebXTrdq5ncc7C4Tw/edit?usp=sharing][Google Sheet]]
 ** Shared memory vs distributed memory
 *** Runtime
 In runtime, both implementations have the same property of decreasing over an increasing number of cores in all problem sizes (as one would certainly hope). As the problem size increases, the overall differences in the runtimes of each implementation also decreases; meaning they follow the same trends. This can be shown in the runtimes for both implmentations running on a small grid and a large grid:
 #+ATTR_LATEX: :width 8cm
 [[./cores-vs-runtimes.png]]
 Both seem to converge to some rational function. Using an online regression calculator it was found that the MPI Life Computation (iteration computation time only) runtime follows the function $t(p)=\frac{274.449}{p^{0.985}}$ with a correlation coefficient of $r=-0.999892441$. Since $t$ is very close to being a rational function of $p$, we know that the runtime fits to what could be expected: $T_{\text{parallel}} = \frac{T_{\text{serial}}}{p}$.
 *** Speedup
 In speedup, both implementations tend to increase over an increasing number of cores in all problem sizes. However, it doesn't strictly increase. With some numbers of cores in the shared memory implementation, the speedup actually decreases from its predecessor. 
 #+ATTR_LATEX: :width 10cm
 [[./speedups-vs-cores.png]]
 *** Efficiency
 Efficiency is the ratio of speedup to $p$ processors ($E = \frac{S}{p}$), so it can be thought of as the derivative of the speedup. Thus efficiency can be measured without plotting it explicitly.
 By definition, a program is "strongly scalable" if it can keep its efficiency constant over a varying input size. In the results, it can be seen that the slope of the Distributed Memory Life Computation Time line tends to be constant, meaning that the efficiency is also constant. Thus, the MPI version is strongly scalable.
 However, the shared memory (OpenMP) implementation does not seem to be perfectly strongly scalable. As the problem size varies, the speedup does not follow a constant slope. Instead, it tends to match the efficiency of the MPI implemenation until some point where the slope drops off. 
 Theoretically, the OpenMP implementation should be just as strongly scalable as the MPI implementation. One reason overhead could be present is in thread scheduling. 
 ** CUDA Implementation
 *** Runtime
 For the CUDA implementation, different grid sizes are used to measure the iteration time as well as the wall time. Again, 1000 iterations are used for the timing study.
 #+ATTR_LATEX: :width 8cm
 [[./cuda-times.png]]
 Using an online regression calculator again, it was found that the runtime as a function of input size can be expressed with by $t(n) = (1.486)(10^{-7})n^2 + (1.328)(10^{-6})n + 0.02151$ with a correlation coefficient $r = 0.9999278678$.
 Since the number of cores is constant, we would hope to see a quadratic increase in the runtime as the input size grows. This is because the number of cells increases with $(\text{input size})^2$.
 Indeed, this is what we see.
 *** Speedup
 The speedup of the cuda implementation as input size increases tends to follow a logarithmic curve, plateuing after around $n=1000$. While I am not entirely sure why it follows this trend, I guess it might have to do with the warp scheduling.
 #+ATTR_LATEX: :width 8cm
 [[./cuda-speedup.png]]
 *** Efficiency
 Since the core count on the K80 is constant (4992 CUDA cores), the efficiency can be calculated by $E = \frac{S}{4992}$. As the efficiency is just a constant multiplied by the speedup, the efficiency graph will just be a scaled version of the speedup graph. As such the efficiency will not be constant over different input sizes since the speedup isn't, and thus the CUDA implementation is not strongly scalable.
--- a/report/report.pdf
+++ b/report/report.pdf
--- a/report/report.tex
+++ b/report/report.tex
@ -0,0 +1,118 @@
 % Created 2021-12-08 Wed 18:34
 % Intended LaTeX compiler: pdflatex
 \documentclass[11pt]{article}
 \usepackage[utf8]{inputenc}
 \usepackage[T1]{fontenc}
 \usepackage{graphicx}
 \usepackage{grffile}
 \usepackage{longtable}
 \usepackage{wrapfig}
 \usepackage{rotating}
 \usepackage[normalem]{ulem}
 \usepackage{amsmath}
 \usepackage{textcomp}
 \usepackage{amssymb}
 \usepackage{capt-of}
 \usepackage{hyperref}
 \usepackage{amsfonts} \usepackage{amssymb} \usepackage{mathtools} \usepackage{ upgreek }
 \author{Logan Hunt}
 \date{\today}
 \title{Final Project: Game of Life}
 \hypersetup{
 pdfauthor={Logan Hunt},
 pdftitle={Final Project: Game of Life},
 pdfkeywords={},
 pdfsubject={},
 pdfcreator={Emacs 27.2 (Org mode 9.4.4)}, 
 pdflang={English}}
 \begin{document}
 \maketitle
 \section{Description}
 \label{sec:orgee5348a}
 From \href{https://mathworld.wolfram.com/CellularAutomaton.html}{Wolfram MathWorld}:
 \begin{quote}
 A cellular automaton is a collection of "colored" cells on a grid of specified shape that evolves through a number of discrete time steps according to a set of rules based on the states of neighboring cells. The rules are then applied iteratively for as many time steps as desired. 
 \end{quote}
 Conway's Game of Life is one such automaton. In the Game of Life, the rules for each cell are as follows (from \href{https://en.wikipedia.org/wiki/Conway\%27s\_Game\_of\_Life}{Wikipedia}):
 \begin{quote}
 \begin{enumerate}
 \item Any live cell with fewer than two live neighbours dies, as if by underpopulation.
 \item Any live cell with two or three live neighbours lives on to the next generation.
 \item Any live cell with more than three live neighbours dies, as if by overpopulation.
 \item Any dead cell with exactly three live neighbours becomes a live cell, as if by reproduction.
 \end{enumerate}
 \end{quote}
 To help visualize this automaton I created a script to go through the output of my Game of Life simulation and compile a video with ffmpeg. As an example, I've uploaded the output of a simulation with a 1920x1080 grid of cells with 1000 iterations \href{https://www.youtube.com/watch?v=N\_aUWYNqpeY}{to YouTube}. Each cell that is white is alive and each black cell is dead.
 There are four implementations of Conway's Game of Life in this project; a serial implementation, a distributed memory implementation (in OpenMPI), a shared memory implementation (in OpenMP), and a GPU implementation (in Cuda). 
 A timing study is performed on each implementation by calculating the elapsed time of the program given varying sizes of initial grids and, in the shared and distributed memory versions, a different number of cores. In each, both the time it takes to compute the next iteration and the total wall clock time are measured.
 \section{Performance analysis (of Game of Life iteration time)}
 \label{sec:org1f7f2b6}
 Results can be found on a \href{https://docs.google.com/spreadsheets/d/1QxCsyMFzk67Qpuv-xZ-tRny4jHMebXTrdq5ncc7C4Tw/edit?usp=sharing}{Google Sheet}
 \subsection{Shared memory vs distributed memory}
 \label{sec:org9f5d8c9}
 \subsubsection{Runtime}
 \label{sec:org9209384}
 In runtime, both implementations have the same property of decreasing over an increasing number of cores in all problem sizes (as one would certainly hope). As the problem size increases, the overall differences in the runtimes of each implementation also decreases; meaning they follow the same trends. This can be shown in the runtimes for both implmentations running on a small grid and a large grid:
 \begin{center}
 \includegraphics[width=8cm]{./cores-vs-runtimes.png}
 \end{center}
 Both seem to converge to some rational function. Using an online regression calculator it was found that the MPI Life Computation (iteration computation time only) runtime follows the function \(t(p)=\frac{274.449}{p^{0.985}}\) with a correlation coefficient of \(r=-0.999892441\). Since \(t\) is very close to being a rational function of \(p\), we know that the runtime fits to what could be expected: \(T_{\text{parallel}} = \frac{T_{\text{serial}}}{p}\).
 \subsubsection{Speedup}
 \label{sec:org3812f4f}
 In speedup, both implementations tend to increase over an increasing number of cores in all problem sizes. However, it doesn't strictly increase. With some numbers of cores in the shared memory implementation, the speedup actually decreases from its predecessor. 
 \begin{center}
 \includegraphics[width=10cm]{./speedups-vs-cores.png}
 \end{center}
 \subsubsection{Efficiency}
 \label{sec:org477fbb5}
 Efficiency is the ratio of speedup to \(p\) processors (\(E = \frac{S}{p}\)), so it can be thought of as the derivative of the speedup. Thus efficiency can be measured without plotting it explicitly.
 By definition, a program is "strongly scalable" if it can keep its efficiency constant over a varying input size. In the results, it can be seen that the slope of the Distributed Memory Life Computation Time line tends to be constant, meaning that the efficiency is also constant. Thus, the MPI version is strongly scalable.
 However, the shared memory (OpenMP) implementation does not seem to be perfectly strongly scalable. As the problem size varies, the speedup does not follow a constant slope. Instead, it tends to match the efficiency of the MPI implemenation until some point where the slope drops off. 
 Theoretically, the OpenMP implementation should be just as strongly scalable as the MPI implementation. One reason overhead could be present is in thread scheduling. 
 \subsection{CUDA Implementation}
 \label{sec:org31daf97}
 \subsubsection{Runtime}
 \label{sec:orgb6b22ab}
 For the CUDA implementation, different grid sizes are used to measure the iteration time as well as the wall time. Again, 1000 iterations are used for the timing study.
 \begin{center}
 \includegraphics[width=8cm]{./cuda-times.png}
 \end{center}
 Using an online regression calculator again, it was found that the runtime as a function of input size can be expressed with by \(t(n) = (1.486)(10^{-7})n^2 + (1.328)(10^{-6})n + 0.02151\) with a correlation coefficient \(r = 0.9999278678\).
 Since the number of cores is constant, we would hope to see a quadratic increase in the runtime as the input size grows. This is because the number of cells increases with \((\text{input size})^2\).
 Indeed, this is what we see.
 \subsubsection{Speedup}
 \label{sec:org888d520}
 The speedup of the cuda implementation as input size increases tends to follow a logarithmic curve, plateuing after around \(n=1000\). While I am not entirely sure why it follows this trend, I guess it might have to do with the warp scheduling.
 \begin{center}
 \includegraphics[width=8cm]{./cuda-speedup.png}
 \end{center}
 \subsubsection{Efficiency}
 \label{sec:org530b4aa}
 Since the core count on the K80 is constant (4992 CUDA cores), the efficiency can be calculated by \(E = \frac{S}{4992}\). As the efficiency is just a constant multiplied by the speedup, the efficiency graph will just be a scaled version of the speedup graph. As such the efficiency will not be constant over different input sizes since the speedup isn't, and thus the CUDA implementation is not strongly scalable.
 \end{document}
--- a/report/speedups-vs-cores.png
+++ b/report/speedups-vs-cores.png