This commit is contained in:
Logan Hunt 2021-12-08 18:38:46 -07:00
parent c846568cf2
commit 9362bc1591
90 changed files with 389 additions and 192 deletions

Binary file not shown.

View File

@ -36,7 +36,10 @@ true) {
// Do the simulation // Do the simulation
void simulate(int argc, char** argv) { void simulate(int argc, char** argv) {
srand(SEED); srand(SEED);
clock_t global_start = clock(); cudaEvent_t global_start, global_end;
cudaEventCreate(&global_start);
cudaEventCreate(&global_end);
cudaEventRecord(global_start);
char* filename; char* filename;
struct GAME game; struct GAME game;
game.padding = PADDING; game.padding = PADDING;
@ -135,8 +138,12 @@ void simulate(int argc, char** argv) {
game.grid = temp; game.grid = temp;
} }
} }
cudaEventRecord(global_end);
cudaEventSynchronize(global_end);
float global_time;
cudaEventElapsedTime(&global_time, global_start, global_end);
printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", time_computing_life, ((double)clock() - (double)global_start)/CLOCKS_PER_SEC); printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", time_computing_life, global_time/(double)1000);
} }
int main(int argc, char** argv) { int main(int argc, char** argv) {

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.169687 Time computing life: 0.169470
Clock time: 1.560000 Clock time: 5.175729

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.254989 Time computing life: 0.255659
Clock time: 2.240000 Clock time: 5.620605

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.354361 Time computing life: 0.354065
Clock time: 3.050000 Clock time: 8.177913

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.480174 Time computing life: 0.480989
Clock time: 4.070000 Clock time: 9.626799

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.619636 Time computing life: 0.618807
Clock time: 5.220000 Clock time: 10.948197

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.029867 Time computing life: 0.029682
Clock time: 0.330000 Clock time: 2.946978

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.059907 Time computing life: 0.059339
Clock time: 0.540000 Clock time: 3.249037

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.110954 Time computing life: 0.110569
Clock time: 1.000000 Clock time: 4.294806

BIN
mpi/gol

Binary file not shown.

View File

@ -5,7 +5,7 @@ int neighbors(struct GAME* game, int x, int y, unsigned char* halo_above, unsign
for (int dy = -1; dy <= 1; dy++) { for (int dy = -1; dy <= 1; dy++) {
for (int dx = -1; dx <= 1; dx++) { for (int dx = -1; dx <= 1; dx++) {
if (!(dx == 0 && dy == 0) && (x+dx) > 0 && (x+dx) < game->width+(game->padding*2)) { if (!(dx == 0 && dy == 0) && (x+dx) > 0 && (x+dx) < game->width+(game->padding*2) && (y+dy) < game->height) {
if (y+dy == -1 && halo_above != NULL) { if (y+dy == -1 && halo_above != NULL) {
if (halo_above[x+dx]) { if (halo_above[x+dx]) {
n++; n++;

View File

@ -3,6 +3,7 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <mpi.h> #include <mpi.h>
#include <stddef.h>
#include "file.h" #include "file.h"
#include "game.h" #include "game.h"

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 33.832562 Time computing life: 68.237100
Clock time: 37.939663 Clock time: 73.897736

View File

@ -1,11 +1,4 @@
=================================================================================== ===Timing===
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES Time computing life: 105.934486
= PID 21716 RUNNING AT kp013 Clock time: 112.662907
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 75.141736 Time computing life: 155.201482
Clock time: 83.149478 Clock time: 165.129865

View File

@ -1,11 +1,4 @@
=================================================================================== ===Timing===
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES Time computing life: 207.332667
= PID 21837 RUNNING AT kp013 Clock time: 219.494609
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 132.636661 Time computing life: 269.160186
Clock time: 145.001708 Clock time: 284.025931

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 2.383001 Time computing life: 5.132488
Clock time: 4.113476 Clock time: 6.490781

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 8.793952 Time computing life: 18.976428
Clock time: 9.832794 Clock time: 20.433132

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 19.270078 Time computing life: 41.754895
Clock time: 21.813069 Clock time: 44.502337

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 2.833550 Time computing life: 6.001465
Clock time: 6.323680 Clock time: 9.559285

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 4.347700 Time computing life: 9.368616
Clock time: 9.178630 Clock time: 14.966020

View File

@ -1,11 +1,4 @@
=================================================================================== ===Timing===
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES Time computing life: 13.390212
= PID 23209 RUNNING AT kp013 Clock time: 20.945775
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 8.483342 Time computing life: 18.167763
Clock time: 17.330302 Clock time: 28.215494

View File

@ -1,11 +1,4 @@
=================================================================================== ===Timing===
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES Time computing life: 23.784948
= PID 23290 RUNNING AT kp013 Clock time: 36.657344
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.198089 Time computing life: 0.621095
Clock time: 2.217166 Clock time: 2.379479

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.735509 Time computing life: 1.541923
Clock time: 2.513034 Clock time: 3.193527

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 1.617002 Time computing life: 3.622372
Clock time: 4.091923 Clock time: 5.586008

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 2.106571 Time computing life: 4.442463
Clock time: 7.500836 Clock time: 10.520606

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 3.445883 Time computing life: 7.085545
Clock time: 11.167682 Clock time: 15.203315

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 4.741983 Time computing life: 10.130384
Clock time: 16.777514 Clock time: 22.828620

View File

@ -1,8 +1,4 @@
=================================================================================== ===Timing===
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES Time computing life: 13.581483
= PID 34784 RUNNING AT kp160 Clock time: 30.474959
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 8.301682 Time computing life: 17.827682
Clock time: 28.791425 Clock time: 38.839103

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.145483 Time computing life: 0.295248
Clock time: 2.572587 Clock time: 2.424203

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.570992 Time computing life: 1.134216
Clock time: 3.899400 Clock time: 3.333061

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 1.215016 Time computing life: 2.724965
Clock time: 5.047125 Clock time: 6.326008

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 3.626076
Clock time: 10.578861

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 5.589568
Clock time: 15.829982

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 8.249432
Clock time: 22.901060

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 10.833047
Clock time: 31.488105

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 14.341513
Clock time: 41.437950

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 0.290254
Clock time: 2.288621

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 0.922726
Clock time: 3.252760

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 2.058615
Clock time: 6.415593

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 1.414322 Time computing life: 2.948477
Clock time: 9.439315 Clock time: 11.073549

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 2.171989 Time computing life: 4.599746
Clock time: 13.927639 Clock time: 16.289204

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 3.133675 Time computing life: 6.653323
Clock time: 19.271850 Clock time: 23.581825

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 4.398371 Time computing life: 9.023902
Clock time: 25.650748 Clock time: 32.654144

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 5.639865 Time computing life: 11.813565
Clock time: 33.529967 Clock time: 42.361473

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.100765 Time computing life: 0.194711
Clock time: 2.412458 Clock time: 2.336872

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.465147 Time computing life: 0.751125
Clock time: 3.942927 Clock time: 3.215283

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.815429 Time computing life: 1.749681
Clock time: 5.642879 Clock time: 6.566280

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 8.467197 Time computing life: 18.303801
Clock time: 11.707533 Clock time: 22.160403

View File

@ -1,11 +1,4 @@
=================================================================================== ===Timing===
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES Time computing life: 28.577705
= PID 22126 RUNNING AT kp013 Clock time: 33.967832
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 18.823087 Time computing life: 40.818054
Clock time: 26.449810 Clock time: 49.034747

View File

@ -1,11 +1,4 @@
=================================================================================== ===Timing===
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES Time computing life: 55.473812
= PID 22197 RUNNING AT kp013 Clock time: 66.402986
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 33.274214 Time computing life: 72.029655
Clock time: 45.841294 Clock time: 84.802906

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.599813 Time computing life: 1.263181
Clock time: 2.807879 Clock time: 2.244229

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 2.212790 Time computing life: 4.748224
Clock time: 4.133439 Clock time: 6.104621

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 4.830949 Time computing life: 10.420523
Clock time: 6.854574 Clock time: 12.881742

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 4.226861 Time computing life: 9.155404
Clock time: 7.517444 Clock time: 12.658901

View File

@ -1,11 +1,4 @@
=================================================================================== ===Timing===
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES Time computing life: 14.082438
= PID 22852 RUNNING AT kp013 Clock time: 19.224195
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 9.416485 Time computing life: 20.413675
Clock time: 16.706325 Clock time: 27.885011

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 12.741221 Time computing life: 27.722141
Clock time: 22.281683 Clock time: 38.768550

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 16.578412 Time computing life: 35.856221
Clock time: 26.921717 Clock time: 48.674318

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 0.296146 Time computing life: 0.617449
Clock time: 2.211905 Clock time: 2.370964

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 1.111486 Time computing life: 2.396797
Clock time: 2.710176 Clock time: 3.529909

View File

@ -1,4 +1,4 @@
===Timing=== ===Timing===
Time computing life: 2.419305 Time computing life: 5.226469
Clock time: 4.675962 Clock time: 7.317886

View File

@ -1,11 +0,0 @@
mkdir: cannot create directory timing-study: File exists
[proxy:0:0@kp013] HYD_pmcd_pmip_control_cmd_cb (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmip_cb.c:887): assert (!closed) failed
[proxy:0:0@kp013] HYDT_dmxu_poll_wait_for_event (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:0@kp013] main (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmip.c:202): demux engine error waiting for event
srun: error: kp013: task 0: Exited with exit code 7
[mpiexec@kp013] HYDT_bscu_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/bootstrap/utils/bscu_wait.c:76): one of the processes terminated badly; aborting
[mpiexec@kp013] HYDT_bsci_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/bootstrap/src/bsci_wait.c:23): launcher returned error waiting for completion
[mpiexec@kp013] HYD_pmci_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmiserv_pmci.c:218): launcher returned error waiting for completion
[mpiexec@kp013] main (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/ui/mpich/mpiexec.c:340): process manager error waiting for completion
srun: error: Unable to create step for job 10870703: Job/step already completing or completed
slurmstepd: error: *** JOB 10870703 ON kp013 CANCELLED AT 2021-12-08T01:29:02 DUE TO TIME LIMIT ***

View File

@ -0,0 +1,11 @@
mkdir: cannot create directory timing-study: File exists
Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163".
Due to MODULEPATH changes, the following have been reloaded:
1) mpich/3.2.1
srun: Job step aborted: Waiting up to 62 seconds for job step to finish.
slurmstepd: error: *** STEP 10870708.3 ON kp018 CANCELLED AT 2021-12-08T03:00:27 ***
slurmstepd: error: *** JOB 10870708 ON kp018 CANCELLED AT 2021-12-08T03:00:27 ***

View File

@ -0,0 +1,8 @@
mkdir: cannot create directory timing-study: File exists
Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163".
Due to MODULEPATH changes, the following have been reloaded:
1) mpich/3.2.1

View File

@ -0,0 +1,8 @@
mkdir: cannot create directory timing-study: File exists
Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163".
Due to MODULEPATH changes, the following have been reloaded:
1) mpich/3.2.1

View File

@ -1,9 +1,9 @@
#!/bin/bash #!/bin/bash
#SBATCH --time=0:10:00 # walltime, abbreviated by -t #SBATCH --time=0:20:00 # walltime, abbreviated by -t
#SBATCH --nodes=2 # number of cluster nodes, abbreviated by -N #SBATCH --nodes=1 # number of cluster nodes, abbreviated by -N
#SBATCH -o slurm-%j.out-%N # name of the stdout, using the job number (%j) and the first node (%N) #SBATCH -o slurm-%j.out-%N # name of the stdout, using the job number (%j) and the first node (%N)
#SBATCH -e slurm-%j.err-%N # name of the stderr, using job and first node values #SBATCH -e slurm-%j.err-%N # name of the stderr, using job and first node values
#SBATCH --ntasks=24 # number of MPI tasks, abbreviated by -n #SBATCH --ntasks=1 # number of MPI tasks, abbreviated by -n
# additional information for allocated clusters # additional information for allocated clusters
#SBATCH --account=usucs5030 # account - abbreviated by -A #SBATCH --account=usucs5030 # account - abbreviated by -A
#SBATCH --partition=kingspeak # partition, abbreviated by -p #SBATCH --partition=kingspeak # partition, abbreviated by -p
@ -15,9 +15,9 @@ module load intel mpich
iterations=1000 iterations=1000
for cores in 1 4 8 12 16 20 #24 for cores in 1 #12 16 20 24
do do
for size in 250 500 750 1000 1250 1500 1750 2000 for size in 1000 1250 1500 1750 2000 #250 500 750 1000 1250 1500 1750 2000
do do
mpirun -np $cores ./gol simulate random $size $size $iterations 1 > timing-study/output-$cores-$iterations-$size.txt mpirun -np $cores ./gol simulate random $size $size $iterations 1 > timing-study/output-$cores-$iterations-$size.txt
done done

BIN
report/.DS_Store vendored Normal file

Binary file not shown.

BIN
report/Game of Life.xlsx Normal file

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 160 KiB

BIN
report/cuda-speedup.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 175 KiB

BIN
report/cuda-times.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 95 KiB

76
report/report.org Normal file
View File

@ -0,0 +1,76 @@
#+TITLE: Final Project: Game of Life
#+STARTUP: fold inlineimages
#+OPTIONS: toc:nil
#+AUTHOR: Logan Hunt
#+LATEX_HEADER: \usepackage{amsfonts} \usepackage{amssymb} \usepackage{mathtools} \usepackage{ upgreek }
* Description
From [[https://mathworld.wolfram.com/CellularAutomaton.html][Wolfram MathWorld]]:
#+BEGIN_QUOTE
A cellular automaton is a collection of "colored" cells on a grid of specified shape that evolves through a number of discrete time steps according to a set of rules based on the states of neighboring cells. The rules are then applied iteratively for as many time steps as desired.
#+END_QUOTE
Conway's Game of Life is one such automaton. In the Game of Life, the rules for each cell are as follows (from [[https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life][Wikipedia]]):
#+BEGIN_QUOTE
1. Any live cell with fewer than two live neighbours dies, as if by underpopulation.
2. Any live cell with two or three live neighbours lives on to the next generation.
3. Any live cell with more than three live neighbours dies, as if by overpopulation.
4. Any dead cell with exactly three live neighbours becomes a live cell, as if by reproduction.
#+END_QUOTE
To help visualize this automaton I created a script to go through the output of my Game of Life simulation and compile a video with ffmpeg. As an example, I've uploaded the output of a simulation with a 1920x1080 grid of cells with 1000 iterations [[https://www.youtube.com/watch?v=N_aUWYNqpeY][to YouTube]]. Each cell that is white is alive and each black cell is dead.
There are four implementations of Conway's Game of Life in this project; a serial implementation, a distributed memory implementation (in OpenMPI), a shared memory implementation (in OpenMP), and a GPU implementation (in Cuda).
A timing study is performed on each implementation by calculating the elapsed time of the program given varying sizes of initial grids and, in the shared and distributed memory versions, a different number of cores. In each, both the time it takes to compute the next iteration and the total wall clock time are measured.
* Performance analysis (of Game of Life iteration time)
Results can be found on a [[https://docs.google.com/spreadsheets/d/1QxCsyMFzk67Qpuv-xZ-tRny4jHMebXTrdq5ncc7C4Tw/edit?usp=sharing][Google Sheet]]
** Shared memory vs distributed memory
*** Runtime
In runtime, both implementations have the same property of decreasing over an increasing number of cores in all problem sizes (as one would certainly hope). As the problem size increases, the overall differences in the runtimes of each implementation also decreases; meaning they follow the same trends. This can be shown in the runtimes for both implmentations running on a small grid and a large grid:
#+ATTR_LATEX: :width 8cm
[[./cores-vs-runtimes.png]]
Both seem to converge to some rational function. Using an online regression calculator it was found that the MPI Life Computation (iteration computation time only) runtime follows the function $t(p)=\frac{274.449}{p^{0.985}}$ with a correlation coefficient of $r=-0.999892441$. Since $t$ is very close to being a rational function of $p$, we know that the runtime fits to what could be expected: $T_{\text{parallel}} = \frac{T_{\text{serial}}}{p}$.
*** Speedup
In speedup, both implementations tend to increase over an increasing number of cores in all problem sizes. However, it doesn't strictly increase. With some numbers of cores in the shared memory implementation, the speedup actually decreases from its predecessor.
#+ATTR_LATEX: :width 10cm
[[./speedups-vs-cores.png]]
*** Efficiency
Efficiency is the ratio of speedup to $p$ processors ($E = \frac{S}{p}$), so it can be thought of as the derivative of the speedup. Thus efficiency can be measured without plotting it explicitly.
By definition, a program is "strongly scalable" if it can keep its efficiency constant over a varying input size. In the results, it can be seen that the slope of the Distributed Memory Life Computation Time line tends to be constant, meaning that the efficiency is also constant. Thus, the MPI version is strongly scalable.
However, the shared memory (OpenMP) implementation does not seem to be perfectly strongly scalable. As the problem size varies, the speedup does not follow a constant slope. Instead, it tends to match the efficiency of the MPI implemenation until some point where the slope drops off.
Theoretically, the OpenMP implementation should be just as strongly scalable as the MPI implementation. One reason overhead could be present is in thread scheduling.
** CUDA Implementation
*** Runtime
For the CUDA implementation, different grid sizes are used to measure the iteration time as well as the wall time. Again, 1000 iterations are used for the timing study.
#+ATTR_LATEX: :width 8cm
[[./cuda-times.png]]
Using an online regression calculator again, it was found that the runtime as a function of input size can be expressed with by $t(n) = (1.486)(10^{-7})n^2 + (1.328)(10^{-6})n + 0.02151$ with a correlation coefficient $r = 0.9999278678$.
Since the number of cores is constant, we would hope to see a quadratic increase in the runtime as the input size grows. This is because the number of cells increases with $(\text{input size})^2$.
Indeed, this is what we see.
*** Speedup
The speedup of the cuda implementation as input size increases tends to follow a logarithmic curve, plateuing after around $n=1000$. While I am not entirely sure why it follows this trend, I guess it might have to do with the warp scheduling.
#+ATTR_LATEX: :width 8cm
[[./cuda-speedup.png]]
*** Efficiency
Since the core count on the K80 is constant (4992 CUDA cores), the efficiency can be calculated by $E = \frac{S}{4992}$. As the efficiency is just a constant multiplied by the speedup, the efficiency graph will just be a scaled version of the speedup graph. As such the efficiency will not be constant over different input sizes since the speedup isn't, and thus the CUDA implementation is not strongly scalable.

BIN
report/report.pdf Normal file

Binary file not shown.

118
report/report.tex Normal file
View File

@ -0,0 +1,118 @@
% Created 2021-12-08 Wed 18:34
% Intended LaTeX compiler: pdflatex
\documentclass[11pt]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{grffile}
\usepackage{longtable}
\usepackage{wrapfig}
\usepackage{rotating}
\usepackage[normalem]{ulem}
\usepackage{amsmath}
\usepackage{textcomp}
\usepackage{amssymb}
\usepackage{capt-of}
\usepackage{hyperref}
\usepackage{amsfonts} \usepackage{amssymb} \usepackage{mathtools} \usepackage{ upgreek }
\author{Logan Hunt}
\date{\today}
\title{Final Project: Game of Life}
\hypersetup{
pdfauthor={Logan Hunt},
pdftitle={Final Project: Game of Life},
pdfkeywords={},
pdfsubject={},
pdfcreator={Emacs 27.2 (Org mode 9.4.4)},
pdflang={English}}
\begin{document}
\maketitle
\section{Description}
\label{sec:orgee5348a}
From \href{https://mathworld.wolfram.com/CellularAutomaton.html}{Wolfram MathWorld}:
\begin{quote}
A cellular automaton is a collection of "colored" cells on a grid of specified shape that evolves through a number of discrete time steps according to a set of rules based on the states of neighboring cells. The rules are then applied iteratively for as many time steps as desired.
\end{quote}
Conway's Game of Life is one such automaton. In the Game of Life, the rules for each cell are as follows (from \href{https://en.wikipedia.org/wiki/Conway\%27s\_Game\_of\_Life}{Wikipedia}):
\begin{quote}
\begin{enumerate}
\item Any live cell with fewer than two live neighbours dies, as if by underpopulation.
\item Any live cell with two or three live neighbours lives on to the next generation.
\item Any live cell with more than three live neighbours dies, as if by overpopulation.
\item Any dead cell with exactly three live neighbours becomes a live cell, as if by reproduction.
\end{enumerate}
\end{quote}
To help visualize this automaton I created a script to go through the output of my Game of Life simulation and compile a video with ffmpeg. As an example, I've uploaded the output of a simulation with a 1920x1080 grid of cells with 1000 iterations \href{https://www.youtube.com/watch?v=N\_aUWYNqpeY}{to YouTube}. Each cell that is white is alive and each black cell is dead.
There are four implementations of Conway's Game of Life in this project; a serial implementation, a distributed memory implementation (in OpenMPI), a shared memory implementation (in OpenMP), and a GPU implementation (in Cuda).
A timing study is performed on each implementation by calculating the elapsed time of the program given varying sizes of initial grids and, in the shared and distributed memory versions, a different number of cores. In each, both the time it takes to compute the next iteration and the total wall clock time are measured.
\section{Performance analysis (of Game of Life iteration time)}
\label{sec:org1f7f2b6}
Results can be found on a \href{https://docs.google.com/spreadsheets/d/1QxCsyMFzk67Qpuv-xZ-tRny4jHMebXTrdq5ncc7C4Tw/edit?usp=sharing}{Google Sheet}
\subsection{Shared memory vs distributed memory}
\label{sec:org9f5d8c9}
\subsubsection{Runtime}
\label{sec:org9209384}
In runtime, both implementations have the same property of decreasing over an increasing number of cores in all problem sizes (as one would certainly hope). As the problem size increases, the overall differences in the runtimes of each implementation also decreases; meaning they follow the same trends. This can be shown in the runtimes for both implmentations running on a small grid and a large grid:
\begin{center}
\includegraphics[width=8cm]{./cores-vs-runtimes.png}
\end{center}
Both seem to converge to some rational function. Using an online regression calculator it was found that the MPI Life Computation (iteration computation time only) runtime follows the function \(t(p)=\frac{274.449}{p^{0.985}}\) with a correlation coefficient of \(r=-0.999892441\). Since \(t\) is very close to being a rational function of \(p\), we know that the runtime fits to what could be expected: \(T_{\text{parallel}} = \frac{T_{\text{serial}}}{p}\).
\subsubsection{Speedup}
\label{sec:org3812f4f}
In speedup, both implementations tend to increase over an increasing number of cores in all problem sizes. However, it doesn't strictly increase. With some numbers of cores in the shared memory implementation, the speedup actually decreases from its predecessor.
\begin{center}
\includegraphics[width=10cm]{./speedups-vs-cores.png}
\end{center}
\subsubsection{Efficiency}
\label{sec:org477fbb5}
Efficiency is the ratio of speedup to \(p\) processors (\(E = \frac{S}{p}\)), so it can be thought of as the derivative of the speedup. Thus efficiency can be measured without plotting it explicitly.
By definition, a program is "strongly scalable" if it can keep its efficiency constant over a varying input size. In the results, it can be seen that the slope of the Distributed Memory Life Computation Time line tends to be constant, meaning that the efficiency is also constant. Thus, the MPI version is strongly scalable.
However, the shared memory (OpenMP) implementation does not seem to be perfectly strongly scalable. As the problem size varies, the speedup does not follow a constant slope. Instead, it tends to match the efficiency of the MPI implemenation until some point where the slope drops off.
Theoretically, the OpenMP implementation should be just as strongly scalable as the MPI implementation. One reason overhead could be present is in thread scheduling.
\subsection{CUDA Implementation}
\label{sec:org31daf97}
\subsubsection{Runtime}
\label{sec:orgb6b22ab}
For the CUDA implementation, different grid sizes are used to measure the iteration time as well as the wall time. Again, 1000 iterations are used for the timing study.
\begin{center}
\includegraphics[width=8cm]{./cuda-times.png}
\end{center}
Using an online regression calculator again, it was found that the runtime as a function of input size can be expressed with by \(t(n) = (1.486)(10^{-7})n^2 + (1.328)(10^{-6})n + 0.02151\) with a correlation coefficient \(r = 0.9999278678\).
Since the number of cores is constant, we would hope to see a quadratic increase in the runtime as the input size grows. This is because the number of cells increases with \((\text{input size})^2\).
Indeed, this is what we see.
\subsubsection{Speedup}
\label{sec:org888d520}
The speedup of the cuda implementation as input size increases tends to follow a logarithmic curve, plateuing after around \(n=1000\). While I am not entirely sure why it follows this trend, I guess it might have to do with the warp scheduling.
\begin{center}
\includegraphics[width=8cm]{./cuda-speedup.png}
\end{center}
\subsubsection{Efficiency}
\label{sec:org530b4aa}
Since the core count on the K80 is constant (4992 CUDA cores), the efficiency can be calculated by \(E = \frac{S}{4992}\). As the efficiency is just a constant multiplied by the speedup, the efficiency graph will just be a scaled version of the speedup graph. As such the efficiency will not be constant over different input sizes since the speedup isn't, and thus the CUDA implementation is not strongly scalable.
\end{document}

Binary file not shown.

After

Width:  |  Height:  |  Size: 399 KiB