This commit is contained in:
Logan Hunt 2021-12-08 18:38:46 -07:00
parent c846568cf2
commit 9362bc1591
90 changed files with 389 additions and 192 deletions

Binary file not shown.

View File

@ -36,7 +36,10 @@ true) {
// Do the simulation
void simulate(int argc, char** argv) {
srand(SEED);
clock_t global_start = clock();
cudaEvent_t global_start, global_end;
cudaEventCreate(&global_start);
cudaEventCreate(&global_end);
cudaEventRecord(global_start);
char* filename;
struct GAME game;
game.padding = PADDING;
@ -135,8 +138,12 @@ void simulate(int argc, char** argv) {
game.grid = temp;
}
}
cudaEventRecord(global_end);
cudaEventSynchronize(global_end);
float global_time;
cudaEventElapsedTime(&global_time, global_start, global_end);
printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", time_computing_life, ((double)clock() - (double)global_start)/CLOCKS_PER_SEC);
printf("\n===Timing===\nTime computing life: %f\nClock time: %f\n", time_computing_life, global_time/(double)1000);
}
int main(int argc, char** argv) {

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.169687
Clock time: 1.560000
Time computing life: 0.169470
Clock time: 5.175729

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.254989
Clock time: 2.240000
Time computing life: 0.255659
Clock time: 5.620605

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.354361
Clock time: 3.050000
Time computing life: 0.354065
Clock time: 8.177913

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.480174
Clock time: 4.070000
Time computing life: 0.480989
Clock time: 9.626799

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.619636
Clock time: 5.220000
Time computing life: 0.618807
Clock time: 10.948197

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.029867
Clock time: 0.330000
Time computing life: 0.029682
Clock time: 2.946978

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.059907
Clock time: 0.540000
Time computing life: 0.059339
Clock time: 3.249037

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.110954
Clock time: 1.000000
Time computing life: 0.110569
Clock time: 4.294806

BIN
mpi/gol

Binary file not shown.

View File

@ -5,7 +5,7 @@ int neighbors(struct GAME* game, int x, int y, unsigned char* halo_above, unsign
for (int dy = -1; dy <= 1; dy++) {
for (int dx = -1; dx <= 1; dx++) {
if (!(dx == 0 && dy == 0) && (x+dx) > 0 && (x+dx) < game->width+(game->padding*2)) {
if (!(dx == 0 && dy == 0) && (x+dx) > 0 && (x+dx) < game->width+(game->padding*2) && (y+dy) < game->height) {
if (y+dy == -1 && halo_above != NULL) {
if (halo_above[x+dx]) {
n++;

View File

@ -3,6 +3,7 @@
#include <stdio.h>
#include <string.h>
#include <mpi.h>
#include <stddef.h>
#include "file.h"
#include "game.h"

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 33.832562
Clock time: 37.939663
Time computing life: 68.237100
Clock time: 73.897736

View File

@ -1,11 +1,4 @@
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 21716 RUNNING AT kp013
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
===Timing===
Time computing life: 105.934486
Clock time: 112.662907

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 75.141736
Clock time: 83.149478
Time computing life: 155.201482
Clock time: 165.129865

View File

@ -1,11 +1,4 @@
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 21837 RUNNING AT kp013
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
===Timing===
Time computing life: 207.332667
Clock time: 219.494609

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 132.636661
Clock time: 145.001708
Time computing life: 269.160186
Clock time: 284.025931

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 2.383001
Clock time: 4.113476
Time computing life: 5.132488
Clock time: 6.490781

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 8.793952
Clock time: 9.832794
Time computing life: 18.976428
Clock time: 20.433132

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 19.270078
Clock time: 21.813069
Time computing life: 41.754895
Clock time: 44.502337

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 2.833550
Clock time: 6.323680
Time computing life: 6.001465
Clock time: 9.559285

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 4.347700
Clock time: 9.178630
Time computing life: 9.368616
Clock time: 14.966020

View File

@ -1,11 +1,4 @@
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 23209 RUNNING AT kp013
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
===Timing===
Time computing life: 13.390212
Clock time: 20.945775

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 8.483342
Clock time: 17.330302
Time computing life: 18.167763
Clock time: 28.215494

View File

@ -1,11 +1,4 @@
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 23290 RUNNING AT kp013
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
===Timing===
Time computing life: 23.784948
Clock time: 36.657344

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.198089
Clock time: 2.217166
Time computing life: 0.621095
Clock time: 2.379479

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.735509
Clock time: 2.513034
Time computing life: 1.541923
Clock time: 3.193527

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 1.617002
Clock time: 4.091923
Time computing life: 3.622372
Clock time: 5.586008

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 2.106571
Clock time: 7.500836
Time computing life: 4.442463
Clock time: 10.520606

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 3.445883
Clock time: 11.167682
Time computing life: 7.085545
Clock time: 15.203315

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 4.741983
Clock time: 16.777514
Time computing life: 10.130384
Clock time: 22.828620

View File

@ -1,8 +1,4 @@
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 34784 RUNNING AT kp160
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
===Timing===
Time computing life: 13.581483
Clock time: 30.474959

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 8.301682
Clock time: 28.791425
Time computing life: 17.827682
Clock time: 38.839103

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.145483
Clock time: 2.572587
Time computing life: 0.295248
Clock time: 2.424203

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.570992
Clock time: 3.899400
Time computing life: 1.134216
Clock time: 3.333061

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 1.215016
Clock time: 5.047125
Time computing life: 2.724965
Clock time: 6.326008

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 3.626076
Clock time: 10.578861

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 5.589568
Clock time: 15.829982

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 8.249432
Clock time: 22.901060

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 10.833047
Clock time: 31.488105

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 14.341513
Clock time: 41.437950

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 0.290254
Clock time: 2.288621

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 0.922726
Clock time: 3.252760

View File

@ -0,0 +1,4 @@
===Timing===
Time computing life: 2.058615
Clock time: 6.415593

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 1.414322
Clock time: 9.439315
Time computing life: 2.948477
Clock time: 11.073549

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 2.171989
Clock time: 13.927639
Time computing life: 4.599746
Clock time: 16.289204

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 3.133675
Clock time: 19.271850
Time computing life: 6.653323
Clock time: 23.581825

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 4.398371
Clock time: 25.650748
Time computing life: 9.023902
Clock time: 32.654144

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 5.639865
Clock time: 33.529967
Time computing life: 11.813565
Clock time: 42.361473

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.100765
Clock time: 2.412458
Time computing life: 0.194711
Clock time: 2.336872

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.465147
Clock time: 3.942927
Time computing life: 0.751125
Clock time: 3.215283

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.815429
Clock time: 5.642879
Time computing life: 1.749681
Clock time: 6.566280

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 8.467197
Clock time: 11.707533
Time computing life: 18.303801
Clock time: 22.160403

View File

@ -1,11 +1,4 @@
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 22126 RUNNING AT kp013
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
===Timing===
Time computing life: 28.577705
Clock time: 33.967832

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 18.823087
Clock time: 26.449810
Time computing life: 40.818054
Clock time: 49.034747

View File

@ -1,11 +1,4 @@
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 22197 RUNNING AT kp013
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
===Timing===
Time computing life: 55.473812
Clock time: 66.402986

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 33.274214
Clock time: 45.841294
Time computing life: 72.029655
Clock time: 84.802906

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.599813
Clock time: 2.807879
Time computing life: 1.263181
Clock time: 2.244229

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 2.212790
Clock time: 4.133439
Time computing life: 4.748224
Clock time: 6.104621

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 4.830949
Clock time: 6.854574
Time computing life: 10.420523
Clock time: 12.881742

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 4.226861
Clock time: 7.517444
Time computing life: 9.155404
Clock time: 12.658901

View File

@ -1,11 +1,4 @@
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= PID 22852 RUNNING AT kp013
= EXIT CODE: 11
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Segmentation fault (signal 11)
This typically refers to a problem with your application.
Please see the FAQ page for debugging suggestions
===Timing===
Time computing life: 14.082438
Clock time: 19.224195

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 9.416485
Clock time: 16.706325
Time computing life: 20.413675
Clock time: 27.885011

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 12.741221
Clock time: 22.281683
Time computing life: 27.722141
Clock time: 38.768550

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 16.578412
Clock time: 26.921717
Time computing life: 35.856221
Clock time: 48.674318

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 0.296146
Clock time: 2.211905
Time computing life: 0.617449
Clock time: 2.370964

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 1.111486
Clock time: 2.710176
Time computing life: 2.396797
Clock time: 3.529909

View File

@ -1,4 +1,4 @@
===Timing===
Time computing life: 2.419305
Clock time: 4.675962
Time computing life: 5.226469
Clock time: 7.317886

View File

@ -1,11 +0,0 @@
mkdir: cannot create directory timing-study: File exists
[proxy:0:0@kp013] HYD_pmcd_pmip_control_cmd_cb (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmip_cb.c:887): assert (!closed) failed
[proxy:0:0@kp013] HYDT_dmxu_poll_wait_for_event (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status
[proxy:0:0@kp013] main (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmip.c:202): demux engine error waiting for event
srun: error: kp013: task 0: Exited with exit code 7
[mpiexec@kp013] HYDT_bscu_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/bootstrap/utils/bscu_wait.c:76): one of the processes terminated badly; aborting
[mpiexec@kp013] HYDT_bsci_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/tools/bootstrap/src/bsci_wait.c:23): launcher returned error waiting for completion
[mpiexec@kp013] HYD_pmci_wait_for_completion (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/pm/pmiserv/pmiserv_pmci.c:218): launcher returned error waiting for completion
[mpiexec@kp013] main (../../../../../../srcdir/mpich/3.2.1/src/pm/hydra/ui/mpich/mpiexec.c:340): process manager error waiting for completion
srun: error: Unable to create step for job 10870703: Job/step already completing or completed
slurmstepd: error: *** JOB 10870703 ON kp013 CANCELLED AT 2021-12-08T01:29:02 DUE TO TIME LIMIT ***

View File

@ -0,0 +1,11 @@
mkdir: cannot create directory timing-study: File exists
Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163".
Due to MODULEPATH changes, the following have been reloaded:
1) mpich/3.2.1
srun: Job step aborted: Waiting up to 62 seconds for job step to finish.
slurmstepd: error: *** STEP 10870708.3 ON kp018 CANCELLED AT 2021-12-08T03:00:27 ***
slurmstepd: error: *** JOB 10870708 ON kp018 CANCELLED AT 2021-12-08T03:00:27 ***

View File

@ -0,0 +1,8 @@
mkdir: cannot create directory timing-study: File exists
Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163".
Due to MODULEPATH changes, the following have been reloaded:
1) mpich/3.2.1

View File

@ -0,0 +1,8 @@
mkdir: cannot create directory timing-study: File exists
Lmod is automatically replacing "gcc/4.8.5" with "intel/2018.1.163".
Due to MODULEPATH changes, the following have been reloaded:
1) mpich/3.2.1

View File

@ -1,9 +1,9 @@
#!/bin/bash
#SBATCH --time=0:10:00 # walltime, abbreviated by -t
#SBATCH --nodes=2 # number of cluster nodes, abbreviated by -N
#SBATCH --time=0:20:00 # walltime, abbreviated by -t
#SBATCH --nodes=1 # number of cluster nodes, abbreviated by -N
#SBATCH -o slurm-%j.out-%N # name of the stdout, using the job number (%j) and the first node (%N)
#SBATCH -e slurm-%j.err-%N # name of the stderr, using job and first node values
#SBATCH --ntasks=24 # number of MPI tasks, abbreviated by -n
#SBATCH --ntasks=1 # number of MPI tasks, abbreviated by -n
# additional information for allocated clusters
#SBATCH --account=usucs5030 # account - abbreviated by -A
#SBATCH --partition=kingspeak # partition, abbreviated by -p
@ -15,9 +15,9 @@ module load intel mpich
iterations=1000
for cores in 1 4 8 12 16 20 #24
for cores in 1 #12 16 20 24
do
for size in 250 500 750 1000 1250 1500 1750 2000
for size in 1000 1250 1500 1750 2000 #250 500 750 1000 1250 1500 1750 2000
do
mpirun -np $cores ./gol simulate random $size $size $iterations 1 > timing-study/output-$cores-$iterations-$size.txt
done

BIN
report/.DS_Store vendored Normal file

Binary file not shown.

BIN
report/Game of Life.xlsx Normal file

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 160 KiB

BIN
report/cuda-speedup.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 175 KiB

BIN
report/cuda-times.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 95 KiB

76
report/report.org Normal file
View File

@ -0,0 +1,76 @@
#+TITLE: Final Project: Game of Life
#+STARTUP: fold inlineimages
#+OPTIONS: toc:nil
#+AUTHOR: Logan Hunt
#+LATEX_HEADER: \usepackage{amsfonts} \usepackage{amssymb} \usepackage{mathtools} \usepackage{ upgreek }
* Description
From [[https://mathworld.wolfram.com/CellularAutomaton.html][Wolfram MathWorld]]:
#+BEGIN_QUOTE
A cellular automaton is a collection of "colored" cells on a grid of specified shape that evolves through a number of discrete time steps according to a set of rules based on the states of neighboring cells. The rules are then applied iteratively for as many time steps as desired.
#+END_QUOTE
Conway's Game of Life is one such automaton. In the Game of Life, the rules for each cell are as follows (from [[https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life][Wikipedia]]):
#+BEGIN_QUOTE
1. Any live cell with fewer than two live neighbours dies, as if by underpopulation.
2. Any live cell with two or three live neighbours lives on to the next generation.
3. Any live cell with more than three live neighbours dies, as if by overpopulation.
4. Any dead cell with exactly three live neighbours becomes a live cell, as if by reproduction.
#+END_QUOTE
To help visualize this automaton I created a script to go through the output of my Game of Life simulation and compile a video with ffmpeg. As an example, I've uploaded the output of a simulation with a 1920x1080 grid of cells with 1000 iterations [[https://www.youtube.com/watch?v=N_aUWYNqpeY][to YouTube]]. Each cell that is white is alive and each black cell is dead.
There are four implementations of Conway's Game of Life in this project; a serial implementation, a distributed memory implementation (in OpenMPI), a shared memory implementation (in OpenMP), and a GPU implementation (in Cuda).
A timing study is performed on each implementation by calculating the elapsed time of the program given varying sizes of initial grids and, in the shared and distributed memory versions, a different number of cores. In each, both the time it takes to compute the next iteration and the total wall clock time are measured.
* Performance analysis (of Game of Life iteration time)
Results can be found on a [[https://docs.google.com/spreadsheets/d/1QxCsyMFzk67Qpuv-xZ-tRny4jHMebXTrdq5ncc7C4Tw/edit?usp=sharing][Google Sheet]]
** Shared memory vs distributed memory
*** Runtime
In runtime, both implementations have the same property of decreasing over an increasing number of cores in all problem sizes (as one would certainly hope). As the problem size increases, the overall differences in the runtimes of each implementation also decreases; meaning they follow the same trends. This can be shown in the runtimes for both implmentations running on a small grid and a large grid:
#+ATTR_LATEX: :width 8cm
[[./cores-vs-runtimes.png]]
Both seem to converge to some rational function. Using an online regression calculator it was found that the MPI Life Computation (iteration computation time only) runtime follows the function $t(p)=\frac{274.449}{p^{0.985}}$ with a correlation coefficient of $r=-0.999892441$. Since $t$ is very close to being a rational function of $p$, we know that the runtime fits to what could be expected: $T_{\text{parallel}} = \frac{T_{\text{serial}}}{p}$.
*** Speedup
In speedup, both implementations tend to increase over an increasing number of cores in all problem sizes. However, it doesn't strictly increase. With some numbers of cores in the shared memory implementation, the speedup actually decreases from its predecessor.
#+ATTR_LATEX: :width 10cm
[[./speedups-vs-cores.png]]
*** Efficiency
Efficiency is the ratio of speedup to $p$ processors ($E = \frac{S}{p}$), so it can be thought of as the derivative of the speedup. Thus efficiency can be measured without plotting it explicitly.
By definition, a program is "strongly scalable" if it can keep its efficiency constant over a varying input size. In the results, it can be seen that the slope of the Distributed Memory Life Computation Time line tends to be constant, meaning that the efficiency is also constant. Thus, the MPI version is strongly scalable.
However, the shared memory (OpenMP) implementation does not seem to be perfectly strongly scalable. As the problem size varies, the speedup does not follow a constant slope. Instead, it tends to match the efficiency of the MPI implemenation until some point where the slope drops off.
Theoretically, the OpenMP implementation should be just as strongly scalable as the MPI implementation. One reason overhead could be present is in thread scheduling.
** CUDA Implementation
*** Runtime
For the CUDA implementation, different grid sizes are used to measure the iteration time as well as the wall time. Again, 1000 iterations are used for the timing study.
#+ATTR_LATEX: :width 8cm
[[./cuda-times.png]]
Using an online regression calculator again, it was found that the runtime as a function of input size can be expressed with by $t(n) = (1.486)(10^{-7})n^2 + (1.328)(10^{-6})n + 0.02151$ with a correlation coefficient $r = 0.9999278678$.
Since the number of cores is constant, we would hope to see a quadratic increase in the runtime as the input size grows. This is because the number of cells increases with $(\text{input size})^2$.
Indeed, this is what we see.
*** Speedup
The speedup of the cuda implementation as input size increases tends to follow a logarithmic curve, plateuing after around $n=1000$. While I am not entirely sure why it follows this trend, I guess it might have to do with the warp scheduling.
#+ATTR_LATEX: :width 8cm
[[./cuda-speedup.png]]
*** Efficiency
Since the core count on the K80 is constant (4992 CUDA cores), the efficiency can be calculated by $E = \frac{S}{4992}$. As the efficiency is just a constant multiplied by the speedup, the efficiency graph will just be a scaled version of the speedup graph. As such the efficiency will not be constant over different input sizes since the speedup isn't, and thus the CUDA implementation is not strongly scalable.

BIN
report/report.pdf Normal file

Binary file not shown.

118
report/report.tex Normal file
View File

@ -0,0 +1,118 @@
% Created 2021-12-08 Wed 18:34
% Intended LaTeX compiler: pdflatex
\documentclass[11pt]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{grffile}
\usepackage{longtable}
\usepackage{wrapfig}
\usepackage{rotating}
\usepackage[normalem]{ulem}
\usepackage{amsmath}
\usepackage{textcomp}
\usepackage{amssymb}
\usepackage{capt-of}
\usepackage{hyperref}
\usepackage{amsfonts} \usepackage{amssymb} \usepackage{mathtools} \usepackage{ upgreek }
\author{Logan Hunt}
\date{\today}
\title{Final Project: Game of Life}
\hypersetup{
pdfauthor={Logan Hunt},
pdftitle={Final Project: Game of Life},
pdfkeywords={},
pdfsubject={},
pdfcreator={Emacs 27.2 (Org mode 9.4.4)},
pdflang={English}}
\begin{document}
\maketitle
\section{Description}
\label{sec:orgee5348a}
From \href{https://mathworld.wolfram.com/CellularAutomaton.html}{Wolfram MathWorld}:
\begin{quote}
A cellular automaton is a collection of "colored" cells on a grid of specified shape that evolves through a number of discrete time steps according to a set of rules based on the states of neighboring cells. The rules are then applied iteratively for as many time steps as desired.
\end{quote}
Conway's Game of Life is one such automaton. In the Game of Life, the rules for each cell are as follows (from \href{https://en.wikipedia.org/wiki/Conway\%27s\_Game\_of\_Life}{Wikipedia}):
\begin{quote}
\begin{enumerate}
\item Any live cell with fewer than two live neighbours dies, as if by underpopulation.
\item Any live cell with two or three live neighbours lives on to the next generation.
\item Any live cell with more than three live neighbours dies, as if by overpopulation.
\item Any dead cell with exactly three live neighbours becomes a live cell, as if by reproduction.
\end{enumerate}
\end{quote}
To help visualize this automaton I created a script to go through the output of my Game of Life simulation and compile a video with ffmpeg. As an example, I've uploaded the output of a simulation with a 1920x1080 grid of cells with 1000 iterations \href{https://www.youtube.com/watch?v=N\_aUWYNqpeY}{to YouTube}. Each cell that is white is alive and each black cell is dead.
There are four implementations of Conway's Game of Life in this project; a serial implementation, a distributed memory implementation (in OpenMPI), a shared memory implementation (in OpenMP), and a GPU implementation (in Cuda).
A timing study is performed on each implementation by calculating the elapsed time of the program given varying sizes of initial grids and, in the shared and distributed memory versions, a different number of cores. In each, both the time it takes to compute the next iteration and the total wall clock time are measured.
\section{Performance analysis (of Game of Life iteration time)}
\label{sec:org1f7f2b6}
Results can be found on a \href{https://docs.google.com/spreadsheets/d/1QxCsyMFzk67Qpuv-xZ-tRny4jHMebXTrdq5ncc7C4Tw/edit?usp=sharing}{Google Sheet}
\subsection{Shared memory vs distributed memory}
\label{sec:org9f5d8c9}
\subsubsection{Runtime}
\label{sec:org9209384}
In runtime, both implementations have the same property of decreasing over an increasing number of cores in all problem sizes (as one would certainly hope). As the problem size increases, the overall differences in the runtimes of each implementation also decreases; meaning they follow the same trends. This can be shown in the runtimes for both implmentations running on a small grid and a large grid:
\begin{center}
\includegraphics[width=8cm]{./cores-vs-runtimes.png}
\end{center}
Both seem to converge to some rational function. Using an online regression calculator it was found that the MPI Life Computation (iteration computation time only) runtime follows the function \(t(p)=\frac{274.449}{p^{0.985}}\) with a correlation coefficient of \(r=-0.999892441\). Since \(t\) is very close to being a rational function of \(p\), we know that the runtime fits to what could be expected: \(T_{\text{parallel}} = \frac{T_{\text{serial}}}{p}\).
\subsubsection{Speedup}
\label{sec:org3812f4f}
In speedup, both implementations tend to increase over an increasing number of cores in all problem sizes. However, it doesn't strictly increase. With some numbers of cores in the shared memory implementation, the speedup actually decreases from its predecessor.
\begin{center}
\includegraphics[width=10cm]{./speedups-vs-cores.png}
\end{center}
\subsubsection{Efficiency}
\label{sec:org477fbb5}
Efficiency is the ratio of speedup to \(p\) processors (\(E = \frac{S}{p}\)), so it can be thought of as the derivative of the speedup. Thus efficiency can be measured without plotting it explicitly.
By definition, a program is "strongly scalable" if it can keep its efficiency constant over a varying input size. In the results, it can be seen that the slope of the Distributed Memory Life Computation Time line tends to be constant, meaning that the efficiency is also constant. Thus, the MPI version is strongly scalable.
However, the shared memory (OpenMP) implementation does not seem to be perfectly strongly scalable. As the problem size varies, the speedup does not follow a constant slope. Instead, it tends to match the efficiency of the MPI implemenation until some point where the slope drops off.
Theoretically, the OpenMP implementation should be just as strongly scalable as the MPI implementation. One reason overhead could be present is in thread scheduling.
\subsection{CUDA Implementation}
\label{sec:org31daf97}
\subsubsection{Runtime}
\label{sec:orgb6b22ab}
For the CUDA implementation, different grid sizes are used to measure the iteration time as well as the wall time. Again, 1000 iterations are used for the timing study.
\begin{center}
\includegraphics[width=8cm]{./cuda-times.png}
\end{center}
Using an online regression calculator again, it was found that the runtime as a function of input size can be expressed with by \(t(n) = (1.486)(10^{-7})n^2 + (1.328)(10^{-6})n + 0.02151\) with a correlation coefficient \(r = 0.9999278678\).
Since the number of cores is constant, we would hope to see a quadratic increase in the runtime as the input size grows. This is because the number of cells increases with \((\text{input size})^2\).
Indeed, this is what we see.
\subsubsection{Speedup}
\label{sec:org888d520}
The speedup of the cuda implementation as input size increases tends to follow a logarithmic curve, plateuing after around \(n=1000\). While I am not entirely sure why it follows this trend, I guess it might have to do with the warp scheduling.
\begin{center}
\includegraphics[width=8cm]{./cuda-speedup.png}
\end{center}
\subsubsection{Efficiency}
\label{sec:org530b4aa}
Since the core count on the K80 is constant (4992 CUDA cores), the efficiency can be calculated by \(E = \frac{S}{4992}\). As the efficiency is just a constant multiplied by the speedup, the efficiency graph will just be a scaled version of the speedup graph. As such the efficiency will not be constant over different input sizes since the speedup isn't, and thus the CUDA implementation is not strongly scalable.
\end{document}

Binary file not shown.

After

Width:  |  Height:  |  Size: 399 KiB