% Created 2023-12-11 Mon 19:24
% Intended LaTeX compiler: pdflatex
\documentclass[11pt]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{longtable}
\usepackage{wrapfig}
\usepackage{rotating}
\usepackage[normalem]{ulem}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{capt-of}
\usepackage{hyperref}
\notindent \notag  \usepackage{amsmath} \usepackage[a4paper,margin=1in,portrait]{geometry}
\author{Elizabeth Hunt}
\date{\today}
\title{Homework 9}
\hypersetup{
 pdfauthor={Elizabeth Hunt},
 pdftitle={Homework 9},
 pdfkeywords={},
 pdfsubject={},
 pdfcreator={Emacs 28.2 (Org mode 9.7-pre)}, 
 pdflang={English}}
\begin{document}

\maketitle
\setlength\parindent{0pt}

\section{Question One}
\label{sec:org69bed2d}

With a \texttt{matrix\_dimension} set to 700, I consistently see about a 3x improvement in performance on my
10-thread machine. The serial implementation gives an average \texttt{0.189s} total runtime, while the below
parallel implementation runs in about \texttt{0.066s} after the cpu cache has filled on the first run.

\begin{verbatim}
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define matrix_dimension 700

int n = matrix_dimension;
float sum;

int main() {
  float A[n][n];
  float x0[n];
  float b[n];
  float x1[n];
  float res[n];

  srand((unsigned int)(time(NULL)));

  // not worth parallellization - rand() is not thread-safe
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < n; j++) {
      A[i][j] = ((float)rand() / (float)(RAND_MAX) * 5.0);
    }
    x0[i] = ((float)rand() / (float)(RAND_MAX) * 5.0);
  }

#pragma omp parallel for private(sum)
  for (int i = 0; i < n; i++) {
    sum = 0.0;
    for (int j = 0; j < n; j++) {
      sum += fabs(A[i][j]);
    }
    A[i][i] += sum;
  }

#pragma omp parallel for private(sum)
  for (int i = 0; i < n; i++) {
    sum = 0.0;
    for (int j = 0; j < n; j++) {
      sum += A[i][j];
    }
    b[i] = sum;
  }

  float tol = 0.0001;
  float error = 10.0 * tol;
  int maxiter = 100;
  int iter = 0;

  while (error > tol && iter < maxiter) {
#pragma omp parallel for
    for (int i = 0; i < n; i++) {
      float temp_sum = b[i];
      for (int j = 0; j < n; j++) {
        temp_sum -= A[i][j] * x0[j];
      }
      res[i] = temp_sum;
      x1[i] = x0[i] + res[i] / A[i][i];
    }

    sum = 0.0;
#pragma omp parallel for reduction(+ : sum)
    for (int i = 0; i < n; i++) {
      float val = x1[i] - x0[i];
      sum += val * val;
    }
    error = sqrt(sum);

#pragma omp parallel for
    for (int i = 0; i < n; i++) {
      x0[i] = x1[i];
    }

    iter++;
  }

  for (int i = 0; i < n; i++)
    printf("x[%d] = %6f \t res[%d] = %6f\n", i, x1[i], i, res[i]);

  return 0;
}

\end{verbatim}

\section{Question Two}
\label{sec:orgbeace25}

I only see lowerings in performance (likely due to overhead) on my machine using OpenMP until
\texttt{matrix\_dimension} becomes quite large, about \texttt{300} in testing. At \texttt{matrix\_dimension=1000}, I see another
about 3x improvement in total runtime (including initialization \& I/O which was untouched, so, even further
improvements could be made) on my 10-thread machine; from around \texttt{0.174} seconds to \texttt{.052}.

\begin{verbatim}
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#ifdef _OPENMP
#include <omp.h>
#else
#define omp_get_num_threads() 0
#define omp_set_num_threads(int) 0
#define omp_get_thread_num() 0
#endif

#define matrix_dimension 1000

int n = matrix_dimension;
float ynrm;

int main() {
  float A[n][n];
  float v0[n];
  float v1[n];
  float y[n];
  //
  // create a matrix
  //
  // not worth parallellization - rand() is not thread-safe
  srand((unsigned int)(time(NULL)));
  float a = 5.0;
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < n; j++) {
      A[i][j] = ((float)rand() / (float)(RAND_MAX)*a);
    }
    v0[i] = ((float)rand() / (float)(RAND_MAX)*a);
  }
  //
  // modify the diagonal entries for diagonal dominance
  // --------------------------------------------------
  //
  for (int i = 0; i < n; i++) {
    float sum = 0.0;
    for (int j = 0; j < n; j++) {
      sum = sum + fabs(A[i][j]);
    }
    A[i][i] = A[i][i] + sum;
  }
  //
  // generate a vector of ones
  // -------------------------
  //
  for (int j = 0; j < n; j++) {
    v0[j] = 1.0;
  }
  //
  // power iteration test
  // --------------------
  //
  float tol = 0.0000001;
  float error = 10.0 * tol;
  float lam1, lam0;
  int maxiter = 100;
  int iter = 0;

  while (error > tol && iter < maxiter) {
#pragma omp parallel for
    for (int i = 0; i < n; i++) {
      y[i] = 0;
      for (int j = 0; j < n; j++) {
        y[i] = y[i] + A[i][j] * v0[j];
      }
    }

    ynrm = 0.0;
#pragma omp parallel for reduction(+ : ynrm)
    for (int i = 0; i < n; i++) {
      ynrm += y[i] * y[i];
    }
    ynrm = sqrt(ynrm);

#pragma omp parallel for
    for (int i = 0; i < n; i++) {
      v1[i] = y[i] / ynrm;
    }

#pragma omp parallel for
    for (int i = 0; i < n; i++) {
      y[i] = 0.0;
      for (int j = 0; j < n; j++) {
        y[i] += A[i][j] * v1[j];
      }
    }

    lam1 = 0.0;
#pragma omp parallel for reduction(+ : lam1)
    for (int i = 0; i < n; i++) {
      lam1 += v1[i] * y[i];
    }

    error = fabs(lam1 - lam0);
    lam0 = lam1;

#pragma omp parallel for
    for (int i = 0; i < n; i++) {
      v0[i] = v1[i];
    }

    iter++;
  }

  printf("in %d iterations, eigenvalue = %f\n", iter, lam1);
}
\end{verbatim}

\section{Question Three}
\label{sec:org33439e0}
\url{https://static.simponic.xyz/lizfcm.pdf}
\end{document}