Computer Architecture and Organization

Download as pdf or txt
Download as pdf or txt
You are on page 1of 20

COMPUTER ARCHITECTURE AND ORGANIZATION

NAME : T. RAGHAVENDRA

REG NO. : 18BCE0698

SLOT : G1+TG1

COURSERA : FUNDAMENTALS OF PARALLELISM ON INTEL


ARCHITECTURE

CODE:

#include <cstdio>

int main() {

#ifdef __INTEL_COMPILER

// Only compiled with Intel Compiler

printf("Hello world from Intel compiler");

#elif __GNUC__
// Only compiled with GNU Compiler

printf("Hello world from GNU compiler");

#endif

QUIZ : 2

CODE :
#include <cstdio>

#include <cstdlib>

#include <cmath>

#include <omp.h>

#include <mkl.h>

#include "distribution.h"

int diffusion(const int n_particles,

const int n_steps,

const float x_threshold,

const float alpha,

VSLStreamStatePtr rnStream);

// DO NOT MODIFY THIS FUNCTION //

//unoptimized reference function

int ref_diffusion(const int n_particles,

const int n_steps,

const float x_threshold,

const float alpha,

VSLStreamStatePtr rnStream) {

int n_escaped=0;

for (int i = 0; i < n_particles; i++) {

float x = 0.0f;

for (int j = 0; j < n_steps; j++) {

float rn;

vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, 1, &rn, -1.0, 1.0);


x += delta_max*sinf(alpha*rn)*expf(-rn*rn);

if (x > x_threshold) n_escaped++;

return n_escaped;

int main(int argc, char** argv) {

float alpha = 1.0f;

float x_threshold = 3.0f;

if(argc>1) {

alpha = atof(argv[1]);

if(argc>2) {

x_threshold = atof(argv[2]);

const int n_particles = 1<<17;

const int n_steps = 500;

VSLStreamStatePtr rnStream;

//initialize random stream

vslNewStream( &rnStream, VSL_BRNG_MT19937, 0);

//compute refernce data

const int ref_escaped = ref_diffusion(n_particles, n_steps, x_threshold, alpha, rnStream);

const int n_trials = 10;


const int skip_trials = 2;

double tsum = 0.0;

bool err = false;

//compute diffusion data using function defined in worker.cc and get the timing

const double t0 = omp_get_wtime();

int n_escaped = diffusion(n_particles, n_steps, x_threshold, alpha, rnStream);

const double t1 = omp_get_wtime();

//verify the filter data with refernce data

if(n_escaped - ref_escaped > 5*sqrt(ref_escaped)) {

printf("Error: n_escaped %d, while reference is %d\n", n_escaped, ref_escaped);

} else {

// Printing verification and performance

printf("%d\t(ref: %d)\t%f\n", n_escaped, ref_escaped, t1-t0);

QUIZ : 3
#include <cstdlib>

#include <cstdio>

#include <omp.h>

#include <mkl.h>

#include <vector>

#include <algorithm>

void filter(const long n, const long m, float *data, const float threshold, std::vector<long>
&result_row_ind);

//reference function to verify data

void filter_ref(const long n, const long m, float *data, const float threshold, std::vector<long>
&result_row_ind) {

float sum;

for(long i = 0; i < n; i++){

sum = 0.0f;

for(long j = 0; j < m; j++) {

sum+=data[i*m+j];
}

if(sum > threshold)

result_row_ind.push_back(i);

std::sort(result_row_ind.begin(),result_row_ind.end());

int main(int argc, char** argv) {

float threshold = 0.5;

if(argc < 2) {

threshold = 0.5;

} else {

threshold = atof(argv[1]);

const long n = 1<<15; //rows

const long m = 1<<18; //columns

float *data = (float *) malloc((long)sizeof(float)*n*m);

long random_seed = (long)(omp_get_wtime()*1000.0) % 1000L;

VSLStreamStatePtr rnStream;

vslNewStream( &rnStream, VSL_BRNG_MT19937, random_seed);

//initialize 2D data

#pragma omp parallel for

for(long i =0; i < n; i++)


vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, m, &data[m*i], -1.0, 1.0);

std::vector<long> ref_result_row_ind;

//compute the refernce data using unoptimized refernce function defined above

filter_ref(n, m, data, threshold, ref_result_row_ind);

//compute actual data using the function defined in worker.cc and get the timing

std::vector<long> result_row_ind;

const double t0 = omp_get_wtime();

filter(n, m, data, threshold, result_row_ind);

const double t1 = omp_get_wtime();

//verify the actual data and the refernce data

if(ref_result_row_ind.size() != result_row_ind.size()) {

// Result sizes did not match

printf("Error: The reference and result vectors have different sizes: %ld
%ld",ref_result_row_ind.size(), result_row_ind.size());

} else {

bool passed = true;

for(long i = 0; i < ref_result_row_ind.size(); i++) {

passed &= (ref_result_row_ind[i] == result_row_ind[i]);

if(passed) {

// Printing perf

printf("Time: %f\n", t1-t0);

} else {

// Results did not match

printf("Error: The reference and result vectors did not match");


}

QUIZ : 4

CODE :

#include <cstdio>
#include <cstdlib>

#include <mkl.h>

#include <omp.h>

#include <hbwmalloc.h>

void runFFTs( const size_t fft_size, const size_t num_fft, MKL_Complex8 *data,
DFTI_DESCRIPTOR_HANDLE *fftHandle);

// Do not modify.

//reference funtion

void runFFTs_ref( const size_t fft_size, const size_t num_fft, MKL_Complex8 *data,
DFTI_DESCRIPTOR_HANDLE *fftHandle) {

for(size_t i = 0; i < num_fft; i++) {

DftiComputeForward (*fftHandle, &data[i*fft_size]);

int main() {

const size_t fft_size = 1L<<27;

const size_t num_fft = 32L;

MKL_Complex8 *data = (MKL_Complex8 *) _mm_malloc(sizeof(MKL_Complex8)*num_fft*fft_size,


4096);

MKL_Complex8 *ref_data = (MKL_Complex8 *)


_mm_malloc(sizeof(MKL_Complex8)*num_fft*fft_size, 4096);

//iniitialize data array and copy it to ref_data array

#pragma omp parallel

long random_seed = (long)(omp_get_wtime()*1000.0*omp_get_thread_num()) % 1000L;


VSLStreamStatePtr rnStream;

//initialize random stream

vslNewStream( &rnStream, VSL_BRNG_MT19937, random_seed);

#pragma omp for

for(size_t i = 0; i < num_fft; i++) {

//Intel MKL Rnadom stream generation function

vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, rnStream, 2*fft_size, (float *)


&data[i*fft_size], -1.0, 1.0);

//copy data to ref_data

#pragma omp for

for(long i = 0; i < (fft_size+2)*num_fft; i++) {

ref_data[i].real = data[i].real;

ref_data[i].imag = data[i].imag;

DFTI_DESCRIPTOR_HANDLE* fftHandle = new DFTI_DESCRIPTOR_HANDLE;

DftiCreateDescriptor(fftHandle, DFTI_SINGLE, DFTI_COMPLEX, 1, (MKL_LONG) fft_size);

DftiCommitDescriptor (*fftHandle);

//compute FFT using refernce function

runFFTs_ref(fft_size, num_fft, ref_data, fftHandle);

//compute and time FFT using function defined in worker.cc

const double t0 = omp_get_wtime();

runFFTs(fft_size, num_fft, data, fftHandle);

const double t1 = omp_get_wtime();


//verify the comuted FFT data with the reference FFT data

bool within_tolerance = true;

#pragma omp parallel for reduction(&: within_tolerance)

for(long i = 0; i < num_fft; i++) {

for(long j = 0; j < fft_size; j++) {

within_tolerance &= ((data[i*fft_size+j].real-ref_data[i*fft_size+j].real)

*(data[i*fft_size+j].real-ref_data[i*fft_size+j].real)

+(data[i*fft_size+j].imag-ref_data[i*fft_size+j].imag)

*(data[i*fft_size+j].imag-ref_data[i*fft_size+j].imag))

< 1.0e-6;

if(within_tolerance) {

// Printing performance

printf("Time: %f\n", t1-t0);

} else {

// Verification failed

printf("Error: Verification failed\n");

DftiFreeDescriptor (fftHandle);

_mm_free(ref_data);

_mm_free(data);

}
QUIZ : 5

CODE :

#include <cstdlib>

#include <cstdio>

#include <math.h>

#include <mpi.h>
#include <omp.h>

#include <assert.h>

#include "L.h"

// Finite difference method for stings

// d_(x, t+1) = L(x)*(d_(x+dx, t) + d_(x-dx, t))

// + 2.0f*(1.0f-L(x))*(d_(x,t))

// - d_(x, t-1)

float * simulate(const float alpha, const long n_segments, const int n_steps, float *d_buf1, float
*d_buf2, const int rank, const int world_size, const long segments_per_process);

// Do not modify

//reference simulate function to verify data

float * simulate_ref(const float alpha, const long n_segments, const int n_steps, float *d_buf1, float
*d_buf2, const int rank, const int world_size, const long segments_per_process) {

float* d_t = d_buf1; // buffer for d(*, t)

float* d_t1 = d_buf2; // buffer for d(*, t+1)

const int start_segment = segments_per_process*((long)rank) +1L;

const int last_segment = segments_per_process*((long)rank+1L)+1L;

const float dx = 1.0f/(float)n_segments;

const float phase = 0.5f;

for(int t = 0; t < n_steps; t++) {

#pragma omp parallel for simd


for(long i = start_segment; i < last_segment; i++) {

const float L_x = L(alpha,phase,i*dx);

d_t1[i] = L_x*(d_t[i+1] + d_t[i-1])

+2.0f*(1.0f-L_x)*(d_t[i])

- d_t1[i]; // The algorithm calls for d(i, t-1) here, but that is currently contained in d_t1

MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, &d_t1[1], segments_per_process,


MPI_FLOAT, MPI_COMM_WORLD);

float* temp = d_t1; d_t1 = d_t; d_t=temp; // swap buffers

return d_t;

void initialize_buffers(const float alpha, const long n_segments, float *d_buf1, float *d_buf2) {

const float dx = 1.0f/(float)n_segments;

const float phase = (float)n_segments/2.0f;

#pragma omp parallel for

for(long i =0; i < n_segments; i++)

d_buf1[i] = 100.0*sinf(3.14159*(float)i*dx);

d_buf1[0] = d_buf1[n_segments-1] = d_buf2[0] = d_buf2[n_segments-1] = 0.0f;

for(long i = 1; i < n_segments-1; i++)

// d_1 = d_0 + v_0*dt + 0.5*a*dt^2

d_buf2[i] = L(alpha,phase,i*dx)/2.0f*(d_buf1[i+1] + d_buf1[i-1]) + (1.0f-


L(alpha,phase,i*dx))*(d_buf1[i]);

int main(int argc, char** argv) {

int ret = MPI_Init(&argc,&argv);

if (ret != MPI_SUCCESS) {
printf("error: could not initialize MPI\n");

MPI_Abort(MPI_COMM_WORLD, ret);

float alpha;

if (argc < 2) {

alpha = 0.2;

} else {

alpha = atof(argv[1]);

int world_size, rank;

MPI_Status stat;

MPI_Comm_size(MPI_COMM_WORLD, &world_size);

MPI_Comm_rank(MPI_COMM_WORLD, &rank);

const int n_steps = 1<<6;

const long n_segments = (1L<<25)+2L;

assert((n_segments-2L)%world_size == 0); // This will make MPI gather much easier to work with

const long segments_per_process = (n_segments-2)/(long)world_size;

//two buffers to store current and next position

float *d_buf1 = (float *) _mm_malloc(sizeof(float)*n_segments, 4096);

float *d_buf2 = (float *) _mm_malloc(sizeof(float)*n_segments, 4096);

// Getting verificatiobn data

float *d_ref = (float *) _mm_malloc(sizeof(float)*n_segments, 4096);

if(rank == 0) {
initialize_buffers(alpha, n_segments, d_buf1, d_buf2);

MPI_Bcast(d_buf1, n_segments, MPI_FLOAT, 0, MPI_COMM_WORLD);

MPI_Bcast(d_buf2, n_segments, MPI_FLOAT, 0, MPI_COMM_WORLD);

//compute reference data

float *d_ref_temp = simulate_ref(alpha, n_segments, n_steps, d_buf1, d_buf2, rank, world_size,


segments_per_process);

if(rank == 0) {

#pragma omp parallel for

for(long i = 0; i < n_segments; i++)

d_ref[i] = d_ref_temp[i];

//initialize buffers in rank0 and broadcast them to all the processes

if(rank == 0) {

initialize_buffers(alpha, n_segments, d_buf1, d_buf2);

MPI_Bcast(d_buf1, n_segments, MPI_FLOAT, 0, MPI_COMM_WORLD);

MPI_Bcast(d_buf2, n_segments, MPI_FLOAT, 0, MPI_COMM_WORLD);

//compute using the function in worker.cc and get the timing

const double t0 = omp_get_wtime();

float *d_final = simulate(alpha, n_segments, n_steps, d_buf1, d_buf2, rank, world_size,


segments_per_process);
const double t1 = omp_get_wtime();

//verify computed data with the reference data in rank 0

if(rank == 0) {

bool within_tolerance = true;

#pragma omp parallel for reduction(&: within_tolerance)

for(long i = 0; i < n_segments; i++)

within_tolerance &= ((d_ref[i] - d_final[i])*(d_ref[i] - d_final[i])) < 1.0e-6;;

if(within_tolerance) {

// Printing performance as measured on node 1

printf("Time: %f\n", t1-t0);

} else {

// Verification failed

printf("Error: verification failed %f\n", t1-t0);

MPI_Finalize();

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy