0% found this document useful (0 votes)
3 views

5-computation

The document discusses CUDA programming with a focus on launch configurations for large data sets, matrix squaring, and GPU computation hierarchy. It highlights issues such as out-of-bounds access and the importance of memory coalescing, as well as the impact of thread divergence on execution efficiency. Additionally, it provides examples of kernel functions and performance comparisons between CPU and GPU implementations.

Uploaded by

webbstu1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views

5-computation

The document discusses CUDA programming with a focus on launch configurations for large data sets, matrix squaring, and GPU computation hierarchy. It highlights issues such as out-of-bounds access and the importance of memory coalescing, as well as the impact of thread divergence on execution efficiency. Additionally, it provides examples of kernel functions and performance comparisons between CPU and GPU implementations.

Uploaded by

webbstu1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 13

CUDA Programming

Launch Configuration for Huge Data


#include <stdio.h>
#include <cuda.h>
__global__ void dkernel(unsigned *vector) {
unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
vector[id] = id; Access out-of-bounds
}
#define BLOCKSIZE 1024
int main(int nn, char *str[]) {
unsigned N = atoi(str[1]); Find
Findtwo
twoissues
issues
unsigned *vector, *hvector; with this code.
with this code.
cudaMalloc(&vector, N * sizeof(unsigned));
hvector = (unsigned *)malloc(N * sizeof(unsigned));

unsigned nblocks = ceil(N / BLOCKSIZE); Needs floating-point


printf("nblocks = %d\n", nblocks);
division
dkernel<<<nblocks, BLOCKSIZE>>>(vector);
cudaMemcpy(hvector, vector, N * sizeof(unsigned), cudaMemcpyDeviceToHost);
for (unsigned ii = 0; ii < N; ++ii) {
printf("%4d ", hvector[ii]);
}
return 0;
} 2
Launch Configuration for Large Size
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel(unsigned *vector, unsigned vectorsize) {
unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < vectorsize) vector[id] = id;
}
#define BLOCKSIZE 1024
int main(int nn, char *str[]) {
unsigned N = atoi(str[1]);
unsigned *vector, *hvector;
cudaMalloc(&vector, N * sizeof(unsigned));
hvector = (unsigned *)malloc(N * sizeof(unsigned));

unsigned nblocks = ceil((float)N / BLOCKSIZE);


printf("nblocks = %d\n", nblocks);

dkernel<<<nblocks, BLOCKSIZE>>>(vector, N);


cudaMemcpy(hvector, vector, N * sizeof(unsigned), cudaMemcpyDeviceToHost);
for (unsigned ii = 0; ii < N; ++ii) {
printf("%4d ", hvector[ii]);
}
return 0;
} 3
Matrix Squaring
void
voidsquarecpu(unsigned
squarecpu(unsigned*matrix,
*matrix,unsigned
unsigned*result,
*result,
unsigned
unsignedmatrixsize
matrixsize/*/*==64*/)
64*/){{
for
for(unsigned
(unsignediiii==0;
0;iiii<<matrixsize;
matrixsize;++ii)
++ii){{
for
for(unsigned
(unsignedjjjj==0;
0;jjjj<<matrixsize;
matrixsize;++jj)
++jj){{

for
for(unsigned
(unsignedkk kk==0;
0;kk
kk<<matrixsize;
matrixsize;++kk)
++kk){{
result[ii
result[ii**matrixsize
matrixsize++jj]jj]+=
+=
matrix[ii
matrix[ii**matrixsize
matrixsize++kk] kk]**matrix[kk
matrix[kk**matrixsize
matrixsize++jj];
jj];
}}
}}
}}
}}
4
CPU time = 1.527 ms
Matrix Squaring (version 1)
square<<<1,
square<<<1,N>>>(matrix,
N>>>(matrix,result,
result,N);
N);////NN==64
64
__global__
__global__void
voidsquare(unsigned
square(unsigned*matrix,
*matrix,
unsigned
unsigned*result,
*result,
unsigned
unsignedmatrixsize)
matrixsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
for
for(unsigned
(unsignedjjjj==0;0;jjjj<<matrixsize;
matrixsize;++jj)
++jj){{
for
for(unsigned
(unsignedkk kk==0; 0;kk
kk<<matrixsize;
matrixsize;++kk)
++kk){{
result[id
result[id**matrixsize
matrixsize++jj]jj]+=
+=
matrix[id
matrix[id**matrixsize
matrixsize++kk] kk]**
matrix[kk
matrix[kk**matrixsize
matrixsize++jj];jj];
}} }} }}
5
CPU time = 1.527 ms, GPU v1 time = 6.391 ms
Matrix Squaring (version 2)
square<<<N,
square<<<N,N>>>(matrix,
N>>>(matrix,result,
result,N);
N); ////NN==64
64
__global__
__global__void
voidsquare(unsigned
square(unsigned*matrix,
*matrix,
unsigned
unsigned*result,
*result,
unsigned
unsignedmatrixsize)
matrixsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
unsigned
unsignediiii==id id//matrixsize;
matrixsize; Homework: What if you
interchange ii and jj?
unsigned jj = id % matrixsize;
unsigned jj = id % matrixsize;
for
for(unsigned
(unsignedkk kk==0;
0;kkkk<<matrixsize;
matrixsize;++kk)
++kk){{
result[ii
result[ii**matrixsize
matrixsize++jj]jj]+=
+=matrix[ii
matrix[ii**matrixsize
matrixsize++kk]kk]**
matrix[kk
matrix[kk**matrixsize
matrixsize++jj];
jj];
}} }}
CPU time = 1.527 ms, GPU v1 time = 6.391 ms, 6
GPU v2 time = 0.1 ms
GPU Computation Hierarchy
... ... ... ... Hundreds of
GPU thousands

... ...
... ... Tens of
Multi-processor thousands

Block
... ... ... ...
1024

... 32
Warp

Thread 1

7
Warp

A set of consecutive threads (currently 32) that
execute in SIMD fashion.

SIMD == Single Instruction Multiple Data

Warp-threads are fully synchronized. There is
an implicit barrier after each step / instruction.

Memory coalescing is closely related to warps.
Takeaway

It is a misconception that all


threads in a GPU execute in
lock-step. Lock-step execution is
true for threads only within a warp.
8
Warp with Conditions
__global__
__global__void
voiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x; S0
ifif(id
(id%%2)
2)vector[id]
vector[id]==id;
id; S1
else
elsevector[id]
vector[id]==vectorsize
vectorsize**vectorsize;
vectorsize; S2
vector[id]++;
vector[id]++; S3
}}

0 1 2 3 4 5 6 7

S0 S0 S0 S0 S0 S0 S0 S0 NOP

S1 S1 S1 S1
Time

S2 S2 S2 S2

S3 S3 S3 S3 S3 S3 S3 S3 9
Warp with Conditions

When different warp-threads execute different
instructions, threads are said to diverge.

Hardware executes threads satisfying same condition
together, ensuring that other threads execute a no-op.

This adds sequentiality to the execution.

This problem is termed as thread-divergence.
0 1 2 3 4 5 6 7

S0 S0 S0 S0 S0 S0 S0 S0

S1 S1 S1 S1
Time

S2 S2 S2 S2

S3 S3 S3 S3 S3 S3 S3 S3 10
Classwork
__global__
__global__void
voiddkernel(unsigned
dkernel(unsigned*vector, *vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedidid==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
for
for(unsigned
(unsignediiii==0;0;iiii<<id;
id;++ii)
++ii)
vector[id]
vector[id]+=
+=ii;ii;
Does
Doesthis
thiscode
codediverge?
diverge?
}}

__global__
__global__voidvoiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
ifif(id
(id%%2) 2)vector[id]
vector[id]==id;
id;
else
elseifif(vector[id]
(vector[id]%%2)2)vector[id]
vector[id]==id
id//2;2;
else
elsevector[id]
vector[id]==id
id**2;2;
}}
Does
Doesthis
thiscode
codediverge
divergefurther?
further?

vector
vectorisisinitialized to{0,
initializedto {0,1,1,2,2,3,3,…}.
…}. 11
Thread-Divergence

Since thread-divergence makes execution sequential,
conditions are evil in the kernel codes?
ifif(vectorsize
(vectorsize<<N)
N)S1;
S1;else
elseS2;
S2; Condition but no divergence


Then, conditions evaluating to different truth-values
are evil?
ifif(id
(id//32)
32)S1;
S1;else
elseS2;
S2; Different truth-values but no divergence

Takeaway

Conditions are not bad;


they evaluating to different truth-values is also not bad;
they evaluating to different truth-values for warp-threads is bad.
12
Classwork

Rewrite the following program fragment to
remove thread-divergence.
assert(x
assert(x== ==yy||||xx==
==z);
z);
ifif(x
(x==
==y)
y)xx==z;z;
else
elsexx==y;y;

assert(x
assert(x====yy||||xx==
==z);
z);
xx==yy++zz––x;x;

13

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy