5-computation
5-computation
for
for(unsigned
(unsignedkk kk==0;
0;kk
kk<<matrixsize;
matrixsize;++kk)
++kk){{
result[ii
result[ii**matrixsize
matrixsize++jj]jj]+=
+=
matrix[ii
matrix[ii**matrixsize
matrixsize++kk] kk]**matrix[kk
matrix[kk**matrixsize
matrixsize++jj];
jj];
}}
}}
}}
}}
4
CPU time = 1.527 ms
Matrix Squaring (version 1)
square<<<1,
square<<<1,N>>>(matrix,
N>>>(matrix,result,
result,N);
N);////NN==64
64
__global__
__global__void
voidsquare(unsigned
square(unsigned*matrix,
*matrix,
unsigned
unsigned*result,
*result,
unsigned
unsignedmatrixsize)
matrixsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
for
for(unsigned
(unsignedjjjj==0;0;jjjj<<matrixsize;
matrixsize;++jj)
++jj){{
for
for(unsigned
(unsignedkk kk==0; 0;kk
kk<<matrixsize;
matrixsize;++kk)
++kk){{
result[id
result[id**matrixsize
matrixsize++jj]jj]+=
+=
matrix[id
matrix[id**matrixsize
matrixsize++kk] kk]**
matrix[kk
matrix[kk**matrixsize
matrixsize++jj];jj];
}} }} }}
5
CPU time = 1.527 ms, GPU v1 time = 6.391 ms
Matrix Squaring (version 2)
square<<<N,
square<<<N,N>>>(matrix,
N>>>(matrix,result,
result,N);
N); ////NN==64
64
__global__
__global__void
voidsquare(unsigned
square(unsigned*matrix,
*matrix,
unsigned
unsigned*result,
*result,
unsigned
unsignedmatrixsize)
matrixsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
unsigned
unsignediiii==id id//matrixsize;
matrixsize; Homework: What if you
interchange ii and jj?
unsigned jj = id % matrixsize;
unsigned jj = id % matrixsize;
for
for(unsigned
(unsignedkk kk==0;
0;kkkk<<matrixsize;
matrixsize;++kk)
++kk){{
result[ii
result[ii**matrixsize
matrixsize++jj]jj]+=
+=matrix[ii
matrix[ii**matrixsize
matrixsize++kk]kk]**
matrix[kk
matrix[kk**matrixsize
matrixsize++jj];
jj];
}} }}
CPU time = 1.527 ms, GPU v1 time = 6.391 ms, 6
GPU v2 time = 0.1 ms
GPU Computation Hierarchy
... ... ... ... Hundreds of
GPU thousands
... ...
... ... Tens of
Multi-processor thousands
Block
... ... ... ...
1024
... 32
Warp
Thread 1
7
Warp
●
A set of consecutive threads (currently 32) that
execute in SIMD fashion.
●
SIMD == Single Instruction Multiple Data
●
Warp-threads are fully synchronized. There is
an implicit barrier after each step / instruction.
●
Memory coalescing is closely related to warps.
Takeaway
0 1 2 3 4 5 6 7
S0 S0 S0 S0 S0 S0 S0 S0 NOP
S1 S1 S1 S1
Time
S2 S2 S2 S2
S3 S3 S3 S3 S3 S3 S3 S3 9
Warp with Conditions
●
When different warp-threads execute different
instructions, threads are said to diverge.
●
Hardware executes threads satisfying same condition
together, ensuring that other threads execute a no-op.
●
This adds sequentiality to the execution.
●
This problem is termed as thread-divergence.
0 1 2 3 4 5 6 7
S0 S0 S0 S0 S0 S0 S0 S0
S1 S1 S1 S1
Time
S2 S2 S2 S2
S3 S3 S3 S3 S3 S3 S3 S3 10
Classwork
__global__
__global__void
voiddkernel(unsigned
dkernel(unsigned*vector, *vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedidid==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
for
for(unsigned
(unsignediiii==0;0;iiii<<id;
id;++ii)
++ii)
vector[id]
vector[id]+=
+=ii;ii;
Does
Doesthis
thiscode
codediverge?
diverge?
}}
__global__
__global__voidvoiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
ifif(id
(id%%2) 2)vector[id]
vector[id]==id;
id;
else
elseifif(vector[id]
(vector[id]%%2)2)vector[id]
vector[id]==id
id//2;2;
else
elsevector[id]
vector[id]==id
id**2;2;
}}
Does
Doesthis
thiscode
codediverge
divergefurther?
further?
vector
vectorisisinitialized to{0,
initializedto {0,1,1,2,2,3,3,…}.
…}. 11
Thread-Divergence
●
Since thread-divergence makes execution sequential,
conditions are evil in the kernel codes?
ifif(vectorsize
(vectorsize<<N)
N)S1;
S1;else
elseS2;
S2; Condition but no divergence
●
Then, conditions evaluating to different truth-values
are evil?
ifif(id
(id//32)
32)S1;
S1;else
elseS2;
S2; Different truth-values but no divergence
Takeaway
assert(x
assert(x====yy||||xx==
==z);
z);
xx==yy++zz––x;x;
13