0% found this document useful (0 votes)

3 views

5-computation

The document discusses CUDA programming with a focus on launch configurations for large data sets, matrix squaring, and GPU computation hierarchy. It highlights issues such as out-of-bounds access and the importance of memory coalescing, as well as the impact of thread divergence on execution efficiency. Additionally, it provides examples of kernel functions and performance comparisons between CPU and GPU implementations.

Uploaded by

webbstu1

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

3 views

5-computation

Uploaded by

webbstu1

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 13

CUDA Programming

Launch Configuration for Huge Data

#include <stdio.h>
#include <cuda.h>
__global__ void dkernel(unsigned *vector) {
unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
vector[id] = id; Access out-of-bounds
}
#define BLOCKSIZE 1024
int main(int nn, char *str[]) {
unsigned N = atoi(str[1]); Find
Findtwo
twoissues
issues
unsigned *vector, *hvector; with this code.
with this code.
cudaMalloc(&vector, N * sizeof(unsigned));
hvector = (unsigned *)malloc(N * sizeof(unsigned));

unsigned nblocks = ceil(N / BLOCKSIZE); Needs floating-point

printf("nblocks = %d\n", nblocks);
division
dkernel<<<nblocks, BLOCKSIZE>>>(vector);
cudaMemcpy(hvector, vector, N * sizeof(unsigned), cudaMemcpyDeviceToHost);
for (unsigned ii = 0; ii < N; ++ii) {
printf("%4d ", hvector[ii]);
}
return 0;
} 2
Launch Configuration for Large Size
#include <stdio.h>
#include <cuda.h>
__global__ void dkernel(unsigned *vector, unsigned vectorsize) {
unsigned id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < vectorsize) vector[id] = id;
}
#define BLOCKSIZE 1024
int main(int nn, char *str[]) {
unsigned N = atoi(str[1]);
unsigned *vector, *hvector;
cudaMalloc(&vector, N * sizeof(unsigned));
hvector = (unsigned *)malloc(N * sizeof(unsigned));

unsigned nblocks = ceil((float)N / BLOCKSIZE);

printf("nblocks = %d\n", nblocks);

dkernel<<<nblocks, BLOCKSIZE>>>(vector, N);

cudaMemcpy(hvector, vector, N * sizeof(unsigned), cudaMemcpyDeviceToHost);
for (unsigned ii = 0; ii < N; ++ii) {
printf("%4d ", hvector[ii]);
}
return 0;
} 3
Matrix Squaring
void
voidsquarecpu(unsigned
squarecpu(unsigned*matrix,
*matrix,unsigned
unsigned*result,
*result,
unsigned
unsignedmatrixsize
matrixsize/*/*==64*/)
64*/){{
for
for(unsigned
(unsignediiii==0;
0;iiii<<matrixsize;
matrixsize;++ii)
++ii){{
for
for(unsigned
(unsignedjjjj==0;
0;jjjj<<matrixsize;
matrixsize;++jj)
++jj){{

for
for(unsigned
(unsignedkk kk==0;
0;kk
kk<<matrixsize;
matrixsize;++kk)
++kk){{
result[ii
result[ii**matrixsize
matrixsize++jj]jj]+=
+=
matrix[ii
matrix[ii**matrixsize
matrixsize++kk] kk]**matrix[kk
matrix[kk**matrixsize
matrixsize++jj];
jj];
}}
}}
}}
}}
4
CPU time = 1.527 ms
Matrix Squaring (version 1)
square<<<1,
square<<<1,N>>>(matrix,
N>>>(matrix,result,
result,N);
N);////NN==64
64
__global__
__global__void
voidsquare(unsigned
square(unsigned*matrix,
*matrix,
unsigned
unsigned*result,
*result,
unsigned
unsignedmatrixsize)
matrixsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
for
for(unsigned
(unsignedjjjj==0;0;jjjj<<matrixsize;
matrixsize;++jj)
++jj){{
for
for(unsigned
(unsignedkk kk==0; 0;kk
kk<<matrixsize;
matrixsize;++kk)
++kk){{
result[id
result[id**matrixsize
matrixsize++jj]jj]+=
+=
matrix[id
matrix[id**matrixsize
matrixsize++kk] kk]**
matrix[kk
matrix[kk**matrixsize
matrixsize++jj];jj];
}} }} }}
5
CPU time = 1.527 ms, GPU v1 time = 6.391 ms
Matrix Squaring (version 2)
square<<<N,
square<<<N,N>>>(matrix,
N>>>(matrix,result,
result,N);
N); ////NN==64
64
__global__
__global__void
voidsquare(unsigned
square(unsigned*matrix,
*matrix,
unsigned
unsigned*result,
*result,
unsigned
unsignedmatrixsize)
matrixsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
unsigned
unsignediiii==id id//matrixsize;
matrixsize; Homework: What if you
interchange ii and jj?
unsigned jj = id % matrixsize;
unsigned jj = id % matrixsize;
for
for(unsigned
(unsignedkk kk==0;
0;kkkk<<matrixsize;
matrixsize;++kk)
++kk){{
result[ii
result[ii**matrixsize
matrixsize++jj]jj]+=
+=matrix[ii
matrix[ii**matrixsize
matrixsize++kk]kk]**
matrix[kk
matrix[kk**matrixsize
matrixsize++jj];
jj];
}} }}
CPU time = 1.527 ms, GPU v1 time = 6.391 ms, 6
GPU v2 time = 0.1 ms
GPU Computation Hierarchy
... ... ... ... Hundreds of
GPU thousands

... ...
... ... Tens of
Multi-processor thousands

Block
... ... ... ...
1024

... 32
Warp

Thread 1

7
Warp
●
A set of consecutive threads (currently 32) that
execute in SIMD fashion.
●
SIMD == Single Instruction Multiple Data
●
Warp-threads are fully synchronized. There is
an implicit barrier after each step / instruction.
●
Memory coalescing is closely related to warps.
Takeaway

It is a misconception that all

threads in a GPU execute in
lock-step. Lock-step execution is
true for threads only within a warp.
8
Warp with Conditions
__global__
__global__void
voiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x; S0
ifif(id
(id%%2)
2)vector[id]
vector[id]==id;
id; S1
else
elsevector[id]
vector[id]==vectorsize
vectorsize**vectorsize;
vectorsize; S2
vector[id]++;
vector[id]++; S3
}}

0 1 2 3 4 5 6 7

S0 S0 S0 S0 S0 S0 S0 S0 NOP

S1 S1 S1 S1
Time

S2 S2 S2 S2

S3 S3 S3 S3 S3 S3 S3 S3 9
Warp with Conditions
●
When different warp-threads execute different
instructions, threads are said to diverge.
●
Hardware executes threads satisfying same condition
together, ensuring that other threads execute a no-op.
●
This adds sequentiality to the execution.
●
This problem is termed as thread-divergence.
0 1 2 3 4 5 6 7

S0 S0 S0 S0 S0 S0 S0 S0

S1 S1 S1 S1
Time

S2 S2 S2 S2

S3 S3 S3 S3 S3 S3 S3 S3 10
Classwork
__global__
__global__void
voiddkernel(unsigned
dkernel(unsigned*vector, *vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedidid==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
for
for(unsigned
(unsignediiii==0;0;iiii<<id;
id;++ii)
++ii)
vector[id]
vector[id]+=
+=ii;ii;
Does
Doesthis
thiscode
codediverge?
diverge?
}}

__global__
__global__voidvoiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
ifif(id
(id%%2) 2)vector[id]
vector[id]==id;
id;
else
elseifif(vector[id]
(vector[id]%%2)2)vector[id]
vector[id]==id
id//2;2;
else
elsevector[id]
vector[id]==id
id**2;2;
}}
Does
Doesthis
thiscode
codediverge
divergefurther?
further?

vector
vectorisisinitialized to{0,
initializedto {0,1,1,2,2,3,3,…}.
…}. 11
Thread-Divergence
●
Since thread-divergence makes execution sequential,
conditions are evil in the kernel codes?
ifif(vectorsize
(vectorsize<<N)
N)S1;
S1;else
elseS2;
S2; Condition but no divergence

●
Then, conditions evaluating to different truth-values
are evil?
ifif(id
(id//32)
32)S1;
S1;else
elseS2;
S2; Different truth-values but no divergence

Takeaway

Conditions are not bad;

they evaluating to different truth-values is also not bad;
they evaluating to different truth-values for warp-threads is bad.
12
Classwork
●
Rewrite the following program fragment to
remove thread-divergence.
assert(x
assert(x== ==yy||||xx==
==z);
z);
ifif(x
(x==
==y)
y)xx==z;z;
else
elsexx==y;y;

assert(x
assert(x====yy||||xx==
==z);
z);
xx==yy++zz––x;x;

Graphs of Functions - Section 4 - Functions and Graphs - MathTrackX - Polynomials, Functions and Graphs - Edx
No ratings yet
Graphs of Functions - Section 4 - Functions and Graphs - MathTrackX - Polynomials, Functions and Graphs - Edx
10 pages
20 Quiz 14
No ratings yet
20 Quiz 14
12 pages
6-computation
No ratings yet
6-computation
11 pages
217 Lec3
No ratings yet
217 Lec3
46 pages
Matrix Mult
100% (1)
Matrix Mult
55 pages
Hpc file
No ratings yet
Hpc file
22 pages
ECE408 S19 ZJUI Exam1 Study Guide
No ratings yet
ECE408 S19 ZJUI Exam1 Study Guide
25 pages
Multithreaded Architectures: Memory and Data Locality
No ratings yet
Multithreaded Architectures: Memory and Data Locality
39 pages
Input: Output: 1. Sub String Program
No ratings yet
Input: Output: 1. Sub String Program
8 pages
Coursera Quiz Week1 Spring 2014 Heterogeneous Programming
100% (5)
Coursera Quiz Week1 Spring 2014 Heterogeneous Programming
4 pages
CUDA_part-2
No ratings yet
CUDA_part-2
49 pages
Matrix-Matrix Multiplication Using Shared Memory
No ratings yet
Matrix-Matrix Multiplication Using Shared Memory
27 pages
Processors
No ratings yet
Processors
25 pages
PDC assignment
No ratings yet
PDC assignment
9 pages
Introduction To CUDA: CAP 4730 Spring 2012
No ratings yet
Introduction To CUDA: CAP 4730 Spring 2012
35 pages
Web GPU
0% (1)
Web GPU
40 pages
Cuda Notes From Udacity Lecture
No ratings yet
Cuda Notes From Udacity Lecture
3 pages
BECOA157 Parallel Matrix Multiplication
No ratings yet
BECOA157 Parallel Matrix Multiplication
3 pages
CUDA Exercises
No ratings yet
CUDA Exercises
185 pages
BCS3413 Principle & Applications of Parallel Programming Quiz 2: Gpgpu Cuda
No ratings yet
BCS3413 Principle & Applications of Parallel Programming Quiz 2: Gpgpu Cuda
3 pages
Lab Report 6
No ratings yet
Lab Report 6
12 pages
Lecture5 2
No ratings yet
Lecture5 2
46 pages
cuuda nvidai guide_Part3
No ratings yet
cuuda nvidai guide_Part3
15 pages
3-CUDA
No ratings yet
3-CUDA
5 pages
cuda
No ratings yet
cuda
4 pages
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
No ratings yet
3 Some Commonly Used CUDA API: 3.1 Function Type Qualifiers
7 pages
Rishi
No ratings yet
Rishi
30 pages
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
No ratings yet
Department of Computer Engineering BE Laboratory Practice-I A.Y 2021-22 SEM1
45 pages
HPC Int2 Key
No ratings yet
HPC Int2 Key
10 pages
Lab 7
No ratings yet
Lab 7
3 pages
7. Moving to Parallel - Addition of 2 Matrices
No ratings yet
7. Moving to Parallel - Addition of 2 Matrices
14 pages
HPC (Pra 04)
No ratings yet
HPC (Pra 04)
11 pages
Lecture 4
No ratings yet
Lecture 4
48 pages
4 MM in CUDA
No ratings yet
4 MM in CUDA
38 pages
ECE408 2012 Practice Exam1
No ratings yet
ECE408 2012 Practice Exam1
10 pages
Module 4.1 - Memory and Data Locality: GPU Teaching Kit
No ratings yet
Module 4.1 - Memory and Data Locality: GPU Teaching Kit
132 pages
2023-CSC14120-Lecture01-CUDAIntroduction
No ratings yet
2023-CSC14120-Lecture01-CUDAIntroduction
32 pages
CUDA Putting It All Together
No ratings yet
CUDA Putting It All Together
39 pages
Lab 1 Parallel
No ratings yet
Lab 1 Parallel
4 pages
CUDA_Memory
No ratings yet
CUDA_Memory
56 pages
Class4 Advanced Cuda Opencl
No ratings yet
Class4 Advanced Cuda Opencl
64 pages
tilining
No ratings yet
tilining
23 pages
作业2
No ratings yet
作业2
5 pages
GPU Computing 2
No ratings yet
GPU Computing 2
28 pages
Class 10
No ratings yet
Class 10
13 pages
Google Colab Solution Activity
No ratings yet
Google Colab Solution Activity
5 pages
Cuda Firstprograms PDF
No ratings yet
Cuda Firstprograms PDF
6 pages
LP 1,,1
No ratings yet
LP 1,,1
5 pages
UNIT-5 Tiling
No ratings yet
UNIT-5 Tiling
23 pages
Chapter 3
No ratings yet
Chapter 3
20 pages
GPU_Assignment-3_Solution
No ratings yet
GPU_Assignment-3_Solution
4 pages
L06_GPGPU_CUDA_Programming_1
No ratings yet
L06_GPGPU_CUDA_Programming_1
23 pages
cuda_mode_lecture2
No ratings yet
cuda_mode_lecture2
33 pages
joint_matrix_bfloat16_modified
No ratings yet
joint_matrix_bfloat16_modified
4 pages
Vector Addition
No ratings yet
Vector Addition
3 pages
Graphics Processing Unit (GPU) Architecture and Programming: TU/e 5kk73 Zhenyu Ye Henk Corporaal 2011-11-15
No ratings yet
Graphics Processing Unit (GPU) Architecture and Programming: TU/e 5kk73 Zhenyu Ye Henk Corporaal 2011-11-15
53 pages
Lect11 12 Cuda Threads
No ratings yet
Lect11 12 Cuda Threads
25 pages
HPC-Practical-4Addition of two large vectors
No ratings yet
HPC-Practical-4Addition of two large vectors
4 pages
HPC
No ratings yet
HPC
90 pages
Coursera Quiz Week2 Fall 2012
No ratings yet
Coursera Quiz Week2 Fall 2012
3 pages
Build Your Own Blockchain In JS
From Everand
Build Your Own Blockchain In JS
Sankar Srinivasan
5/5 (1)
Bowthorpe EMP Remote Surge Monitoring Systems
No ratings yet
Bowthorpe EMP Remote Surge Monitoring Systems
7 pages
United States Court of Appeals, Second Circuit.: No. 691, Docket 93-7649
No ratings yet
United States Court of Appeals, Second Circuit.: No. 691, Docket 93-7649
6 pages
Bugreport CPH2565 UP1A.230620.001 2024 05 05 21 17 06 Dumpstate - Log 19688
No ratings yet
Bugreport CPH2565 UP1A.230620.001 2024 05 05 21 17 06 Dumpstate - Log 19688
22 pages
Application For Bail Before High Court Under Section 439, CR PC
No ratings yet
Application For Bail Before High Court Under Section 439, CR PC
2 pages
Zoje WR 501
No ratings yet
Zoje WR 501
40 pages
Formulae Used in Group Incentive Scheme: Incentive Calculation For Production Shops
No ratings yet
Formulae Used in Group Incentive Scheme: Incentive Calculation For Production Shops
2 pages
Kashish_140 Report-2 (3)
No ratings yet
Kashish_140 Report-2 (3)
50 pages
10 Service Manual Neo NX
No ratings yet
10 Service Manual Neo NX
189 pages
Manifestation Motion EERI ERC Case No. 2021-021RC - 21may2021 - Agc Edited
No ratings yet
Manifestation Motion EERI ERC Case No. 2021-021RC - 21may2021 - Agc Edited
3 pages
Catenary
No ratings yet
Catenary
8 pages
Tutorial Lengkap Phy 143 (Edit)
No ratings yet
Tutorial Lengkap Phy 143 (Edit)
22 pages
Kasb Bank
No ratings yet
Kasb Bank
3 pages
PDF Buyers Importers Heimtextil Frankfurt Germany Jan 2018 - Compress
100% (1)
PDF Buyers Importers Heimtextil Frankfurt Germany Jan 2018 - Compress
15 pages
Jrc112247 Ec Educational Mobility Report Final
No ratings yet
Jrc112247 Ec Educational Mobility Report Final
45 pages
Unit 1
No ratings yet
Unit 1
67 pages
Unit 5 Som Watermark
100% (1)
Unit 5 Som Watermark
27 pages
Tesis Aulya Fahma
No ratings yet
Tesis Aulya Fahma
142 pages
TOPIC 2 ISLAMIC ECONOMIC SYSTEM Sept16
No ratings yet
TOPIC 2 ISLAMIC ECONOMIC SYSTEM Sept16
33 pages
Anthurium Varieties Performance and Economics Under Greenhouse.
33% (3)
Anthurium Varieties Performance and Economics Under Greenhouse.
4 pages
PAS 220 2008 Checklist
No ratings yet
PAS 220 2008 Checklist
18 pages
Wikipedia
No ratings yet
Wikipedia
3 pages
Catalogue 2024 1
No ratings yet
Catalogue 2024 1
8 pages
Oop Repeat Bro
No ratings yet
Oop Repeat Bro
10 pages
Prad4x4™ New Thar Catalog
No ratings yet
Prad4x4™ New Thar Catalog
24 pages
The Stony Brook Press, Volume 34, Issue 9
No ratings yet
The Stony Brook Press, Volume 34, Issue 9
32 pages
Lesson 3 Exercises Problem 4
No ratings yet
Lesson 3 Exercises Problem 4
2 pages
AH THIAN v. GOVERNMENT OF MALAYSIA
No ratings yet
AH THIAN v. GOVERNMENT OF MALAYSIA
5 pages
PHẦN TRẮC NGHIỆM (8,0 điểm) : C B C A
No ratings yet
PHẦN TRẮC NGHIỆM (8,0 điểm) : C B C A
3 pages
MIRIAM M. GARZA v. RANIER L.L.C., Et Al.
No ratings yet
MIRIAM M. GARZA v. RANIER L.L.C., Et Al.
11 pages

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

5-computation

Uploaded by

5-computation

Uploaded by

CUDA Programming

Launch Configuration for Huge Data

unsigned nblocks = ceil(N / BLOCKSIZE); Needs floating-point

unsigned nblocks = ceil((float)N / BLOCKSIZE);

dkernel<<<nblocks, BLOCKSIZE>>>(vector, N);

It is a misconception that all

Conditions are not bad;

You might also like

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.