CUDA tutorial/en: Difference between revisions

CUDA tutorial/en (view source)

Revision as of 20:03, 1 March 2018

23 bytes added , 6 years ago

Updating to match new version of source page

FuzzyBot

Bots

38,760

edits

@@ Line 101: / Line 101: @@
 The following example shows how to add two numbers on the GPU using CUDA. Note that this is just an exercise, it's very simple, so don't expect to see any actual acceleration.
 <syntaxhighlight lang="cpp" line highlight="1,5">
-__global__   void add (int *a, int *b, int *c){
+___global__ void add (int *a, int *b, int *c){
+  *c = *a + *b;
+}
-	*c = *a + *b;
-}
 int main(void){
-	int a, b, c;
+  int a, b, c;
-	int *dev_a, *dev_b, *dev_c;
+  int *dev_a, *dev_b, *dev_c;
-	int size = sizeof(int);
+  int size = sizeof(int);
-//  allocate device copies of a,b, c
+  //  allocate device copies of a,b, c
-cudaMalloc ( (void**) &dev_a, size);
+  cudaMalloc ( (void**) &dev_a, size);
-cudaMalloc ( (void**) &dev_b, size);
+  cudaMalloc ( (void**) &dev_b, size);
-cudaMalloc ( (void**) &dev_c, size);
+  cudaMalloc ( (void**) &dev_c, size);
-a=2; b=7;
+  a=2; b=7;
-//  copy inputs to device
+  //  copy inputs to device
-cudaMemcpy (dev_a, &a, size, cudaMemcpyHostToDevice);
+  cudaMemcpy (dev_a, &a, size, cudaMemcpyHostToDevice);
-cudaMemcpy (dev_b, &b, size, cudaMemcpyHostToDevice);
+  cudaMemcpy (dev_b, &b, size, cudaMemcpyHostToDevice);
-// launch add() kernel on GPU, passing parameters
+  // launch add() kernel on GPU, passing parameters
-add <<< 1, 1 >>> (dev_a, dev_b, dev_c);
+  add <<< 1, 1 >>> (dev_a, dev_b, dev_c);
-// copy device result back to host
+  // copy device result back to host
-cudaMemcpy (&c, dev_c, size, cudaMemcpyDeviceToHost);
+  cudaMemcpy (&c, dev_c, size, cudaMemcpyDeviceToHost);
-cudaFree ( dev_a ); cudaFree ( dev_b ); cudaFree ( dev_c );
+  cudaFree ( dev_a ); cudaFree ( dev_b ); cudaFree ( dev_c );
 }
 </syntaxhighlight>
@@ Line 139: / Line 139: @@
 <syntaxhighlight lang="cpp" line highlight="1,5">
 __global__   void add (int *a, int *b, int *c){
+  c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
-	c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
 </syntaxhighlight>
 where blockIdx.x is the unique number identifying a CUDA block. This way each CUDA block adds a value from a[ ] to b[ ].
@@ Line 153: / Line 152: @@
 = Advantages of shared memory=
 So far all the memory transfers in the kernel have been done via the regular GPU (global) memory which is relatively slow. Often we have so many communications between the threads that the performance decreases significantly.  In order to address this issue there exists another type of memory called '''shared memory''' which can be used to speed-up the memory operations between the threads. However the trick is that only the threads within a block can communicate.  In order to demonstrate the usage of such shared memory we consider the dot product example where two vectors are multiplied together element by element and then summed. Below is the kernel:
- <syntaxhighlight lang="cpp" line highlight="1,5">
+<syntaxhighlight lang="cpp" line highlight="1,5">
 __global__   void dot(int *a, int *b, int *c){
-        int temp = a[threadIdx.x]*b[threadIdx.x];
+  int temp = a[threadIdx.x]*b[threadIdx.x];
 }
 </syntaxhighlight>
@@ Line 162: / Line 161: @@
 #define N 512
 __global__   void dot(int *a, int *b, int *c){
-   __shared__ int temp[N];
+  __shared__ int temp[N];
-   temp[threadIdx.x] = a[threadIdx.x]*b[threadIdx.x];
+  temp[threadIdx.x] = a[threadIdx.x]*b[threadIdx.x];
-   __syncthreads();
+  __syncthreads();
-   if(threadIdx.x==0){
+  if(threadIdx.x==0){
-	int sum; for(int i=0;i<N;i++) sum+= temp[i];
+    int sum; for(int i=0;i<N;i++) sum+= temp[i];
-	*c=sum; }
+    *c=sum;
+  }
 }
 </syntaxhighlight>

CUDA tutorial/en: Difference between revisions

CUDA tutorial/en (view source)

Revision as of 20:03, 1 March 2018

Navigation menu

Search