-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathstudent.cuh
104 lines (90 loc) · 2.95 KB
/
student.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#pragma once
#include "assert.h"
#include "stdio.h"
#include <algorithm>
#include <cuda.h>
#include <cuda_runtime.h>
#include "utils/logger.cuh"
#include "utils/rng.cuh"
#include "utils/timer.h"
/*! \brief File containing student code
students should only modify this file
*/
// You can use this kernel defination
/*
__global__ void GMM(float *A, float *B, float *C, size_t Arow, size_t Acol, size_t Bcol){
}
*/
// Define cuda kernel here
// Define cuda kernel here
// Define cuda kernel here
// Define cuda kernel here
// Do NOT modify the function Image
void BasicGMM(size_t matArow, size_t matAcol, size_t matBrow, size_t matBcol, float *hostA, float *hostB, float *hostC)
{
if (matAcol != matBrow)
{
Log(critical, "Incorrect matrix configuration");
return;
}
// device memory pointers
float *deviceA = nullptr, *deviceB = nullptr, *deviceC = nullptr;
size_t cSz = matArow * matBcol;
//////////////////////////////////////////
// Allocate GPU Memory
//////////////////////////////////////////
Timer t;
Log(debug, "Allocating GPU memory");
// Fill this part
// Fill this part
// Fill this part
// Fill this part
// Fill this part
double seconds = t.elapsed();
Log(debug, "done... %f sec\n\n", seconds);
//////////////////////////////////////////
// Copy Host Data to GPU
//////////////////////////////////////////
Log(debug, "Copying host memory to the GPU");
t.reset();
// Fill this part
// Fill this part
// Fill this part
// Fill this part
CUDA_RUNTIME(cudaMalloc((void **)&deviceC, cSz * sizeof(float)));
seconds = t.elapsed();
Log(debug, "done... %f sec\n\n", seconds);
//////////////////////////////////////////
// GPU M-M multiplication computation
//////////////////////////////////////////
Log(debug, "Performing GPU Matrix-Multiplication");
t.reset();
// Fill this part; call the kernel here (Remember: <<< >>>)
// Fill this part; call the kernel here (Remember: <<< >>>)
// Fill this part; call the kernel here (Remember: <<< >>>)
// Fill this part; call the kernel here (Remember: <<< >>>)
// Fill this part; call the kernel here (Remember: <<< >>>)
CUDA_RUNTIME(cudaDeviceSynchronize());
seconds = t.elapsed();
Log(debug, "done... %f sec\n\n", seconds);
//////////////////////////////////////////
// Copy GPU Data to Host
//////////////////////////////////////////
Log(debug, "Copying GPU memory to the host");
t.reset();
// Let me do this part for you
CUDA_RUNTIME(cudaMemcpy(hostC, deviceC, cSz * sizeof(float), cudaMemcpyDeviceToHost));
seconds = t.elapsed();
Log(debug, "done... %f sec\n\n", seconds);
//////////////////////////////////////////
// Delete GPU memory
//////////////////////////////////////////
Log(debug, "Deleting GPU memory");
t.reset();
// Let me do this part for you
CUDA_RUNTIME(cudaFree(deviceA));
CUDA_RUNTIME(cudaFree(deviceB));
CUDA_RUNTIME(cudaFree(deviceC));
seconds = t.elapsed();
Log(debug, "done... %f sec\n\n", seconds);
}