diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 6cd330079..fdf1d02c0 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -37,60 +37,6 @@ __device__ float atomicMax(float* address, float val) { return __int_as_float(old); } -__device__ float atomicMin(float* address, float val) { - int* address_as_i = reinterpret_cast(address); - int old = *address_as_i, assumed; - do { - assumed = old; - old = atomicCAS( - reinterpret_cast(address), assumed, - __float_as_int(fminf(val, __int_as_float(assumed)))); - } while (assumed != old); - return __int_as_float(old); -} - -__device__ float dDequantizeFP4(unsigned char val, float absmax) -{ - float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f; - if((val & 0b0110) == 0) - { - // subnormal - if((val & 0b0001) == 0) - return 0.0f; - else - return sign*0.0625f*absmax; - } - else - { - // normal - float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f); - float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f; - - return sign*exponent*fraction*absmax; - } -} - -__device__ float d2DequantizeFP4(unsigned char val) -{ - float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f; - if((val & 0b0110) == 0) - { - // subnormal - if((val & 0b0001) == 0) - return 0.0f; - else - return sign*0.0625f; - } - else - { - // normal - float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f); - float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f; - - return sign*exponent*fraction; - } -} - __device__ float dDequantizeFP4Tree(unsigned char val, float absmax) { float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f; @@ -167,60 +113,6 @@ __device__ unsigned char dQuantizeFP4(float x) return 0b0000+sign; } -__device__ half dhDequantizeNF4(unsigned char val) -{ - // the values for this tree was generated by test_normal_map_tree - // in the file tests/test_functional.py - if((val & 0b1000) == 8) - if((val & 0b0100) == 4) // 1 - if((val & 0b0010) == 2) // 11 - if((val & 0b0001) == 1) // 111 - return 1.0f; - else - return 0.7229568362236023f; - else - if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f; - else - return 0.44070982933044434f; - else - if((val & 0b0010) == 2) //10 - if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f; - else - return 0.24611230194568634f; - else - if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f; - else - return 0.07958029955625534f; - - else - if((val & 0b0100) == 4) // 0 - if((val & 0b0010) == 2) //01 - if((val & 0b0001) == 1) // 011 - return 0.0f; - else - return -0.09105003625154495f; - else - if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f; - else - return -0.28444138169288635f; - else - if((val & 0b0010) == 2) //00 - if((val & 0b0001) == 1) // 001 - return -0.39491748809814453f; - else - return -0.5250730514526367f; - else - if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f; - else - return -1.0f; - -} - __device__ __forceinline__ float dDequantizeNF4(unsigned char val) { @@ -3424,118 +3316,6 @@ template __global__ void kgemm_4bit_inferenc } - -//#define ROWS 2 -//template __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc) -//{ -//// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp -//// 1. Load dataB into register -//// 2. Dequantize B -//// 3. Fetch data from A and multiply -// -// typedef cub::BlockLoad LoadA; -// //__shared__ typename LoadA::TempStorage loada; -// typedef cub::BlockLoad LoadB; -// //__shared__ typename LoadB::TempStorage loadb; -// typedef cub::BlockReduce BlockReduce; -// // Allocate shared memory for BlockReduce -// //__shared__ typename BlockReduce::TempStorage reduce; -// -// __shared__ union { -// typename BlockReduce::TempStorage reduce; -// typename LoadB::TempStorage loadb; -// typename LoadA::TempStorage loada; -// } temp_storage; -// -// -// T dataA[ITEMS]; -// T local_B[ITEMS]; -// T local_accC[ROWS]; -// int valid_items = 0; -// const int col_offset = blockIdx.x * 8; -// -// __shared__ T tileA[ROWS*THREADS*ITEMS]; -// __shared__ T accumulatorC[ROWS*8]; -// -// //#pragma unroll 8 -// //for(int i = 0; i < 8; i++) -// // tileA[threadIdx.x + (i*256)] = 0.0f; -// //__syncthreads(); -// if(threadIdx.x < 64) -// accumulatorC[threadIdx.x] = 0.0f; -// __syncthreads(); -// -// -// for(int inner_idx = 0; inner_idx < K; inner_idx+= THREADS*ITEMS) -// { -// valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx; -// int baserow = 0; -// for(int row = baserow; row < (baserow+ROWS) && row < N; row++) -// { -// LoadA(temp_storage.loada).Load(&(A[(row*K) + inner_idx]), dataA, valid_items, 0.0f); -// -// #pragma unroll ITEMS -// for(int k = 0; k < ITEMS; k++) -// tileA[row*THREADS*ITEMS + threadIdx.x + (k*THREADS)] = dataA[k]; -// -// __syncthreads(); -// } -// baserow += ROWS; -// -// // load 16 columns from B at a time. B is transposed, so its like loading rows -// // each warp loads one row -// // each thread loads 128 byte -// -// // col: inner_idx + warp_lane -// // row: ldb*(offset + warp_id) -// for(int col = 0; col < 8 && (col_offset + col) < M; col++) -// { -// int colB = col_offset + col; -// -// for(int k = 0; k < ROWS; k++) -// local_accC[k] = 0.0f; -// -// int base_idxB = ldb*colB; -// valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx; -// LoadB(temp_storage.loadb).Load(&(B[base_idxB + inner_idx]), local_B, valid_items, 0.0f); -// __syncthreads(); -// -// for(int row = 0; row < ROWS && row < N; row++) -// { -// #pragma unroll ITEMS -// for(int k = 0; k < ITEMS; k++) -// { -// int idxA = row*THREADS*ITEMS + threadIdx.x + (THREADS*k); -// local_accC[row] += tileA[idxA]*local_B[k]; -// } -// -// local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum()); -// if(threadIdx.x == 0) -// atomicAdd(&accumulatorC[row*8 + col], local_accC[row]); -// } -// } -// } -// -// for(int row = 0; row < ROWS && row < N; row++) -// { -// int out_idx = ldc*row + col_offset; -// -// //if(threadIdx.x < 8) -// // if(accumulatorC[row*8 + threadIdx.x] != 0.0) -// // printf("%i %i %i %i %f idx %i %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx, blockIdx.x); -// -// if(threadIdx.x < 8 && (col_offset + threadIdx.x) < M) -// { -// //printf("%i %i %i %i %f idx %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx); -// out[out_idx + threadIdx.x] = accumulatorC[row*8 + threadIdx.x]; -// } -// } -// -// -// -//} - - template __global__ void kfunc(T *A, T *B, T value, long n) { for(long i = (blockDim.x*blockIdx.x) + threadIdx.x; i < n; i+=(blockDim.x*gridDim.x)) @@ -3756,8 +3536,6 @@ MAKE_optimizerStatic8bit2State(ADAM, float) template __global__ void kPercentileClipping(float * __restrict__ g, float *gnorm_vec, int step, const int n); template __global__ void kPercentileClipping(half * __restrict__ g, float *gnorm_vec, int step, const int n); -// template __global__ void kPercentileClipping(float * __restrict__ g, float *gnorm_vec, int step, const int n); -// template __global__ void kPercentileClipping(half * __restrict__ g, float *gnorm_vec, int step, const int n); #define MAKE_kQuantizeBlockwise(dtype, blocksize, num_per_thread, stochastic, data_type_name) \ template __global__ void kQuantizeBlockwise(float * code, dtype * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); \ diff --git a/csrc/ops.cu b/csrc/ops.cu index afe1eb275..e6c2bb443 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -674,24 +674,10 @@ template void gemm_host(int m, int n, int k, T * A, T* B, T * out int num_blocks = (m+31)/32; - //cout << num_blocks << endl; - //cout << lda << endl; - //cout << ldb << endl; - //cout << ldc << endl; - - //cout << m << endl; - //cout << n << endl; - //cout << k << endl; if(bits == 32) - //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); if(bits == 16) - //gemm_device<<< num_blocks, 256, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); gemm_device<<< num_blocks, 160, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - //gemm_device<<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - //gemm_device<<< num_blocks, 96, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - //gemm_device<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); - //gemm_device<<< num_blocks, 64, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc); } template void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize) @@ -699,18 +685,7 @@ template void gemm_4bit_inference(int m, int n, int k, T * A, unsi int num_blocks = (m+31)/32; - //cout << num_blocks << endl; - //cout << lda << endl; - //cout << ldb << endl; - //cout << ldc << endl; - - //cout << m << endl; - //cout << n << endl; - //cout << k << endl; kgemm_4bit_inference<<< num_blocks, 96, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); - //kgemm_4bit_inference<<< num_blocks, 256, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); - //kgemm_4bit_inference<<< num_blocks, 160, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); - //kgemm_4bit_inference<<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); } template void gemm_4bit_inference_naive(int m, int n, int k, T * A, unsigned char* B, float *absmax, float *datatype, T * out, int lda, int ldb, int ldc, int blocksize, cudaStream_t stream)