Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleanup: remove unused kernels/C++ code #1458

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 0 additions & 222 deletions csrc/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,60 +37,6 @@ __device__ float atomicMax(float* address, float val) {
return __int_as_float(old);
}

__device__ float atomicMin(float* address, float val) {
int* address_as_i = reinterpret_cast<int*>(address);
int old = *address_as_i, assumed;
do {
assumed = old;
old = atomicCAS(
reinterpret_cast<int*>(address), assumed,
__float_as_int(fminf(val, __int_as_float(assumed))));
} while (assumed != old);
return __int_as_float(old);
}

__device__ float dDequantizeFP4(unsigned char val, float absmax)
{
float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
if((val & 0b0110) == 0)
{
// subnormal
if((val & 0b0001) == 0)
return 0.0f;
else
return sign*0.0625f*absmax;
}
else
{
// normal
float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f);
float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f;

return sign*exponent*fraction*absmax;
}
}

__device__ float d2DequantizeFP4(unsigned char val)
{
float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
if((val & 0b0110) == 0)
{
// subnormal
if((val & 0b0001) == 0)
return 0.0f;
else
return sign*0.0625f;
}
else
{
// normal
float exponent = ((val & 0b0100) == 4 ? 2.0f : 8.0f) + ((val & 0b0010) == 2 ? 0.0f : 2.0f);
float fraction = (val & 0b0001) == 1 ? 1.5f : 1.0f;

return sign*exponent*fraction;
}
}

__device__ float dDequantizeFP4Tree(unsigned char val, float absmax)
{
float sign = (val & 0b1000) == 8 ? -1.0f : 1.0f;
Expand Down Expand Up @@ -167,60 +113,6 @@ __device__ unsigned char dQuantizeFP4(float x)
return 0b0000+sign;
}

__device__ half dhDequantizeNF4(unsigned char val)
{
// the values for this tree was generated by test_normal_map_tree
// in the file tests/test_functional.py
if((val & 0b1000) == 8)
if((val & 0b0100) == 4) // 1
if((val & 0b0010) == 2) // 11
if((val & 0b0001) == 1) // 111
return 1.0f;
else
return 0.7229568362236023f;
else
if((val & 0b0001) == 1) // 110
return 0.5626170039176941f;
else
return 0.44070982933044434f;
else
if((val & 0b0010) == 2) //10
if((val & 0b0001) == 1) // 101
return 0.33791524171829224f;
else
return 0.24611230194568634f;
else
if((val & 0b0001) == 1) // 100
return 0.16093020141124725f;
else
return 0.07958029955625534f;

else
if((val & 0b0100) == 4) // 0
if((val & 0b0010) == 2) //01
if((val & 0b0001) == 1) // 011
return 0.0f;
else
return -0.09105003625154495f;
else
if((val & 0b0001) == 1) // 010
return -0.18477343022823334f;
else
return -0.28444138169288635f;
else
if((val & 0b0010) == 2) //00
if((val & 0b0001) == 1) // 001
return -0.39491748809814453f;
else
return -0.5250730514526367f;
else
if((val & 0b0001) == 1) // 000
return -0.6961928009986877f;
else
return -1.0f;

}

__device__ __forceinline__ float dDequantizeNF4(unsigned char val)
{

Expand Down Expand Up @@ -3424,118 +3316,6 @@ template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inferenc

}


//#define ROWS 2
//template <typename T, int ITEMS, int THREADS> __global__ void gemm_device(int M, int N, int K, T const* A, T* B, T * out, int lda, int ldb, int ldc)
//{
//// 0. We want to fill a 8x128 tile for a thread block so we have 8x16 tile for each warp
//// 1. Load dataB into register
//// 2. Dequantize B
//// 3. Fetch data from A and multiply
//
// typedef cub::BlockLoad<T, THREADS , ITEMS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadA;
// //__shared__ typename LoadA::TempStorage loada;
// typedef cub::BlockLoad<T, THREADS , ITEMS, cub::BLOCK_LOAD_WARP_TRANSPOSE> LoadB;
// //__shared__ typename LoadB::TempStorage loadb;
// typedef cub::BlockReduce<T, THREADS> BlockReduce;
// // Allocate shared memory for BlockReduce
// //__shared__ typename BlockReduce::TempStorage reduce;
//
// __shared__ union {
// typename BlockReduce::TempStorage reduce;
// typename LoadB::TempStorage loadb;
// typename LoadA::TempStorage loada;
// } temp_storage;
//
//
// T dataA[ITEMS];
// T local_B[ITEMS];
// T local_accC[ROWS];
// int valid_items = 0;
// const int col_offset = blockIdx.x * 8;
//
// __shared__ T tileA[ROWS*THREADS*ITEMS];
// __shared__ T accumulatorC[ROWS*8];
//
// //#pragma unroll 8
// //for(int i = 0; i < 8; i++)
// // tileA[threadIdx.x + (i*256)] = 0.0f;
// //__syncthreads();
// if(threadIdx.x < 64)
// accumulatorC[threadIdx.x] = 0.0f;
// __syncthreads();
//
//
// for(int inner_idx = 0; inner_idx < K; inner_idx+= THREADS*ITEMS)
// {
// valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx;
// int baserow = 0;
// for(int row = baserow; row < (baserow+ROWS) && row < N; row++)
// {
// LoadA(temp_storage.loada).Load(&(A[(row*K) + inner_idx]), dataA, valid_items, 0.0f);
//
// #pragma unroll ITEMS
// for(int k = 0; k < ITEMS; k++)
// tileA[row*THREADS*ITEMS + threadIdx.x + (k*THREADS)] = dataA[k];
//
// __syncthreads();
// }
// baserow += ROWS;
//
// // load 16 columns from B at a time. B is transposed, so its like loading rows
// // each warp loads one row
// // each thread loads 128 byte
//
// // col: inner_idx + warp_lane
// // row: ldb*(offset + warp_id)
// for(int col = 0; col < 8 && (col_offset + col) < M; col++)
// {
// int colB = col_offset + col;
//
// for(int k = 0; k < ROWS; k++)
// local_accC[k] = 0.0f;
//
// int base_idxB = ldb*colB;
// valid_items = K - inner_idx > THREADS*ITEMS ? THREADS*ITEMS : K - inner_idx;
// LoadB(temp_storage.loadb).Load(&(B[base_idxB + inner_idx]), local_B, valid_items, 0.0f);
// __syncthreads();
//
// for(int row = 0; row < ROWS && row < N; row++)
// {
// #pragma unroll ITEMS
// for(int k = 0; k < ITEMS; k++)
// {
// int idxA = row*THREADS*ITEMS + threadIdx.x + (THREADS*k);
// local_accC[row] += tileA[idxA]*local_B[k];
// }
//
// local_accC[row] = BlockReduce(temp_storage.reduce).Reduce(local_accC[row], cub::Sum());
// if(threadIdx.x == 0)
// atomicAdd(&accumulatorC[row*8 + col], local_accC[row]);
// }
// }
// }
//
// for(int row = 0; row < ROWS && row < N; row++)
// {
// int out_idx = ldc*row + col_offset;
//
// //if(threadIdx.x < 8)
// // if(accumulatorC[row*8 + threadIdx.x] != 0.0)
// // printf("%i %i %i %i %f idx %i %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx, blockIdx.x);
//
// if(threadIdx.x < 8 && (col_offset + threadIdx.x) < M)
// {
// //printf("%i %i %i %i %f idx %i %i\n", row, col_offset, threadIdx.x, N, accumulatorC[row*8 + threadIdx.x], ldc, out_idx);
// out[out_idx + threadIdx.x] = accumulatorC[row*8 + threadIdx.x];
// }
// }
//
//
//
//}


template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long n)
{
for(long i = (blockDim.x*blockIdx.x) + threadIdx.x; i < n; i+=(blockDim.x*gridDim.x))
Expand Down Expand Up @@ -3756,8 +3536,6 @@ MAKE_optimizerStatic8bit2State(ADAM, float)

template __global__ void kPercentileClipping<float, 2048, 4>(float * __restrict__ g, float *gnorm_vec, int step, const int n);
template __global__ void kPercentileClipping<half, 2048, 4>(half * __restrict__ g, float *gnorm_vec, int step, const int n);
// template __global__ void kPercentileClipping<float, 128, 4>(float * __restrict__ g, float *gnorm_vec, int step, const int n);
// template __global__ void kPercentileClipping<half, 128, 4>(half * __restrict__ g, float *gnorm_vec, int step, const int n);

#define MAKE_kQuantizeBlockwise(dtype, blocksize, num_per_thread, stochastic, data_type_name) \
template __global__ void kQuantizeBlockwise<dtype, blocksize, num_per_thread, stochastic, data_type_name>(float * code, dtype * __restrict__ const A, float *absmax, unsigned char *out, float * __restrict__ const rand, const int rand_offset, const int n); \
Expand Down
25 changes: 0 additions & 25 deletions csrc/ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -674,43 +674,18 @@ template <typename T> void gemm_host(int m, int n, int k, T * A, T* B, T * out

int num_blocks = (m+31)/32;

//cout << num_blocks << endl;
//cout << lda << endl;
//cout << ldb << endl;
//cout << ldc << endl;

//cout << m << endl;
//cout << n << endl;
//cout << k << endl;
if(bits == 32)
//gemm_device<T, 32, 128><<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
gemm_device<T, 32, 32><<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
if(bits == 16)
//gemm_device<T, 16, 256><<< num_blocks, 256, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
gemm_device<T, 16, 160><<< num_blocks, 160, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
//gemm_device<T, 16, 128><<< num_blocks, 128, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
//gemm_device<T, 16, 96><<< num_blocks, 96, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
//gemm_device<T, 16, 32><<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
//gemm_device<T, 16, 64><<< num_blocks, 64, 0, 0 >>>(m, n, k, A, B, out, lda, ldb, ldc);
}

template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A, unsigned char* B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize)
{

int num_blocks = (m+31)/32;

//cout << num_blocks << endl;
//cout << lda << endl;
//cout << ldb << endl;
//cout << ldc << endl;

//cout << m << endl;
//cout << n << endl;
//cout << k << endl;
kgemm_4bit_inference<T, 96><<< num_blocks, 96, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
//kgemm_4bit_inference<T, 256><<< num_blocks, 256, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
//kgemm_4bit_inference<T, 160><<< num_blocks, 160, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
//kgemm_4bit_inference<T, 32><<< num_blocks, 32, 0, 0 >>>(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize);
}

template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A, unsigned char* B, float *absmax, float *datatype, T * out, int lda, int ldb, int ldc, int blocksize, cudaStream_t stream)
Expand Down
Loading