Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Commit

Permalink
Add signing GPU impl
Browse files Browse the repository at this point in the history
  • Loading branch information
sakridge committed Oct 31, 2019
1 parent 78a5a9a commit ec8a282
Show file tree
Hide file tree
Showing 15 changed files with 454 additions and 187 deletions.
56 changes: 45 additions & 11 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#
# Makefile to build libcuda-crypt.a
# Makefile to build libcuda-crypt.so
# From sources in /cuda-crypt and /ed25519-verify
#
# nvcc inserts conflicting driver symbols into a static lib (.a)
# so more than one .a cannot be linked into a single program.
# nvcc inserts conflicting driver symbols into a static lib (.so)
# so more than one .so cannot be linked into a single program.
# Shared objects with device kernels also did not seem to work--
# it can build, but the kernel launch is not successful. (with cuda 9.2)
# Hence, build ed25519 ecdsa verify and chacha encryption device
Expand All @@ -24,12 +24,41 @@ CFLAGS+=-DENDIAN_NEUTRAL -DLTC_NO_ASM -I$(CUDA_HEADER_DIR) -I$(CUDA_SHA256_DIR)
all: $V/$(CHACHA_TEST_BIN) $V/$(ECC_TEST_BIN) $(V)/lib$(LIB).so

ECC_DIR:=cuda-ecc-ed25519
VERIFY_SRCS:=$(addprefix $(ECC_DIR)/,verify.cu seed.cu sha512.cu ge.cu sc.cu fe.cu sign.cu keypair.cu common.cu ed25519.h)

SC_SRCS:=$(addprefix $(ECC_DIR)/,sc.cu ed25519.h ge.h)
$V/sc.o: $(SC_SRCS)
@mkdir -p $(@D)
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@

KEYPAIR_SRCS:=$(addprefix $(ECC_DIR)/,keypair.cu ed25519.h ge.h)
$V/keypair.o: $(KEYPAIR_SRCS)
@mkdir -p $(@D)
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@

SEED_SRCS:=$(addprefix $(ECC_DIR)/,seed.cu ed25519.h)
$V/seed.o: $(SEED_SRCS)
@mkdir -p $(@D)
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@

GE_SRCS:=$(addprefix $(ECC_DIR)/,ge.cu ge.h precomp_data.h)
$V/ge.o: $(GE_SRCS)
@mkdir -p $(@D)
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@

SIGN_SRCS:=$(addprefix $(ECC_DIR)/,sign.cu sha512.h ge.h sc.h fe.cu ../$(CUDA_HEADER_DIR)/gpu_common.h ed25519.h)
$V/sign.o: $(SIGN_SRCS)
@mkdir -p $(@D)
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@

VERIFY_SRCS:=$(addprefix $(ECC_DIR)/,verify.cu seed.cu sha512.cu ge.h sc.cu fe.cu keypair.cu common.cu ed25519.h)
$V/verify.o: $(VERIFY_SRCS)
@mkdir -p $(@D)
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@

$V/gpu_ctx.o: $(addprefix $(ECC_DIR)/,gpu_ctx.cu gpu_ctx.h)
@mkdir -p $(@D)
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@

CHACHA_DIR:=cuda-crypt
CHACHA_SRCS:=$(addprefix $(CHACHA_DIR)/,chacha_cbc.cu chacha.h common.cu)

Expand All @@ -50,25 +79,30 @@ $V/poh_verify.o: $(POH_SRCS)
@mkdir -p $(@D)
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@

$V/crypt-dlink.o: $V/chacha_cbc.o $V/aes_cbc.o $V/verify.o $V/poh_verify.o
$(NVCC) -Xcompiler "-fPIC" --gpu-architecture=compute_61 --device-link $^ --output-file $@
CPU_GPU_OBJS=$(addprefix $V/,chacha_cbc.o aes_cbc.o verify.o poh_verify.o gpu_ctx.o sign.o ge.o seed.o keypair.o sc.o)

$V/lib$(LIB).a: $V/crypt-dlink.o $V/chacha_cbc.o $V/aes_cbc.o $V/verify.o $V/poh_verify.o
$(NVCC) -Xcompiler "-fPIC" --lib --output-file $@ $^
$V/crypt-dlink.o: $(CPU_GPU_OBJS)
$(NVCC) -Xcompiler "-fPIC" --gpu-architecture=compute_61 --device-link $^ --output-file $@

$V/lib$(LIB).so: $V/crypt-dlink.o $V/chacha_cbc.o $V/aes_cbc.o $V/verify.o $V/poh_verify.o
$V/lib$(LIB).so: $V/crypt-dlink.o $(CPU_GPU_OBJS)
$(NVCC) -Xcompiler "-fPIC" --shared --output-file $@ $^

$V/$(CHACHA_TEST_BIN): $(CHACHA_DIR)/test.cu $V/lib$(LIB).a
$V/$(CHACHA_TEST_BIN): $(CHACHA_DIR)/test.cu $V/lib$(LIB).so
$(NVCC) $(CFLAGS) -L$V -l$(LIB) $< -o $@

$V/ecc_main.o: $(addprefix $(ECC_DIR)/,main.cu ed25519.h)
@mkdir -p $(@D)
$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@

$V/$(ECC_TEST_BIN): $V/ecc_main.o $V/lib$(LIB).a
$V/$(ECC_TEST_BIN): $V/ecc_main.o $V/lib$(LIB).so
$(NVCC) $(CFLAGS) -L$V -l$(LIB) $< -o $@

.PHONY:clean
clean:
rm -rf $V

test: $V/$(ECC_TEST_BIN) $V/$(CHACHA_TEST_BIN)
cd $(V) && ./$(CHACHA_TEST_BIN) 64 \
cd $(V) && ./$(ECC_TEST_BIN) 1 1 1 1 1 1
cd $(V) && ./$(ECC_TEST_BIN) 64 1 1 1 1 0
cd $(V) && ./$(ECC_TEST_BIN) 100201 1 1 4 10 1
28 changes: 26 additions & 2 deletions src/cuda-ecc-ed25519/ed25519.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,35 @@ typedef struct {

void ED25519_DECLSPEC ed25519_create_keypair(unsigned char *public_key, unsigned char *private_key, const unsigned char *seed);
void ED25519_DECLSPEC ed25519_sign(unsigned char *signature, const unsigned char *message, size_t message_len, const unsigned char *public_key, const unsigned char *private_key);

void ED25519_DECLSPEC ed25519_sign_many(const gpu_Elems* elems,
uint32_t num_elems,
uint32_t message_size,
uint32_t total_packets,
uint32_t total_signatures,
const uint32_t* message_lens,
const uint32_t* public_key_offsets,
const uint32_t* private_key_offsets,
const uint32_t* message_start_offsets,
uint8_t* signatures_out,
uint8_t use_non_default_stream);

int ED25519_DECLSPEC ed25519_verify(const unsigned char *signature, const unsigned char *message, uint32_t message_len, const unsigned char *public_key);
void ED25519_DECLSPEC ed25519_verify_many(const gpu_Elems* elems, uint32_t num_elems, uint32_t message_size, uint32_t total_packets, uint32_t total_signatures, const uint32_t* message_lens, const uint32_t* public_key_offset, const uint32_t* signature_offset, const uint32_t* message_start_offset, uint8_t* out, uint8_t use_non_default_stream);

void ED25519_DECLSPEC ed25519_verify_many(const gpu_Elems* elems,
uint32_t num_elems,
uint32_t message_size,
uint32_t total_packets,
uint32_t total_signatures,
const uint32_t* message_lens,
const uint32_t* public_key_offsets,
const uint32_t* private_key_offsets,
const uint32_t* message_start_offsets,
uint8_t* out,
uint8_t use_non_default_stream);

void ED25519_DECLSPEC ed25519_add_scalar(unsigned char *public_key, unsigned char *private_key, const unsigned char *scalar);
void ED25519_DECLSPEC ed25519_key_exchange(unsigned char *shared_secret, const unsigned char *public_key, const unsigned char *private_key);
void ED25519_DECLSPEC ed25519_free_gpu_mem();
void ED25519_DECLSPEC ed25519_set_verbose(bool val);

const char* ED25519_DECLSPEC ed25519_license();
Expand Down
2 changes: 1 addition & 1 deletion src/cuda-ecc-ed25519/fe.cu
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ void __device__ __host__ fe_add(fe h, const fe f, const fe g) {
Preconditions: b in {0,1}.
*/

void fe_cmov(fe f, const fe g, unsigned int b) {
void __host__ __device__ fe_cmov(fe f, const fe g, unsigned int b) {
int32_t f0 = f[0];
int32_t f1 = f[1];
int32_t f2 = f[2];
Expand Down
2 changes: 1 addition & 1 deletion src/cuda-ecc-ed25519/fe.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void __device__ __host__ fe_tobytes(unsigned char *s, const fe h);
void __host__ __device__ fe_copy(fe h, const fe f);
int __host__ __device__ fe_isnegative(const fe f);
int __device__ __host__ fe_isnonzero(const fe f);
void fe_cmov(fe f, const fe g, unsigned int b);
void __host__ __device__ fe_cmov(fe f, const fe g, unsigned int b);
void fe_cswap(fe f, fe g, unsigned int b);

void __device__ __host__ fe_neg(fe h, const fe f);
Expand Down
12 changes: 6 additions & 6 deletions src/cuda-ecc-ed25519/ge.cu
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
}


void ge_p3_0(ge_p3 *h) {
void __host__ __device__ ge_p3_0(ge_p3 *h) {
fe_0(h->X);
fe_1(h->Y);
fe_1(h->Z);
Expand Down Expand Up @@ -330,7 +330,7 @@ void ge_p3_tobytes(unsigned char *s, const ge_p3 *h) {
}


static unsigned char equal(signed char b, signed char c) {
static unsigned char __host__ __device__ equal(signed char b, signed char c) {
unsigned char ub = b;
unsigned char uc = c;
unsigned char x = ub ^ uc; /* 0: yes; 1..255: no */
Expand All @@ -340,20 +340,20 @@ static unsigned char equal(signed char b, signed char c) {
return (unsigned char) y;
}

static unsigned char negative(signed char b) {
static unsigned char __host__ __device__ negative(signed char b) {
uint64_t x = b; /* 18446744073709551361..18446744073709551615: yes; 0..255: no */
x >>= 63; /* 1: yes; 0: no */
return (unsigned char) x;
}

static void cmov(ge_precomp *t, const ge_precomp *u, unsigned char b) {
static void __host__ __device__ cmov(ge_precomp *t, const ge_precomp *u, unsigned char b) {
fe_cmov(t->yplusx, u->yplusx, b);
fe_cmov(t->yminusx, u->yminusx, b);
fe_cmov(t->xy2d, u->xy2d, b);
}


static void select(ge_precomp *t, int pos, signed char b) {
static void __host__ __device__ select(ge_precomp *t, int pos, signed char b) {
ge_precomp minust;
unsigned char bnegative = negative(b);
unsigned char babs = b - (((-bnegative) & b) << 1);
Expand Down Expand Up @@ -383,7 +383,7 @@ Preconditions:
a[31] <= 127
*/

void ge_scalarmult_base(ge_p3 *h, const unsigned char *a) {
void __device__ __host__ ge_scalarmult_base(ge_p3 *h, const unsigned char *a) {
signed char e[64];
signed char carry;
ge_p1p1 r;
Expand Down
6 changes: 3 additions & 3 deletions src/cuda-ecc-ed25519/ge.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,20 @@ typedef struct {

void __host__ __device__ ge_p3_tobytes(unsigned char *s, const ge_p3 *h);
void __host__ __device__ ge_tobytes(unsigned char *s, const ge_p2 *h);
int __device__ __host__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s);
int __host__ __device__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s);

void __host__ __device__ ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
void __host__ __device__ ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
void __host__ __device__ ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, const ge_p3 *A, const unsigned char *b);
void __host__ __device__ ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
void __host__ __device__ ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
void ge_scalarmult_base(ge_p3 *h, const unsigned char *a);
void __host__ __device__ ge_scalarmult_base(ge_p3 *h, const unsigned char *a);

void __host__ __device__ ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p);
void __host__ __device__ ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p);
void __host__ __device__ ge_p2_0(ge_p2 *h);
void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p);
void ge_p3_0(ge_p3 *h);
void __host__ __device__ ge_p3_0(ge_p3 *h);
void __host__ __device__ ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p);
void __host__ __device__ ge_p3_to_cached(ge_cached *r, const ge_p3 *p);
void __host__ __device__ ge_p3_to_p2(ge_p2 *r, const ge_p3 *p);
Expand Down
161 changes: 161 additions & 0 deletions src/cuda-ecc-ed25519/gpu_ctx.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#include "ed25519.h"
#include "gpu_ctx.h"
#include <pthread.h>
#include "gpu_common.h"

static pthread_mutex_t g_ctx_mutex = PTHREAD_MUTEX_INITIALIZER;

#define MAX_NUM_GPUS 8
#define MAX_QUEUE_SIZE 8

static gpu_ctx_t g_gpu_ctx[MAX_NUM_GPUS][MAX_QUEUE_SIZE] = {0};
static uint32_t g_cur_gpu = 0;
static uint32_t g_cur_queue[MAX_NUM_GPUS] = {0};
static int32_t g_total_gpus = -1;

static bool cuda_crypt_init_locked() {
if (g_total_gpus == -1) {
cudaGetDeviceCount(&g_total_gpus);
g_total_gpus = min(MAX_NUM_GPUS, g_total_gpus);
LOG("total_gpus: %d\n", g_total_gpus);
for (int gpu = 0; gpu < g_total_gpus; gpu++) {
CUDA_CHK(cudaSetDevice(gpu));
for (int queue = 0; queue < MAX_QUEUE_SIZE; queue++) {
int err = pthread_mutex_init(&g_gpu_ctx[gpu][queue].mutex, NULL);
if (err != 0) {
fprintf(stderr, "pthread_mutex_init error %d gpu: %d queue: %d\n",
err, gpu, queue);
g_total_gpus = 0;
return false;
}
CUDA_CHK(cudaStreamCreate(&g_gpu_ctx[gpu][queue].stream));
}
}
}
return g_total_gpus > 0;
}

bool ed25519_init() {
cudaFree(0);
pthread_mutex_lock(&g_ctx_mutex);
bool success = cuda_crypt_init_locked();
pthread_mutex_unlock(&g_ctx_mutex);
return success;
}

gpu_ctx_t* get_gpu_ctx() {
int32_t cur_gpu, cur_queue;

LOG("locking global mutex");
pthread_mutex_lock(&g_ctx_mutex);
if (!cuda_crypt_init_locked()) {
pthread_mutex_unlock(&g_ctx_mutex);
LOG("No GPUs, exiting...\n");
return NULL;
}
cur_gpu = g_cur_gpu;
g_cur_gpu++;
g_cur_gpu %= g_total_gpus;
cur_queue = g_cur_queue[cur_gpu];
g_cur_queue[cur_gpu]++;
g_cur_queue[cur_gpu] %= MAX_QUEUE_SIZE;
pthread_mutex_unlock(&g_ctx_mutex);

gpu_ctx_t* cur_ctx = &g_gpu_ctx[cur_gpu][cur_queue];
LOG("locking contex mutex queue: %d gpu: %d", cur_queue, cur_gpu);
pthread_mutex_lock(&cur_ctx->mutex);

CUDA_CHK(cudaSetDevice(cur_gpu));

LOG("selecting gpu: %d queue: %d\n", cur_gpu, cur_queue);

return cur_ctx;
}

void setup_gpu_ctx(verify_ctx_t* cur_ctx,
const gpu_Elems* elems,
uint32_t num_elems,
uint32_t message_size,
uint32_t total_packets,
uint32_t total_packets_size,
uint32_t total_signatures,
const uint32_t* message_lens,
const uint32_t* public_key_offsets,
const uint32_t* signature_offsets,
const uint32_t* message_start_offsets,
size_t out_size,
cudaStream_t stream
) {
size_t offsets_size = total_signatures * sizeof(uint32_t);

LOG("device allocate. packets: %d out: %d offsets_size: %zu\n",
total_packets_size, (int)out_size, offsets_size);

if (cur_ctx->packets == NULL ||
total_packets_size > cur_ctx->packets_size_bytes) {
CUDA_CHK(cudaFree(cur_ctx->packets));
CUDA_CHK(cudaMalloc(&cur_ctx->packets, total_packets_size));

cur_ctx->packets_size_bytes = total_packets_size;
}

if (cur_ctx->out == NULL || cur_ctx->out_size_bytes < out_size) {
CUDA_CHK(cudaFree(cur_ctx->out));
CUDA_CHK(cudaMalloc(&cur_ctx->out, out_size));

cur_ctx->out_size_bytes = total_signatures;
}

if (cur_ctx->public_key_offsets == NULL || cur_ctx->offsets_len < total_signatures) {
CUDA_CHK(cudaFree(cur_ctx->public_key_offsets));
CUDA_CHK(cudaMalloc(&cur_ctx->public_key_offsets, offsets_size));

CUDA_CHK(cudaFree(cur_ctx->signature_offsets));
CUDA_CHK(cudaMalloc(&cur_ctx->signature_offsets, offsets_size));

CUDA_CHK(cudaFree(cur_ctx->message_start_offsets));
CUDA_CHK(cudaMalloc(&cur_ctx->message_start_offsets, offsets_size));

CUDA_CHK(cudaFree(cur_ctx->message_lens));
CUDA_CHK(cudaMalloc(&cur_ctx->message_lens, offsets_size));

cur_ctx->offsets_len = total_signatures;
}

LOG("Done alloc");

CUDA_CHK(cudaMemcpyAsync(cur_ctx->public_key_offsets, public_key_offsets, offsets_size, cudaMemcpyHostToDevice, stream));
CUDA_CHK(cudaMemcpyAsync(cur_ctx->signature_offsets, signature_offsets, offsets_size, cudaMemcpyHostToDevice, stream));
CUDA_CHK(cudaMemcpyAsync(cur_ctx->message_start_offsets, message_start_offsets, offsets_size, cudaMemcpyHostToDevice, stream));
CUDA_CHK(cudaMemcpyAsync(cur_ctx->message_lens, message_lens, offsets_size, cudaMemcpyHostToDevice, stream));

size_t cur = 0;
for (size_t i = 0; i < num_elems; i++) {
LOG("i: %zu size: %d\n", i, elems[i].num * message_size);
CUDA_CHK(cudaMemcpyAsync(&cur_ctx->packets[cur * message_size], elems[i].elems, elems[i].num * message_size, cudaMemcpyHostToDevice, stream));
cur += elems[i].num;
}
}


void release_gpu_ctx(gpu_ctx_t* cur_ctx) {
pthread_mutex_unlock(&cur_ctx->mutex);
}

void ed25519_free_gpu_mem() {
for (size_t gpu = 0; gpu < MAX_NUM_GPUS; gpu++) {
for (size_t queue = 0; queue < MAX_QUEUE_SIZE; queue++) {
gpu_ctx_t* cur_ctx = &g_gpu_ctx[gpu][queue];
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.packets));
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.out));
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.message_lens));
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.public_key_offsets));
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.private_key_offsets));
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.signature_offsets));
CUDA_CHK(cudaFree(cur_ctx->verify_ctx.message_start_offsets));
if (cur_ctx->stream != 0) {
CUDA_CHK(cudaStreamDestroy(cur_ctx->stream));
}
}
}
}
Loading

0 comments on commit ec8a282

Please sign in to comment.