Add signing GPU impl

solana-labs · Oct 31, 2019 · ec8a282 · ec8a282
1 parent 78a5a9a
commit ec8a282
Show file tree

Hide file tree

Showing 15 changed files with 454 additions and 187 deletions.
diff --git a/src/Makefile b/src/Makefile
@@ -1,9 +1,9 @@
 #
-#  Makefile to build libcuda-crypt.a
+#  Makefile to build libcuda-crypt.so
 #   From sources in /cuda-crypt and /ed25519-verify
 #
-#   nvcc inserts conflicting driver symbols into a static lib (.a)
-#   so more than one .a cannot be linked into a single program.
+#   nvcc inserts conflicting driver symbols into a static lib (.so)
+#   so more than one .so cannot be linked into a single program.
 #   Shared objects with device kernels also did not seem to work--
 #   it can build, but the kernel launch is not successful. (with cuda 9.2)
 #   Hence, build ed25519 ecdsa verify and chacha encryption device
@@ -24,12 +24,41 @@ CFLAGS+=-DENDIAN_NEUTRAL -DLTC_NO_ASM -I$(CUDA_HEADER_DIR) -I$(CUDA_SHA256_DIR)
 all: $V/$(CHACHA_TEST_BIN) $V/$(ECC_TEST_BIN) $(V)/lib$(LIB).so
 
 ECC_DIR:=cuda-ecc-ed25519
-VERIFY_SRCS:=$(addprefix $(ECC_DIR)/,verify.cu seed.cu sha512.cu ge.cu sc.cu fe.cu sign.cu keypair.cu common.cu ed25519.h)
 
+SC_SRCS:=$(addprefix $(ECC_DIR)/,sc.cu ed25519.h ge.h)
+$V/sc.o: $(SC_SRCS)
+	@mkdir -p $(@D)
+	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
+
+KEYPAIR_SRCS:=$(addprefix $(ECC_DIR)/,keypair.cu ed25519.h ge.h)
+$V/keypair.o: $(KEYPAIR_SRCS)
+	@mkdir -p $(@D)
+	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
+
+SEED_SRCS:=$(addprefix $(ECC_DIR)/,seed.cu ed25519.h)
+$V/seed.o: $(SEED_SRCS)
+	@mkdir -p $(@D)
+	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
+
+GE_SRCS:=$(addprefix $(ECC_DIR)/,ge.cu ge.h precomp_data.h)
+$V/ge.o: $(GE_SRCS)
+	@mkdir -p $(@D)
+	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
+
+SIGN_SRCS:=$(addprefix $(ECC_DIR)/,sign.cu sha512.h ge.h sc.h fe.cu ../$(CUDA_HEADER_DIR)/gpu_common.h ed25519.h)
+$V/sign.o: $(SIGN_SRCS)
+	@mkdir -p $(@D)
+	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
+
+VERIFY_SRCS:=$(addprefix $(ECC_DIR)/,verify.cu seed.cu sha512.cu ge.h sc.cu fe.cu keypair.cu common.cu ed25519.h)
 $V/verify.o: $(VERIFY_SRCS)
 	@mkdir -p $(@D)
 	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
 
+$V/gpu_ctx.o: $(addprefix $(ECC_DIR)/,gpu_ctx.cu gpu_ctx.h)
+	@mkdir -p $(@D)
+	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
+
 CHACHA_DIR:=cuda-crypt
 CHACHA_SRCS:=$(addprefix $(CHACHA_DIR)/,chacha_cbc.cu chacha.h common.cu)
 
@@ -50,25 +79,30 @@ $V/poh_verify.o: $(POH_SRCS)
 	@mkdir -p $(@D)
 	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
 
-$V/crypt-dlink.o: $V/chacha_cbc.o $V/aes_cbc.o $V/verify.o $V/poh_verify.o
-	$(NVCC) -Xcompiler "-fPIC" --gpu-architecture=compute_61 --device-link $^ --output-file $@
+CPU_GPU_OBJS=$(addprefix $V/,chacha_cbc.o aes_cbc.o verify.o poh_verify.o gpu_ctx.o sign.o ge.o seed.o keypair.o sc.o)
 
-$V/lib$(LIB).a: $V/crypt-dlink.o $V/chacha_cbc.o $V/aes_cbc.o $V/verify.o $V/poh_verify.o
-	$(NVCC) -Xcompiler "-fPIC" --lib --output-file $@ $^
+$V/crypt-dlink.o: $(CPU_GPU_OBJS)
+	$(NVCC) -Xcompiler "-fPIC" --gpu-architecture=compute_61 --device-link $^ --output-file $@
 
-$V/lib$(LIB).so: $V/crypt-dlink.o $V/chacha_cbc.o $V/aes_cbc.o $V/verify.o $V/poh_verify.o
+$V/lib$(LIB).so: $V/crypt-dlink.o $(CPU_GPU_OBJS)
 	$(NVCC) -Xcompiler "-fPIC" --shared --output-file $@ $^
 
-$V/$(CHACHA_TEST_BIN): $(CHACHA_DIR)/test.cu $V/lib$(LIB).a
+$V/$(CHACHA_TEST_BIN): $(CHACHA_DIR)/test.cu $V/lib$(LIB).so
 	$(NVCC) $(CFLAGS) -L$V -l$(LIB) $< -o $@
 
 $V/ecc_main.o: $(addprefix $(ECC_DIR)/,main.cu ed25519.h)
 	@mkdir -p $(@D)
 	$(NVCC) -rdc=true $(CFLAGS) -c $< -o $@
 
-$V/$(ECC_TEST_BIN): $V/ecc_main.o $V/lib$(LIB).a
+$V/$(ECC_TEST_BIN): $V/ecc_main.o $V/lib$(LIB).so
 	$(NVCC) $(CFLAGS) -L$V -l$(LIB) $< -o $@
 
 .PHONY:clean
 clean:
 	rm -rf $V
+
+test: $V/$(ECC_TEST_BIN) $V/$(CHACHA_TEST_BIN)
+	cd $(V) && ./$(CHACHA_TEST_BIN) 64 \
+	cd $(V) && ./$(ECC_TEST_BIN) 1 1 1 1 1 1
+	cd $(V) && ./$(ECC_TEST_BIN) 64 1 1 1 1 0
+	cd $(V) && ./$(ECC_TEST_BIN) 100201 1 1 4 10 1
diff --git a/src/cuda-ecc-ed25519/ed25519.h b/src/cuda-ecc-ed25519/ed25519.h
@@ -38,11 +38,35 @@ typedef struct {
 
 void ED25519_DECLSPEC ed25519_create_keypair(unsigned char *public_key, unsigned char *private_key, const unsigned char *seed);
 void ED25519_DECLSPEC ed25519_sign(unsigned char *signature, const unsigned char *message, size_t message_len, const unsigned char *public_key, const unsigned char *private_key);
+
+void ED25519_DECLSPEC ed25519_sign_many(const gpu_Elems* elems,
+                                        uint32_t num_elems,
+                                        uint32_t message_size,
+                                        uint32_t total_packets,
+                                        uint32_t total_signatures,
+                                        const uint32_t* message_lens,
+                                        const uint32_t* public_key_offsets,
+                                        const uint32_t* private_key_offsets,
+                                        const uint32_t* message_start_offsets,
+                                        uint8_t* signatures_out,
+                                        uint8_t use_non_default_stream);
+
 int ED25519_DECLSPEC ed25519_verify(const unsigned char *signature, const unsigned char *message, uint32_t message_len, const unsigned char *public_key);
-void ED25519_DECLSPEC ed25519_verify_many(const gpu_Elems* elems, uint32_t num_elems, uint32_t message_size, uint32_t total_packets, uint32_t total_signatures, const uint32_t* message_lens, const uint32_t* public_key_offset, const uint32_t* signature_offset, const uint32_t* message_start_offset, uint8_t* out, uint8_t use_non_default_stream);
+
+void ED25519_DECLSPEC ed25519_verify_many(const gpu_Elems* elems,
+                                          uint32_t num_elems,
+                                          uint32_t message_size,
+                                          uint32_t total_packets,
+                                          uint32_t total_signatures,
+                                          const uint32_t* message_lens,
+                                          const uint32_t* public_key_offsets,
+                                          const uint32_t* private_key_offsets,
+                                          const uint32_t* message_start_offsets,
+                                          uint8_t* out,
+                                          uint8_t use_non_default_stream);
+
 void ED25519_DECLSPEC ed25519_add_scalar(unsigned char *public_key, unsigned char *private_key, const unsigned char *scalar);
 void ED25519_DECLSPEC ed25519_key_exchange(unsigned char *shared_secret, const unsigned char *public_key, const unsigned char *private_key);
-void ED25519_DECLSPEC ed25519_free_gpu_mem();
 void ED25519_DECLSPEC ed25519_set_verbose(bool val);
 
 const char* ED25519_DECLSPEC ed25519_license();

diff --git a/src/cuda-ecc-ed25519/fe.cu b/src/cuda-ecc-ed25519/fe.cu
@@ -109,7 +109,7 @@ void __device__ __host__ fe_add(fe h, const fe f, const fe g) {
     Preconditions: b in {0,1}.
 */
 
-void fe_cmov(fe f, const fe g, unsigned int b) {
+void __host__ __device__ fe_cmov(fe f, const fe g, unsigned int b) {
     int32_t f0 = f[0];
     int32_t f1 = f[1];
     int32_t f2 = f[2];

diff --git a/src/cuda-ecc-ed25519/fe.h b/src/cuda-ecc-ed25519/fe.h
@@ -25,7 +25,7 @@ void __device__ __host__ fe_tobytes(unsigned char *s, const fe h);
 void __host__ __device__ fe_copy(fe h, const fe f);
 int __host__ __device__ fe_isnegative(const fe f);
 int __device__ __host__ fe_isnonzero(const fe f);
-void fe_cmov(fe f, const fe g, unsigned int b);
+void __host__ __device__ fe_cmov(fe f, const fe g, unsigned int b);
 void fe_cswap(fe f, fe g, unsigned int b);
 
 void __device__ __host__ fe_neg(fe h, const fe f);

diff --git a/src/cuda-ecc-ed25519/ge.cu b/src/cuda-ecc-ed25519/ge.cu
@@ -271,7 +271,7 @@ void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
 }
 
 
-void ge_p3_0(ge_p3 *h) {
+void __host__ __device__ ge_p3_0(ge_p3 *h) {
     fe_0(h->X);
     fe_1(h->Y);
     fe_1(h->Z);
@@ -330,7 +330,7 @@ void ge_p3_tobytes(unsigned char *s, const ge_p3 *h) {
 }
 
 
-static unsigned char equal(signed char b, signed char c) {
+static unsigned char __host__ __device__ equal(signed char b, signed char c) {
     unsigned char ub = b;
     unsigned char uc = c;
     unsigned char x = ub ^ uc; /* 0: yes; 1..255: no */
@@ -340,20 +340,20 @@ static unsigned char equal(signed char b, signed char c) {
     return (unsigned char) y;
 }
 
-static unsigned char negative(signed char b) {
+static unsigned char __host__ __device__ negative(signed char b) {
     uint64_t x = b; /* 18446744073709551361..18446744073709551615: yes; 0..255: no */
     x >>= 63; /* 1: yes; 0: no */
     return (unsigned char) x;
 }
 
-static void cmov(ge_precomp *t, const ge_precomp *u, unsigned char b) {
+static void __host__ __device__ cmov(ge_precomp *t, const ge_precomp *u, unsigned char b) {
     fe_cmov(t->yplusx, u->yplusx, b);
     fe_cmov(t->yminusx, u->yminusx, b);
     fe_cmov(t->xy2d, u->xy2d, b);
 }
 
 
-static void select(ge_precomp *t, int pos, signed char b) {
+static void __host__ __device__ select(ge_precomp *t, int pos, signed char b) {
     ge_precomp minust;
     unsigned char bnegative = negative(b);
     unsigned char babs = b - (((-bnegative) & b) << 1);
@@ -383,7 +383,7 @@ Preconditions:
   a[31] <= 127
 */
 
-void ge_scalarmult_base(ge_p3 *h, const unsigned char *a) {
+void __device__ __host__ ge_scalarmult_base(ge_p3 *h, const unsigned char *a) {
     signed char e[64];
     signed char carry;
     ge_p1p1 r;

diff --git a/src/cuda-ecc-ed25519/ge.h b/src/cuda-ecc-ed25519/ge.h
@@ -53,20 +53,20 @@ typedef struct {
 
 void __host__ __device__ ge_p3_tobytes(unsigned char *s, const ge_p3 *h);
 void __host__ __device__ ge_tobytes(unsigned char *s, const ge_p2 *h);
-int __device__ __host__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s);
+int  __host__ __device__ ge_frombytes_negate_vartime(ge_p3 *h, const unsigned char *s);
 
 void __host__ __device__ ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
 void __host__ __device__ ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q);
 void __host__ __device__ ge_double_scalarmult_vartime(ge_p2 *r, const unsigned char *a, const ge_p3 *A, const unsigned char *b);
 void __host__ __device__ ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
 void __host__ __device__ ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q);
-void ge_scalarmult_base(ge_p3 *h, const unsigned char *a);
+void __host__ __device__ ge_scalarmult_base(ge_p3 *h, const unsigned char *a);
 
 void __host__ __device__ ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p);
 void __host__ __device__ ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p);
 void __host__ __device__ ge_p2_0(ge_p2 *h);
 void __host__ __device__ ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p);
-void ge_p3_0(ge_p3 *h);
+void __host__ __device__ ge_p3_0(ge_p3 *h);
 void __host__ __device__ ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p);
 void __host__ __device__ ge_p3_to_cached(ge_cached *r, const ge_p3 *p);
 void __host__ __device__ ge_p3_to_p2(ge_p2 *r, const ge_p3 *p);

diff --git a/src/cuda-ecc-ed25519/gpu_ctx.cu b/src/cuda-ecc-ed25519/gpu_ctx.cu
@@ -0,0 +1,161 @@
+#include "ed25519.h"
+#include "gpu_ctx.h"
+#include <pthread.h>
+#include "gpu_common.h"
+
+static pthread_mutex_t g_ctx_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+#define MAX_NUM_GPUS 8
+#define MAX_QUEUE_SIZE 8
+
+static gpu_ctx_t g_gpu_ctx[MAX_NUM_GPUS][MAX_QUEUE_SIZE] = {0};
+static uint32_t g_cur_gpu = 0;
+static uint32_t g_cur_queue[MAX_NUM_GPUS] = {0};
+static int32_t g_total_gpus = -1;
+
+static bool cuda_crypt_init_locked() {
+    if (g_total_gpus == -1) {
+        cudaGetDeviceCount(&g_total_gpus);
+        g_total_gpus = min(MAX_NUM_GPUS, g_total_gpus);
+        LOG("total_gpus: %d\n", g_total_gpus);
+        for (int gpu = 0; gpu < g_total_gpus; gpu++) {
+            CUDA_CHK(cudaSetDevice(gpu));
+            for (int queue = 0; queue < MAX_QUEUE_SIZE; queue++) {
+                int err = pthread_mutex_init(&g_gpu_ctx[gpu][queue].mutex, NULL);
+                if (err != 0) {
+                    fprintf(stderr, "pthread_mutex_init error %d gpu: %d queue: %d\n",
+                            err, gpu, queue);
+                    g_total_gpus = 0;
+                    return false;
+                }
+                CUDA_CHK(cudaStreamCreate(&g_gpu_ctx[gpu][queue].stream));
+            }
+        }
+    }
+    return g_total_gpus > 0;
+}
+
+bool ed25519_init() {
+    cudaFree(0);
+    pthread_mutex_lock(&g_ctx_mutex);
+    bool success = cuda_crypt_init_locked();
+    pthread_mutex_unlock(&g_ctx_mutex);
+    return success;
+}
+
+gpu_ctx_t* get_gpu_ctx() {
+    int32_t cur_gpu, cur_queue;
+
+    LOG("locking global mutex");
+    pthread_mutex_lock(&g_ctx_mutex);
+    if (!cuda_crypt_init_locked()) {
+        pthread_mutex_unlock(&g_ctx_mutex);
+        LOG("No GPUs, exiting...\n");
+        return NULL;
+    }
+    cur_gpu = g_cur_gpu;
+    g_cur_gpu++;
+    g_cur_gpu %= g_total_gpus;
+    cur_queue = g_cur_queue[cur_gpu];
+    g_cur_queue[cur_gpu]++;
+    g_cur_queue[cur_gpu] %= MAX_QUEUE_SIZE;
+    pthread_mutex_unlock(&g_ctx_mutex);
+
+    gpu_ctx_t* cur_ctx = &g_gpu_ctx[cur_gpu][cur_queue];
+    LOG("locking contex mutex queue: %d gpu: %d", cur_queue, cur_gpu);
+    pthread_mutex_lock(&cur_ctx->mutex);
+
+    CUDA_CHK(cudaSetDevice(cur_gpu));
+
+    LOG("selecting gpu: %d queue: %d\n", cur_gpu, cur_queue);
+
+    return cur_ctx;
+}
+
+void setup_gpu_ctx(verify_ctx_t* cur_ctx,
+                   const gpu_Elems* elems,
+                   uint32_t num_elems,
+                   uint32_t message_size,
+                   uint32_t total_packets,
+                   uint32_t total_packets_size,
+                   uint32_t total_signatures,
+                   const uint32_t* message_lens,
+                   const uint32_t* public_key_offsets,
+                   const uint32_t* signature_offsets,
+                   const uint32_t* message_start_offsets,
+                   size_t out_size,
+                   cudaStream_t stream
+                   ) {
+    size_t offsets_size = total_signatures * sizeof(uint32_t);
+
+    LOG("device allocate. packets: %d out: %d offsets_size: %zu\n",
+        total_packets_size, (int)out_size, offsets_size);
+
+    if (cur_ctx->packets == NULL ||
+        total_packets_size > cur_ctx->packets_size_bytes) {
+        CUDA_CHK(cudaFree(cur_ctx->packets));
+        CUDA_CHK(cudaMalloc(&cur_ctx->packets, total_packets_size));
+
+        cur_ctx->packets_size_bytes = total_packets_size;
+    }
+
+    if (cur_ctx->out == NULL || cur_ctx->out_size_bytes < out_size) {
+        CUDA_CHK(cudaFree(cur_ctx->out));
+        CUDA_CHK(cudaMalloc(&cur_ctx->out, out_size));
+
+        cur_ctx->out_size_bytes = total_signatures;
+    }
+
+    if (cur_ctx->public_key_offsets == NULL || cur_ctx->offsets_len < total_signatures) {
+        CUDA_CHK(cudaFree(cur_ctx->public_key_offsets));
+        CUDA_CHK(cudaMalloc(&cur_ctx->public_key_offsets, offsets_size));
+
+        CUDA_CHK(cudaFree(cur_ctx->signature_offsets));
+        CUDA_CHK(cudaMalloc(&cur_ctx->signature_offsets, offsets_size));
+
+        CUDA_CHK(cudaFree(cur_ctx->message_start_offsets));
+        CUDA_CHK(cudaMalloc(&cur_ctx->message_start_offsets, offsets_size));
+
+        CUDA_CHK(cudaFree(cur_ctx->message_lens));
+        CUDA_CHK(cudaMalloc(&cur_ctx->message_lens, offsets_size));
+
+        cur_ctx->offsets_len = total_signatures;
+    }
+
+    LOG("Done alloc");
+
+    CUDA_CHK(cudaMemcpyAsync(cur_ctx->public_key_offsets, public_key_offsets, offsets_size, cudaMemcpyHostToDevice, stream));
+    CUDA_CHK(cudaMemcpyAsync(cur_ctx->signature_offsets, signature_offsets, offsets_size, cudaMemcpyHostToDevice, stream));
+    CUDA_CHK(cudaMemcpyAsync(cur_ctx->message_start_offsets, message_start_offsets, offsets_size, cudaMemcpyHostToDevice, stream));
+    CUDA_CHK(cudaMemcpyAsync(cur_ctx->message_lens, message_lens, offsets_size, cudaMemcpyHostToDevice, stream));
+
+    size_t cur = 0;
+    for (size_t i = 0; i < num_elems; i++) {
+        LOG("i: %zu size: %d\n", i, elems[i].num * message_size);
+        CUDA_CHK(cudaMemcpyAsync(&cur_ctx->packets[cur * message_size], elems[i].elems, elems[i].num * message_size, cudaMemcpyHostToDevice, stream));
+        cur += elems[i].num;
+    }
+}
+
+
+void release_gpu_ctx(gpu_ctx_t* cur_ctx) {
+    pthread_mutex_unlock(&cur_ctx->mutex);
+}
+
+void ed25519_free_gpu_mem() {
+    for (size_t gpu = 0; gpu < MAX_NUM_GPUS; gpu++) {
+        for (size_t queue = 0; queue < MAX_QUEUE_SIZE; queue++) {
+            gpu_ctx_t* cur_ctx = &g_gpu_ctx[gpu][queue];
+            CUDA_CHK(cudaFree(cur_ctx->verify_ctx.packets));
+            CUDA_CHK(cudaFree(cur_ctx->verify_ctx.out));
+            CUDA_CHK(cudaFree(cur_ctx->verify_ctx.message_lens));
+            CUDA_CHK(cudaFree(cur_ctx->verify_ctx.public_key_offsets));
+            CUDA_CHK(cudaFree(cur_ctx->verify_ctx.private_key_offsets));
+            CUDA_CHK(cudaFree(cur_ctx->verify_ctx.signature_offsets));
+            CUDA_CHK(cudaFree(cur_ctx->verify_ctx.message_start_offsets));
+            if (cur_ctx->stream != 0) {
+                CUDA_CHK(cudaStreamDestroy(cur_ctx->stream));
+            }
+        }
+    }
+}