diff --git a/.gitignore b/.gitignore index d5a66b7ec114d76e5e5d2d7c52e5652a91797af2..35add7c63bb02379d982f5d37ef1ec380ae17d27 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,20 @@ gpumaths* .cproject .project .settings +*~ +# Ignore vendor files/folders +vendor/ +# Ignore Jetbrains IDE folder +.idea/* +# Ignore vim .swp buffers for open files +.*.swp +.*.swo +# Ignore logs +*.log +# Ignore generated version file +cmd/version_vars.go +*.out +# Ignore obsolete dependency management software +glide.lock +# Ignore emacs #...# files +\#*\# diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..3077020ecd11e2c3226b76b943bccd432708b8b8 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,57 @@ +# From: https://about.gitlab.com/2017/09/21/how-to-create-ci-cd-pipeline-with-autodeploy-to-kubernetes-using-gitlab-and-helm/ + +cache: + untracked: true + key: "$CI_BUILD_REF_NAME" + paths: + - vendor/ + +variables: + REPO_DIR: gitlab.com/elixxir + REPO_NAME: gpumathsnative + DOCKER_IMAGE: elixxirlabs/cuda-go:latest + +before_script: + - echo $CI_BUILD_REF + - echo $CI_PROJECT_DIR + - echo $PWD + - echo $USER + - eval $(ssh-agent -s) + - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add - > /dev/null + - mkdir -p ~/.ssh + - chmod 700 ~/.ssh + - ssh-keyscan -t rsa gitlab.com > ~/.ssh/known_hosts + - git config --global url."git@gitlab.com:".insteadOf "https://gitlab.com/" + +stages: + - setup + - trigger_server + - trigger_release_server + +setup: + stage: setup + image: $DOCKER_IMAGE + except: + - tags + script: + - cd cgbnBindings/powm + - make turing + - make devinstall + artifacts: + paths: + - lib/ + +# TODO(?): have server pull in the latest from release/master automatically for this pipeline, if possible +trigger_server: + stage: trigger_server + script: + - "curl -X POST -F token=5be79349e632bcd07f452d04cc0583 -F ref=master https://gitlab.com/api/v4/projects/5014439/trigger/pipeline" + only: + - master + +trigger_release_server: + stage: trigger_release_server + script: + - "curl -X POST -F token=5be79349e632bcd07f452d04cc0583 -F ref=release https://gitlab.com/api/v4/projects/5014439/trigger/pipeline" + only: + - release diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7421e060bd6282da5d817a44567352b3b0be0795 --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +# elixxir/gpumathsnative + +## Building the native gpumaths library + +Before beginning, install the CUDA toolkit, version 10.2 and libgmp-dev. + +``` +$ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ +cuda-ubuntu1804.pin +$ sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 +$ wget http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/ +cuda-repo-ubuntu1804-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb +$ sudo dpkg -i cuda-repo-ubuntu1804-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb +$ sudo apt-key add /var/cuda-repo-10-2-local-10.2.89-440.33.01/7fa2af80.pub +$ sudo apt-get update +$ sudo apt-get -y install cuda +``` + +``` +$ sudo apt install libgmp-dev +``` + +Next, build and install the native gpumaths library. You must have nvcc in your PATH for this to work. + +``` +$ cd cgbnBindings/powm +$ make turing +$ sudo make install +``` + +Then, you should be able to build the server with GPU support. + diff --git a/cgbnBindings/LICENSE b/cgbnBindings/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..3fec355783116d0b82c806b0dab58d2d3ba003b5 --- /dev/null +++ b/cgbnBindings/LICENSE @@ -0,0 +1,21 @@ +/*** +Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +***/ diff --git a/cgbnBindings/powm/Makefile b/cgbnBindings/powm/Makefile index f7c62c527f970acebe247fda30aa03277103e6b7..86576a3f4571fd0a764956c2993889df97a22f35 100644 --- a/cgbnBindings/powm/Makefile +++ b/cgbnBindings/powm/Makefile @@ -23,10 +23,13 @@ clean: devinstall: mkdir -p ../../lib mv libpowmo*.so ../../lib + cp powm_odd_export.h ../../lib install: - mkdir -p /opt/elixxir/lib - mv libpowmo*.so /opt/elixxir/lib + mkdir -p /opt/xxnetwork/lib + mkdir -p /opt/xxnetwork/include + mv libpowmo*.so /opt/xxnetwork/lib + cp powm_odd_export.h /opt/xxnetwork/include kepler: nvcc --compiler-options '-fPIC' --shared $(INC) $(LIB) -I../../cgbn-master/include -arch=sm_30 powm_odd.cu -o libpowmosm40.so -lgmp diff --git a/cgbnBindings/powm/powm_odd.cu b/cgbnBindings/powm/powm_odd.cu index 9330c763e339bf18824c0ce2d88120b685617cb5..1f4f5fcff1bee64596e79b4a8db61c2b4dd13bc1 100644 --- a/cgbnBindings/powm/powm_odd.cu +++ b/cgbnBindings/powm/powm_odd.cu @@ -112,6 +112,16 @@ class cmixPrecomp { mem_t x; mem_t power; } powm_odd_input_t; + + typedef struct { + mem_t precomputation; + mem_t cypher; + } strip_input_t; + + typedef struct { + mem_t x; + mem_t y; + } mul2_input_t; typedef struct { mem_t privateKey; // Used to calculate both outputs @@ -131,7 +141,11 @@ class cmixPrecomp { mem_t publicCypherKey; } elgamal_constant_t; - + typedef struct { + mem_t prime; + mem_t Z; + } reveal_constant_t; + typedef cgbn_context_t<params::TPI, params> context_t; typedef cgbn_env_t<context_t, params::BITS> env_t; typedef typename env_t::cgbn_t bn_t; @@ -269,6 +283,25 @@ class cmixPrecomp { cgbn_set_ui32(_env, result, 1); } } + + // Find a modular root + // Precondition: Z is coprime to prime - 1 + __device__ __forceinline__ bool root_coprime(bn_t &result, const bn_t &cypher, const bn_t &Z, const bn_t &prime) { + bn_t psub1, cypherMont; + // prime should always be large, so don't check return value + cgbn_sub_ui32(_env, psub1, prime, uint32_t(1)); + bool ok = cgbn_modular_inverse(_env, result, Z, psub1); + if (ok) { + // Found inverse successfully, so do the exponentiation + uint32_t np0 = cgbn_bn2mont(_env, cypherMont, cypher, prime); + fixed_window_powm_odd(cypherMont, cypherMont, result, prime, np0); + cgbn_mont2bn(_env, result, cypherMont, prime, np0); + } else { + // The inversion result was undefined, so we must report an error + _context.report_error(cgbn_inverse_does_not_exist_error); + } + return ok; + } }; // kernel implementation using cgbn @@ -276,7 +309,7 @@ class cmixPrecomp { // Unfortunately, the kernel must be separate from the cmixPrecomp class // kernel_powm_odd<params><<<(instance_count+IPB-1)/IPB, TPB>>>(report, gpuInputs, gpuResults, instance_count); template<class params> -__global__ void kernel_powm_odd(cgbn_error_report_t *report, cgbn_mem_t<params::BITS> *constants, typename cmixPrecomp<params>::powm_odd_input_t *inputs, cgbn_mem_t<params::BITS> *outputs, size_t count) { +__global__ void kernel_powm_odd(cgbn_error_report_t *report, typename cmixPrecomp<params>::mem_t *constants, typename cmixPrecomp<params>::powm_odd_input_t *inputs, typename cmixPrecomp<params>::mem_t *outputs, size_t count) { int32_t instance; // decode an instance number from the blockIdx and threadIdx @@ -351,6 +384,92 @@ __global__ void kernel_elgamal(cgbn_error_report_t *report, typename cmixPrecomp cgbn_store(po._env, &(outputs[instance].cypher), result); } +template<class params> +__global__ void kernel_reveal(cgbn_error_report_t *report, typename cmixPrecomp<params>::reveal_constant_t *constants, typename cmixPrecomp<params>::mem_t *inputs, typename cmixPrecomp<params>::mem_t *outputs, size_t count) { + int32_t instance; + + // decode an instance number from the blockIdx and threadIdx + instance=(blockIdx.x*blockDim.x + threadIdx.x)/params::TPI; + if(instance>=count) + return; + + cmixPrecomp<params> po(cgbn_report_monitor, report, instance); + typename cmixPrecomp<params>::bn_t cypher, Z, prime, result; + + cgbn_load(po._env, cypher, &(inputs[instance])); + cgbn_load(po._env, Z, &(constants->Z)); + cgbn_load(po._env, prime, &(constants->prime)); + + po.root_coprime(result, cypher, Z, prime); + + cgbn_store(po._env, &(outputs[instance]), result); +} + +template<class params> +__global__ void kernel_strip(cgbn_error_report_t *report, typename cmixPrecomp<params>::reveal_constant_t *constants, typename cmixPrecomp<params>::strip_input_t *inputs, typename cmixPrecomp<params>::mem_t *outputs, size_t count) { + int32_t instance; + + // decode an instance number from the blockIdx and threadIdx + instance=(blockIdx.x*blockDim.x + threadIdx.x)/params::TPI; + if(instance>=count) + return; + + cmixPrecomp<params> po(cgbn_report_monitor, report, instance); + typename cmixPrecomp<params>::bn_t cypher, Z, prime, precomputation, result; + + cgbn_load(po._env, cypher, &(inputs[instance].cypher)); + cgbn_load(po._env, Z, &(constants->Z)); + cgbn_load(po._env, prime, &(constants->prime)); + + // Strip runs on the last node only and it begins with a reveal operation + bool ok = po.root_coprime(result, cypher, Z, prime); + + if (ok) { + cgbn_load(po._env, precomputation, &(inputs[instance].precomputation)); + // It should be possible to get a speedup here, because the + // prime is odd + ok = cgbn_modular_inverse(po._env, precomputation, precomputation, prime); + if (ok) { + // It may be possible to do this multiplication faster + // This is just a best guess + uint32_t np0 = cgbn_bn2mont(po._env, precomputation, precomputation, prime); + cgbn_bn2mont(po._env, cypher, cypher, prime); + cgbn_mont_mul(po._env, result, precomputation, cypher, prime, np0); + cgbn_mont2bn(po._env, result, result, prime, np0); + cgbn_store(po._env, &(outputs[instance]), result); + } else { + // The second modular inverse failed + po._context.report_error(cgbn_inverse_does_not_exist_error); + } + } +} + +// Multiply x by y mod prime +template<class params> +__global__ void kernel_mul2(cgbn_error_report_t *report, typename cmixPrecomp<params>::mem_t *constants, typename cmixPrecomp<params>::mul2_input_t *inputs, typename cmixPrecomp<params>::mem_t *outputs, size_t count) { + int32_t instance; + + // decode an instance number from the blockIdx and threadIdx + instance=(blockIdx.x*blockDim.x + threadIdx.x)/params::TPI; + if(instance>=count) + return; + + cmixPrecomp<params> po(cgbn_report_monitor, report, instance); + typename cmixPrecomp<params>::bn_t x, y, prime, result; + + cgbn_load(po._env, x, &(inputs[instance].x)); + cgbn_load(po._env, y, &(inputs[instance].y)); + cgbn_load(po._env, prime, constants); + + uint32_t np0 = cgbn_bn2mont(po._env, x, x, prime); + cgbn_bn2mont(po._env, y, y, prime); + cgbn_mont_mul(po._env, result, x, y, prime, np0); + cgbn_mont2bn(po._env, result, result, prime, np0); + + cgbn_store(po._env, &(outputs[instance]), result); +} + + // Run powm kernel // Enqueues kernel on the stream and returns immediately (non-blocking) // The results will be placed in the stream's gpu outputs buffer some time after the kernel launch @@ -365,7 +484,7 @@ const char* run(streamData *stream) { // launch kernel with blocks=ceil(instance_count/IPB) and threads=TPB // TODO We should be able to launch more than just this kernel. // Organize with enumeration? Is it possible to use templates to make this better? - typedef cgbn_mem_t<params::BITS> mem_t; + typedef typename cmixPrecomp<params>::mem_t mem_t; switch (stream->whichToRun) { case KERNEL_POWM_ODD: @@ -390,8 +509,36 @@ const char* run(streamData *stream) { stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length); } break; + case KERNEL_REVEAL: + { + typedef typename cmixPrecomp<params>::reveal_constant_t constant_t; + constant_t* gpuConstants = (constant_t*)stream->gpuMem; + mem_t* gpuInputs = (mem_t*)(gpuConstants+1); + mem_t* gpuOutputs = (mem_t*)(gpuInputs+stream->length); + kernel_reveal<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>( + stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length); + } + break; + case KERNEL_STRIP: + { + typedef typename cmixPrecomp<params>::reveal_constant_t constant_t; + typedef typename cmixPrecomp<params>::strip_input_t input_t; + constant_t* gpuConstants = (constant_t*)stream->gpuMem; + input_t* gpuInputs = (input_t*)(gpuConstants+1); + mem_t* gpuOutputs = (mem_t*)(gpuInputs+stream->length); + kernel_strip<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>( + stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length); + } + break; case KERNEL_MUL2: - return strdup("KERNEL_MUL2 unimplemented"); + { + typedef typename cmixPrecomp<params>::mul2_input_t input_t; + mem_t *gpuConstants = (mem_t*)(stream->gpuMem); + input_t *gpuInputs = (input_t*)(gpuConstants+1); + mem_t *gpuOutputs = (mem_t*)(gpuInputs+stream->length); + kernel_mul2<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>( + stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length); + } break; default: return strdup("Unknown kernel not implemented"); @@ -523,14 +670,27 @@ extern "C" { // This is a mess. We should just be able to pass "elgamal" or, at worst, "elgamal<params4096>" // I'd rather not have to switch all the types and instantiate different classes based on the cryptop but that's a limitation on Cgo. void *cpuOutputs, *gpuOutputs; + typedef typename cmixPrecomp<params4096>::mem_t mem_t; switch (stream->whichToRun) { case KERNEL_ELGAMAL: cpuOutputs = getOutputs<cmixPrecomp<params4096>::elgamal_input_t, cmixPrecomp<params4096>::elgamal_constant_t>(stream->cpuMem, stream->length); gpuOutputs = getOutputs<cmixPrecomp<params4096>::elgamal_input_t, cmixPrecomp<params4096>::elgamal_constant_t>(stream->gpuMem, stream->length); break; case KERNEL_POWM_ODD: - cpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, cmixPrecomp<params4096>::mem_t>(stream->cpuMem, stream->length); - gpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, cmixPrecomp<params4096>::mem_t>(stream->gpuMem, stream->length); + cpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, mem_t>(stream->cpuMem, stream->length); + gpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, mem_t>(stream->gpuMem, stream->length); + break; + case KERNEL_REVEAL: + cpuOutputs = getOutputs<mem_t, cmixPrecomp<params4096>::reveal_constant_t>(stream->cpuMem, stream->length); + gpuOutputs = getOutputs<mem_t, cmixPrecomp<params4096>::reveal_constant_t>(stream->gpuMem, stream->length); + break; + case KERNEL_STRIP: + cpuOutputs = getOutputs<cmixPrecomp<params4096>::strip_input_t, cmixPrecomp<params4096>::reveal_constant_t>(stream->cpuMem, stream->length); + gpuOutputs = getOutputs<cmixPrecomp<params4096>::strip_input_t, cmixPrecomp<params4096>::reveal_constant_t>(stream->gpuMem, stream->length); + break; + case KERNEL_MUL2: + cpuOutputs = getOutputs<cmixPrecomp<params4096>::mul2_input_t, mem_t>(stream->cpuMem, stream->length); + gpuOutputs = getOutputs<cmixPrecomp<params4096>::mul2_input_t, mem_t>(stream->gpuMem, stream->length); break; default: return strdup("Unknown kernel for download; unable to find location of outputs in buffer\n"); @@ -604,9 +764,15 @@ extern "C" { return getInputs<cmixPrecomp<params4096>::elgamal_constant_t>(s->cpuMem); break; case KERNEL_POWM_ODD: + case KERNEL_MUL2: return getInputs<cmixPrecomp<params4096>::mem_t>(s->cpuMem); break; - case KERNEL_MUL2: + case KERNEL_REVEAL: + return getInputs<cmixPrecomp<params4096>::reveal_constant_t>(s->cpuMem); + break; + case KERNEL_STRIP: + return getInputs<cmixPrecomp<params4096>::reveal_constant_t>(s->cpuMem); + break; default: // Unimplemented return NULL; @@ -626,7 +792,17 @@ extern "C" { return getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, cmixPrecomp<params4096>::mem_t>( s->cpuMem, s->length); break; + case KERNEL_REVEAL: + return getOutputs<cmixPrecomp<params4096>::mem_t, cmixPrecomp<params4096>::reveal_constant_t>( + s->cpuMem, s->length); + break; + case KERNEL_STRIP: + return getOutputs<cmixPrecomp<params4096>::strip_input_t, cmixPrecomp<params4096>::reveal_constant_t>( + s->cpuMem, s->length); + break; case KERNEL_MUL2: + return getOutputs<cmixPrecomp<params4096>::mul2_input_t, cmixPrecomp<params4096>::mem_t>( + s->cpuMem, s->length); default: // Unimplemented return NULL; @@ -648,9 +824,13 @@ extern "C" { return sizeof(cmixPrecomp<params4096>::elgamal_constant_t); break; case KERNEL_POWM_ODD: + case KERNEL_MUL2: return sizeof(cmixPrecomp<params4096>::mem_t); break; - case KERNEL_MUL2: + case KERNEL_REVEAL: + case KERNEL_STRIP: + return sizeof(cmixPrecomp<params4096>::reveal_constant_t); + break; default: // Unimplemented return 0; @@ -666,7 +846,15 @@ extern "C" { case KERNEL_POWM_ODD: return sizeof(cmixPrecomp<params4096>::powm_odd_input_t); break; + case KERNEL_REVEAL: + return sizeof(cmixPrecomp<params4096>::mem_t); + break; + case KERNEL_STRIP: + return sizeof(cmixPrecomp<params4096>::strip_input_t); + break; case KERNEL_MUL2: + return sizeof(cmixPrecomp<params4096>::mul2_input_t); + break; default: // Unimplemented return 0; @@ -680,9 +868,12 @@ extern "C" { return sizeof(cmixPrecomp<params4096>::elgamal_output_t); break; case KERNEL_POWM_ODD: + case KERNEL_REVEAL: + case KERNEL_STRIP: + case KERNEL_MUL2: + // Most ops just return one number return sizeof(cmixPrecomp<params4096>::mem_t); break; - case KERNEL_MUL2: default: // Unimplemented return 0; diff --git a/cgbnBindings/powm/powm_odd_export.h b/cgbnBindings/powm/powm_odd_export.h index df4b350db8c331dfc2e99739005548b04dec048b..de8def1412806db8cbf9d1f57a21321d1f5ff0bb 100644 --- a/cgbnBindings/powm/powm_odd_export.h +++ b/cgbnBindings/powm/powm_odd_export.h @@ -29,6 +29,8 @@ struct return_data { enum kernel { KERNEL_POWM_ODD, KERNEL_ELGAMAL, + KERNEL_REVEAL, + KERNEL_STRIP, KERNEL_MUL2, };