diff --git a/.gitignore b/.gitignore
index d5a66b7ec114d76e5e5d2d7c52e5652a91797af2..35add7c63bb02379d982f5d37ef1ec380ae17d27 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,20 @@ gpumaths*
 .cproject
 .project
 .settings
+*~
+# Ignore vendor files/folders
+vendor/
+# Ignore Jetbrains IDE folder
+.idea/*
+# Ignore vim .swp buffers for open files
+.*.swp
+.*.swo
+# Ignore logs
+*.log
+# Ignore generated version file
+cmd/version_vars.go
+*.out
+# Ignore obsolete dependency management software
+glide.lock
+# Ignore emacs #...# files
+\#*\#
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3077020ecd11e2c3226b76b943bccd432708b8b8
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,57 @@
+# From: https://about.gitlab.com/2017/09/21/how-to-create-ci-cd-pipeline-with-autodeploy-to-kubernetes-using-gitlab-and-helm/
+
+cache:
+  untracked: true
+  key: "$CI_BUILD_REF_NAME"
+  paths:
+    - vendor/
+
+variables:
+  REPO_DIR: gitlab.com/elixxir
+  REPO_NAME: gpumathsnative
+  DOCKER_IMAGE: elixxirlabs/cuda-go:latest
+
+before_script:
+  - echo $CI_BUILD_REF
+  - echo $CI_PROJECT_DIR
+  - echo $PWD
+  - echo $USER
+  - eval $(ssh-agent -s)
+  - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add - > /dev/null
+  - mkdir -p ~/.ssh
+  - chmod 700 ~/.ssh
+  - ssh-keyscan -t rsa gitlab.com > ~/.ssh/known_hosts
+  - git config --global url."git@gitlab.com:".insteadOf "https://gitlab.com/"
+
+stages:
+  - setup
+  - trigger_server
+  - trigger_release_server
+
+setup:
+  stage: setup
+  image: $DOCKER_IMAGE
+  except:
+    - tags
+  script:
+    - cd cgbnBindings/powm
+    - make turing
+    - make devinstall
+  artifacts:
+    paths:
+     - lib/
+
+# TODO(?): have server pull in the latest from release/master automatically for this pipeline, if possible
+trigger_server:
+  stage: trigger_server
+  script:
+    - "curl -X POST -F token=5be79349e632bcd07f452d04cc0583 -F ref=master https://gitlab.com/api/v4/projects/5014439/trigger/pipeline"
+  only:
+    - master
+
+trigger_release_server:
+  stage: trigger_release_server
+  script:
+    - "curl -X POST -F token=5be79349e632bcd07f452d04cc0583 -F ref=release https://gitlab.com/api/v4/projects/5014439/trigger/pipeline"
+  only:
+    - release
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7421e060bd6282da5d817a44567352b3b0be0795
--- /dev/null
+++ b/README.md
@@ -0,0 +1,32 @@
+# elixxir/gpumathsnative
+
+## Building the native gpumaths library
+
+Before beginning, install the CUDA toolkit, version 10.2 and libgmp-dev.
+
+```
+$ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/
+cuda-ubuntu1804.pin
+$ sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
+$ wget http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/
+cuda-repo-ubuntu1804-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
+$ sudo dpkg -i cuda-repo-ubuntu1804-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
+$ sudo apt-key add /var/cuda-repo-10-2-local-10.2.89-440.33.01/7fa2af80.pub
+$ sudo apt-get update
+$ sudo apt-get -y install cuda
+```
+
+```
+$ sudo apt install libgmp-dev
+```
+
+Next, build and install the native gpumaths library. You must have nvcc in your PATH for this to work.
+
+```
+$ cd cgbnBindings/powm
+$ make turing
+$ sudo make install
+```
+
+Then, you should be able to build the server with GPU support.
+
diff --git a/cgbnBindings/LICENSE b/cgbnBindings/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..3fec355783116d0b82c806b0dab58d2d3ba003b5
--- /dev/null
+++ b/cgbnBindings/LICENSE
@@ -0,0 +1,21 @@
+/***
+Copyright (c) 2015, NVIDIA CORPORATION.  All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a 
+copy of this software and associated documentation files (the "Software"), 
+to deal in the Software without restriction, including without limitation 
+the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+and/or sell copies of the Software, and to permit persons to whom the 
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in 
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
+IN THE SOFTWARE.
+***/
diff --git a/cgbnBindings/powm/Makefile b/cgbnBindings/powm/Makefile
index f7c62c527f970acebe247fda30aa03277103e6b7..86576a3f4571fd0a764956c2993889df97a22f35 100644
--- a/cgbnBindings/powm/Makefile
+++ b/cgbnBindings/powm/Makefile
@@ -23,10 +23,13 @@ clean:
 devinstall:
 	mkdir -p ../../lib
 	mv libpowmo*.so ../../lib
+	cp powm_odd_export.h ../../lib
 
 install:
-	mkdir -p /opt/elixxir/lib
-	mv libpowmo*.so /opt/elixxir/lib
+	mkdir -p /opt/xxnetwork/lib
+	mkdir -p /opt/xxnetwork/include
+	mv libpowmo*.so /opt/xxnetwork/lib
+	cp powm_odd_export.h /opt/xxnetwork/include
 
 kepler:
 	nvcc --compiler-options '-fPIC' --shared $(INC) $(LIB) -I../../cgbn-master/include -arch=sm_30 powm_odd.cu -o libpowmosm40.so -lgmp
diff --git a/cgbnBindings/powm/powm_odd.cu b/cgbnBindings/powm/powm_odd.cu
index 9330c763e339bf18824c0ce2d88120b685617cb5..1f4f5fcff1bee64596e79b4a8db61c2b4dd13bc1 100644
--- a/cgbnBindings/powm/powm_odd.cu
+++ b/cgbnBindings/powm/powm_odd.cu
@@ -112,6 +112,16 @@ class cmixPrecomp {
     mem_t x;
     mem_t power;
   } powm_odd_input_t;
+  
+  typedef struct {
+    mem_t precomputation;
+    mem_t cypher;
+  } strip_input_t;
+  
+  typedef struct {
+    mem_t x;
+    mem_t y;
+  } mul2_input_t;
 
   typedef struct {
     mem_t privateKey; // Used to calculate both outputs 
@@ -131,7 +141,11 @@ class cmixPrecomp {
     mem_t publicCypherKey;
   } elgamal_constant_t;
 
-  
+  typedef struct {
+    mem_t prime;
+    mem_t Z;
+  } reveal_constant_t;
+
   typedef cgbn_context_t<params::TPI, params>   context_t;
   typedef cgbn_env_t<context_t, params::BITS>   env_t;
   typedef typename env_t::cgbn_t                bn_t;
@@ -269,6 +283,25 @@ class cmixPrecomp {
       cgbn_set_ui32(_env, result, 1);
     }
   }
+
+  // Find a modular root
+  // Precondition: Z is coprime to prime - 1
+  __device__ __forceinline__ bool root_coprime(bn_t &result, const bn_t &cypher, const bn_t &Z, const bn_t &prime) {
+    bn_t psub1, cypherMont;
+    // prime should always be large, so don't check return value
+    cgbn_sub_ui32(_env, psub1, prime, uint32_t(1));
+    bool ok = cgbn_modular_inverse(_env, result, Z, psub1);
+    if (ok) {
+      // Found inverse successfully, so do the exponentiation
+      uint32_t np0 = cgbn_bn2mont(_env, cypherMont, cypher, prime);
+      fixed_window_powm_odd(cypherMont, cypherMont, result, prime, np0);
+      cgbn_mont2bn(_env, result, cypherMont, prime, np0);
+    } else {
+      // The inversion result was undefined, so we must report an error
+      _context.report_error(cgbn_inverse_does_not_exist_error);
+    }
+    return ok;
+  }
 };
 
 // kernel implementation using cgbn
@@ -276,7 +309,7 @@ class cmixPrecomp {
 // Unfortunately, the kernel must be separate from the cmixPrecomp class
 // kernel_powm_odd<params><<<(instance_count+IPB-1)/IPB, TPB>>>(report, gpuInputs, gpuResults, instance_count);
 template<class params>
-__global__ void kernel_powm_odd(cgbn_error_report_t *report, cgbn_mem_t<params::BITS> *constants, typename cmixPrecomp<params>::powm_odd_input_t *inputs, cgbn_mem_t<params::BITS> *outputs, size_t count) {
+__global__ void kernel_powm_odd(cgbn_error_report_t *report, typename cmixPrecomp<params>::mem_t *constants, typename cmixPrecomp<params>::powm_odd_input_t *inputs, typename cmixPrecomp<params>::mem_t *outputs, size_t count) {
   int32_t instance;
 
   // decode an instance number from the blockIdx and threadIdx
@@ -351,6 +384,92 @@ __global__ void kernel_elgamal(cgbn_error_report_t *report, typename cmixPrecomp
   cgbn_store(po._env, &(outputs[instance].cypher), result);
 }
 
+template<class params>
+__global__ void kernel_reveal(cgbn_error_report_t *report, typename cmixPrecomp<params>::reveal_constant_t *constants, typename cmixPrecomp<params>::mem_t *inputs, typename cmixPrecomp<params>::mem_t *outputs, size_t count) {
+  int32_t instance;
+
+  // decode an instance number from the blockIdx and threadIdx
+  instance=(blockIdx.x*blockDim.x + threadIdx.x)/params::TPI;
+  if(instance>=count)
+    return;
+
+  cmixPrecomp<params>                 po(cgbn_report_monitor, report, instance);
+  typename cmixPrecomp<params>::bn_t  cypher, Z, prime, result;
+
+  cgbn_load(po._env, cypher, &(inputs[instance]));
+  cgbn_load(po._env, Z, &(constants->Z));
+  cgbn_load(po._env, prime, &(constants->prime));
+
+  po.root_coprime(result, cypher, Z, prime);
+
+  cgbn_store(po._env, &(outputs[instance]), result);
+}
+
+template<class params>
+__global__ void kernel_strip(cgbn_error_report_t *report, typename cmixPrecomp<params>::reveal_constant_t *constants, typename cmixPrecomp<params>::strip_input_t *inputs, typename cmixPrecomp<params>::mem_t *outputs, size_t count) {
+  int32_t instance;
+
+  // decode an instance number from the blockIdx and threadIdx
+  instance=(blockIdx.x*blockDim.x + threadIdx.x)/params::TPI;
+  if(instance>=count)
+    return;
+
+  cmixPrecomp<params>                 po(cgbn_report_monitor, report, instance);
+  typename cmixPrecomp<params>::bn_t  cypher, Z, prime, precomputation, result;
+
+  cgbn_load(po._env, cypher, &(inputs[instance].cypher));
+  cgbn_load(po._env, Z, &(constants->Z));
+  cgbn_load(po._env, prime, &(constants->prime));
+
+  // Strip runs on the last node only and it begins with a reveal operation
+  bool ok = po.root_coprime(result, cypher, Z, prime);
+  
+  if (ok) {
+    cgbn_load(po._env, precomputation, &(inputs[instance].precomputation));
+    // It should be possible to get a speedup here, because the
+    // prime is odd
+    ok = cgbn_modular_inverse(po._env, precomputation, precomputation, prime);
+    if (ok) {
+      // It may be possible to do this multiplication faster
+      // This is just a best guess
+      uint32_t np0 = cgbn_bn2mont(po._env, precomputation, precomputation, prime);
+      cgbn_bn2mont(po._env, cypher, cypher, prime);
+      cgbn_mont_mul(po._env, result, precomputation, cypher, prime, np0);
+      cgbn_mont2bn(po._env, result, result, prime, np0);
+      cgbn_store(po._env, &(outputs[instance]), result);
+    } else {
+      // The second modular inverse failed
+      po._context.report_error(cgbn_inverse_does_not_exist_error);
+    }
+  }
+}
+
+// Multiply x by y mod prime
+template<class params>
+__global__ void kernel_mul2(cgbn_error_report_t *report, typename cmixPrecomp<params>::mem_t *constants, typename cmixPrecomp<params>::mul2_input_t *inputs, typename cmixPrecomp<params>::mem_t *outputs, size_t count) {
+  int32_t instance;
+
+  // decode an instance number from the blockIdx and threadIdx
+  instance=(blockIdx.x*blockDim.x + threadIdx.x)/params::TPI;
+  if(instance>=count)
+    return;
+
+  cmixPrecomp<params>                 po(cgbn_report_monitor, report, instance);
+  typename cmixPrecomp<params>::bn_t  x, y, prime, result;
+
+  cgbn_load(po._env, x, &(inputs[instance].x));
+  cgbn_load(po._env, y, &(inputs[instance].y));
+  cgbn_load(po._env, prime, constants);
+
+  uint32_t np0 = cgbn_bn2mont(po._env, x, x, prime);
+  cgbn_bn2mont(po._env, y, y, prime);
+  cgbn_mont_mul(po._env, result, x, y, prime, np0);
+  cgbn_mont2bn(po._env, result, result, prime, np0);
+
+  cgbn_store(po._env, &(outputs[instance]), result);
+}
+
+
 // Run powm kernel
 // Enqueues kernel on the stream and returns immediately (non-blocking)
 // The results will be placed in the stream's gpu outputs buffer some time after the kernel launch
@@ -365,7 +484,7 @@ const char* run(streamData *stream) {
   // launch kernel with blocks=ceil(instance_count/IPB) and threads=TPB
   // TODO We should be able to launch more than just this kernel.
   //  Organize with enumeration? Is it possible to use templates to make this better?
-  typedef cgbn_mem_t<params::BITS> mem_t;
+  typedef typename cmixPrecomp<params>::mem_t mem_t;
 
   switch (stream->whichToRun) {
   case KERNEL_POWM_ODD:
@@ -390,8 +509,36 @@ const char* run(streamData *stream) {
         stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length);
     }
     break;
+  case KERNEL_REVEAL:
+    {
+      typedef typename cmixPrecomp<params>::reveal_constant_t constant_t;
+      constant_t* gpuConstants = (constant_t*)stream->gpuMem;
+      mem_t* gpuInputs = (mem_t*)(gpuConstants+1);
+      mem_t* gpuOutputs = (mem_t*)(gpuInputs+stream->length);
+      kernel_reveal<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>(
+          stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length);
+    }
+    break;
+  case KERNEL_STRIP:
+    {
+      typedef typename cmixPrecomp<params>::reveal_constant_t constant_t;
+      typedef typename cmixPrecomp<params>::strip_input_t input_t;
+      constant_t* gpuConstants = (constant_t*)stream->gpuMem;
+      input_t* gpuInputs = (input_t*)(gpuConstants+1);
+      mem_t* gpuOutputs = (mem_t*)(gpuInputs+stream->length);
+      kernel_strip<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>(
+          stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length);
+    }
+    break;
   case KERNEL_MUL2:
-    return strdup("KERNEL_MUL2 unimplemented");
+    {
+      typedef typename cmixPrecomp<params>::mul2_input_t input_t;
+      mem_t *gpuConstants = (mem_t*)(stream->gpuMem);
+      input_t *gpuInputs = (input_t*)(gpuConstants+1);
+      mem_t *gpuOutputs = (mem_t*)(gpuInputs+stream->length);
+      kernel_mul2<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>(
+          stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length);
+    }
     break;
   default:
     return strdup("Unknown kernel not implemented");
@@ -523,14 +670,27 @@ extern "C" {
     // This is a mess. We should just be able to pass "elgamal" or, at worst, "elgamal<params4096>" 
     // I'd rather not have to switch all the types and instantiate different classes based on the cryptop but that's a limitation on Cgo.
     void *cpuOutputs, *gpuOutputs;
+    typedef typename cmixPrecomp<params4096>::mem_t mem_t;
     switch (stream->whichToRun) {
     case KERNEL_ELGAMAL:
       cpuOutputs = getOutputs<cmixPrecomp<params4096>::elgamal_input_t, cmixPrecomp<params4096>::elgamal_constant_t>(stream->cpuMem, stream->length);
       gpuOutputs = getOutputs<cmixPrecomp<params4096>::elgamal_input_t, cmixPrecomp<params4096>::elgamal_constant_t>(stream->gpuMem, stream->length);
       break;
     case KERNEL_POWM_ODD:
-      cpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, cmixPrecomp<params4096>::mem_t>(stream->cpuMem, stream->length);
-      gpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, cmixPrecomp<params4096>::mem_t>(stream->gpuMem, stream->length);
+      cpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, mem_t>(stream->cpuMem, stream->length);
+      gpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, mem_t>(stream->gpuMem, stream->length);
+      break;
+    case KERNEL_REVEAL:
+      cpuOutputs = getOutputs<mem_t, cmixPrecomp<params4096>::reveal_constant_t>(stream->cpuMem, stream->length);
+      gpuOutputs = getOutputs<mem_t, cmixPrecomp<params4096>::reveal_constant_t>(stream->gpuMem, stream->length);
+      break;
+    case KERNEL_STRIP:
+      cpuOutputs = getOutputs<cmixPrecomp<params4096>::strip_input_t, cmixPrecomp<params4096>::reveal_constant_t>(stream->cpuMem, stream->length);
+      gpuOutputs = getOutputs<cmixPrecomp<params4096>::strip_input_t, cmixPrecomp<params4096>::reveal_constant_t>(stream->gpuMem, stream->length);
+      break;
+    case KERNEL_MUL2:
+      cpuOutputs = getOutputs<cmixPrecomp<params4096>::mul2_input_t, mem_t>(stream->cpuMem, stream->length);
+      gpuOutputs = getOutputs<cmixPrecomp<params4096>::mul2_input_t, mem_t>(stream->gpuMem, stream->length);
       break;
     default:
       return strdup("Unknown kernel for download; unable to find location of outputs in buffer\n");
@@ -604,9 +764,15 @@ extern "C" {
         return getInputs<cmixPrecomp<params4096>::elgamal_constant_t>(s->cpuMem);
         break;
       case KERNEL_POWM_ODD:
+      case KERNEL_MUL2:
         return getInputs<cmixPrecomp<params4096>::mem_t>(s->cpuMem);
         break;
-      case KERNEL_MUL2:
+      case KERNEL_REVEAL:
+        return getInputs<cmixPrecomp<params4096>::reveal_constant_t>(s->cpuMem);
+        break;
+      case KERNEL_STRIP:
+        return getInputs<cmixPrecomp<params4096>::reveal_constant_t>(s->cpuMem);
+        break;
       default:
         // Unimplemented
         return NULL;
@@ -626,7 +792,17 @@ extern "C" {
         return getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, cmixPrecomp<params4096>::mem_t>(
             s->cpuMem, s->length);
         break;
+      case KERNEL_REVEAL:
+        return getOutputs<cmixPrecomp<params4096>::mem_t, cmixPrecomp<params4096>::reveal_constant_t>(
+            s->cpuMem, s->length);
+        break;
+      case KERNEL_STRIP:
+        return getOutputs<cmixPrecomp<params4096>::strip_input_t, cmixPrecomp<params4096>::reveal_constant_t>(
+            s->cpuMem, s->length);
+        break;
       case KERNEL_MUL2:
+        return getOutputs<cmixPrecomp<params4096>::mul2_input_t, cmixPrecomp<params4096>::mem_t>(
+            s->cpuMem, s->length);
       default:
         // Unimplemented
         return NULL;
@@ -648,9 +824,13 @@ extern "C" {
         return sizeof(cmixPrecomp<params4096>::elgamal_constant_t);
         break;
       case KERNEL_POWM_ODD:
+      case KERNEL_MUL2:
         return sizeof(cmixPrecomp<params4096>::mem_t);
         break;
-      case KERNEL_MUL2:
+      case KERNEL_REVEAL:
+      case KERNEL_STRIP:
+        return sizeof(cmixPrecomp<params4096>::reveal_constant_t);
+        break;
       default:
         // Unimplemented
         return 0;
@@ -666,7 +846,15 @@ extern "C" {
       case KERNEL_POWM_ODD:
         return sizeof(cmixPrecomp<params4096>::powm_odd_input_t);
         break;
+      case KERNEL_REVEAL:
+        return sizeof(cmixPrecomp<params4096>::mem_t);
+        break;
+      case KERNEL_STRIP:
+        return sizeof(cmixPrecomp<params4096>::strip_input_t);
+        break;
       case KERNEL_MUL2:
+        return sizeof(cmixPrecomp<params4096>::mul2_input_t);
+        break;
       default:
         // Unimplemented
         return 0;
@@ -680,9 +868,12 @@ extern "C" {
         return sizeof(cmixPrecomp<params4096>::elgamal_output_t);
         break;
       case KERNEL_POWM_ODD:
+      case KERNEL_REVEAL:
+      case KERNEL_STRIP:
+      case KERNEL_MUL2:
+        // Most ops just return one number
         return sizeof(cmixPrecomp<params4096>::mem_t);
         break;
-      case KERNEL_MUL2:
       default:
         // Unimplemented
         return 0;
diff --git a/cgbnBindings/powm/powm_odd_export.h b/cgbnBindings/powm/powm_odd_export.h
index df4b350db8c331dfc2e99739005548b04dec048b..de8def1412806db8cbf9d1f57a21321d1f5ff0bb 100644
--- a/cgbnBindings/powm/powm_odd_export.h
+++ b/cgbnBindings/powm/powm_odd_export.h
@@ -29,6 +29,8 @@ struct return_data {
 enum kernel {
   KERNEL_POWM_ODD,
   KERNEL_ELGAMAL,
+  KERNEL_REVEAL,
+  KERNEL_STRIP,
   KERNEL_MUL2,
 };