Skip to content
Snippets Groups Projects
Commit b2d0ff49 authored by Niamh Nikali's avatar Niamh Nikali
Browse files

Create external API for elgamal and exp cryptops

parent 7168c60a
No related branches found
No related tags found
No related merge requests found
...@@ -18,10 +18,15 @@ pick: ...@@ -18,10 +18,15 @@ pick:
@echo @echo
clean: clean:
rm -f libpowmosm*.so rm -f libpowmo*.so
devinstall:
mkdir -p ../../lib
mv libpowmo*.so ../../lib
install: install:
mv libpowmosm*.so ../../lib mkdir -p /opt/elixxir/lib
mv libpowmo*.so /opt/elixxir/lib
kepler: kepler:
nvcc --compiler-options '-fPIC' --shared $(INC) $(LIB) -I../../cgbn-master/include -arch=sm_30 powm_odd.cu -o libpowmosm40.so -lgmp nvcc --compiler-options '-fPIC' --shared $(INC) $(LIB) -I../../cgbn-master/include -arch=sm_30 powm_odd.cu -o libpowmosm40.so -lgmp
...@@ -36,6 +41,8 @@ volta: ...@@ -36,6 +41,8 @@ volta:
nvcc --compiler-options '-fPIC' --shared $(INC) $(LIB) -I../../cgbn-master/include -arch=sm_70 powm_odd.cu -o libpowmosm70.so -lgmp nvcc --compiler-options '-fPIC' --shared $(INC) $(LIB) -I../../cgbn-master/include -arch=sm_70 powm_odd.cu -o libpowmosm70.so -lgmp
turing: turing:
nvcc --compiler-options '-fPIC' --shared $(INC) $(LIB) -I../../cgbn-master/include -arch=sm_75 powm_odd.cu -o libpowmosm75.so -lgmp -g nvcc --compiler-options '-fPIC' --shared $(INC) $(LIB) -I../../cgbn-master/include -arch=sm_75 powm_odd.cu -o libpowmosm75.so -lgmp
turingdebug:
nvcc --compiler-options '-fPIC' --shared $(INC) $(LIB) -I../../cgbn-master/include -arch=sm_75 powm_odd.cu -o libpowmosm75.so -lgmp -g -DTRACE
all: kepler maxwell pascal volta turing
...@@ -32,8 +32,6 @@ IN THE SOFTWARE. ...@@ -32,8 +32,6 @@ IN THE SOFTWARE.
#include "../utility/support.h" #include "../utility/support.h"
#include "powm_odd_export.h" #include "powm_odd_export.h"
//#define TRACE
// Stream object and associated data for a stream // Stream object and associated data for a stream
// This name could perhaps be better... // This name could perhaps be better...
struct streamData { struct streamData {
...@@ -41,36 +39,24 @@ struct streamData { ...@@ -41,36 +39,24 @@ struct streamData {
// this one // this one
cudaStream_t stream; cudaStream_t stream;
// Device buffer uploaded to before execution // Area of device memory that this stream can use
// Contains multiple items void *gpuMem;
void *gpuInputs;
// Device buffer downloaded from after execution // Area of host memory that this stream can use
// Contains multiple items // This buffer is pinned memory
void *gpuOutputs; void *cpuMem;
// Device data that's the same for all items, uploaded to before execution
// For instance, can contain the modulus. // Total size of buffers at max capacity (set at creation time)
// Note: This currently uses global memory, not constant memory. size_t memCapacity;
// It isn't a constant buffer.
void *gpuConstants; // Size of input and output regions for this launch (set at upload time)
// Input region will only have host to device transfer, output region will only have device to host transfer
// Host buffer downloaded to after execution // Input region always comes first, and output region always comes right afterwards
// This is allocated with pinned memory that can be transferred with the DMA
void *cpuOutputs;
void *cpuInputs;
void *cpuConstants;
// Size of buffers at max capacity (set at creation time)
size_t inputsCapacity;
size_t outputsCapacity;
size_t constantsCapacity;
// Size of uploads and downloads for this launch (set at upload time)
size_t inputsLength; size_t inputsLength;
size_t outputsLength; size_t outputsLength;
size_t constantsLength;
// Number of items that can be held in the buffers associated with this stream enum kernel whichToRun;
size_t capacity;
// Number of items to be processed with this part of the stream // Number of items to be processed with this part of the stream
size_t length; size_t length;
...@@ -114,6 +100,7 @@ class powm_params_t { ...@@ -114,6 +100,7 @@ class powm_params_t {
static const uint32_t WINDOW_BITS=window_bits; // window size static const uint32_t WINDOW_BITS=window_bits; // window size
}; };
// Really I'd like this class to have very few responsibilities overall, and be able to invoke an exponentiation in the same way I'd invoke any of the native cgbn methods
template<class params> template<class params>
class cmixPrecomp { class cmixPrecomp {
public: public:
...@@ -289,7 +276,7 @@ class cmixPrecomp { ...@@ -289,7 +276,7 @@ class cmixPrecomp {
// Unfortunately, the kernel must be separate from the cmixPrecomp class // Unfortunately, the kernel must be separate from the cmixPrecomp class
// kernel_powm_odd<params><<<(instance_count+IPB-1)/IPB, TPB>>>(report, gpuInputs, gpuResults, instance_count); // kernel_powm_odd<params><<<(instance_count+IPB-1)/IPB, TPB>>>(report, gpuInputs, gpuResults, instance_count);
template<class params> template<class params>
__global__ void kernel_powm_odd(cgbn_error_report_t *report, typename cmixPrecomp<params>::powm_odd_input_t *inputs, cgbn_mem_t<params::BITS> *modulus, cgbn_mem_t<params::BITS> *outputs, size_t count) { __global__ void kernel_powm_odd(cgbn_error_report_t *report, cgbn_mem_t<params::BITS> *constants, typename cmixPrecomp<params>::powm_odd_input_t *inputs, cgbn_mem_t<params::BITS> *outputs, size_t count) {
int32_t instance; int32_t instance;
// decode an instance number from the blockIdx and threadIdx // decode an instance number from the blockIdx and threadIdx
...@@ -304,7 +291,7 @@ __global__ void kernel_powm_odd(cgbn_error_report_t *report, typename cmixPrecom ...@@ -304,7 +291,7 @@ __global__ void kernel_powm_odd(cgbn_error_report_t *report, typename cmixPrecom
// here and to pass in and out bignums // here and to pass in and out bignums
cgbn_load(po._env, x, &(inputs[instance].x)); cgbn_load(po._env, x, &(inputs[instance].x));
cgbn_load(po._env, p, &(inputs[instance].power)); cgbn_load(po._env, p, &(inputs[instance].power));
cgbn_load(po._env, m, modulus); cgbn_load(po._env, m, constants);
// this can be either fixed_window_powm_odd or sliding_window_powm_odd. // this can be either fixed_window_powm_odd or sliding_window_powm_odd.
// when TPI<32, fixed window runs much faster because it is less divergent, so we use it here // when TPI<32, fixed window runs much faster because it is less divergent, so we use it here
...@@ -318,7 +305,7 @@ __global__ void kernel_powm_odd(cgbn_error_report_t *report, typename cmixPrecom ...@@ -318,7 +305,7 @@ __global__ void kernel_powm_odd(cgbn_error_report_t *report, typename cmixPrecom
} }
template<class params> template<class params>
__global__ void kernel_elgamal(cgbn_error_report_t *report, typename cmixPrecomp<params>::elgamal_input_t *inputs, typename cmixPrecomp<params>::elgamal_constant_t *constants, typename cmixPrecomp<params>::elgamal_output_t *outputs, size_t count) { __global__ void kernel_elgamal(cgbn_error_report_t *report, typename cmixPrecomp<params>::elgamal_constant_t *constants, typename cmixPrecomp<params>::elgamal_input_t *inputs, typename cmixPrecomp<params>::elgamal_output_t *outputs, size_t count) {
int32_t instance; int32_t instance;
// decode an instance number from the blockIdx and threadIdx // decode an instance number from the blockIdx and threadIdx
...@@ -369,10 +356,8 @@ __global__ void kernel_elgamal(cgbn_error_report_t *report, typename cmixPrecomp ...@@ -369,10 +356,8 @@ __global__ void kernel_elgamal(cgbn_error_report_t *report, typename cmixPrecomp
// The results will be placed in the stream's gpu outputs buffer some time after the kernel launch // The results will be placed in the stream's gpu outputs buffer some time after the kernel launch
// Precondition: stream should have had upload called on it // Precondition: stream should have had upload called on it
template<class params> template<class params>
const char* run(streamData *stream, kernel whichToRun) { const char* run(streamData *stream) {
#ifdef TRACE debugPrint("run (streamData, kernel)");
printf("run (streamData, kernel)\n");
#endif
const int32_t TPB=(params::TPB==0) ? 128 : params::TPB; // default threads per block to 128 const int32_t TPB=(params::TPB==0) ? 128 : params::TPB; // default threads per block to 128
const int32_t TPI=params::TPI, IPB=TPB/TPI; // IPB is instances per block const int32_t TPI=params::TPI, IPB=TPB/TPI; // IPB is instances per block
...@@ -382,16 +367,15 @@ const char* run(streamData *stream, kernel whichToRun) { ...@@ -382,16 +367,15 @@ const char* run(streamData *stream, kernel whichToRun) {
// Organize with enumeration? Is it possible to use templates to make this better? // Organize with enumeration? Is it possible to use templates to make this better?
typedef cgbn_mem_t<params::BITS> mem_t; typedef cgbn_mem_t<params::BITS> mem_t;
switch (whichToRun) { switch (stream->whichToRun) {
case KERNEL_POWM_ODD: case KERNEL_POWM_ODD:
{ {
typedef typename cmixPrecomp<params>::powm_odd_input_t input_t; typedef typename cmixPrecomp<params>::powm_odd_input_t input_t;
mem_t* gpuConstants = (mem_t*)stream->gpuMem;
input_t* gpuInputs = (input_t*)(gpuConstants+1);
mem_t* gpuOutputs = (mem_t*)(gpuInputs+stream->length);
kernel_powm_odd<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>( kernel_powm_odd<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>(
stream->report, stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length);
(input_t*)stream->gpuInputs,
(mem_t*)stream->gpuConstants,
(mem_t*)stream->gpuOutputs,
stream->length);
} }
break; break;
case KERNEL_ELGAMAL: case KERNEL_ELGAMAL:
...@@ -399,12 +383,11 @@ const char* run(streamData *stream, kernel whichToRun) { ...@@ -399,12 +383,11 @@ const char* run(streamData *stream, kernel whichToRun) {
typedef typename cmixPrecomp<params>::elgamal_input_t input_t; typedef typename cmixPrecomp<params>::elgamal_input_t input_t;
typedef typename cmixPrecomp<params>::elgamal_output_t output_t; typedef typename cmixPrecomp<params>::elgamal_output_t output_t;
typedef typename cmixPrecomp<params>::elgamal_constant_t constant_t; typedef typename cmixPrecomp<params>::elgamal_constant_t constant_t;
constant_t* gpuConstants = (constant_t*)stream->gpuMem;
input_t* gpuInputs = (input_t*)(gpuConstants+1);
output_t* gpuOutputs = (output_t*)(gpuInputs+stream->length);
kernel_elgamal<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>( kernel_elgamal<params><<<(stream->length+IPB-1)/IPB, TPB, 0, stream->stream>>>(
stream->report, stream->report, gpuConstants, gpuInputs, gpuOutputs, stream->length);
(input_t*)stream->gpuInputs,
(constant_t*)stream->gpuConstants,
(output_t*)stream->gpuOutputs,
stream->length);
} }
break; break;
case KERNEL_MUL2: case KERNEL_MUL2:
...@@ -421,9 +404,7 @@ const char* run(streamData *stream, kernel whichToRun) { ...@@ -421,9 +404,7 @@ const char* run(streamData *stream, kernel whichToRun) {
} }
const char* getResults(streamData *stream) { const char* getResults(streamData *stream) {
#ifdef TRACE debugPrint("getResults (streamData)");
printf("getResults (streamData)\n");
#endif
// Wait for download to complete // Wait for download to complete
CUDA_CHECK_RETURN(cudaEventSynchronize(stream->deviceToHost)); CUDA_CHECK_RETURN(cudaEventSynchronize(stream->deviceToHost));
// Not sure if we can check the error report before this (e.g. in download function) // Not sure if we can check the error report before this (e.g. in download function)
...@@ -431,31 +412,43 @@ const char* getResults(streamData *stream) { ...@@ -431,31 +412,43 @@ const char* getResults(streamData *stream) {
return NULL; return NULL;
} }
// Get the location where the outputs start for uploading
// TODO Should this be located somewhere else? Could I reduce redundancy if I make different classes for each op? (for instance)
// Basically this design seems suboptimal in some way
template <class Input, class Constant>
void* getOutputs(void* mem, size_t numItems) {
debugPrint("getOutputs (void*, size_t)");
// We want to get the location after the inputs and constants
// Order doesn't matter and we assume padding doesn't exist
// (which should be true if the big numbers are big enough)
Constant* constants = (Constant*)mem;
// Not sure if this is undefined behaviour? Do I have to reinterpret_cast or cast to void to make this work?
Input* inputs = (Input*)(constants+1);
return (void*) (inputs+numItems);
}
// Get the memory address of the beginning of the inputs in a buffer
// Inputs always come right after constants
template <class Constant>
void* getInputs(void* mem) {
debugPrint("getInputs (void*, size_t)");
Constant* constants = (Constant*)mem;
return (void*)(constants+1);
}
typedef powm_params_t<32, 4096, 5> params4096; typedef powm_params_t<32, 4096, 5> params4096;
// create a bunch of streams and buffers suitable for running a particular kernel // create a bunch of streams and buffers suitable for running a particular kernel
inline const char* createStream(streamCreateInfo createInfo, streamData* stream) { inline const char* createStream(streamCreateInfo createInfo, streamData* stream) {
#ifdef TRACE debugPrint("createStream (streamData)");
printf("createStream (streamData)\n"); stream->memCapacity = createInfo.capacity;
#endif
stream->capacity = createInfo.capacity;
stream->constantsCapacity = createInfo.constantsCapacity;
stream->outputsCapacity = createInfo.outputsCapacity;
stream->inputsCapacity = createInfo.inputsCapacity;
stream->length = 0; stream->length = 0;
stream->constantsLength = 0;
stream->outputsLength = 0; stream->outputsLength = 0;
stream->inputsLength = 0; stream->inputsLength = 0;
CUDA_CHECK_RETURN(cudaStreamCreate(&(stream->stream))); CUDA_CHECK_RETURN(cudaStreamCreate(&(stream->stream)));
CUDA_CHECK_RETURN(cudaMalloc(&(stream->gpuInputs), createInfo.inputsCapacity)); CUDA_CHECK_RETURN(cudaMalloc(&(stream->gpuMem), createInfo.capacity));
CUDA_CHECK_RETURN(cudaMalloc(&(stream->gpuOutputs), createInfo.outputsCapacity));
CUDA_CHECK_RETURN(cudaMalloc(&(stream->gpuConstants), createInfo.constantsCapacity));
CUDA_CHECK_RETURN(cgbn_error_report_alloc(&stream->report)); CUDA_CHECK_RETURN(cgbn_error_report_alloc(&stream->report));
CUDA_CHECK_RETURN(cudaHostAlloc(&(stream->cpuOutputs), createInfo.outputsCapacity, cudaHostAllocDefault)); CUDA_CHECK_RETURN(cudaHostAlloc(&(stream->cpuMem), createInfo.capacity, cudaHostAllocDefault));
// Both of these next buffers should only be written on the host, so they can be allocated write-combined
CUDA_CHECK_RETURN(cudaHostAlloc(&(stream->cpuInputs), createInfo.inputsCapacity, cudaHostAllocWriteCombined));
CUDA_CHECK_RETURN(cudaHostAlloc(&(stream->cpuConstants), createInfo.constantsCapacity, cudaHostAllocWriteCombined));
// These events are created with timing disabled because it takes time // These events are created with timing disabled because it takes time
// to get the timing data, and we don't need it. // to get the timing data, and we don't need it.
...@@ -471,46 +464,32 @@ inline const char* createStream(streamCreateInfo createInfo, streamData* stream) ...@@ -471,46 +464,32 @@ inline const char* createStream(streamCreateInfo createInfo, streamData* stream)
// implementation-specific name mangling // implementation-specific name mangling
// This makes them more straightforward to load from the shared object // This makes them more straightforward to load from the shared object
extern "C" { extern "C" {
// Enqueue upload data for a powm kernel run for 4K bits // Enqueue upload for a specific kernel
// Stage input data by copying to the stream's constants and inputs memory before calling // Stage input data by copying to the stream's constants and inputs memory before calling
const char* upload(const uint32_t instance_count, void *stream, size_t inputsUploadSize, size_t constantsUploadSize, size_t outputsDownloadSize) { const char* upload(const uint32_t instance_count, void *stream, enum kernel whichToRun) {
#ifdef TRACE debugPrint("upload (void)");
printf("upload (void)\n");
#endif
auto gpuData = (streamData*)stream; auto gpuData = (streamData*)stream;
// Previous download must finish before data are uploaded // Previous download must finish before data are uploaded
CUDA_CHECK_RETURN(cudaStreamWaitEvent(gpuData->stream, gpuData->deviceToHost, 0)); CUDA_CHECK_RETURN(cudaStreamWaitEvent(gpuData->stream, gpuData->deviceToHost, 0));
// Set instance count; it's re-used when the kernel gets run later // Set instance count; it's re-used when the kernel gets run later
gpuData->length = instance_count; gpuData->length = instance_count;
// If there are more instances uploaded than the stream can handle, that's an error
// At least for now
if (gpuData->length > gpuData->capacity) {
return strdup("upload_powm error: length greater than capacity\n");
}
// It also doesn't take too long to bounds check the requested data transfer sizes // It also doesn't take too long to bounds check the requested data transfer sizes
// to avoid segmentation faults // to avoid segmentation faults
if (gpuData->inputsCapacity < inputsUploadSize) { // Avoid these errors by querying the stream for how many items of a particular kernel it can run before uploading
return strdup("upload error: input upload size greater than capacity\n"); size_t inputsUploadSize = getInputSize(whichToRun) * instance_count + getConstantsSize(whichToRun);
} size_t outputsDownloadSize = getOutputSize(whichToRun) * instance_count;
if (gpuData->constantsCapacity < constantsUploadSize) { if (inputsUploadSize + outputsDownloadSize > gpuData->memCapacity) {
return strdup("upload error: constants upload size greater than capacity\n"); return strdup("upload error: inputs+outputs larger than stream capacity\n");
}
if (gpuData->outputsCapacity < outputsDownloadSize) {
return strdup("upload error: outputs download size greater than capacity\n");
} }
// At this point, everything should be in a good state, so set the needed variables // At this point, everything should be in a good state, so set the needed variables
gpuData->inputsLength = inputsUploadSize; gpuData->inputsLength = inputsUploadSize;
gpuData->outputsLength = outputsDownloadSize; gpuData->outputsLength = outputsDownloadSize;
gpuData->constantsLength = constantsUploadSize; gpuData->whichToRun = whichToRun;
CUDA_CHECK_RETURN(cudaMemcpyAsync(gpuData->gpuInputs, gpuData->cpuInputs, gpuData->inputsLength, // Upload the inputs
cudaMemcpyHostToDevice, gpuData->stream)); CUDA_CHECK_RETURN(cudaMemcpyAsync(gpuData->gpuMem, gpuData->cpuMem, gpuData->inputsLength,
// Currently, we're copying to the constants before each kernel launch
// This might not be necessary, but it's not that much data
CUDA_CHECK_RETURN(cudaMemcpyAsync(gpuData->gpuConstants, gpuData->cpuConstants, gpuData->constantsLength,
cudaMemcpyHostToDevice, gpuData->stream)); cudaMemcpyHostToDevice, gpuData->stream));
// Run should wait on this event for kernel launch // Run should wait on this event for kernel launch
...@@ -521,44 +500,56 @@ extern "C" { ...@@ -521,44 +500,56 @@ extern "C" {
} }
// Run powm for 4K bits // Run powm for 4K bits
const char* run(void *stream, kernel whichToRun) { const char* run(void *stream) {
#ifdef TRACE debugPrint("run (void)");
printf("run (void)\n"); return run<params4096>((streamData*)stream);
#endif
return run<params4096>((streamData*)stream, whichToRun);
} }
const char* download(void *s) { const char* download(void *s) {
#ifdef TRACE debugPrint("download (void)");
printf("download (void)\n");
#endif
auto stream = (streamData*)s; auto stream = (streamData*)s;
// Wait for the kernel to finish running // Wait for the kernel to finish running
CUDA_CHECK_RETURN(cudaStreamWaitEvent(stream->stream, stream->exec, 0)); CUDA_CHECK_RETURN(cudaStreamWaitEvent(stream->stream, stream->exec, 0));
// The kernel ran successfully, so we get the results off the GPU // The kernel ran successfully, so we get the results off the GPU
CUDA_CHECK_RETURN(cudaMemcpyAsync(stream->cpuOutputs, stream->gpuOutputs, stream->outputsLength, cudaMemcpyDeviceToHost, stream->stream)); // Outputs come right after the inputs
// TODO: Do this differently (need to not do arithmetic on void pointer for it to be valid)
// Specifically, I know what the byte length should be in this buffer because the operation is specified
// Basically, we're completing the type manually for bindings compatibility reasons
// Otherwise we'd have to template(?) the streamData struct with the input, output, and constant types
// and cast the cpu and gpu buffers to the right structure depending on the operation.
//
// For now, just instantiate with types for elgamal 4k?
// This is a mess. We should just be able to pass "elgamal" or, at worst, "elgamal<params4096>"
// I'd rather not have to switch all the types and instantiate different classes based on the cryptop but that's a limitation on Cgo.
void *cpuOutputs, *gpuOutputs;
switch (stream->whichToRun) {
case KERNEL_ELGAMAL:
cpuOutputs = getOutputs<cmixPrecomp<params4096>::elgamal_input_t, cmixPrecomp<params4096>::elgamal_constant_t>(stream->cpuMem, stream->length);
gpuOutputs = getOutputs<cmixPrecomp<params4096>::elgamal_input_t, cmixPrecomp<params4096>::elgamal_constant_t>(stream->gpuMem, stream->length);
break;
case KERNEL_POWM_ODD:
cpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, cmixPrecomp<params4096>::mem_t>(stream->cpuMem, stream->length);
gpuOutputs = getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, cmixPrecomp<params4096>::mem_t>(stream->gpuMem, stream->length);
break;
default:
return strdup("Unknown kernel for download; unable to find location of outputs in buffer\n");
}
CUDA_CHECK_RETURN(cudaMemcpyAsync(cpuOutputs, gpuOutputs, stream->outputsLength, cudaMemcpyDeviceToHost, stream->stream));
CUDA_CHECK_RETURN(cudaEventRecord(stream->deviceToHost, stream->stream)); CUDA_CHECK_RETURN(cudaEventRecord(stream->deviceToHost, stream->stream));
return NULL; return NULL;
} }
struct return_data* getResults(void *stream) { const char* getResults(void *stream) {
#ifdef TRACE debugPrint("getResults (void)");
printf("getResults (void)\n"); return getResults((streamData*)stream);
#endif
return_data* result = (return_data*)malloc(sizeof(*result));
result->result = ((streamData*)stream)->cpuOutputs;
result->error = getResults((streamData*)stream);
return result;
} }
// Call this when starting the program to allocate resources // Call this when starting the program to allocate resources
// Returns stream or error // Returns stream or error
struct return_data* createStream(streamCreateInfo createInfo) { struct return_data* createStream(streamCreateInfo createInfo) {
#ifdef TRACE debugPrint("createStream (streamCreateInfo)");
printf("createStream (streamCreateInfo)\n");
#endif
return_data* result = (return_data*)malloc(sizeof(*result)); return_data* result = (return_data*)malloc(sizeof(*result));
streamData *s = (streamData*)(malloc(sizeof(*s))); streamData *s = (streamData*)(malloc(sizeof(*s)));
result->error = createStream(createInfo, s); result->error = createStream(createInfo, s);
...@@ -569,30 +560,16 @@ extern "C" { ...@@ -569,30 +560,16 @@ extern "C" {
// Call this after execution has completed to deallocate resources // Call this after execution has completed to deallocate resources
// Returns error // Returns error
const char* destroyStream(void *destroyee) { const char* destroyStream(void *destroyee) {
#ifdef TRACE debugPrint("destroyStream (void)");
printf("destroyStream (void)\n");
#endif
auto stream = (streamData*)destroyee; auto stream = (streamData*)destroyee;
// Don't know at what point there could have been errors while creating this stream, // Don't know at what point there could have been errors while creating this stream,
// so make sure things exist before destroying them // so make sure things exist before destroying them
if (stream != NULL) { if (stream != NULL) {
if (stream->gpuInputs != NULL) { if (stream->gpuMem != NULL) {
CUDA_CHECK_RETURN(cudaFree(stream->gpuInputs)); CUDA_CHECK_RETURN(cudaFree(stream->gpuMem));
}
if (stream->gpuOutputs != NULL) {
CUDA_CHECK_RETURN(cudaFree(stream->gpuOutputs));
}
if (stream->gpuConstants != NULL) {
CUDA_CHECK_RETURN(cudaFree(stream->gpuConstants));
}
if (stream->cpuInputs != NULL) {
CUDA_CHECK_RETURN(cudaFreeHost(stream->cpuInputs));
} }
if (stream->cpuOutputs != NULL) { if (stream->cpuMem != NULL) {
CUDA_CHECK_RETURN(cudaFreeHost(stream->cpuOutputs)); CUDA_CHECK_RETURN(cudaFreeHost(stream->cpuMem));
}
if (stream->cpuConstants != NULL) {
CUDA_CHECK_RETURN(cudaFreeHost(stream->cpuConstants));
} }
if (stream->report != NULL) { if (stream->report != NULL) {
CUDA_CHECK_RETURN(cgbn_error_report_free(stream->report)); CUDA_CHECK_RETURN(cgbn_error_report_free(stream->report));
...@@ -613,34 +590,105 @@ extern "C" { ...@@ -613,34 +590,105 @@ extern "C" {
} }
// Call this after execution has completed to write out profile information to the disk // Call this after execution has completed to write out profile information to the disk
const char* stopProfiling() {
CUDA_CHECK_RETURN(cudaProfilerStop());
return NULL;
}
const char* startProfiling() {
CUDA_CHECK_RETURN(cudaProfilerStart());
return NULL;
}
const char* resetDevice() { const char* resetDevice() {
CUDA_CHECK_RETURN(cudaDeviceReset()); CUDA_CHECK_RETURN(cudaDeviceReset());
return NULL; return NULL;
} }
// Return cpu inputs buffer pointer for writing // Return cpu inputs buffer pointer for writing
void* getCpuInputs(void* stream) { // TODO Implement depending on kernel/length
return ((streamData*)stream)->cpuInputs; void* getCpuInputs(void* stream, enum kernel op) {
streamData* s = (streamData*)stream;
switch (op) {
case KERNEL_ELGAMAL:
return getInputs<cmixPrecomp<params4096>::elgamal_constant_t>(s->cpuMem);
break;
case KERNEL_POWM_ODD:
return getInputs<cmixPrecomp<params4096>::mem_t>(s->cpuMem);
break;
case KERNEL_MUL2:
default:
// Unimplemented
return NULL;
break;
}
} }
// Return cpu outputs buffer pointer for reading // Return cpu outputs buffer pointer for reading
void* getCpuOutputs(void* stream) { void* getCpuOutputs(void* stream) {
return ((streamData*)stream)->cpuOutputs; streamData* s = (streamData*)stream;
switch (s->whichToRun) {
case KERNEL_ELGAMAL:
return getOutputs<cmixPrecomp<params4096>::elgamal_input_t, cmixPrecomp<params4096>::elgamal_constant_t>(
s->cpuMem, s->length);
break;
case KERNEL_POWM_ODD:
return getOutputs<cmixPrecomp<params4096>::powm_odd_input_t, cmixPrecomp<params4096>::mem_t>(
s->cpuMem, s->length);
break;
case KERNEL_MUL2:
default:
// Unimplemented
return NULL;
break;
}
} }
// Return cpu constants buffer pointer for writing // Return cpu constants buffer pointer for writing
void* getCpuConstants(void* stream) { void* getCpuConstants(void* stream) {
return ((streamData*)stream)->cpuConstants; return ((streamData*)stream)->cpuMem;
} }
// TODO? Deprecate this and use sizeof / pointer arithmetic instead
// Although, I think we might need this for slicing things on the go side
// TODO? Support different bit lengths
size_t getConstantsSize(enum kernel op) {
switch (op) {
case KERNEL_ELGAMAL:
return sizeof(cmixPrecomp<params4096>::elgamal_constant_t);
break;
case KERNEL_POWM_ODD:
return sizeof(cmixPrecomp<params4096>::mem_t);
break;
case KERNEL_MUL2:
default:
// Unimplemented
return 0;
break;
}
}
size_t getInputSize(enum kernel op) {
switch (op) {
case KERNEL_ELGAMAL:
return sizeof(cmixPrecomp<params4096>::elgamal_input_t);
break;
case KERNEL_POWM_ODD:
return sizeof(cmixPrecomp<params4096>::powm_odd_input_t);
break;
case KERNEL_MUL2:
default:
// Unimplemented
return 0;
break;
}
}
size_t getOutputSize(enum kernel op) {
switch (op) {
case KERNEL_ELGAMAL:
return sizeof(cmixPrecomp<params4096>::elgamal_output_t);
break;
case KERNEL_POWM_ODD:
return sizeof(cmixPrecomp<params4096>::mem_t);
break;
case KERNEL_MUL2:
default:
// Unimplemented
return 0;
break;
}
}
} }
...@@ -33,25 +33,17 @@ enum kernel { ...@@ -33,25 +33,17 @@ enum kernel {
}; };
// Prepare a kernel run // Prepare a kernel run
const char* upload(const uint32_t instance_count, void *stream, size_t inputsUploadSize, size_t constantsUploadSize, size_t outputsDownloadSize); const char* upload(const uint32_t instance_count, void *stream, enum kernel whichToRun);
// Enqueue a kernel run // Enqueue a kernel run
const char* run(void *stream, enum kernel whichToRun); const char* run(void *stream);
// Enqueue download from a previous kernel launch // Enqueue download from a previous kernel launch
const char* download(void *stream); const char* download(void *stream);
// Wait for a results download to finish // Wait for a results download to finish
struct return_data* getResults(void *stream); const char* getResults(void *stream);
struct streamCreateInfo { struct streamCreateInfo {
// How many instances can be invoked in a kernel launch? // How much memory is available for the stream to use?
size_t capacity; size_t capacity;
// What's the size in bytes of the entire input buffer?
// (assumed to be linear in size with number of inputs)
size_t inputsCapacity;
// What's the size in bytes of the entire output buffer?
// (assumed to be linear in size with number of inputs)
size_t outputsCapacity;
// What's the size in bytes of the entire constants buffer?
size_t constantsCapacity;
}; };
...@@ -64,7 +56,7 @@ const char* destroyStream(void *destroyee); ...@@ -64,7 +56,7 @@ const char* destroyStream(void *destroyee);
// Get a pointer to the CPU inputs buffer from a stream // Get a pointer to the CPU inputs buffer from a stream
// Overwrite this memory with inputs before enqueueing an upload // Overwrite this memory with inputs before enqueueing an upload
void* getCpuInputs(void* stream); void* getCpuInputs(void* stream, enum kernel op);
// Get a pointer to the CPU outputs buffer from a stream // Get a pointer to the CPU outputs buffer from a stream
// Read outputs from this memory after calling getResults to synchronize the event // Read outputs from this memory after calling getResults to synchronize the event
...@@ -74,11 +66,12 @@ void* getCpuOutputs(void* stream); ...@@ -74,11 +66,12 @@ void* getCpuOutputs(void* stream);
// Overwrite this memory with constants before enqueueing an upload // Overwrite this memory with constants before enqueueing an upload
void* getCpuConstants(void* stream); void* getCpuConstants(void* stream);
// Call this after execution has completed to write out profile information to the disk // Get memory size required for a certain op's constants buffer
const char* stopProfiling(); size_t getConstantsSize(enum kernel op);
// Get memory size required for a certain op's inputs buffer
// Calling this is optional if you profile from the start of execution. size_t getInputSize(enum kernel op);
const char* startProfiling(); // Get memory size required for a certain op's outputs buffer
size_t getOutputSize(enum kernel op);
// If using the newer profiler, use this instead when kernels have finished // If using the newer profiler, use this instead when kernels have finished
// running to signal the profiler that execution has finished. // running to signal the profiler that execution has finished.
......
...@@ -130,3 +130,10 @@ void random_words(uint32_t *x, uint32_t count) { ...@@ -130,3 +130,10 @@ void random_words(uint32_t *x, uint32_t count) {
for(index=0;index<count;index++) for(index=0;index<count;index++)
x[index]=random_word(); x[index]=random_word();
} }
void debugPrint(const char* s) {
#ifdef TRACE
printf("%s\n", s);
#endif
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment