Sparse arrays are widely used in many scientific applications where memory and time are strictly limited. For a considerable speedup one can use CUDA. Since Mathematica 8 we can conveniently use CUDA directly from Mathematica. However we can use only dense matrices. Neither version 9 nor version 10 did not add CUDA sparse arrays. However, CUSPARSE library of GPU-accelerated sparse matrix routines was introduced in CUDA Toolkit 3.2, November 2010, i.e. 4 years ago! Is it possible to use CUSPARSE directly from Mathematica now?
It is not a simple problem. On the one hand CUDALink doesn't allow you to use routines from the external libraries. On the other hand LibraryLink doesn't allow you to use CUDAMemory
and its reference to GPU memory.
Answer
Yes, it is possible! There is a WorframGPULibrary (WGL), which I discover recently. It is undocumented, however there are beautiful examples in
$InstallationDirectory/SystemFiles/Links/CUDALink/CSource/
It is similar to LibraryLink, but allows CUDAMemory
as an argument. I wrote the code below to call main CUSPARSE routines directly from Mathematica
$$ C := \alpha (AB)^T + \beta C $$
or
$$ c := \alpha Ab + \beta c. $$
Where $A$ is a sparse matrix, $B$ and $C$ are dense matrices, $b$ and $c$ are dense vectors. The motivation of transposition is to improve the memory access and increase CUDA performance (approximately twice). You can always quickly transpose input and output as you want.
- Matrix-matrix and matrix-vector operations.
- Single and double precision.
- Real or complex values.
- Most popular CSR and BSR formats of sparse arrays.
- Up to 30x speedup.
There is almost nothing interesting, just calling appropriate CUSPARSE routines (sparse.cu
):
#include
#include
#define wglState (wglData->state)
#define wglErr (wglData->getError(wglData))
#define WGL_SuccessQ (wglErr->code == WGL_Success)
#define WGL_FailQ (!WGL_SuccessQ)
#define WGL_Type_RealQ(mem) ((mem)->type == WGL_Real_t)
#define WGL_SAFE_CALL(stmt, jmp) stmt; if (WGL_FailQ) { goto jmp; }
#define TRA CUSPARSE_OPERATION_NON_TRANSPOSE
#define TRB CUSPARSE_OPERATION_TRANSPOSE
#define BDR CUSPARSE_DIRECTION_ROW
#define GET(type, ptr) CUDA_Runtime_getDeviceMemoryAs##type(ptr)
#define CSRMM(c, type) cusparse##c##csrmm2(handle, TRA, TRB, m, n, k, nnzb, \
&alpha##c, descrA, GET(type, A_val), \
GET(Integer, A_ptr), GET(Integer, A_ind), \
GET(type, B), n, &beta##c, GET(type, C), m)
#define CSRMV(c, type) cusparse##c##csrmv(handle, TRA, m, k, nnzb, \
&alpha##c, descrA, GET(type, A_val), \
GET(Integer, A_ptr), GET(Integer, A_ind), \
GET(type, B), &beta##c, GET(type, C))
#if CUDA_VERSION >= 6050
#define BSRMM(c, type) cusparse##c##bsrmm(handle, BDR, TRA, TRB, m/bs, n, k/bs, nnzb, \
&alpha##c, descrA, GET(type, A_val), \
GET(Integer, A_ptr), GET(Integer, A_ind), bs, \
GET(type, B), n, &beta##c, GET(type, C), m)
#else
#define BSRMM(c, type) CUSPARSE_STATUS_SUCCESS; return LIBRARY_VERSION_ERROR; // CUDA 6.5 or later is required for bsrmm
#endif
#define BSRMV(c, type) cusparse##c##bsrmv(handle, BDR, TRA, m/bs, k/bs, nnzb, \
&alpha##c, descrA, GET(type, A_val), \
GET(Integer, A_ptr), GET(Integer, A_ind), bs, \
GET(type, B), &beta##c, GET(type, C))
#define SRM(c, type) if (r==2) { \
if (bs==1) ret = CSRMM(c, type); \
else ret = BSRMM(c, type); \
} else { \
if (bs==1) ret = CSRMV(c, type); \
else ret = BSRMV(c, type); \
}
WolframGPULibraryData wglData = NULL;
EXTERN_C DLLEXPORT int osrmm(WolframLibraryData libData, mint Argc, MArgument * Args, MArgument Res) {
int err = LIBRARY_FUNCTION_ERROR;
WGL_Memory_t A_val, A_ptr, A_ind, B, C;
int r, m, n, k, nnzb, bs;
cusparseHandle_t handle = 0;
cusparseMatDescr_t descrA = 0;
cusparseStatus_t ret;
float alphaS, betaS;
double alphaD, betaD;
cuComplex alphaC, betaC;
cuDoubleComplex alphaZ, betaZ;
WGL_SAFE_CALL(wglData->setWolframLibraryData(wglData, libData), cleanup);
WGL_SAFE_CALL(A_val = wglData->findMemory(wglData,MArgument_getInteger(Args[0])), cleanup);
WGL_SAFE_CALL(A_ptr = wglData->findMemory(wglData,MArgument_getInteger(Args[1])), cleanup);
WGL_SAFE_CALL(A_ind = wglData->findMemory(wglData,MArgument_getInteger(Args[2])), cleanup);
m = MArgument_getInteger(Args[3]);
k = MArgument_getInteger(Args[4]);
nnzb = A_ind->flattenedLength;
bs = sqrt(A_val->flattenedLength/nnzb);
WGL_SAFE_CALL(B = wglData->findMemory(wglData, MArgument_getInteger(Args[5])), cleanup);
WGL_SAFE_CALL(C = wglData->findMemory(wglData, MArgument_getInteger(Args[6])), cleanup);
if ((A_val->type != B->type)||(B->type != C->type)||(A_val->type != C->type)) {
return LIBRARY_TYPE_ERROR;
}
r = B->rank;
if ((r != C->rank)||(r > 2)) return LIBRARY_DIMENSION_ERROR;
if (r == 2) {
n = B->dimensions[1];
if ((m != C->dimensions[1])||(n != C->dimensions[0])||(k != B->dimensions[0]))
return LIBRARY_DIMENSION_ERROR;
} else if ((m != C->flattenedLength)||(k != B->flattenedLength))
return LIBRARY_DIMENSION_ERROR;
WGL_SAFE_CALL(CUDA_Runtime_setMemoryAsInput(wglState, A_val, wglErr), cleanup);
WGL_SAFE_CALL(CUDA_Runtime_setMemoryAsInput(wglState, A_ptr, wglErr), cleanup);
WGL_SAFE_CALL(CUDA_Runtime_setMemoryAsInput(wglState, A_ind, wglErr), cleanup);
WGL_SAFE_CALL(CUDA_Runtime_setMemoryAsInput(wglState, B, wglErr), cleanup);
WGL_SAFE_CALL(CUDA_Runtime_setMemoryAsInputOutput(wglState, C, wglErr), cleanup);
cusparseCreate(&handle);
cusparseCreateMatDescr(&descrA);
switch(A_val->type)
{
case WGL_Type_Float:
alphaS = MArgument_getReal(Args[7]);
betaS = MArgument_getReal(Args[9]);
SRM(S, Float)
break;
case WGL_Type_Double:
alphaD = MArgument_getReal(Args[7]);
betaD = MArgument_getReal(Args[9]);
SRM(D, Double);
break;
case WGL_Type_ComplexFloat:
alphaC.x = MArgument_getReal(Args[7]);
alphaC.y = MArgument_getReal(Args[8]);
betaC.x = MArgument_getReal(Args[9]);
betaC.y = MArgument_getReal(Args[10]);
SRM(C, ComplexFloat);
break;
case WGL_Type_ComplexDouble:
alphaZ.x = MArgument_getReal(Args[7]);
alphaZ.y = MArgument_getReal(Args[8]);
betaZ.x = MArgument_getReal(Args[9]);
betaZ.y = MArgument_getReal(Args[10]);
SRM(Z, ComplexDouble);
break;
}
cusparseDestroyMatDescr(descrA);
cusparseDestroy(handle);
err = LIBRARY_NO_ERROR;
cleanup:
if (WGL_SuccessQ) {
CUDA_Runtime_setMemoryAsValidInputOutput(wglState, C, wglErr);
} else {
CUDA_Runtime_setMemoryAsInvalidInputOutput(wglState, C, wglErr);
}
CUDA_Runtime_unsetMemoryAsInput(wglState, A_val, wglErr);
CUDA_Runtime_unsetMemoryAsInput(wglState, A_ptr, wglErr);
CUDA_Runtime_unsetMemoryAsInput(wglState, A_ind, wglErr);
CUDA_Runtime_unsetMemoryAsInput(wglState, B, wglErr);
if (WGL_SuccessQ && err == LIBRARY_NO_ERROR && ret == CUSPARSE_STATUS_SUCCESS) {
return LIBRARY_NO_ERROR;
} else {
return LIBRARY_FUNCTION_ERROR;
}
}
EXTERN_C DLLEXPORT int WolframGPULibrary_initialize(WolframGPULibraryData wglData0) {
wglData = wglData0;
return LIBRARY_NO_ERROR;
}
EXTERN_C DLLEXPORT int WolframLibrary_initialize(WolframLibraryData libData) {
return LIBRARY_NO_ERROR;
}
EXTERN_C DLLEXPORT void WolframLibrary_uninitialize( ) {
return;
}
Notes:
- Data storage in Mathematica and CUSPARSE are different: row-major and column-major respectively. So you have to keep in mind a virtual transposition of all dense matrices.
#define TRB CUSPARSE_OPERATION_TRANSPOSE
means: apply additional virtual transposition to the matrix $B$ to increase memory access. It requires CUDA architecture 2.0 or later. You can switch it to#define TRB CUSPARSE_OPERATION_NON_TRANSPOSE
and perform $C := \alpha (AB^T)^T + \beta C$.- CUDA 6.5 or later is required for bsrmm (matrix-matrix multiplication with BSR format).
Now you can try to compile this code. I say "try" because the proper compilation even more difficult then writing the code. I use Mathematica 10 under Linux with the system installation of CUDA 6.5. My magic code:
$HistoryLength = 0;
SetDirectory@NotebookDirectory[];
Needs["CUDALink`"];
Off[OptionValue::nodef];
CreateLibrary[{"sparse.cu"}, "libsparse.so",
"Compiler" -> NVCCCompiler,
"CUDAArchitecture" ->
CUDALink`Private`getCUDAArchitecture /@ Range@Length@CUDAInformation[],
"CompilerInstallation" -> "/opt/cuda/bin",
"TargetDirectory" -> ".",
"Defines" -> {"CONFIG_USE_CUDA", "CONFIG_USE_CUSPARSE",
If[#, "CONFIG_USE_DOUBLE_PRECISION", ## &[]]},
"LibraryDirectories" -> $CUDALinkLibraryPath,
"Libraries" -> {If[#, "CUDALink_Double", "CUDALink_Single"], cusparse"},
"ShellCommandFunction" -> Print,
"ShellOutputFunction" -> Print] &@
CUDALink`Internal`SupportsDoublePrecisionQ@$CUDADevice;
"CUDAArchitecture" -> ...
put current CUDA architecture tonvcc
. It is necessary becauseCreateLibrary
automatically choose all possible architectures including 1.0. However CUDA 6.5 can't compile for architecture 1.0.Off[OptionValue::nodef]
hide meaningless warning about CUDA architecture."CompilerInstallation" -> ...
location of yournvcc
."TargetDirectory" -> "."
create library in the same folderCONFIG_USE_CUSPARSE
includes CUSPARSE headers. This flag is exist in WGL but disabled by default!If[#, "CONFIG_USE_DOUBLE_PRECISION", ## &[]]
turns onCONFIG_USE_DOUBLE_PRECISION
if your GPU supports double precision. It is necessary for proper type detection.If[#, "CUDALink_Double", "CUDALink_Single"]
is necessary for proper WGL initialization. Also depends on the double precision support."ShellCommandFunction" -> Print
and"ShellOutputFunction" -> Print
show full compilation command and output fromnvcc
.
Now you can load the library
srmm = CUDAFunctionLoad[{FileNameJoin@{NotebookDirectory[], "libsparse.so"}},
"osrmm", {{_Integer, "Input"}, {_Integer, "Input"}, {_Integer, "Input"},
_Integer, _Integer,
{_Integer, "Input"}, {_Integer, "InputOutput"},
_Real, _Real, _Real, _Real}, {16, 16}];
If you compiled the library properly then you will see no errors. There are 10 arguments of the function srmm
. All {_Integer, ...}
means CUDAMemory
because it contains an integer ID.
- 3
CUDAMemory
arguments for the sparse matrix $A$ in CSR or BSR format. - 2 integer arguments are the height and the width of the matrix $A$.
- 2
CUDAMemory
arguments are matrices $B$ and $C$ or vectors $b$ and $c$. - 4 real arguments for the real and the imaginary part of the $\alpha$ and $\beta$.
Definition of the custom cuSparseArray
HoldPattern@uncover@SparseArray@s__ := {s};
Options[cuSparseArray] =
Append[Options[CUDAMemoryLoad], blockSize -> 1];
cuSparseArray[s_SparseArray, opts : OptionsPattern[]] :=
Module[{bs = OptionValue[blockSize], list},
cuSparseArray[
CUDAMemoryCopyToDevice /@
CUDAMemoryLoad @@@ {{list = Identity; Flatten@#4[[3]],
FilterRules[{opts}, Options@CUDAMemoryLoad]}, {#4[[2, 1]],
"Integer32"}, {Flatten@#4[[2, 2]] - 1, "Integer32"}},
bs #2] & @@
uncover@If[bs > 1,
Developer`PartitionMap[list@Normal@# &, s, {bs, bs}], s]];
cuDot[cuSparseArray[{val_, ptr_, ind_}, {h_, w_}], b_CUDAMemory,
c_CUDAMemory, α_: 1, β_: 0] :=
srmm[val, ptr, ind, h, w, b, c, Re@α, Im@α,
Re@β, Im@β];
CUDAMemoryUnload@cuSparseArray[a_, _] ^:= CUDAMemoryUnload @@ a;
SparseArray
is already in CSR format. So we have touncover
it and load the data.If the option
blockSize
is greater then 1 thenSparseArray
will be splitted byblockSize*blockSize
blocks and loaded inBSR
format."Integer32" is necessary for indexes because CUSPARSE use 32-bit integers, not 64-bit integers (at least for my system)
a = N@Flatten[LeviCivitaTensor[4], {{1, 2}, {3, 4}}];
b = RandomReal[1, 16];
c = ConstantArray[0., 16];
ga = cuSparseArray[a];
{gb, gc} = CUDAMemoryLoad /@ {b, c};
cuDot[ga, gb, gc];
CUDAMemoryGet[gc] - a.b
{0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.}
Now we can perform a benchmarking for various data types and structures. I use two types of the distribution of nonzero elements in the sparse matrix $A$: random distribution (bs = 1
) and clustered distribution (bs = 16
)
prec = "Single"; (* "Single" or "Double" *)
type = "Real"; (* "Real" or "Complex" *)
bs = 1; (* size of blocks in the sparse array *)
format = "CSR"; (* "CSR" or "BSR", GPU storage format *)
struct = "Vector"; (* "Matrix" or "Vector", multiply by what *)
scale = 2^8; (* change depending on your GPU memory, check ByteCount below *)
rnd = If[type == "Real", RandomReal[1, #] &,
RandomComplex[1 + I, #] &];
{m, k, n} = scale 2^{7, 6, 0};
nnz = scale 2^14/bs^2;
a = KroneckerProduct[#, rnd[bs {1, 1}]] &@
SparseArray[
Sort@Transpose[RandomInteger[{1, #}, nnz] & /@ ({m, k}/bs)] ->
rnd[nnz], {m, k}/bs];
b = rnd@If[struct == "Matrix", {k, n}, k];
c = rnd@If[struct == "Matrix", {n, m}, m];
{α, β} = rnd[2];
repeats = If[struct == "Matrix", 20, 300];
Plus @@ ByteCount /@ {a, b, c}
66 982 088 (Memory used in bytes)
ga = cuSparseArray[a, TargetPrecision -> prec,
blockSize -> If[format == "CSR", 1, bs]];
{gb, gc} =
CUDAMemoryCopyToDevice@
CUDAMemoryLoad[#, TargetPrecision -> prec] & /@ {b, c};
First@AbsoluteTiming@
Do[c = α If[struct == "Matrix", (a.b)\[Transpose],
a.b] + β c, {repeats}]/
First@AbsoluteTiming[
Do[cuDot[ga, gb, gc, α, β], {repeats}];
c2 = CUDAMemoryGet@gc]
Norm[c2 - c, 1]/Norm[c, 1]
CUDAMemoryUnload /@ {ga, gb, gc};
2.3152 (Speedup)
4.31758*10^-6 (Accuracy)
The comprehensive table of benchmarking on my system (GTX 560/Core i7 3820)
Up to 30x speedup! It's not bad for €125 GPU.
I will appreciate for any tests of WGL and CUSPARSE. Thank you for reading such a long post!
Comments
Post a Comment