I am currently working on an application where I need to execute the multiplication of two complex sparse matrices. I've done the multiplication with the method cusparseSpGEMM defining the data type of both matrices and the result as CUDA_C_32F. However, when I try to change the precision with the data type CUDA_C_64F, the function cusparseSpGEMM_copy breaks the application and returns the following error:
I don't know what's causing this since the example works properly for the other data types, even the single-precision complex one (CUDA_C_32F ), and the documentation says cusparseSpGEMM supports CUDA_C_64F.
The cusparseSpGEMM example using CUDA_C_64F is below.
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h> // cusparseSpGEMM
#include <cuComplex.h>
#include <stdio.h> // printf
#include <stdlib.h> // EXIT_FAILURE
#define CHECK_CUDA(func) \
{ \
cudaError_t status = (func); \
if (status != cudaSuccess) { \
printf("CUDA API failed at line %d with error: %s (%d)\n", \
__LINE__, cudaGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
#define CHECK_CUSPARSE(func) \
{ \
cusparseStatus_t status = (func); \
if (status != CUSPARSE_STATUS_SUCCESS) { \
printf("CUSPARSE API failed at line %d with error: %s (%d)\n", \
__LINE__, cusparseGetErrorString(status), status); \
return EXIT_FAILURE; \
} \
}
int main(void) {
// Host problem definition
#define A_NUM_ROWS 4 // C compatibility
const int A_num_rows = 4;
const int A_num_cols = 4;
const int A_nnz = 9;
const int B_num_rows = 4;
const int B_num_cols = 4;
const int B_nnz = 8;
int hA_csrOffsets[] = { 0, 3, 4, 7, 9 };
int hA_columns[] = { 0, 2, 3, 1, 0, 2, 3, 1, 3 };
cuDoubleComplex hA_values[] = { make_cuDoubleComplex(1,0), make_cuDoubleComplex(2,0), make_cuDoubleComplex(3,0), make_cuDoubleComplex(4,0), make_cuDoubleComplex(5,0),
make_cuDoubleComplex(6,0), make_cuDoubleComplex(7,0), make_cuDoubleComplex(8,0), make_cuDoubleComplex(9,0) };
int hB_csrOffsets[] = { 0, 2, 4, 7, 8 };
int hB_columns[] = { 0, 3, 1, 3, 0, 1, 2, 1 };
cuDoubleComplex hB_values[] = { make_cuDoubleComplex(1,0), make_cuDoubleComplex(2,0), make_cuDoubleComplex(3,0), make_cuDoubleComplex(4,0), make_cuDoubleComplex(5,0),
make_cuDoubleComplex(6,0), make_cuDoubleComplex(7,0), make_cuDoubleComplex(8,0) };
int hC_csrOffsets[] = { 0, 4, 6, 10, 12 };
int hC_columns[] = { 0, 1, 2, 3, 1, 3, 0, 1, 2, 3, 1, 3 };
cuDoubleComplex hC_values[] = { make_cuDoubleComplex(11,0), make_cuDoubleComplex(36,0), make_cuDoubleComplex(14,0), make_cuDoubleComplex(2,0), make_cuDoubleComplex(12,0),
make_cuDoubleComplex(16,0), make_cuDoubleComplex(35,0), make_cuDoubleComplex(92,0), make_cuDoubleComplex(42,0), make_cuDoubleComplex(10,0),
make_cuDoubleComplex(96,0), make_cuDoubleComplex(32,0) };
const int C_nnz = 12;
#define C_NUM_NNZ 12 // C compatibility
cuDoubleComplex alpha = make_cuDoubleComplex(1,0);
cuDoubleComplex beta = make_cuDoubleComplex(0,0);
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
cudaDataType computeType = CUDA_C_64F;
//--------------------------------------------------------------------------
// Device memory management: Allocate and copy A, B
int* dA_csrOffsets, * dA_columns, * dB_csrOffsets, * dB_columns,
* dC_csrOffsets, * dC_columns;
cuDoubleComplex* dA_values, * dB_values, * dC_values;
// allocate A
CHECK_CUDA(cudaMalloc((void**)&dA_csrOffsets,
(A_num_rows + 1) * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&dA_columns, A_nnz * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&dA_values, A_nnz * sizeof(cuDoubleComplex)))
// allocate B
CHECK_CUDA(cudaMalloc((void**)&dB_csrOffsets,
(B_num_rows + 1) * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&dB_columns, B_nnz * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&dB_values, B_nnz * sizeof(cuDoubleComplex)))
// allocate C offsets
CHECK_CUDA(cudaMalloc((void**)&dC_csrOffsets,
(A_num_rows + 1) * sizeof(int)))
// copy A
CHECK_CUDA(cudaMemcpy(dA_csrOffsets, hA_csrOffsets,
(A_num_rows + 1) * sizeof(int),
cudaMemcpyHostToDevice))
CHECK_CUDA(cudaMemcpy(dA_columns, hA_columns, A_nnz * sizeof(int),
cudaMemcpyHostToDevice))
CHECK_CUDA(cudaMemcpy(dA_values, hA_values,
A_nnz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice))
// copy B
CHECK_CUDA(cudaMemcpy(dB_csrOffsets, hB_csrOffsets,
(B_num_rows + 1) * sizeof(int),
cudaMemcpyHostToDevice))
CHECK_CUDA(cudaMemcpy(dB_columns, hB_columns, B_nnz * sizeof(int),
cudaMemcpyHostToDevice))
CHECK_CUDA(cudaMemcpy(dB_values, hB_values,
B_nnz * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice))
//--------------------------------------------------------------------------
// CUSPARSE APIs
cusparseHandle_t handle = NULL;
cusparseSpMatDescr_t matA, matB, matC;
void* dBuffer1 = NULL, * dBuffer2 = NULL;
size_t bufferSize1 = 0, bufferSize2 = 0;
CHECK_CUSPARSE(cusparseCreate(&handle))
// Create sparse matrix A in CSR format
CHECK_CUSPARSE(cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_nnz,
dA_csrOffsets, dA_columns, dA_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F))
CHECK_CUSPARSE(cusparseCreateCsr(&matB, B_num_rows, B_num_cols, B_nnz,
dB_csrOffsets, dB_columns, dB_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F))
CHECK_CUSPARSE(cusparseCreateCsr(&matC, A_num_rows, B_num_cols, 0,
NULL, NULL, NULL,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_C_64F))
//--------------------------------------------------------------------------
// SpGEMM Computation
cusparseSpGEMMDescr_t spgemmDesc;
CHECK_CUSPARSE(cusparseSpGEMM_createDescr(&spgemmDesc))
// ask bufferSize1 bytes for external memory
CHECK_CUSPARSE(
cusparseSpGEMM_workEstimation(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT,
spgemmDesc, &bufferSize1, NULL))
CHECK_CUDA(cudaMalloc((void**)&dBuffer1, bufferSize1))
// inspect the matrices A and B to understand the memory requiremnent for
// the next step
CHECK_CUSPARSE(
cusparseSpGEMM_workEstimation(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT,
spgemmDesc, &bufferSize1, dBuffer1))
// ask bufferSize2 bytes for external memory
CHECK_CUSPARSE(
cusparseSpGEMM_compute(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT,
spgemmDesc, &bufferSize2, NULL))
CHECK_CUDA(cudaMalloc((void**)&dBuffer2, bufferSize2))
// compute the intermediate product of A * B
CHECK_CUSPARSE(cusparseSpGEMM_compute(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT,
spgemmDesc, &bufferSize2, dBuffer2))
// get matrix C non-zero entries C_nnz1
int64_t C_num_rows1, C_num_cols1, C_nnz1;
CHECK_CUSPARSE(cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1,
&C_nnz1))
// allocate matrix C
CHECK_CUDA(cudaMalloc((void**)&dC_columns, C_nnz1 * sizeof(int)))
CHECK_CUDA(cudaMalloc((void**)&dC_values, C_nnz1 * sizeof(cuDoubleComplex)))
// update matC with the new pointers
CHECK_CUSPARSE(
cusparseCsrSetPointers(matC, dC_csrOffsets, dC_columns, dC_values))
// if beta != 0, cusparseSpGEMM_copy reuses/updates the values of dC_values
// copy the final products to the matrix C
CHECK_CUSPARSE(
cusparseSpGEMM_copy(handle, opA, opB,
&alpha, matA, matB, &beta, matC,
computeType, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc))
// destroy matrix/vector descriptors
CHECK_CUSPARSE(cusparseSpGEMM_destroyDescr(spgemmDesc))
CHECK_CUSPARSE(cusparseDestroySpMat(matA))
CHECK_CUSPARSE(cusparseDestroySpMat(matB))
CHECK_CUSPARSE(cusparseDestroySpMat(matC))
CHECK_CUSPARSE(cusparseDestroy(handle))
//--------------------------------------------------------------------------
// device result check
int hC_csrOffsets_tmp[A_NUM_ROWS + 1];
int hC_columns_tmp[C_NUM_NNZ];
cuDoubleComplex hC_values_tmp[C_NUM_NNZ];
CHECK_CUDA(cudaMemcpy(hC_csrOffsets_tmp, dC_csrOffsets,
(A_num_rows + 1) * sizeof(int),
cudaMemcpyDeviceToHost))
CHECK_CUDA(cudaMemcpy(hC_columns_tmp, dC_columns,
C_nnz * sizeof(int),
cudaMemcpyDeviceToHost))
CHECK_CUDA(cudaMemcpy(hC_values_tmp, dC_values,
C_nnz * sizeof(cuDoubleComplex),
cudaMemcpyDeviceToHost))
int correct = 1;
for (int i = 0; i < A_num_rows + 1; i++) {
if (hC_csrOffsets_tmp[i] != hC_csrOffsets[i]) {
printf("hC_csrOffsets[i]: %d \n", hC_csrOffsets[i]);
printf("hC_csrOffsets_tmp[i]: %d \n", hC_csrOffsets_tmp[i]);
correct = 0;
break;
}
//printf("hC_csrOffsets_tmp[i]: %d \n", hC_csrOffsets_tmp[i]);
}
for (int i = 0; i < C_nnz; i++) {
//printf("hC_columns_tmp[i]: %d - hC_values_tmp[i]: %f + j%f \n", hC_columns_tmp[i], cuCreal(hC_values_tmp[i]), cuCimag(hC_values_tmp[i]));
if (hC_columns_tmp[i] != hC_columns[i] ||
cuCreal(hC_values_tmp[i]) != cuCreal(hC_values[i]) ||
cuCimag(hC_values_tmp[i]) != cuCimag(hC_values[i])) {
printf("hC_columns[i]: %d - hC_values[i]: %f + j%f \n", hC_columns[i], cuCreal(hC_values[i]), cuCimag(hC_values[i]));
printf("hC_columns_tmp[i]: %d - hC_values_tmp[i]: %f + j%f \n", hC_columns_tmp[i], cuCreal(hC_values_tmp[i]), cuCimag(hC_values_tmp[i]));
correct = 0;
break;
}
}
if (correct)
printf("spgemm_example test PASSED\n");
else {
printf("spgemm_example test FAILED: wrong result\n");
return EXIT_FAILURE;
}
//--------------------------------------------------------------------------
// device memory deallocation
CHECK_CUDA(cudaFree(dBuffer1))
CHECK_CUDA(cudaFree(dBuffer2))
CHECK_CUDA(cudaFree(dA_csrOffsets))
CHECK_CUDA(cudaFree(dA_columns))
CHECK_CUDA(cudaFree(dA_values))
CHECK_CUDA(cudaFree(dB_csrOffsets))
CHECK_CUDA(cudaFree(dB_columns))
CHECK_CUDA(cudaFree(dB_values))
CHECK_CUDA(cudaFree(dC_csrOffsets))
CHECK_CUDA(cudaFree(dC_columns))
CHECK_CUDA(cudaFree(dC_values))
return EXIT_SUCCESS;
}
As I said above, changing all cuDoubleComplex statements for cuFloatComplex and CUDA_C_64F for CUDA_C_32F works fine.
The CUDA Toolkit version I'm using is cuda_11.1.1_456.81_win10.
Thanks in advance for any help with this issue.