Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions cpp/cuda_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,32 @@ const int threads_per_block = 512;

/* Helpers */

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
__device__ double atomicAdd(double* address, double val) {
/* atomicAdd on doubles is not implemented on architectures before Pascal.
For older cards, implement our own based on this post:
https://stackoverflow.com/questions/37566987/cuda-atomicadd-for-doubles-definition-error */

unsigned long long int* address_ull = (unsigned long long int*) address;
unsigned long long int old_val = *address_ull;
unsigned long long int new_val;

do {
new_val = old_val;
old_val = atomicCAS(address_ull, new_val,
__double_as_longlong(val + __longlong_as_double(new_val)));
} while (new_val != old_val);

return __longlong_as_double(old_val);
}

#define cudaFreeAsyncMaybe(ptr, stream) cudaFree(ptr)
#define cudaMemsetAsyncMaybe(ptr, v, size, stream) cudaMemset(ptr, v, size)
#else
#define cudaFreeAsyncMaybe(ptr, stream) cudaFreeAsync(ptr, stream)
#define cudaMemsetAsyncMaybe(ptr, v, size, stream) cudaMemsetAsync(ptr, v, size, stream)
#endif

/** Error handling */
inline void _cuda_check_err(const cudaError_t err, const char* file, const int line, const char* function) {
if (err != cudaSuccess) {
Expand Down
24 changes: 12 additions & 12 deletions cpp/sparse_csr_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ FUNC_IMPL_CUDA(std::vector<torch::Tensor>,
/* We'll create a temporary array to store outputs to and simplify things */
scalar_t* grad_x_ary = nullptr;
cudaMalloc(&grad_x_ary, A_cols * sizeof(scalar_t));
cudaMemsetAsync(grad_x_ary, 0, A_cols * sizeof(scalar_t), main_stream);
cudaMemsetAsyncMaybe(grad_x_ary, 0, A_cols * sizeof(scalar_t), main_stream);

const int grad_x_threads = A_rows;
const dim3 grad_x_blocks((grad_x_threads + threads_per_block - 1) / threads_per_block, 1);
Expand Down Expand Up @@ -250,8 +250,8 @@ FUNC_IMPL_CUDA(std::vector<torch::Tensor>,
int64_t* At_indptr_tmp;
cuda_check_err(cudaMalloc(&At_indptr_tmp, A_columns * sizeof(int64_t)));
cuda_check_err(cudaMalloc(&At_indptr_raw, (A_columns + 1) * sizeof(int64_t)));
cuda_check_err(cudaMemsetAsync(At_indptr_tmp, 0, A_columns * sizeof(int64_t), main_stream));
cuda_check_err(cudaMemsetAsync(At_indptr_raw, 0, sizeof(int64_t), main_stream)); /* Zero out first entry. */
cuda_check_err(cudaMemsetAsyncMaybe(At_indptr_tmp, 0, A_columns * sizeof(int64_t), main_stream));
cuda_check_err(cudaMemsetAsyncMaybe(At_indptr_raw, 0, sizeof(int64_t), main_stream)); /* Zero out first entry. */

/* Compute number of nonzeros per column of A */
cuda_kernel_csr_nnz_per_col<<<(A_rows + threads_per_block - 1) / threads_per_block, threads_per_block, 0, main_stream>>>(
Expand All @@ -261,7 +261,7 @@ FUNC_IMPL_CUDA(std::vector<torch::Tensor>,

/* Now, compute the cumulative sum of nnz to get starting rowptrs of A^T */
cub_cumsum(At_indptr_tmp, At_indptr_raw + 1, A_columns, main_stream);
cuda_check_err(cudaFreeAsync(At_indptr_tmp, main_stream));
cuda_check_err(cudaFreeAsyncMaybe(At_indptr_tmp, main_stream));
At_indptr = torch::from_blob(At_indptr_raw, { static_cast<int64_t>(A_columns + 1) }, cudaFree, int_tens_opts);

/* Move data values into their correct spots */
Expand Down Expand Up @@ -807,7 +807,7 @@ FUNC_IMPL_CUDA(torch::Tensor,
torch doesn't have a native way to do this for tensor accessors */
bool* value_available;
cuda_check_err(cudaMalloc(&value_available, sizeof(bool) * A_rows));
cuda_check_err(cudaMemsetAsync(value_available, 0, sizeof(bool) * A_rows, main_stream));
cuda_check_err(cudaMemsetAsyncMaybe(value_available, 0, sizeof(bool) * A_rows, main_stream));

double* x_raw;
cuda_check_err(cudaMalloc(&x_raw, sizeof(double) * A_rows));
Expand All @@ -825,7 +825,7 @@ FUNC_IMPL_CUDA(torch::Tensor,
}));
cuda_check_kernel_launch_err();

cuda_check_err(cudaFreeAsync(value_available, main_stream));
cuda_check_err(cudaFreeAsyncMaybe(value_available, main_stream));
x_dbl = torch::from_blob(x_raw, { static_cast<int64_t>(A_rows) }, cudaFree, options);
return x_dbl.to(A_data.dtype());
}
Expand Down Expand Up @@ -1196,9 +1196,9 @@ FUNC_IMPL_CUDA(std::vector<torch::Tensor>,
cuda_check_kernel_launch_err();

/* From the row nnz, compute row pointers */
cuda_check_err(cudaMemsetAsync(As_row_indptr_raw, 0, sizeof(int64_t), main_stream));
cuda_check_err(cudaMemsetAsyncMaybe(As_row_indptr_raw, 0, sizeof(int64_t), main_stream));
cub_cumsum(As_row_nnz, As_row_indptr_raw + 1, A_rows, main_stream);
cuda_check_err(cudaFreeAsync(As_row_nnz, main_stream));
cuda_check_err(cudaFreeAsyncMaybe(As_row_nnz, main_stream));

/* Allocate storage for the data and row indices arrays */
int64_t As_nnz;
Expand All @@ -1214,9 +1214,9 @@ FUNC_IMPL_CUDA(std::vector<torch::Tensor>,
vert_fill, vert_queue, vert_mask, tensor_acc(As_data, scalar_t), tensor_acc(As_indices, int64_t), As_row_indptr_raw);
}));
cuda_check_kernel_launch_err();
cuda_check_err(cudaFreeAsync(vert_fill, main_stream));
cuda_check_err(cudaFreeAsync(vert_queue, main_stream));
cuda_check_err(cudaFreeAsync(vert_mask, main_stream));
cuda_check_err(cudaFreeAsyncMaybe(vert_fill, main_stream));
cuda_check_err(cudaFreeAsyncMaybe(vert_queue, main_stream));
cuda_check_err(cudaFreeAsyncMaybe(vert_mask, main_stream));

/* Compute the transpose/csc representation of As so that we have easy column access. */
auto AsT = csr_transpose_forward_cuda(A_rows, A_cols, As_data, As_indices, As_indptr);
Expand All @@ -1235,7 +1235,7 @@ FUNC_IMPL_CUDA(std::vector<torch::Tensor>,
tensor_acc(AsT_data, scalar_t), tensor_acc(AsT_indices, int64_t), tensor_acc(AsT_indptr, int64_t), U_col_nnz);
}));
cuda_check_kernel_launch_err();
cuda_check_err(cudaFreeAsync(U_col_nnz, main_stream));
cuda_check_err(cudaFreeAsyncMaybe(U_col_nnz, main_stream));

/* Transpose back into CSR format */
auto As_f = csr_transpose_forward_cuda(A_cols, A_rows, AsT_data, AsT_indices, AsT_indptr);
Expand Down
4 changes: 2 additions & 2 deletions numml/tests/test_spdmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def test_random_large():

AX_d = AL_d @ X

assert(torch.allclose(AX_d, AL@X))
assert(torch.allclose(AX_d, (AL_c@X_c).cpu()))
assert(tla.norm(AX_d - AL@X) < 1e-4)
assert(tla.norm(AX_d - (AL_c@X_c).cpu()) < 1e-4)

def test_backward_grad_A():
# grad_A := (grad_C * B^T) (*) mask(A)
Expand Down