Skip to content

Commit

Permalink
cusolver based batched LU solver (#22)
Browse files Browse the repository at this point in the history
* cublas-based sparse LU solver class
* update cuda installs in ci
* add test to ci
* add C++ extensions to gitignore

Co-authored-by: Maurizio Monge <[email protected]>
  • Loading branch information
maurimo and Maurizio Monge authored Dec 21, 2021
1 parent e1d0cd0 commit ec9546c
Show file tree
Hide file tree
Showing 10 changed files with 605 additions and 1 deletion.
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ run_tests: &run_tests
working_directory: ~/project
command: |
pytest -s theseus/tests/test_theseus_layer.py
pytest -s theseus/extlib/tests/test_cusolver_lu_solver.py
# -------------------------------------------------------------------------------------
# Jobs
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ expts/
*.ipynb_checkpoints
examples/*.ipynb_checkpoints
outputs/
examples/outputs/
examples/outputs/
theseus/extlib/*.so
1 change: 1 addition & 0 deletions requirements/main.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ scipy>=1.5.3
scikit-sparse>=0.4.5
# torch>=1.7.1 will do separate install instructions for now (CUDA dependent)
pytest>=6.2.1
pybind11>=2.7.1
18 changes: 18 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from pathlib import Path
import setuptools
import os
from torch.utils import cpp_extension as torch_cpp_ext


def parse_requirements_file(path):
Expand All @@ -25,6 +27,20 @@ def parse_requirements_file(path):
with open("README.md", "r") as fh:
long_description = fh.read()

if "CUDA_HOME" in os.environ:
ext_modules = [
torch_cpp_ext.CUDAExtension(
name="theseus.extlib.cusolver_lu_solver",
sources=[
"theseus/extlib/cusolver_lu_solver.cpp",
"theseus/extlib/cusolver_sp_defs.cpp",
],
libraries=["cusolver"],
),
]
else:
ext_modules = []

setuptools.setup(
name="theseus",
version=version,
Expand All @@ -44,4 +60,6 @@ def parse_requirements_file(path):
python_requires=">=3.7",
install_requires=reqs_main,
extras_require={"dev": reqs_main + reqs_dev},
cmdclass={"build_ext": torch_cpp_ext.BuildExtension},
ext_modules=ext_modules,
)
341 changes: 341 additions & 0 deletions theseus/extlib/cusolver_lu_solver.cpp

Large diffs are not rendered by default.

69 changes: 69 additions & 0 deletions theseus/extlib/cusolver_sp_defs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.

#pragma once

#include "cusolver_sp_defs.h"
#include <ATen/cuda/Exceptions.h>
#include <ATen/cuda/detail/DeviceThreadHandles.h>
#include <ATen/cuda/CUDAContext.h>

// functions are defined in this headers are inline so this can be included multiple times
// in units compiled independently (such as Torch extensions formed by one .cu/.cpp file)
namespace theseus::cusolver_sp {

const char* cusolverGetErrorMessage(cusolverStatus_t status) {
switch (status) {
case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCES";
case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED";
case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED";
case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE";
case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH";
case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED";
case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR";
case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
default: return "Unknown cusolver error number";
}
}

void createCusolverSpHandle(cusolverSpHandle_t *handle) {
CUSOLVER_CHECK(cusolverSpCreate(handle));
}

// The switch below look weird, but we will be adopting the same policy as for CusolverDn handle in Torch source
void destroyCusolverSpHandle(cusolverSpHandle_t handle) {
// this is because of something dumb in the ordering of
// destruction. Sometimes atexit, the cuda context (or something)
// would already be destroyed by the time this gets destroyed. It
// happens in fbcode setting. @colesbury and @soumith decided to not destroy
// the handle as a workaround.
// - Comments of @soumith copied from cuDNN handle pool implementation
#ifdef NO_CUDNN_DESTROY_HANDLE
#else
cusolverSpDestroy(handle);
#endif
}

using CuSolverSpPoolType = at::cuda::DeviceThreadHandlePool<cusolverSpHandle_t, createCusolverSpHandle, destroyCusolverSpHandle>;

cusolverSpHandle_t getCurrentCUDASolverSpHandle() {
int device;
AT_CUDA_CHECK(cudaGetDevice(&device));

// Thread local PoolWindows are lazily-initialized
// to avoid initialization issues that caused hangs on Windows.
// See: https://github.com/pytorch/pytorch/pull/22405
// This thread local unique_ptrs will be destroyed when the thread terminates,
// releasing its reserved handles back to the pool.
static auto pool = std::make_shared<CuSolverSpPoolType>();
thread_local std::unique_ptr<CuSolverSpPoolType::PoolWindow> myPoolWindow(pool->newPoolWindow());

auto handle = myPoolWindow->reserve(device);
auto stream = c10::cuda::getCurrentCUDAStream();
CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
return handle;
}

} // namespace theseus::cusolver_sp
25 changes: 25 additions & 0 deletions theseus/extlib/cusolver_sp_defs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.

#pragma once

#include <cusolverSp.h>

#define CUSOLVER_CHECK(EXPR) \
do { \
cusolverStatus_t __err = EXPR; \
TORCH_CHECK(__err == CUSOLVER_STATUS_SUCCESS, \
"cusolver error: ", \
theseus::cusolver_sp::cusolverGetErrorMessage(__err), \
", when calling `" #EXPR "`"); \
} while (0)

namespace theseus::cusolver_sp {

const char* cusolverGetErrorMessage(cusolverStatus_t status);

cusolverSpHandle_t getCurrentCUDASolverSpHandle();

} // namespace theseus::cusolver_sp
121 changes: 121 additions & 0 deletions theseus/extlib/tests/test_cusolver_lu_solver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import numpy as np
import pytest # noqa: F401
import torch # needed for import of Torch C++ extensions to work
from scipy.sparse import csr_matrix

from theseus.utils import random_sparse_binary_matrix


# ideally we would like to support batch_size <= init_batch_size, but
# because of limitations of cublas those have to be always identical
def check_lu_solver(
init_batch_size, batch_size, num_rows, num_cols, fill, verbose=False
):
# this is necessary assumption, so that the hessian is full rank
assert num_rows >= num_cols

if not torch.cuda.is_available():
return
from theseus.extlib.cusolver_lu_solver import CusolverLUSolver

A_skel = random_sparse_binary_matrix(
num_rows, num_cols, fill, min_entries_per_col=3
)
A_num_cols = num_cols
A_rowPtr = torch.tensor(A_skel.indptr, dtype=torch.int).cuda()
A_colInd = torch.tensor(A_skel.indices, dtype=torch.int).cuda()
A_num_rows = A_rowPtr.size(0) - 1
A_nnz = A_colInd.size(0)
A_val = torch.rand((batch_size, A_nnz), dtype=torch.double).cuda()
b = torch.rand((batch_size, A_num_rows), dtype=torch.double).cuda()

A_csr = [
csr_matrix(
(A_val[i].cpu(), A_colInd.cpu(), A_rowPtr.cpu()), (A_num_rows, A_num_cols)
)
for i in range(batch_size)
]
if verbose:
print("A[0]:\n", A_csr[0].todense())
print("b[0]:\n", b[0])

AtA_csr = [(a.T @ a).tocsr() for a in A_csr]
AtA_rowPtr = torch.tensor(AtA_csr[0].indptr).cuda()
AtA_colInd = torch.tensor(AtA_csr[0].indices).cuda()
AtA_val = torch.tensor(np.array([m.data for m in AtA_csr])).cuda()
AtA_num_rows = AtA_rowPtr.size(0) - 1
AtA_num_cols = AtA_num_rows
AtA_nnz = AtA_colInd.size(0) # noqa: F841

if verbose:
print("AtA[0]:\n", AtA_csr[0].todense())

slv = CusolverLUSolver(init_batch_size, AtA_num_cols, AtA_rowPtr, AtA_colInd)
singularities = slv.factor(AtA_val)

if verbose:
print("singularities:", singularities)

b = torch.rand((batch_size, A_num_rows), dtype=torch.double).cuda()
Atb = torch.tensor(
np.array([A_csr[i].T @ b[i].cpu().numpy() for i in range(batch_size)])
).cuda()
if verbose:
print("Atb[0]:", Atb[0])

sol = Atb.clone()
slv.solve(sol)
if verbose:
print("x[0]:", sol[0])

residuals = [
AtA_csr[i] @ sol[i].cpu().numpy() - Atb[i].cpu().numpy()
for i in range(batch_size)
]
if verbose:
print("residual[0]:", residuals[0])

assert all(np.linalg.norm(res) < 1e-10 for res in residuals)


def test_lu_solver_1():
check_lu_solver(init_batch_size=5, batch_size=5, num_rows=50, num_cols=30, fill=0.2)


def test_lu_solver_2():
check_lu_solver(
init_batch_size=5, batch_size=5, num_rows=150, num_cols=60, fill=0.2
)


def test_lu_solver_3():
check_lu_solver(
init_batch_size=10, batch_size=10, num_rows=300, num_cols=90, fill=0.2
)


def test_lu_solver_4():
check_lu_solver(init_batch_size=5, batch_size=5, num_rows=50, num_cols=30, fill=0.1)


def test_lu_solver_5():
check_lu_solver(
init_batch_size=5, batch_size=5, num_rows=150, num_cols=60, fill=0.1
)


def test_lu_solver_6():
check_lu_solver(
init_batch_size=10, batch_size=10, num_rows=300, num_cols=90, fill=0.1
)


# would like to test when irregular batch_size < init_batch_size,
# but this is currently not supported by cublas, maybe in the future
# def test_lu_solver_7():
# check_lu_solver(init_batch_size=10, batch_size=5, num_rows=150, num_cols=60, fill=0.2)
1 change: 1 addition & 0 deletions theseus/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from .sparse_matrix_utils import random_sparse_binary_matrix
from .utils import build_mlp, gather_from_rows_cols, numeric_jacobian
26 changes: 26 additions & 0 deletions theseus/utils/sparse_matrix_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import numpy as np
from scipy.sparse import csr_matrix, lil_matrix


def random_sparse_binary_matrix(rows, cols, fill, min_entries_per_col) -> csr_matrix:
retv = lil_matrix((rows, cols))

if min_entries_per_col > 0:
min_entries_per_col = min(rows, min_entries_per_col)
rows_array = np.arange(rows)
for c in range(cols):
for r in np.random.choice(rows_array, min_entries_per_col):
retv[r, c] = 1.0

num_entries = int(fill * rows * cols)
while retv.getnnz() < num_entries:
col = np.random.randint(cols)
row = np.random.randint(rows)
retv[row, col] = 1.0

return retv.tocsr()

0 comments on commit ec9546c

Please sign in to comment.