From 5a8731e15241829ee58eeb2815f79a2cb3aad1f5 Mon Sep 17 00:00:00 2001 From: Shangtong Zhang Date: Mon, 14 Mar 2016 00:54:41 +0800 Subject: [PATCH 01/11] Fix bug in weak equality check --- include/boost/numeric/ublas/detail/matrix_assign.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/boost/numeric/ublas/detail/matrix_assign.hpp b/include/boost/numeric/ublas/detail/matrix_assign.hpp index be172dd63..d42d495fc 100644 --- a/include/boost/numeric/ublas/detail/matrix_assign.hpp +++ b/include/boost/numeric/ublas/detail/matrix_assign.hpp @@ -30,7 +30,7 @@ namespace detail { template BOOST_UBLAS_INLINE bool equals (const matrix_expression &e1, const matrix_expression &e2, S epsilon, S min_norm) { - return norm_inf (e1 - e2) < epsilon * + return norm_inf (e1 - e2) <= epsilon * std::max (std::max (norm_inf (e1), norm_inf (e2)), min_norm); } From 986547a60d34a9fae17d875109dc4fb75b520e6e Mon Sep 17 00:00:00 2001 From: Ronald Wampler Date: Mon, 28 Mar 2016 13:01:17 -0400 Subject: [PATCH 02/11] Add norm_2_square --- include/boost/numeric/ublas/functional.hpp | 46 +++++++++++++++++++ .../boost/numeric/ublas/vector_expression.hpp | 10 ++++ test/test_complex_norms.cpp | 33 +++++++++++++ 3 files changed, 89 insertions(+) diff --git a/include/boost/numeric/ublas/functional.hpp b/include/boost/numeric/ublas/functional.hpp index b0be20608..3613f7ad4 100644 --- a/include/boost/numeric/ublas/functional.hpp +++ b/include/boost/numeric/ublas/functional.hpp @@ -522,6 +522,52 @@ namespace boost { namespace numeric { namespace ublas { #endif } }; + + template + struct vector_norm_2_square : + public vector_scalar_real_unary_functor { + typedef typename vector_scalar_real_unary_functor::value_type value_type; + typedef typename vector_scalar_real_unary_functor::real_type real_type; + typedef typename vector_scalar_real_unary_functor::result_type result_type; + + template + static BOOST_UBLAS_INLINE + result_type apply (const vector_expression &e) { + real_type t = real_type (); + typedef typename E::size_type vector_size_type; + vector_size_type size (e ().size ()); + for (vector_size_type i = 0; i < size; ++ i) { + real_type u (type_traits::norm_2 (e () (i))); + t += u * u; + } + return t; + } + // Dense case + template + static BOOST_UBLAS_INLINE + result_type apply (D size, I it) { + real_type t = real_type (); + while (-- size >= 0) { + real_type u (type_traits::norm_2 (*it)); + t += u * u; + ++ it; + } + return t; + } + // Sparse case + template + static BOOST_UBLAS_INLINE + result_type apply (I it, const I &it_end) { + real_type t = real_type (); + while (it != it_end) { + real_type u (type_traits::norm_2 (*it)); + t += u * u; + ++ it; + } + return t; + } + }; + template struct vector_norm_inf: public vector_scalar_real_unary_functor { diff --git a/include/boost/numeric/ublas/vector_expression.hpp b/include/boost/numeric/ublas/vector_expression.hpp index d92207c98..6f952a464 100644 --- a/include/boost/numeric/ublas/vector_expression.hpp +++ b/include/boost/numeric/ublas/vector_expression.hpp @@ -1609,6 +1609,16 @@ namespace boost { namespace numeric { namespace ublas { return expression_type (e ()); } + // real: norm_2_square v = sum(v [i] * v [i]) + // complex: norm_2_square v = sum(v [i] * conj (v [i])) + template + BOOST_UBLAS_INLINE + typename vector_scalar_unary_traits >::result_type + norm_2_square (const vector_expression &e) { + typedef typename vector_scalar_unary_traits >::expression_type expression_type; + return expression_type (e ()); + } + // real: norm_inf v = maximum (abs (v [i])) // complex: norm_inf v = maximum (maximum (abs (real (v [i])), abs (imag (v [i])))) template diff --git a/test/test_complex_norms.cpp b/test/test_complex_norms.cpp index d58a788ee..0c37ed45b 100644 --- a/test/test_complex_norms.cpp +++ b/test/test_complex_norms.cpp @@ -42,6 +42,21 @@ BOOST_UBLAS_TEST_DEF ( test_double_complex_norm_2 ) { BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - (3.0*expected)) < TOL); } +BOOST_UBLAS_TEST_DEF ( test_double_complex_norm_2_square ) { + typedef std::complex dComplex; + vector v(4); + for (unsigned int i = 0; i < v.size(); ++i) + v[i] = dComplex(i, i + 1); + + const double expected = 44; + + BOOST_UBLAS_DEBUG_TRACE( "square norm is " << norm_2_square(v) ); + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - expected) < TOL); + v *= 3.; + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - (9.0*expected)) < TOL); +} + + BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_inf ) { typedef std::complex dComplex; vector v(4); @@ -74,6 +89,22 @@ BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_2 ) { BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - (3.0*expected)) < TOL); } +BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_2_square ) { + typedef std::complex dComplex; + vector v(4); + for (unsigned short i = 0; i < v.size(); ++i) { + unsigned short imag(i + 1); + v[i] = dComplex(i, imag); + } + + const double expected = 44; + + BOOST_UBLAS_DEBUG_TRACE( "square norm is " << norm_2_square(v) ); + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - expected) < TOL); + v *= 3.f; + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - (9.0*expected)) < TOL); +} + int main() { BOOST_UBLAS_TEST_BEGIN(); @@ -81,6 +112,8 @@ int main() { BOOST_UBLAS_TEST_DO( test_float_complex_norm_inf ); BOOST_UBLAS_TEST_DO( test_double_complex_norm_2 ); BOOST_UBLAS_TEST_DO( test_float_complex_norm_2 ); + BOOST_UBLAS_TEST_DO( test_double_complex_norm_2_square ); + BOOST_UBLAS_TEST_DO( test_float_complex_norm_2_square ); BOOST_UBLAS_TEST_END(); } From e88fc29650960ad0d1b9f7dbc5ce7fb2b7053a60 Mon Sep 17 00:00:00 2001 From: Ronald Wampler Date: Mon, 28 Mar 2016 13:34:15 -0400 Subject: [PATCH 03/11] Add documentation for norm_2_square --- doc/operations_overview.html | 3 ++- doc/vector_expression.html | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/operations_overview.html b/doc/operations_overview.html index 42d994212..212fcbebf 100644 --- a/doc/operations_overview.html +++ b/doc/operations_overview.html @@ -87,7 +87,8 @@

norms


 t = norm_inf(v); i = index_norm_inf(v);
-t = norm_1(v);   t = norm_2(v); 
+t = norm_1(v);   t = norm_2(v);
+t = norm_2_square(v);
 t = norm_inf(A); i = index_norm_inf(A);
 t = norm_1(A);   t = norm_frobenius(A); 
 
diff --git a/doc/vector_expression.html b/doc/vector_expression.html index 7838007ff..2372ec3df 100644 --- a/doc/vector_expression.html +++ b/doc/vector_expression.html @@ -847,6 +847,11 @@

Prototypes

typename vector_scalar_unary_traits<E, vector_norm_2<typename E::value_type> >::result_type norm_2 (const vector_expression<E> &e); + // norm_2_square v = sum (v [i] * v [i]) + template<class E> + typename vector_scalar_unary_traits<E, vector_norm_2_square<typename E::value_type> >::result_type + norm_2_square (const vector_expression<E> &e); + // norm_inf v = max (abs (v [i])) template<class E> typename vector_scalar_unary_traits<E, vector_norm_inf<typename E::value_type> >::result_type From ad76936f49fe3ad4c981afb5c4aab76df20615de Mon Sep 17 00:00:00 2001 From: Ronald Wampler Date: Thu, 7 Apr 2016 15:10:27 -0400 Subject: [PATCH 04/11] Fix compilation error when BOOST_UBLAS_SCALED_NORM is defined Commit e6b113 changed the `size_type` typedef to be based on the container and in doing so missed the case in vector_norm_2 when BOOST_UBLAS_SCALED_NORM is define. Also added test cases. --- include/boost/numeric/ublas/functional.hpp | 7 ++-- test/Jamfile.v2 | 4 +++ test/test_scaled_norm.cpp | 41 ++++++++++++++++++++++ 3 files changed, 48 insertions(+), 4 deletions(-) create mode 100644 test/test_scaled_norm.cpp diff --git a/include/boost/numeric/ublas/functional.hpp b/include/boost/numeric/ublas/functional.hpp index 3613f7ad4..f54b441fe 100644 --- a/include/boost/numeric/ublas/functional.hpp +++ b/include/boost/numeric/ublas/functional.hpp @@ -433,10 +433,10 @@ namespace boost { namespace numeric { namespace ublas { template static BOOST_UBLAS_INLINE result_type apply (const vector_expression &e) { -#ifndef BOOST_UBLAS_SCALED_NORM - real_type t = real_type (); typedef typename E::size_type vector_size_type; vector_size_type size (e ().size ()); +#ifndef BOOST_UBLAS_SCALED_NORM + real_type t = real_type (); for (vector_size_type i = 0; i < size; ++ i) { real_type u (type_traits::norm_2 (e () (i))); t += u * u; @@ -445,8 +445,7 @@ namespace boost { namespace numeric { namespace ublas { #else real_type scale = real_type (); real_type sum_squares (1); - size_type size (e ().size ()); - for (size_type i = 0; i < size; ++ i) { + for (vector_size_type i = 0; i < size; ++ i) { real_type u (type_traits::norm_2 (e () (i))); if ( real_type () /* zero */ == u ) continue; if (scale < u) { diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 77e296c91..f990b76b9 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -189,6 +189,10 @@ test-suite numeric/uBLAS ] [ run test_complex_norms.cpp ] + [ run test_scaled_norm.cpp + : : : + BOOST_UBLAS_SCALED_NORM + ] [ run test_assignment.cpp : : : BOOST_UBLAS_COO_ALWAYS_DO_FULL_SORT diff --git a/test/test_scaled_norm.cpp b/test/test_scaled_norm.cpp new file mode 100644 index 000000000..74065048f --- /dev/null +++ b/test/test_scaled_norm.cpp @@ -0,0 +1,41 @@ +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include +#include + +#include "utils.hpp" + +using namespace boost::numeric::ublas; + +static const double TOL(1.0e-5); ///< Used for comparing two real numbers. + +BOOST_UBLAS_TEST_DEF ( test_double_scaled_norm_2 ) { + vector v(2); + v[0] = 0; v[1] = 1.0e155; + + const double expected = 1.0e155; + + BOOST_UBLAS_DEBUG_TRACE( "norm is " << norm_2(v) ); + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - expected) < TOL); +} + +BOOST_UBLAS_TEST_DEF ( test_float_scaled_norm_2 ) { + vector v(2); + v[0] = 0; v[1] = 1.0e20; + + const float expected = 1.0e20; + + BOOST_UBLAS_DEBUG_TRACE( "norm is " << norm_2(v) ); + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - expected) < TOL); +} + +int main() { + BOOST_UBLAS_TEST_BEGIN(); + + BOOST_UBLAS_TEST_DO( test_double_scaled_norm_2 ); + BOOST_UBLAS_TEST_DO( test_float_scaled_norm_2 ); + + BOOST_UBLAS_TEST_END(); +} From 5dc64f9b240f55f8377260852465b50659a04365 Mon Sep 17 00:00:00 2001 From: Imre Palik Date: Thu, 11 Feb 2016 17:43:07 +0000 Subject: [PATCH 05/11] ublas: improved dense matrix multiplication performance This patch includes the gemm implementation from Michael Lehn to boost::ublas. This modifies the workings of ublas::prod() and ublas::axppy_prod() to use gemm() above a certain matrix size. This patch only contains the basic architecture, and a generic c++ implementation. Signed-off-by: Imre Palik Cc: Michael Lehn --- include/boost/numeric/ublas/detail/gemm.hpp | 345 ++++++++++++++++++ .../boost/numeric/ublas/matrix_expression.hpp | 141 ++++++- include/boost/numeric/ublas/operation.hpp | 155 +++++++- 3 files changed, 622 insertions(+), 19 deletions(-) create mode 100644 include/boost/numeric/ublas/detail/gemm.hpp diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp new file mode 100644 index 000000000..f2a324a0d --- /dev/null +++ b/include/boost/numeric/ublas/detail/gemm.hpp @@ -0,0 +1,345 @@ +// +// Copyright (c) 2016 +// Michael Lehn, Imre Palik +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef _BOOST_UBLAS_GEMM_ +#define _BOOST_UBLAS_GEMM_ + +#include +#include +#include +#include +#include +#include + +namespace boost { namespace numeric { namespace ublas { namespace detail { + + template + struct prod_block_size { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4092; + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 12; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 26; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4080; + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 24; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 26; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 2; // stripe width for lhs + static const unsigned nr = 2; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 26; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template + struct is_blocksize { + struct fallback { static const int nr = 0; }; + struct derived : T, fallback {}; + template + struct nonr { + static const bool value = false; + typedef false_type type; + }; + + template static char (&f(typename nonr::type*))[1]; + template static char (&f(...))[2]; + + static bool const value = sizeof(f(0)) == 2; + }; + + template + void + gescal(const typename E::value_type &alpha, matrix_expression &X) + { + typedef typename E::size_type size_type; + + for (size_type i=0; i + void + geaxpy(Index m, Index n, const T &alpha, + const T *X, Index incRowX, Index incColX, + T *Y, Index incRowY, Index incColY) + { + for (Index j=0; j + void + gescal(Index m, Index n, + const TX &alpha, + TX *X, Index incRowX, Index incColX) + { + if (alpha != TX(0)) { + for (Index j=0; j + void + ugemm(Index kc, TC alpha, const T *A, const T *B, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED(A, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align); + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + typename boost::aligned_storage::type Pa; + T *P = reinterpret_cast(Pa.address()); + for (unsigned c = 0; c < MR * NR; c++) + P[c] = 0; + + for (Index l=0; l + void + mgemm(Index mc, Index nc, Index kc, TC alpha, + const T *A, const T *B, TC beta, + TC *C, Index incRowC, Index incColC) + { + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + const Index mp = (mc+MR-1) / MR; + const Index np = (nc+NR-1) / NR; + const Index mr_ = mc % MR; + const Index nr_ = nc % NR; + + // #if defined(_OPENMP) + // #pragma omp parallel for + // #endif + for (Index j=0; j(kc, alpha, + &A[i*kc*MR], &B[j*kc*NR], + beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } else { + std::fill_n(C_, MR*NR, T(0)); + ugemm(kc, alpha, + &A[i*kc*MR], &B[j*kc*NR], + T(0), + C_, NR, Index(1)); + gescal(mr, nr, beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + geaxpy(mr, nr, TC(1), C_, NR, Index(1), + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } + } + } + } + + //-- Packing blocks ------------------------------------------------------------ + template + void + pack_A(const matrix_expression &A, T *p) + { + typedef typename E::size_type size_type; + + const size_type mc = A ().size1(); + const size_type kc = A ().size2(); + static const size_type MR = BlockSize::mr; + const size_type mp = (mc+MR-1) / MR; + + for (size_type j=0; j + void + pack_B(const matrix_expression &B, T *p) + { + typedef typename E::size_type size_type; + + const size_type kc = B ().size1(); + const size_type nc = B ().size2(); + static const size_type NR = BlockSize::nr; + const size_type np = (nc+NR-1) / NR; + + for (size_type l=0; l + void + gemm(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3, + BlockSize) + { + typedef typename E3::size_type size_type; + typedef typename E1::value_type value_type1; + typedef typename E2::value_type value_type2; + typedef typename E3::value_type value_type3; + typedef typename common_type::type value_type_i; + typedef unbounded_array > array_type_i; + + static const size_type MC = BlockSize::mc; + static const size_type NC = BlockSize::nc; + + const size_type m = BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ()); + const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ()); + const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); + + static const size_type KC = BlockSize::kc; + const size_type mb = (m+MC-1) / MC; + const size_type nb = (n+NC-1) / NC; + const size_type kb = (k+KC-1) / KC; + const size_type mc_ = m % MC; + const size_type nc_ = n % NC; + const size_type kc_ = k % KC; + + value_type3 *C_ = &e3()(0,0); + const size_type incRowC = &e3()(1,0) - &e3()(0,0); + const size_type incColC = &e3()(0,1) - &e3()(0,0); + + if (alpha==value_type3(0) || k==0) { + gescal(beta, e3); + return; + } + + array_type_i A(MC * KC); + array_type_i B(NC * KC); + + for (size_type j=0; j Bs = + subrange(e2(), l*KC, l*KC+kc, j*NC, j*NC+nc); + pack_B, value_type_i, BlockSize> + (Bs, &B[0]); + + for (size_type i=0; i As = + subrange(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc); + pack_A, value_type_i, BlockSize> + (As, &A[0]); + + mgemm + (mc, nc, kc, alpha, &A[0], &B[0], beta_, + &C_[i*MC*incRowC+j*NC*incColC], + incRowC, incColC); + } + } + } + } +}}}} +#endif diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp index 8e6c2f68b..e58c757da 100644 --- a/include/boost/numeric/ublas/matrix_expression.hpp +++ b/include/boost/numeric/ublas/matrix_expression.hpp @@ -14,6 +14,7 @@ #define _BOOST_UBLAS_MATRIX_EXPRESSION_ #include +#include // Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish // Iterators based on ideas of Jeremy Siek @@ -5460,45 +5461,104 @@ namespace boost { namespace numeric { namespace ublas { expression2_closure_type e2_; }; + namespace detail { + template + struct binary_calculate_result_type; + + template + struct binary_calculate_result_type { + typedef matrix_matrix_binary > result_type; + }; + + template + struct binary_calculate_result_type { + typedef matrix

result_type; + }; + } + template struct matrix_matrix_binary_traits { - typedef unknown_storage_tag storage_category; + // typedef unknown_storage_tag storage_category; + typedef typename storage_restrict_traits::storage_category storage_category; typedef unknown_orientation_tag orientation_category; typedef typename promote_traits::promote_type promote_type; typedef matrix_matrix_binary > expression_type; #ifndef BOOST_UBLAS_SIMPLE_ET_DEBUG - typedef expression_type result_type; + // typedef expression_type result_type; + typedef typename detail::binary_calculate_result_type::value>::result_type result_type; #else typedef typename E1::matrix_temporary_type result_type; #endif }; - template + template BOOST_UBLAS_INLINE typename matrix_matrix_binary_traits::result_type prod (const matrix_expression &e1, const matrix_expression &e2, + B, unknown_storage_tag, unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits::expression_type expression_type; return expression_type (e1 (), e2 ()); } - // Dispatcher - template + template BOOST_UBLAS_INLINE typename matrix_matrix_binary_traits::result_type prod (const matrix_expression &e1, - const matrix_expression &e2) { + const matrix_expression &e2, + B b, + dense_proxy_tag, + unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits::expression_type expression_type; + typedef typename matrix_matrix_binary_traits::result_type result_type; + typedef typename result_type::value_type result_value; + + if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) { + return expression_type (e1 (), e2 ()); + } else { + result_type rv(e1 ().size1(), e2 ().size2()); + detail::gemm(result_value(1), e1, e2, + result_value(0), rv, b); + return rv; + } + } + + // Dispatcher + template + BOOST_UBLAS_INLINE + typename enable_if_c::value, + typename matrix_matrix_binary_traits::result_type>::type + prod (const matrix_expression &e1, + const matrix_expression &e2, + B b) { BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); typedef typename matrix_matrix_binary_traits::storage_category storage_category; typedef typename matrix_matrix_binary_traits::orientation_category orientation_category; - return prod (e1, e2, storage_category (), orientation_category ()); + return prod (e1, e2, b, storage_category (), orientation_category ()); + } + + template + BOOST_UBLAS_INLINE + typename matrix_matrix_binary_traits::result_type + prod (const matrix_expression &e1, + const matrix_expression &e2) { + BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); + typedef typename detail::prod_block_size::type> block_sizes; + + return prod (e1, e2, block_sizes()); } template @@ -5529,13 +5589,76 @@ namespace boost { namespace numeric { namespace ublas { return prec_prod (e1, e2, storage_category (), orientation_category ()); } - template + template + BOOST_UBLAS_INLINE + void + prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, B b, + unknown_storage_tag, + unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits::expression_type expression_type; + + m.assign(expression_type (e1 (), e2 ())); + } + + template + BOOST_UBLAS_INLINE + void + prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, B b, + dense_proxy_tag, + unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits::expression_type expression_type; + typedef typename matrix_matrix_binary_traits::result_type result_type; + typedef typename result_type::value_type result_value; + + if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) { + m.assign(expression_type (e1 (), e2 ())); + } else { + detail::gemm(result_value(1), e1, e2, + result_value(0), m, b); + } + } + + // dispatcher + template BOOST_UBLAS_INLINE M & + prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, B b) { + BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); + typedef typename matrix_matrix_binary_traits::storage_category storage_category; + typedef typename matrix_matrix_binary_traits::orientation_category orientation_category; + + prod (e1, e2, m, b, storage_category(), orientation_category()); + return m; + } + template + BOOST_UBLAS_INLINE + typename enable_if_c::value, + M&>::type prod (const matrix_expression &e1, const matrix_expression &e2, M &m) { - return m.assign (prod (e1, e2)); + BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); + typedef typename matrix_matrix_binary_traits::storage_category storage_category; + typedef typename matrix_matrix_binary_traits::orientation_category orientation_category; + typedef typename detail::prod_block_size::type> block_sizes; + prod (e1, e2, m, block_sizes(), storage_category(), + orientation_category()); + return m; } template diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp index 64657ccf3..2a410a082 100644 --- a/include/boost/numeric/ublas/operation.hpp +++ b/include/boost/numeric/ublas/operation.hpp @@ -14,7 +14,7 @@ #define _BOOST_UBLAS_OPERATION_ #include - +#include /** \file operation.hpp * \brief This file contains some specialized products. */ @@ -637,7 +637,63 @@ namespace boost { namespace numeric { namespace ublas { return m; } - // Dispatcher + template + BOOST_UBLAS_INLINE + M& + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, full, + S, bool init, B) + { + typedef typename M::value_type value_type; + typedef S storage_category; + typedef typename M::orientation_category orientation_category; + + + if (init) + m.assign (zero_matrix (e1 ().size1 (), e2 ().size2 ())); + return axpy_prod (e1, e2, m, full (), storage_category (), + orientation_category ()); + } + + template + BOOST_UBLAS_INLINE + M& + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, full, + dense_proxy_tag, bool init, B) + { + typedef typename M::value_type value_type; + typedef B block_sizes; + if (m.size1() < B::limit || m.size2() < B::limit) { + typedef typename M::storage_category storage_category; + typedef typename M::orientation_category orientation_category; + + if (init) + m.assign (zero_matrix (e1 ().size1 (), e2 ().size2 ())); + return axpy_prod (e1, e2, m, full (), storage_category (), + orientation_category ()); + } else { + detail::gemm(value_type(1), e1, e2, + value_type(init? 0 : 1), m, block_sizes()); + return m; + } + } + + template + BOOST_UBLAS_INLINE + M& + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, full, + dense_tag, bool init, B b) + { + return axpy_prod (e1, e2, m, full (), dense_proxy_tag(), + init, b); + } + + // Dispatchers template BOOST_UBLAS_INLINE M & @@ -651,11 +707,16 @@ namespace boost { namespace numeric { namespace ublas { if (init) m.assign (zero_matrix (e1 ().size1 (), e2 ().size2 ())); - return axpy_prod (e1, e2, m, triangular_restriction (), storage_category (), orientation_category ()); + + return axpy_prod(e1, e2, m, triangular_restriction (), + storage_category (), orientation_category(), + init); } + template BOOST_UBLAS_INLINE - M + typename enable_if_c::value, + M>::type axpy_prod (const matrix_expression &e1, const matrix_expression &e2, TRI) { @@ -690,29 +751,61 @@ namespace boost { namespace numeric { namespace ublas { \param E1 type of a matrix expression \c A \param E2 type of a matrix expression \c X */ + template + BOOST_UBLAS_INLINE + M & + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, bool init, B) { + typedef typename M::storage_category storage_category; + typedef B block_sizes; + + return axpy_prod(e1, e2, m, full (), storage_category (), + init, block_sizes()); + } + template BOOST_UBLAS_INLINE M & axpy_prod (const matrix_expression &e1, const matrix_expression &e2, M &m, bool init = true) { - typedef typename M::value_type value_type; typedef typename M::storage_category storage_category; - typedef typename M::orientation_category orientation_category; + typedef typename detail::prod_block_size::type> block_sizes; + return axpy_prod(e1, e2, m, full (), storage_category (), + init, block_sizes()); + } - if (init) - m.assign (zero_matrix (e1 ().size1 (), e2 ().size2 ())); - return axpy_prod (e1, e2, m, full (), storage_category (), orientation_category ()); + template + BOOST_UBLAS_INLINE + typename boost::enable_if_c::value, + M>::type + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, B) { + typedef M matrix_type; + typedef typename M::storage_category storage_category; + typedef B block_sizes; + + matrix_type m (e1 ().size1 (), e2 ().size2 ()); + return axpy_prod(e1, e2, m, full (), storage_category(), true, + block_sizes()); } + template BOOST_UBLAS_INLINE M axpy_prod (const matrix_expression &e1, const matrix_expression &e2) { typedef M matrix_type; + typedef typename M::storage_category storage_category; + typedef typename detail::prod_block_size::type> block_sizes; matrix_type m (e1 ().size1 (), e2 ().size2 ()); - return axpy_prod (e1, e2, m, full (), true); + return axpy_prod(e1, e2, m, full (), storage_category(), true, + block_sizes()); } @@ -825,6 +918,48 @@ namespace boost { namespace numeric { namespace ublas { return opb_prod (e1, e2, m, true); } + /** \brief computes C = alpha * A * B + beta * C in an + optimized fashion. + + \param alpha scalar multiplier + \param e1 the matrix expression \c A + \param e2 the matrix expression \c B + \param beta scalar multiplier + \param e3 the result matrix \c C + + gemm implements the well known gemm operation + + \ingroup blas3 + + \internal + + template parameters: + \param E1 type of a matrix expression \c A + \param E2 type of a matrix expression \c B + \param E3 type of a matrix expression \c C + */ + template + BOOST_UBLAS_INLINE + void + gemm(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3) { + typedef typename detail::prod_block_size::type> block_sizes; + detail::gemm(alpha, e1, e2, beta, e3, block_sizes()); + } + + template + BOOST_UBLAS_INLINE + void + gemm(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3, + BlockSize b) { + detail::gemm(alpha, e1, e2, beta, e3, b); + } + }}} #endif From b20cfb0fac2bb3431ebf24a9eff2cd1aeb0fc6ca Mon Sep 17 00:00:00 2001 From: Imre Palik Date: Sat, 27 Feb 2016 15:23:16 +0000 Subject: [PATCH 06/11] boost::ublas: gcc support for optimal matrix multiplication This patch contains an optimised, vectorised kernel, using gcc's SIMD vectors. This reaches matrix multiplication speeds comparable to hand-crafted assembly kernels on x86 Haswell microarchitecture. The kernels are compile-time parametrisable, ans the patch also contains optimised kernel parameters for float, double, and long double. The validity of the parameters are checked during compile-time, but the produced compile error can be relatively obscure if vector_length is too big. For architectures that doesn't support vectorisation for all types, it is possible to specify parameters that cause really suboptimal code generation. But as far as I see code correctness is ensured. Signed-off-by: Imre Palik Cc: Michael Lehn --- .../numeric/ublas/detail/block_sizes.hpp | 91 ++++++++ include/boost/numeric/ublas/detail/gemm.hpp | 212 ++++++++++-------- include/boost/numeric/ublas/detail/vector.hpp | 32 +++ .../boost/numeric/ublas/matrix_expression.hpp | 1 + include/boost/numeric/ublas/operation.hpp | 1 + 5 files changed, 238 insertions(+), 99 deletions(-) create mode 100644 include/boost/numeric/ublas/detail/block_sizes.hpp create mode 100644 include/boost/numeric/ublas/detail/vector.hpp diff --git a/include/boost/numeric/ublas/detail/block_sizes.hpp b/include/boost/numeric/ublas/detail/block_sizes.hpp new file mode 100644 index 000000000..9d861de1f --- /dev/null +++ b/include/boost/numeric/ublas/detail/block_sizes.hpp @@ -0,0 +1,91 @@ +// +// Copyright (c) 2016 +// Michael Lehn, Imre Palik +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef _BOOST_UBLAS_BLOCK_SIZES_ +#define _BOOST_UBLAS_BLOCK_SIZES_ + +#include + +namespace boost { namespace numeric { namespace ublas { namespace detail { + + template + struct prod_block_size { + static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(T); // Number of elements in a vector register + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = (4096/(3 * vector_length)) * (3 * vector_length); + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 3 * vector_length; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 14; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 16; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 14; // Use gemm from this size + static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(float); // Number of elements in a vector register + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 1; // stripe width for lhs + static const unsigned nr = 4; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 42; // Use gemm from this size + static const unsigned vector_length = 1; // Number of elements in a vector register + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template + struct is_blocksize { + struct fallback { static const int nr = 0; }; + struct derived : T, fallback {}; + template + struct nonr { + static const bool value = false; + typedef false_type type; + }; + + template static char (&f(typename nonr::type*))[1]; + template static char (&f(...))[2]; + + static bool const value = sizeof(f(0)) == 2; + }; + + template + struct check_blocksize { + BOOST_STATIC_ASSERT_MSG(T::mc>0 && T::kc>0 && T::nc>0 && T::mr>0 && T::nr>0, + "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(T::mc % T::mr == 0, + "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(T::nc % T::nr == 0, + "NC must be a multiple of NR."); + BOOST_STATIC_ASSERT_MSG(T::vector_length <= 1 || T::nr % T::vector_length == 0, + "NR must be a multiple of vector size"); + BOOST_STATIC_ASSERT_MSG(T::limit >= 2, + "Minimum matrix size for gemm is 2*2"); + }; +}}}} +#endif diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp index f2a324a0d..daf38a119 100644 --- a/include/boost/numeric/ublas/detail/gemm.hpp +++ b/include/boost/numeric/ublas/detail/gemm.hpp @@ -11,80 +11,26 @@ #include #include -#include +#include +#include #include #include #include +#include +#include +#include namespace boost { namespace numeric { namespace ublas { namespace detail { - template - struct prod_block_size { - static const unsigned mc = 256; - static const unsigned kc = 512; // stripe length - static const unsigned nc = 4092; - static const unsigned mr = 4; // stripe width for lhs - static const unsigned nr = 12; // stripe width for rhs - static const unsigned align = 64; // align temporary arrays to this boundary - static const unsigned limit = 26; // Use gemm from this size - BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); - BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); - BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); - }; - - template <> - struct prod_block_size { - static const unsigned mc = 256; - static const unsigned kc = 512; // stripe length - static const unsigned nc = 4080; - static const unsigned mr = 4; // stripe width for lhs - static const unsigned nr = 24; // stripe width for rhs - static const unsigned align = 64; // align temporary arrays to this boundary - static const unsigned limit = 26; // Use gemm from this size - BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); - BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); - BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); - }; - - template <> - struct prod_block_size { - static const unsigned mc = 256; - static const unsigned kc = 512; // stripe length - static const unsigned nc = 4096; - static const unsigned mr = 2; // stripe width for lhs - static const unsigned nr = 2; // stripe width for rhs - static const unsigned align = 64; // align temporary arrays to this boundary - static const unsigned limit = 26; // Use gemm from this size - BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); - BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); - BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); - }; - - template - struct is_blocksize { - struct fallback { static const int nr = 0; }; - struct derived : T, fallback {}; - template - struct nonr { - static const bool value = false; - typedef false_type type; - }; - - template static char (&f(typename nonr::type*))[1]; - template static char (&f(...))[2]; - - static bool const value = sizeof(f(0)) == 2; - }; - template void gescal(const typename E::value_type &alpha, matrix_expression &X) { typedef typename E::size_type size_type; - for (size_type i=0; i + //-- Micro Kernel ---------------------------------------------------------- +#ifdef BOOST_UBLAS_VECTOR_KERNEL + template + typename enable_if_c::value + && (1 < BlockSize::vector_length) + && (BlockSize::vector_length * sizeof(T) <= _BOOST_UBLAS_VECTOR_SIZE), + void>::type + ugemm(Index kc, TC alpha, const T *A, const T *B, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED (A, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED (B, BlockSize::align); + static const unsigned vector_length = BlockSize::vector_length; + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr/vector_length; + + typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE))); + + vx P[MR*NR] = {}; + + const vx *b = (const vx *)B; + for (Index l=0; l + typename enable_if_c::value + || (BlockSize::vector_length <= 1), + void>::type +#else + template void +#endif ugemm(Index kc, TC alpha, const T *A, const T *B, TC beta, TC *C, Index incRowC, Index incColC) { @@ -133,47 +143,47 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align); static const Index MR = BlockSize::mr; static const Index NR = BlockSize::nr; - typename boost::aligned_storage::type Pa; - T *P = reinterpret_cast(Pa.address()); - for (unsigned c = 0; c < MR * NR; c++) - P[c] = 0; + typename aligned_storage::type Pa; + T *P = reinterpret_cast(Pa.address()); + for (unsigned c = 0; c < MR * NR; c++) + P[c] = 0; for (Index l=0; l + //-- Macro Kernel ---------------------------------------------------------- + template void mgemm(Index mc, Index nc, Index kc, TC alpha, const T *A, const T *B, TC beta, @@ -190,9 +200,6 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { // #pragma omp parallel for // #endif for (Index j=0; j(kc, alpha, + ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], beta, &C[i*MR*incRowC+j*NR*incColC], @@ -223,11 +230,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { } //-- Packing blocks ------------------------------------------------------------ - template + template void pack_A(const matrix_expression &A, T *p) { typedef typename E::size_type size_type; + BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align); const size_type mc = A ().size1(); const size_type kc = A ().size2(); @@ -245,11 +253,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { } } - template + template void pack_B(const matrix_expression &B, T *p) { typedef typename E::size_type size_type; + BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align); const size_type kc = B ().size1(); const size_type nc = B ().size2(); @@ -271,10 +280,15 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { template void gemm(typename E3::value_type alpha, const matrix_expression &e1, - const matrix_expression &e2, + const matrix_expression &e2, typename E3::value_type beta, matrix_expression &e3, - BlockSize) + BlockSize) { +#if defined(BOOST_COMP_GNUC_DETECTION) + check_blocksize check __attribute__ ((unused)); +#else + check_blocksize check; +#endif typedef typename E3::size_type size_type; typedef typename E1::value_type value_type1; typedef typename E2::value_type value_type2; diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp new file mode 100644 index 000000000..3281ae950 --- /dev/null +++ b/include/boost/numeric/ublas/detail/vector.hpp @@ -0,0 +1,32 @@ +// +// Copyright (c) 2016 +// Imre Palik +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef _BOOST_UBLAS_DETAIL_VECTOR_ +#define _BOOST_UBLAS_DETAIL_VECTOR_ +#include + +#if defined(BOOST_HW_SIMD) && (BOOST_HW_SIMD == BOOST_HW_SIMD_X86 || BOOST_HW_SIMD == BOOST_HW_SIMD_X86_AMD) +#if BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_AVX_VERSION +#define _BOOST_UBLAS_VECTOR_SIZE 32 +#elif BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_SSE_VERSION +#define _BOOST_UBLAS_VECTOR_SIZE 16 +#else +#define _BOOST_UBLAS_VECTOR_SIZE 8 +#endif +#endif + +#ifndef _BOOST_UBLAS_VECTOR_SIZE +#define _BOOST_UBLAS_VECTOR_SIZE 1 +#endif + +#if (defined(BOOST_COMP_GNUC_DETECTION) \ + && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)) +#define BOOST_UBLAS_VECTOR_KERNEL = _BOOST_UBLAS_VECTOR_SIZE +#endif + +#endif diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp index e58c757da..6b9c8fec8 100644 --- a/include/boost/numeric/ublas/matrix_expression.hpp +++ b/include/boost/numeric/ublas/matrix_expression.hpp @@ -15,6 +15,7 @@ #include #include +#include // Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish // Iterators based on ideas of Jeremy Siek diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp index 2a410a082..1e70326a5 100644 --- a/include/boost/numeric/ublas/operation.hpp +++ b/include/boost/numeric/ublas/operation.hpp @@ -15,6 +15,7 @@ #include #include +#include /** \file operation.hpp * \brief This file contains some specialized products. */ From cff83d1c77cc23c19380b13262b96cbce8a36e30 Mon Sep 17 00:00:00 2001 From: Imre Palik Date: Sun, 28 Feb 2016 18:08:12 +0000 Subject: [PATCH 07/11] boost::ublas increasing the range of BLAS level 3 benchmarks This patch increases the range of BLAS level 3 benchmarks for dense matrices up to 1000*1000 matrices. Signed-off-by: Imre Palik --- benchmarks/bench1/Jamfile.v2 | 2 +- benchmarks/bench1/bench1.cpp | 14 ++++++++++---- benchmarks/bench1/bench13.cpp | 8 ++++++++ benchmarks/bench3/Jamfile.v2 | 3 ++- benchmarks/bench3/bench3.cpp | 14 ++++++++++---- benchmarks/bench3/bench33.cpp | 8 ++++++++ 6 files changed, 39 insertions(+), 10 deletions(-) diff --git a/benchmarks/bench1/Jamfile.v2 b/benchmarks/bench1/Jamfile.v2 index 77b11c77e..bf9c821c6 100644 --- a/benchmarks/bench1/Jamfile.v2 +++ b/benchmarks/bench1/Jamfile.v2 @@ -7,4 +7,4 @@ exe bench1 : bench1.cpp bench11.cpp bench12.cpp bench13.cpp - ; + : : speed ; diff --git a/benchmarks/bench1/bench1.cpp b/benchmarks/bench1/bench1.cpp index 87478e139..300eafa4a 100644 --- a/benchmarks/bench1/bench1.cpp +++ b/benchmarks/bench1/bench1.cpp @@ -76,22 +76,28 @@ void do_bench (std::string type_string, int scale) header (type_string + ", 3"); bench_1 () (1000000 * scale); bench_2 () (300000 * scale); - bench_3 () (100000 * scale); + bench_3 () (3000000 * scale); header (type_string + ", 10"); bench_1 () (300000 * scale); bench_2 () (30000 * scale); - bench_3 () (3000 * scale); + bench_3 () (100000 * scale); header (type_string + ", 30"); bench_1 () (100000 * scale); bench_2 () (3000 * scale); - bench_3 () (100 * scale); + bench_3 () (30000 * scale); header (type_string + ", 100"); bench_1 () (30000 * scale); bench_2 () (300 * scale); - bench_3 () (3 * scale); + bench_3 () (1000 * scale); + + header (type_string + ", 300"); + bench_3 () (30 * scale); + + header (type_string + ", 1000"); + bench_3 () (1 * scale); } int main (int argc, char *argv []) { diff --git a/benchmarks/bench1/bench13.cpp b/benchmarks/bench1/bench13.cpp index fadb0b673..2378d46b2 100644 --- a/benchmarks/bench1/bench13.cpp +++ b/benchmarks/bench1/bench13.cpp @@ -166,6 +166,8 @@ template struct bench_3; template struct bench_3; template struct bench_3; template struct bench_3; +template struct bench_3; +template struct bench_3; #endif #ifdef USE_DOUBLE @@ -173,6 +175,8 @@ template struct bench_3; template struct bench_3; template struct bench_3; template struct bench_3; +template struct bench_3; +template struct bench_3; #endif #ifdef USE_STD_COMPLEX @@ -181,6 +185,8 @@ template struct bench_3, 3>; template struct bench_3, 10>; template struct bench_3, 30>; template struct bench_3, 100>; +template struct bench_3, 300>; +template struct bench_3, 1000>; #endif #ifdef USE_DOUBLE @@ -188,5 +194,7 @@ template struct bench_3, 3>; template struct bench_3, 10>; template struct bench_3, 30>; template struct bench_3, 100>; +template struct bench_3, 300>; +template struct bench_3, 1000>; #endif #endif diff --git a/benchmarks/bench3/Jamfile.v2 b/benchmarks/bench3/Jamfile.v2 index 7ce9c9b85..cbaa638a7 100644 --- a/benchmarks/bench3/Jamfile.v2 +++ b/benchmarks/bench3/Jamfile.v2 @@ -7,4 +7,5 @@ exe bench3 : bench3.cpp bench31.cpp bench32.cpp bench33.cpp - ; + : : speed ; + diff --git a/benchmarks/bench3/bench3.cpp b/benchmarks/bench3/bench3.cpp index 390d226ca..72a3db585 100644 --- a/benchmarks/bench3/bench3.cpp +++ b/benchmarks/bench3/bench3.cpp @@ -76,22 +76,28 @@ void do_bench (std::string type_string, int scale) header (type_string + ", 3"); bench_1 () (1000000 * scale); bench_2 () (300000 * scale); - bench_3 () (100000 * scale); + bench_3 () (3000000 * scale); header (type_string + ", 10"); bench_1 () (300000 * scale); bench_2 () (30000 * scale); - bench_3 () (3000 * scale); + bench_3 () (100000 * scale); header (type_string + ", 30"); bench_1 () (100000 * scale); bench_2 () (3000 * scale); - bench_3 () (100 * scale); + bench_3 () (30000 * scale); header (type_string + ", 100"); bench_1 () (30000 * scale); bench_2 () (300 * scale); - bench_3 () (3 * scale); + bench_3 () (1000 * scale); + + header (type_string + ", 300"); + bench_3 () (30 * scale); + + header (type_string + ", 1000"); + bench_3 () (1 * scale); } int main (int argc, char *argv []) { diff --git a/benchmarks/bench3/bench33.cpp b/benchmarks/bench3/bench33.cpp index 9b8e1070e..7eeec07ce 100644 --- a/benchmarks/bench3/bench33.cpp +++ b/benchmarks/bench3/bench33.cpp @@ -172,6 +172,8 @@ template struct bench_3; template struct bench_3; template struct bench_3; template struct bench_3; +template struct bench_3; +template struct bench_3; #endif #ifdef USE_DOUBLE @@ -179,6 +181,8 @@ template struct bench_3; template struct bench_3; template struct bench_3; template struct bench_3; +template struct bench_3; +template struct bench_3; #endif #ifdef USE_STD_COMPLEX @@ -187,6 +191,8 @@ template struct bench_3, 3>; template struct bench_3, 10>; template struct bench_3, 30>; template struct bench_3, 100>; +template struct bench_3, 300>; +template struct bench_3, 1000>; #endif #ifdef USE_DOUBLE @@ -194,5 +200,7 @@ template struct bench_3, 3>; template struct bench_3, 10>; template struct bench_3, 30>; template struct bench_3, 100>; +template struct bench_3, 300>; +template struct bench_3, 1000>; #endif #endif From 93545c0695f60be1cd9ffe9491c95f3351acc374 Mon Sep 17 00:00:00 2001 From: Imre Palik Date: Sun, 6 Mar 2016 15:16:33 +0000 Subject: [PATCH 08/11] ublas::gemm adding support for clang This patch enables the optimised kernel for clang. Sadly, when compiled by clang, the performance of this kernel on Haswell is approximately half of the kernel compiled by gcc. But it is still way faster than anything else. Signed-off-by: Imre Palik --- include/boost/numeric/ublas/detail/gemm.hpp | 4 ++++ include/boost/numeric/ublas/detail/vector.hpp | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp index daf38a119..2d492e681 100644 --- a/include/boost/numeric/ublas/detail/gemm.hpp +++ b/include/boost/numeric/ublas/detail/gemm.hpp @@ -86,7 +86,11 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { static const Index MR = BlockSize::mr; static const Index NR = BlockSize::nr/vector_length; +#ifdef BOOST_COMP_CLANG_DETECTION + typedef T vx __attribute__((ext_vector_type (vector_length))); +#else typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE))); +#endif vx P[MR*NR] = {}; diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp index 3281ae950..686200d33 100644 --- a/include/boost/numeric/ublas/detail/vector.hpp +++ b/include/boost/numeric/ublas/detail/vector.hpp @@ -25,7 +25,9 @@ #endif #if (defined(BOOST_COMP_GNUC_DETECTION) \ - && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)) + && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)) \ + || (defined(BOOST_COMP_CLANG_DETECTION) \ + && BOOST_COMP_CLANG_DETECTION >= BOOST_VERSION_NUMBER(3,5,0)) #define BOOST_UBLAS_VECTOR_KERNEL = _BOOST_UBLAS_VECTOR_SIZE #endif From 8ceb6c0ab7361064ba1e54270fdd40cce4087e0e Mon Sep 17 00:00:00 2001 From: Imre Palik Date: Sun, 13 Mar 2016 19:58:30 +0000 Subject: [PATCH 09/11] ublas::gemm making possible to disable gemm as the default dense matrix multiplication If the preprocessor macro BOOST_UBLAS_LEGACY_PRODUCT is defined, then both prod() and axpy_prod() falls back to the legacy matrix multiplication algorithm. Signed-off-by: Imre Palik --- .../boost/numeric/ublas/matrix_expression.hpp | 42 ++++++++++++------- include/boost/numeric/ublas/operation.hpp | 17 +++++--- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp index 6b9c8fec8..80fd961b6 100644 --- a/include/boost/numeric/ublas/matrix_expression.hpp +++ b/include/boost/numeric/ublas/matrix_expression.hpp @@ -5463,18 +5463,18 @@ namespace boost { namespace numeric { namespace ublas { }; namespace detail { - template - struct binary_calculate_result_type; - - template - struct binary_calculate_result_type { - typedef matrix_matrix_binary > result_type; - }; - - template - struct binary_calculate_result_type { - typedef matrix

result_type; - }; + template + struct binary_calculate_result_type; + + template + struct binary_calculate_result_type { + typedef matrix_matrix_binary > result_type; + }; + + template + struct binary_calculate_result_type { + typedef matrix

result_type; + }; } template @@ -5495,7 +5495,7 @@ namespace boost { namespace numeric { namespace ublas { template BOOST_UBLAS_INLINE typename matrix_matrix_binary_traits::result_type + typename E2::value_type, E2>::expression_type prod (const matrix_expression &e1, const matrix_expression &e2, B, @@ -5551,15 +5551,24 @@ namespace boost { namespace numeric { namespace ublas { template BOOST_UBLAS_INLINE +#ifndef BOOST_UBLAS_LEGACY_PRODUCT typename matrix_matrix_binary_traits::result_type +#else + typename matrix_matrix_binary_traits::expression_type +#endif prod (const matrix_expression &e1, const matrix_expression &e2) { BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); typedef typename detail::prod_block_size::type> block_sizes; - +#ifndef BOOST_UBLAS_LEGACY_PRODUCT return prod (e1, e2, block_sizes()); +#else + return prod (e1, e2, block_sizes(), unknown_storage_tag(), + unknown_orientation_tag()); +#endif } template @@ -5657,8 +5666,13 @@ namespace boost { namespace numeric { namespace ublas { typedef typename detail::prod_block_size::type> block_sizes; +#ifndef BOOST_UBLAS_LEGACY_PRODUCT prod (e1, e2, m, block_sizes(), storage_category(), orientation_category()); +#else + prod (e1, e2, m, block_sizes(), unknown_storage_tag(), + unknown_orientation_tag()); +#endif return m; } diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp index 1e70326a5..176669cf8 100644 --- a/include/boost/numeric/ublas/operation.hpp +++ b/include/boost/numeric/ublas/operation.hpp @@ -775,8 +775,19 @@ namespace boost { namespace numeric { namespace ublas { typedef typename detail::prod_block_size::type> block_sizes; +#ifndef BOOST_UBLAS_LEGACY_PRODUCT return axpy_prod(e1, e2, m, full (), storage_category (), init, block_sizes()); +#else + typedef typename M::value_type value_type; + typedef typename M::orientation_category orientation_category; + + if (init) + m.assign (zero_matrix (e1 ().size1 (), e2 ().size2 ())); + + return axpy_prod(e1, e2, m, full (), storage_category (), + orientation_category()); +#endif } template @@ -800,13 +811,9 @@ namespace boost { namespace numeric { namespace ublas { axpy_prod (const matrix_expression &e1, const matrix_expression &e2) { typedef M matrix_type; - typedef typename M::storage_category storage_category; - typedef typename detail::prod_block_size::type> block_sizes; matrix_type m (e1 ().size1 (), e2 ().size2 ()); - return axpy_prod(e1, e2, m, full (), storage_category(), true, - block_sizes()); + return axpy_prod(e1, e2, m); } From 752bc93fab7c8d1b640a15d0271e77f3b6afd596 Mon Sep 17 00:00:00 2001 From: Imre Palik Date: Sat, 19 Mar 2016 13:13:22 +0000 Subject: [PATCH 10/11] ublas:gemm kernels for complex multiplication This patch contains optimised kernels for multiplying complex matrices, together with the kernel parameters I find decently performing on Haswell. Signed-off-by: Imre Palik --- .../numeric/ublas/detail/block_sizes.hpp | 30 ++ include/boost/numeric/ublas/detail/gemm.hpp | 375 ++++++++++++++++-- .../boost/numeric/ublas/matrix_expression.hpp | 10 +- 3 files changed, 370 insertions(+), 45 deletions(-) diff --git a/include/boost/numeric/ublas/detail/block_sizes.hpp b/include/boost/numeric/ublas/detail/block_sizes.hpp index 9d861de1f..da042da58 100644 --- a/include/boost/numeric/ublas/detail/block_sizes.hpp +++ b/include/boost/numeric/ublas/detail/block_sizes.hpp @@ -58,6 +58,36 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); }; + template + struct prod_block_size > { + static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(T); // Number of elements in a vector register + static const unsigned mc = 255; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 3; // stripe width for lhs + static const unsigned nr = vector_length; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 23; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size > { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 1; // stripe width for lhs + static const unsigned nr = 1; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 68; // Use gemm from this size + static const unsigned vector_length = 1; // Number of elements in a vector register + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + template struct is_blocksize { struct fallback { static const int nr = 0; }; diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp index 2d492e681..671fa42a0 100644 --- a/include/boost/numeric/ublas/detail/gemm.hpp +++ b/include/boost/numeric/ublas/detail/gemm.hpp @@ -12,10 +12,11 @@ #include #include #include -#include +#include #include #include #include +#include #include #include #include @@ -186,12 +187,150 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { } } +#ifdef BOOST_UBLAS_VECTOR_KERNEL + template + typename enable_if_c::value + && (1 < BlockSize::vector_length) + && (BlockSize::vector_length * sizeof(T) <= _BOOST_UBLAS_VECTOR_SIZE), + void>::type + ugemm(Index kc, TC alpha, const T *Ar, const T *Ai, + const T *Br, const T *Bi, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED (Ar, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED (Ai, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED (Br, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED (Bi, BlockSize::align); + static const unsigned vector_length = BlockSize::vector_length; + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr/vector_length; + +#ifdef BOOST_COMP_CLANG_DETECTION + typedef T vx __attribute__((ext_vector_type (vector_length))); +#else + typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE))); +#endif + + vx Pr[MR*NR] = {}; + vx Pi[MR*NR] = {}; + + const vx *br = (const vx *)Br; + const vx *bi = (const vx *)Bi; + for (Index l=0; l + typename enable_if_c::value + || (BlockSize::vector_length <= 1), + void>::type +#else + template + void +#endif + ugemm(Index kc, TC alpha, const T *Ar, const T *Ai, + const T *Br, const T *Bi, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED(Ar, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(Ai, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(Br, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(Bi, BlockSize::align); + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + typename aligned_storage::type Par; + T *Pr = reinterpret_cast(Par.address()); + typename aligned_storage::type Pai; + T *Pi = reinterpret_cast(Pai.address()); + + for (unsigned c = 0; c < MR * NR; c++) + Pr[c] = Pi[c] = 0; + + for (Index l=0; l + template void mgemm(Index mc, Index nc, Index kc, TC alpha, - const T *A, const T *B, TC beta, - TC *C, Index incRowC, Index incColC) + const T *A, const T *B, TC beta, TC *C, + Index incRowC, Index incColC, BlockSize) { static const Index MR = BlockSize::mr; static const Index NR = BlockSize::nr; @@ -200,9 +339,6 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { const Index mr_ = mc % MR; const Index nr_ = nc % NR; - // #if defined(_OPENMP) - // #pragma omp parallel for - // #endif for (Index j=0; j + void + mgemm(Index mc, Index nc, Index kc, TC alpha, + std::pair A, std::pairB, + TC beta, TC *C, Index incRowC, Index incColC, BlockSize) + { + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + const Index mp = (mc+MR-1) / MR; + const Index np = (nc+NR-1) / NR; + const Index mr_ = mc % MR; + const Index nr_ = nc % NR; + + for (Index j=0; j + (kc, alpha, + &A.first[i*kc*MR], &A.second[i*kc*MR], + &B.first[j*kc*NR], &B.second[j*kc*NR], + beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } else { + std::fill_n(C_, MR*NR, T(0)); + ugemm + (kc, alpha, + &A.first[i*kc*MR], &A.second[i*kc*MR], + &B.first[j*kc*NR], &B.second[j*kc*NR], + T(0), + C_, NR, Index(1)); + gescal(mr, nr, beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + geaxpy(mr, nr, TC(1), C_, NR, Index(1), + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } + } + } + } + //-- Packing blocks ------------------------------------------------------------ - template + template void - pack_A(const matrix_expression &A, T *p) + pack_A(const matrix_expression &A, T *p, BlockSize) { typedef typename E::size_type size_type; BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align); @@ -257,9 +440,34 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { } } - template + template void - pack_B(const matrix_expression &B, T *p) + pack_A(const matrix_expression &A, std::pair p, BlockSize) + { + typedef typename E::size_type size_type; + BOOST_ALIGN_ASSUME_ALIGNED(p.first, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(p.second, BlockSize::align); + + const size_type mc = A ().size1(); + const size_type kc = A ().size2(); + static const size_type MR = BlockSize::mr; + const size_type mp = (mc+MR-1) / MR; + + for (size_type j=0; j + void + pack_B(const matrix_expression &B, T *p, BlockSize) { typedef typename E::size_type size_type; BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align); @@ -280,13 +488,38 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { } } + template + void + pack_B(const matrix_expression &B, std::pair p, BlockSize) + { + typedef typename E::size_type size_type; + BOOST_ALIGN_ASSUME_ALIGNED(p.first, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(p.second, BlockSize::align); + + const size_type kc = B ().size1(); + const size_type nc = B ().size2(); + static const size_type NR = BlockSize::nr; + const size_type np = (nc+NR-1) / NR; + + for (size_type l=0; l + template void - gemm(typename E3::value_type alpha, const matrix_expression &e1, - const matrix_expression &e2, - typename E3::value_type beta, matrix_expression &e3, - BlockSize) + gemm_(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3, + M A, M B, BlockSize bs) { #if defined(BOOST_COMP_GNUC_DETECTION) check_blocksize check __attribute__ ((unused)); @@ -294,15 +527,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { check_blocksize check; #endif typedef typename E3::size_type size_type; - typedef typename E1::value_type value_type1; - typedef typename E2::value_type value_type2; typedef typename E3::value_type value_type3; - typedef typename common_type::type value_type_i; - typedef unbounded_array > array_type_i; static const size_type MC = BlockSize::mc; static const size_type NC = BlockSize::nc; @@ -323,13 +548,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { const size_type incRowC = &e3()(1,0) - &e3()(0,0); const size_type incColC = &e3()(0,1) - &e3()(0,0); - if (alpha==value_type3(0) || k==0) { - gescal(beta, e3); - return; - } - - array_type_i A(MC * KC); - array_type_i B(NC * KC); + assert(alpha!=value_type3(0) && k!=0); for (size_type j=0; j Bs = subrange(e2(), l*KC, l*KC+kc, j*NC, j*NC+nc); - pack_B, value_type_i, BlockSize> - (Bs, &B[0]); + pack_B(Bs, B, bs); for (size_type i=0; i As = subrange(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc); - pack_A, value_type_i, BlockSize> - (As, &A[0]); + pack_A(As, A, bs); - mgemm - (mc, nc, kc, alpha, &A[0], &B[0], beta_, - &C_[i*MC*incRowC+j*NC*incColC], - incRowC, incColC); + mgemm(mc, nc, kc, alpha, A, B, beta_, + &C_[i*MC*incRowC+j*NC*incColC], + incRowC, incColC, bs); } } } } + + template + typename enable_if_c::type>::value + || !is_complex::value, + void>::type + gemm(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3, + BlockSize bs) + { + typedef typename E3::size_type size_type; + typedef typename E1::value_type value_type1; + typedef typename E2::value_type value_type2; + typedef typename E3::value_type value_type3; + typedef typename common_type::type value_type_i; + typedef unbounded_array > array_type_i; + + static const size_type MC = BlockSize::mc; + static const size_type NC = BlockSize::nc; + + const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); + + static const size_type KC = BlockSize::kc; + + if (alpha==value_type3(0) || k==0) { + gescal(beta, e3); + return; + } + + array_type_i A(MC * KC); + array_type_i B(NC * KC); + gemm_(alpha, e1, e2, beta, e3, &A[0], &B[0], bs); + } + + template + typename enable_if_c::type>::value + && is_complex::value, + void>::type + gemm(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3, + BlockSize bs) + { + typedef typename E3::size_type size_type; + typedef typename E1::value_type value_type1; + typedef typename E2::value_type value_type2; + typedef typename E3::value_type value_type3; + typedef typename common_type::type value_type_i; + typedef typename value_type_i::value_type value_type_f; + typedef unbounded_array > array_type_f; + + static const size_type MC = BlockSize::mc; + static const size_type NC = BlockSize::nc; + + const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); + + static const size_type KC = BlockSize::kc; + + if (alpha==value_type3(0) || k==0) { + gescal(beta, e3); + return; + } + + array_type_f Ar(MC * KC); + array_type_f Ai(MC * KC); + array_type_f Br(NC * KC); + array_type_f Bi(NC * KC); + gemm_(alpha, e1, e2, beta, e3, std::make_pair(&Ar[0], &Ai[0]), + std::make_pair(&Br[0], &Bi[0]), bs); + } + }}}} #endif diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp index 80fd961b6..36bbf130b 100644 --- a/include/boost/numeric/ublas/matrix_expression.hpp +++ b/include/boost/numeric/ublas/matrix_expression.hpp @@ -5623,16 +5623,14 @@ namespace boost { namespace numeric { namespace ublas { unknown_orientation_tag) { typedef typename matrix_matrix_binary_traits::expression_type expression_type; - typedef typename matrix_matrix_binary_traits::result_type result_type; - typedef typename result_type::value_type result_value; + typedef typename M::value_type result_value; - if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) { + if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) { m.assign(expression_type (e1 (), e2 ())); - } else { + } else { detail::gemm(result_value(1), e1, e2, result_value(0), m, b); - } + } } // dispatcher From 4f4d25105082475fa123b30ea46e29f1491bc0f6 Mon Sep 17 00:00:00 2001 From: Imre Palik Date: Sun, 3 Apr 2016 14:59:34 +0000 Subject: [PATCH 11/11] ublas::gemm support for icc This patch adds vectorised kernel support for icc. It is implemented using cilk arrays. It is tested on icc16, whith older versions your mileage may vary. The resulting kernel is still half the speed of the one compiled by gcc. Signed-off-by: Imre Palik --- include/boost/numeric/ublas/detail/gemm.hpp | 93 +++++++++++++++---- include/boost/numeric/ublas/detail/vector.hpp | 7 +- 2 files changed, 79 insertions(+), 21 deletions(-) diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp index 671fa42a0..3891101cd 100644 --- a/include/boost/numeric/ublas/detail/gemm.hpp +++ b/include/boost/numeric/ublas/detail/gemm.hpp @@ -86,7 +86,30 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { static const unsigned vector_length = BlockSize::vector_length; static const Index MR = BlockSize::mr; static const Index NR = BlockSize::nr/vector_length; + static const Index nr = BlockSize::nr; +#ifdef BOOST_COMP_INTEL_DETECTION + static const Index pl = MR * nr; + T P[pl] = {}; + + for (Index l=0; l &A, std::pair p, BlockSize) { typedef typename E::size_type size_type; - BOOST_ALIGN_ASSUME_ALIGNED(p.first, BlockSize::align); - BOOST_ALIGN_ASSUME_ALIGNED(p.second, BlockSize::align); + T* re = p.first; + T* im = p.second; + BOOST_ALIGN_ASSUME_ALIGNED(re, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(im, BlockSize::align); const size_type mc = A ().size1(); const size_type kc = A ().size2(); @@ -458,8 +513,8 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { for (size_type i0=0; i0 &B, std::pair p, BlockSize) { typedef typename E::size_type size_type; - BOOST_ALIGN_ASSUME_ALIGNED(p.first, BlockSize::align); - BOOST_ALIGN_ASSUME_ALIGNED(p.second, BlockSize::align); + T* re = p.first; + T* im = p.second; + BOOST_ALIGN_ASSUME_ALIGNED(re, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(im, BlockSize::align); const size_type kc = B ().size1(); const size_type nc = B ().size2(); @@ -506,8 +563,8 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { for (size_type i=0; i= BOOST_VERSION_NUMBER(4,8,0)) \ - || (defined(BOOST_COMP_CLANG_DETECTION) \ +#if defined(BOOST_COMP_INTEL_DETECTION) \ + || (defined(BOOST_COMP_GNUC_DETECTION) \ + && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)) \ + || (defined(BOOST_COMP_CLANG_DETECTION) \ && BOOST_COMP_CLANG_DETECTION >= BOOST_VERSION_NUMBER(3,5,0)) #define BOOST_UBLAS_VECTOR_KERNEL = _BOOST_UBLAS_VECTOR_SIZE #endif