diff --git a/benchmarks/bench1/Jamfile.v2 b/benchmarks/bench1/Jamfile.v2 index 77b11c77e..bf9c821c6 100644 --- a/benchmarks/bench1/Jamfile.v2 +++ b/benchmarks/bench1/Jamfile.v2 @@ -7,4 +7,4 @@ exe bench1 : bench1.cpp bench11.cpp bench12.cpp bench13.cpp - ; + : : speed ; diff --git a/benchmarks/bench1/bench1.cpp b/benchmarks/bench1/bench1.cpp index 87478e139..300eafa4a 100644 --- a/benchmarks/bench1/bench1.cpp +++ b/benchmarks/bench1/bench1.cpp @@ -76,22 +76,28 @@ void do_bench (std::string type_string, int scale) header (type_string + ", 3"); bench_1 () (1000000 * scale); bench_2 () (300000 * scale); - bench_3 () (100000 * scale); + bench_3 () (3000000 * scale); header (type_string + ", 10"); bench_1 () (300000 * scale); bench_2 () (30000 * scale); - bench_3 () (3000 * scale); + bench_3 () (100000 * scale); header (type_string + ", 30"); bench_1 () (100000 * scale); bench_2 () (3000 * scale); - bench_3 () (100 * scale); + bench_3 () (30000 * scale); header (type_string + ", 100"); bench_1 () (30000 * scale); bench_2 () (300 * scale); - bench_3 () (3 * scale); + bench_3 () (1000 * scale); + + header (type_string + ", 300"); + bench_3 () (30 * scale); + + header (type_string + ", 1000"); + bench_3 () (1 * scale); } int main (int argc, char *argv []) { diff --git a/benchmarks/bench1/bench13.cpp b/benchmarks/bench1/bench13.cpp index fadb0b673..2378d46b2 100644 --- a/benchmarks/bench1/bench13.cpp +++ b/benchmarks/bench1/bench13.cpp @@ -166,6 +166,8 @@ template struct bench_3; template struct bench_3; template struct bench_3; template struct bench_3; +template struct bench_3; +template struct bench_3; #endif #ifdef USE_DOUBLE @@ -173,6 +175,8 @@ template struct bench_3; template struct bench_3; template struct bench_3; template struct bench_3; +template struct bench_3; +template struct bench_3; #endif #ifdef USE_STD_COMPLEX @@ -181,6 +185,8 @@ template struct bench_3, 3>; template struct bench_3, 10>; template struct bench_3, 30>; template struct bench_3, 100>; +template struct bench_3, 300>; +template struct bench_3, 1000>; #endif #ifdef USE_DOUBLE @@ -188,5 +194,7 @@ template struct bench_3, 3>; template struct bench_3, 10>; template struct bench_3, 30>; template struct bench_3, 100>; +template struct bench_3, 300>; +template struct bench_3, 1000>; #endif #endif diff --git a/benchmarks/bench3/Jamfile.v2 b/benchmarks/bench3/Jamfile.v2 index 7ce9c9b85..cbaa638a7 100644 --- a/benchmarks/bench3/Jamfile.v2 +++ b/benchmarks/bench3/Jamfile.v2 @@ -7,4 +7,5 @@ exe bench3 : bench3.cpp bench31.cpp bench32.cpp bench33.cpp - ; + : : speed ; + diff --git a/benchmarks/bench3/bench3.cpp b/benchmarks/bench3/bench3.cpp index 390d226ca..72a3db585 100644 --- a/benchmarks/bench3/bench3.cpp +++ b/benchmarks/bench3/bench3.cpp @@ -76,22 +76,28 @@ void do_bench (std::string type_string, int scale) header (type_string + ", 3"); bench_1 () (1000000 * scale); bench_2 () (300000 * scale); - bench_3 () (100000 * scale); + bench_3 () (3000000 * scale); header (type_string + ", 10"); bench_1 () (300000 * scale); bench_2 () (30000 * scale); - bench_3 () (3000 * scale); + bench_3 () (100000 * scale); header (type_string + ", 30"); bench_1 () (100000 * scale); bench_2 () (3000 * scale); - bench_3 () (100 * scale); + bench_3 () (30000 * scale); header (type_string + ", 100"); bench_1 () (30000 * scale); bench_2 () (300 * scale); - bench_3 () (3 * scale); + bench_3 () (1000 * scale); + + header (type_string + ", 300"); + bench_3 () (30 * scale); + + header (type_string + ", 1000"); + bench_3 () (1 * scale); } int main (int argc, char *argv []) { diff --git a/benchmarks/bench3/bench33.cpp b/benchmarks/bench3/bench33.cpp index 9b8e1070e..7eeec07ce 100644 --- a/benchmarks/bench3/bench33.cpp +++ b/benchmarks/bench3/bench33.cpp @@ -172,6 +172,8 @@ template struct bench_3; template struct bench_3; template struct bench_3; template struct bench_3; +template struct bench_3; +template struct bench_3; #endif #ifdef USE_DOUBLE @@ -179,6 +181,8 @@ template struct bench_3; template struct bench_3; template struct bench_3; template struct bench_3; +template struct bench_3; +template struct bench_3; #endif #ifdef USE_STD_COMPLEX @@ -187,6 +191,8 @@ template struct bench_3, 3>; template struct bench_3, 10>; template struct bench_3, 30>; template struct bench_3, 100>; +template struct bench_3, 300>; +template struct bench_3, 1000>; #endif #ifdef USE_DOUBLE @@ -194,5 +200,7 @@ template struct bench_3, 3>; template struct bench_3, 10>; template struct bench_3, 30>; template struct bench_3, 100>; +template struct bench_3, 300>; +template struct bench_3, 1000>; #endif #endif diff --git a/doc/operations_overview.html b/doc/operations_overview.html index 42d994212..212fcbebf 100644 --- a/doc/operations_overview.html +++ b/doc/operations_overview.html @@ -87,7 +87,8 @@

norms


 t = norm_inf(v); i = index_norm_inf(v);
-t = norm_1(v);   t = norm_2(v); 
+t = norm_1(v);   t = norm_2(v);
+t = norm_2_square(v);
 t = norm_inf(A); i = index_norm_inf(A);
 t = norm_1(A);   t = norm_frobenius(A); 
 
diff --git a/doc/vector_expression.html b/doc/vector_expression.html index 7838007ff..2372ec3df 100644 --- a/doc/vector_expression.html +++ b/doc/vector_expression.html @@ -847,6 +847,11 @@

Prototypes

typename vector_scalar_unary_traits<E, vector_norm_2<typename E::value_type> >::result_type norm_2 (const vector_expression<E> &e); + // norm_2_square v = sum (v [i] * v [i]) + template<class E> + typename vector_scalar_unary_traits<E, vector_norm_2_square<typename E::value_type> >::result_type + norm_2_square (const vector_expression<E> &e); + // norm_inf v = max (abs (v [i])) template<class E> typename vector_scalar_unary_traits<E, vector_norm_inf<typename E::value_type> >::result_type diff --git a/include/boost/numeric/ublas/detail/block_sizes.hpp b/include/boost/numeric/ublas/detail/block_sizes.hpp new file mode 100644 index 000000000..da042da58 --- /dev/null +++ b/include/boost/numeric/ublas/detail/block_sizes.hpp @@ -0,0 +1,121 @@ +// +// Copyright (c) 2016 +// Michael Lehn, Imre Palik +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef _BOOST_UBLAS_BLOCK_SIZES_ +#define _BOOST_UBLAS_BLOCK_SIZES_ + +#include + +namespace boost { namespace numeric { namespace ublas { namespace detail { + + template + struct prod_block_size { + static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(T); // Number of elements in a vector register + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = (4096/(3 * vector_length)) * (3 * vector_length); + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 3 * vector_length; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 14; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 16; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 14; // Use gemm from this size + static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(float); // Number of elements in a vector register + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 1; // stripe width for lhs + static const unsigned nr = 4; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 42; // Use gemm from this size + static const unsigned vector_length = 1; // Number of elements in a vector register + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template + struct prod_block_size > { + static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(T); // Number of elements in a vector register + static const unsigned mc = 255; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 3; // stripe width for lhs + static const unsigned nr = vector_length; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 23; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size > { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 1; // stripe width for lhs + static const unsigned nr = 1; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 68; // Use gemm from this size + static const unsigned vector_length = 1; // Number of elements in a vector register + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template + struct is_blocksize { + struct fallback { static const int nr = 0; }; + struct derived : T, fallback {}; + template + struct nonr { + static const bool value = false; + typedef false_type type; + }; + + template static char (&f(typename nonr::type*))[1]; + template static char (&f(...))[2]; + + static bool const value = sizeof(f(0)) == 2; + }; + + template + struct check_blocksize { + BOOST_STATIC_ASSERT_MSG(T::mc>0 && T::kc>0 && T::nc>0 && T::mr>0 && T::nr>0, + "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(T::mc % T::mr == 0, + "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(T::nc % T::nr == 0, + "NC must be a multiple of NR."); + BOOST_STATIC_ASSERT_MSG(T::vector_length <= 1 || T::nr % T::vector_length == 0, + "NR must be a multiple of vector size"); + BOOST_STATIC_ASSERT_MSG(T::limit >= 2, + "Minimum matrix size for gemm is 2*2"); + }; +}}}} +#endif diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp new file mode 100644 index 000000000..3891101cd --- /dev/null +++ b/include/boost/numeric/ublas/detail/gemm.hpp @@ -0,0 +1,717 @@ +// +// Copyright (c) 2016 +// Michael Lehn, Imre Palik +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef _BOOST_UBLAS_GEMM_ +#define _BOOST_UBLAS_GEMM_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace boost { namespace numeric { namespace ublas { namespace detail { + + template + void + gescal(const typename E::value_type &alpha, matrix_expression &X) + { + typedef typename E::size_type size_type; + + for (size_type i=0; i + void + geaxpy(Index m, Index n, const T &alpha, + const T *X, Index incRowX, Index incColX, + T *Y, Index incRowY, Index incColY) + { + for (Index j=0; j + void + gescal(Index m, Index n, + const TX &alpha, + TX *X, Index incRowX, Index incColX) + { + if (alpha != TX(0)) { + for (Index j=0; j + typename enable_if_c::value + && (1 < BlockSize::vector_length) + && (BlockSize::vector_length * sizeof(T) <= _BOOST_UBLAS_VECTOR_SIZE), + void>::type + ugemm(Index kc, TC alpha, const T *A, const T *B, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED (A, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED (B, BlockSize::align); + static const unsigned vector_length = BlockSize::vector_length; + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr/vector_length; + static const Index nr = BlockSize::nr; +#ifdef BOOST_COMP_INTEL_DETECTION + static const Index pl = MR * nr; + + T P[pl] = {}; + + for (Index l=0; l + typename enable_if_c::value + || (BlockSize::vector_length <= 1), + void>::type +#else + template + void +#endif + ugemm(Index kc, TC alpha, const T *A, const T *B, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED(A, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align); + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + typename aligned_storage::type Pa; + T *P = reinterpret_cast(Pa.address()); + for (unsigned c = 0; c < MR * NR; c++) + P[c] = 0; + + for (Index l=0; l + typename enable_if_c::value + && (1 < BlockSize::vector_length) + && (BlockSize::vector_length * sizeof(T) <= _BOOST_UBLAS_VECTOR_SIZE), + void>::type + ugemm(Index kc, TC alpha, const T *Ar, const T *Ai, + const T *Br, const T *Bi, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED (Ar, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED (Ai, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED (Br, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED (Bi, BlockSize::align); + static const unsigned vector_length = BlockSize::vector_length; + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr/vector_length; + static const Index nr = BlockSize::nr; +#ifdef BOOST_COMP_INTEL_DETECTION + static const Index pl = MR * nr; + + T Pr[pl] = {}; + T Pi[pl] = {}; + + for (Index l=0; l + typename enable_if_c::value + || (BlockSize::vector_length <= 1), + void>::type +#else + template + void +#endif + ugemm(Index kc, TC alpha, const T *Ar, const T *Ai, + const T *Br, const T *Bi, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED(Ar, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(Ai, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(Br, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(Bi, BlockSize::align); + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + typename aligned_storage::type Par; + T *Pr = reinterpret_cast(Par.address()); + typename aligned_storage::type Pai; + T *Pi = reinterpret_cast(Pai.address()); + + for (unsigned c = 0; c < MR * NR; c++) + Pr[c] = Pi[c] = 0; + + for (Index l=0; l + void + mgemm(Index mc, Index nc, Index kc, TC alpha, + const T *A, const T *B, TC beta, TC *C, + Index incRowC, Index incColC, BlockSize) + { + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + const Index mp = (mc+MR-1) / MR; + const Index np = (nc+NR-1) / NR; + const Index mr_ = mc % MR; + const Index nr_ = nc % NR; + + for (Index j=0; j(kc, alpha, + &A[i*kc*MR], &B[j*kc*NR], + beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } else { + std::fill_n(C_, MR*NR, T(0)); + ugemm(kc, alpha, + &A[i*kc*MR], &B[j*kc*NR], + T(0), + C_, NR, Index(1)); + gescal(mr, nr, beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + geaxpy(mr, nr, TC(1), C_, NR, Index(1), + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } + } + } + } + + template + void + mgemm(Index mc, Index nc, Index kc, TC alpha, + std::pair A, std::pairB, + TC beta, TC *C, Index incRowC, Index incColC, BlockSize) + { + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + const Index mp = (mc+MR-1) / MR; + const Index np = (nc+NR-1) / NR; + const Index mr_ = mc % MR; + const Index nr_ = nc % NR; + + for (Index j=0; j + (kc, alpha, + &A.first[i*kc*MR], &A.second[i*kc*MR], + &B.first[j*kc*NR], &B.second[j*kc*NR], + beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } else { + std::fill_n(C_, MR*NR, T(0)); + ugemm + (kc, alpha, + &A.first[i*kc*MR], &A.second[i*kc*MR], + &B.first[j*kc*NR], &B.second[j*kc*NR], + T(0), + C_, NR, Index(1)); + gescal(mr, nr, beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + geaxpy(mr, nr, TC(1), C_, NR, Index(1), + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } + } + } + } + + //-- Packing blocks ------------------------------------------------------------ + template + void + pack_A(const matrix_expression &A, T *p, BlockSize) + { + typedef typename E::size_type size_type; + BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align); + + const size_type mc = A ().size1(); + const size_type kc = A ().size2(); + static const size_type MR = BlockSize::mr; + const size_type mp = (mc+MR-1) / MR; + + for (size_type j=0; j + void + pack_A(const matrix_expression &A, std::pair p, BlockSize) + { + typedef typename E::size_type size_type; + T* re = p.first; + T* im = p.second; + BOOST_ALIGN_ASSUME_ALIGNED(re, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(im, BlockSize::align); + + const size_type mc = A ().size1(); + const size_type kc = A ().size2(); + static const size_type MR = BlockSize::mr; + const size_type mp = (mc+MR-1) / MR; + + for (size_type j=0; j + void + pack_B(const matrix_expression &B, T *p, BlockSize) + { + typedef typename E::size_type size_type; + BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align); + + const size_type kc = B ().size1(); + const size_type nc = B ().size2(); + static const size_type NR = BlockSize::nr; + const size_type np = (nc+NR-1) / NR; + + for (size_type l=0; l + void + pack_B(const matrix_expression &B, std::pair p, BlockSize) + { + typedef typename E::size_type size_type; + T* re = p.first; + T* im = p.second; + BOOST_ALIGN_ASSUME_ALIGNED(re, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(im, BlockSize::align); + + const size_type kc = B ().size1(); + const size_type nc = B ().size2(); + static const size_type NR = BlockSize::nr; + const size_type np = (nc+NR-1) / NR; + + for (size_type l=0; l + void + gemm_(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3, + M A, M B, BlockSize bs) + { +#if defined(BOOST_COMP_GNUC_DETECTION) + check_blocksize check __attribute__ ((unused)); +#else + check_blocksize check; +#endif + typedef typename E3::size_type size_type; + typedef typename E3::value_type value_type3; + + static const size_type MC = BlockSize::mc; + static const size_type NC = BlockSize::nc; + + const size_type m = BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ()); + const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ()); + const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); + + static const size_type KC = BlockSize::kc; + const size_type mb = (m+MC-1) / MC; + const size_type nb = (n+NC-1) / NC; + const size_type kb = (k+KC-1) / KC; + const size_type mc_ = m % MC; + const size_type nc_ = n % NC; + const size_type kc_ = k % KC; + + value_type3 *C_ = &e3()(0,0); + const size_type incRowC = &e3()(1,0) - &e3()(0,0); + const size_type incColC = &e3()(0,1) - &e3()(0,0); + + assert(alpha!=value_type3(0) && k!=0); + + for (size_type j=0; j Bs = + subrange(e2(), l*KC, l*KC+kc, j*NC, j*NC+nc); + pack_B(Bs, B, bs); + + for (size_type i=0; i As = + subrange(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc); + pack_A(As, A, bs); + + mgemm(mc, nc, kc, alpha, A, B, beta_, + &C_[i*MC*incRowC+j*NC*incColC], + incRowC, incColC, bs); + } + } + } + } + + template + typename enable_if_c::type>::value + || !is_complex::value, + void>::type + gemm(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3, + BlockSize bs) + { + typedef typename E3::size_type size_type; + typedef typename E1::value_type value_type1; + typedef typename E2::value_type value_type2; + typedef typename E3::value_type value_type3; + typedef typename common_type::type value_type_i; + typedef unbounded_array > array_type_i; + + static const size_type MC = BlockSize::mc; + static const size_type NC = BlockSize::nc; + + const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); + + static const size_type KC = BlockSize::kc; + + if (alpha==value_type3(0) || k==0) { + gescal(beta, e3); + return; + } + + array_type_i A(MC * KC); + array_type_i B(NC * KC); + gemm_(alpha, e1, e2, beta, e3, &A[0], &B[0], bs); + } + + template + typename enable_if_c::type>::value + && is_complex::value, + void>::type + gemm(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3, + BlockSize bs) + { + typedef typename E3::size_type size_type; + typedef typename E1::value_type value_type1; + typedef typename E2::value_type value_type2; + typedef typename E3::value_type value_type3; + typedef typename common_type::type value_type_i; + typedef typename value_type_i::value_type value_type_f; + typedef unbounded_array > array_type_f; + + static const size_type MC = BlockSize::mc; + static const size_type NC = BlockSize::nc; + + const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); + + static const size_type KC = BlockSize::kc; + + if (alpha==value_type3(0) || k==0) { + gescal(beta, e3); + return; + } + + array_type_f Ar(MC * KC); + array_type_f Ai(MC * KC); + array_type_f Br(NC * KC); + array_type_f Bi(NC * KC); + gemm_(alpha, e1, e2, beta, e3, std::make_pair(&Ar[0], &Ai[0]), + std::make_pair(&Br[0], &Bi[0]), bs); + } + +}}}} +#endif diff --git a/include/boost/numeric/ublas/detail/matrix_assign.hpp b/include/boost/numeric/ublas/detail/matrix_assign.hpp index efaf9ab81..b6de2586f 100644 --- a/include/boost/numeric/ublas/detail/matrix_assign.hpp +++ b/include/boost/numeric/ublas/detail/matrix_assign.hpp @@ -30,7 +30,7 @@ namespace detail { template BOOST_UBLAS_INLINE bool equals (const matrix_expression &e1, const matrix_expression &e2, S epsilon, S min_norm) { - return norm_inf (e1 - e2) < epsilon * + return norm_inf (e1 - e2) <= epsilon * std::max (std::max (norm_inf (e1), norm_inf (e2)), min_norm); } diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp new file mode 100644 index 000000000..e0ec2b181 --- /dev/null +++ b/include/boost/numeric/ublas/detail/vector.hpp @@ -0,0 +1,35 @@ +// +// Copyright (c) 2016 +// Imre Palik +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef _BOOST_UBLAS_DETAIL_VECTOR_ +#define _BOOST_UBLAS_DETAIL_VECTOR_ +#include + +#if defined(BOOST_HW_SIMD) && (BOOST_HW_SIMD == BOOST_HW_SIMD_X86 || BOOST_HW_SIMD == BOOST_HW_SIMD_X86_AMD) +#if BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_AVX_VERSION +#define _BOOST_UBLAS_VECTOR_SIZE 32 +#elif BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_SSE_VERSION +#define _BOOST_UBLAS_VECTOR_SIZE 16 +#else +#define _BOOST_UBLAS_VECTOR_SIZE 8 +#endif +#endif + +#ifndef _BOOST_UBLAS_VECTOR_SIZE +#define _BOOST_UBLAS_VECTOR_SIZE 1 +#endif + +#if defined(BOOST_COMP_INTEL_DETECTION) \ + || (defined(BOOST_COMP_GNUC_DETECTION) \ + && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)) \ + || (defined(BOOST_COMP_CLANG_DETECTION) \ + && BOOST_COMP_CLANG_DETECTION >= BOOST_VERSION_NUMBER(3,5,0)) +#define BOOST_UBLAS_VECTOR_KERNEL = _BOOST_UBLAS_VECTOR_SIZE +#endif + +#endif diff --git a/include/boost/numeric/ublas/functional.hpp b/include/boost/numeric/ublas/functional.hpp index b0be20608..f54b441fe 100644 --- a/include/boost/numeric/ublas/functional.hpp +++ b/include/boost/numeric/ublas/functional.hpp @@ -433,10 +433,10 @@ namespace boost { namespace numeric { namespace ublas { template static BOOST_UBLAS_INLINE result_type apply (const vector_expression &e) { -#ifndef BOOST_UBLAS_SCALED_NORM - real_type t = real_type (); typedef typename E::size_type vector_size_type; vector_size_type size (e ().size ()); +#ifndef BOOST_UBLAS_SCALED_NORM + real_type t = real_type (); for (vector_size_type i = 0; i < size; ++ i) { real_type u (type_traits::norm_2 (e () (i))); t += u * u; @@ -445,8 +445,7 @@ namespace boost { namespace numeric { namespace ublas { #else real_type scale = real_type (); real_type sum_squares (1); - size_type size (e ().size ()); - for (size_type i = 0; i < size; ++ i) { + for (vector_size_type i = 0; i < size; ++ i) { real_type u (type_traits::norm_2 (e () (i))); if ( real_type () /* zero */ == u ) continue; if (scale < u) { @@ -522,6 +521,52 @@ namespace boost { namespace numeric { namespace ublas { #endif } }; + + template + struct vector_norm_2_square : + public vector_scalar_real_unary_functor { + typedef typename vector_scalar_real_unary_functor::value_type value_type; + typedef typename vector_scalar_real_unary_functor::real_type real_type; + typedef typename vector_scalar_real_unary_functor::result_type result_type; + + template + static BOOST_UBLAS_INLINE + result_type apply (const vector_expression &e) { + real_type t = real_type (); + typedef typename E::size_type vector_size_type; + vector_size_type size (e ().size ()); + for (vector_size_type i = 0; i < size; ++ i) { + real_type u (type_traits::norm_2 (e () (i))); + t += u * u; + } + return t; + } + // Dense case + template + static BOOST_UBLAS_INLINE + result_type apply (D size, I it) { + real_type t = real_type (); + while (-- size >= 0) { + real_type u (type_traits::norm_2 (*it)); + t += u * u; + ++ it; + } + return t; + } + // Sparse case + template + static BOOST_UBLAS_INLINE + result_type apply (I it, const I &it_end) { + real_type t = real_type (); + while (it != it_end) { + real_type u (type_traits::norm_2 (*it)); + t += u * u; + ++ it; + } + return t; + } + }; + template struct vector_norm_inf: public vector_scalar_real_unary_functor { diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp index 8e6c2f68b..36bbf130b 100644 --- a/include/boost/numeric/ublas/matrix_expression.hpp +++ b/include/boost/numeric/ublas/matrix_expression.hpp @@ -14,6 +14,8 @@ #define _BOOST_UBLAS_MATRIX_EXPRESSION_ #include +#include +#include // Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish // Iterators based on ideas of Jeremy Siek @@ -5460,45 +5462,113 @@ namespace boost { namespace numeric { namespace ublas { expression2_closure_type e2_; }; + namespace detail { + template + struct binary_calculate_result_type; + + template + struct binary_calculate_result_type { + typedef matrix_matrix_binary > result_type; + }; + + template + struct binary_calculate_result_type { + typedef matrix

result_type; + }; + } + template struct matrix_matrix_binary_traits { - typedef unknown_storage_tag storage_category; + // typedef unknown_storage_tag storage_category; + typedef typename storage_restrict_traits::storage_category storage_category; typedef unknown_orientation_tag orientation_category; typedef typename promote_traits::promote_type promote_type; typedef matrix_matrix_binary > expression_type; #ifndef BOOST_UBLAS_SIMPLE_ET_DEBUG - typedef expression_type result_type; + // typedef expression_type result_type; + typedef typename detail::binary_calculate_result_type::value>::result_type result_type; #else typedef typename E1::matrix_temporary_type result_type; #endif }; - template + template BOOST_UBLAS_INLINE typename matrix_matrix_binary_traits::result_type + typename E2::value_type, E2>::expression_type prod (const matrix_expression &e1, const matrix_expression &e2, + B, unknown_storage_tag, unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits::expression_type expression_type; return expression_type (e1 (), e2 ()); } - // Dispatcher - template + template BOOST_UBLAS_INLINE typename matrix_matrix_binary_traits::result_type prod (const matrix_expression &e1, - const matrix_expression &e2) { + const matrix_expression &e2, + B b, + dense_proxy_tag, + unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits::expression_type expression_type; + typedef typename matrix_matrix_binary_traits::result_type result_type; + typedef typename result_type::value_type result_value; + + if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) { + return expression_type (e1 (), e2 ()); + } else { + result_type rv(e1 ().size1(), e2 ().size2()); + detail::gemm(result_value(1), e1, e2, + result_value(0), rv, b); + return rv; + } + } + + // Dispatcher + template + BOOST_UBLAS_INLINE + typename enable_if_c::value, + typename matrix_matrix_binary_traits::result_type>::type + prod (const matrix_expression &e1, + const matrix_expression &e2, + B b) { BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); typedef typename matrix_matrix_binary_traits::storage_category storage_category; typedef typename matrix_matrix_binary_traits::orientation_category orientation_category; - return prod (e1, e2, storage_category (), orientation_category ()); + return prod (e1, e2, b, storage_category (), orientation_category ()); + } + + template + BOOST_UBLAS_INLINE +#ifndef BOOST_UBLAS_LEGACY_PRODUCT + typename matrix_matrix_binary_traits::result_type +#else + typename matrix_matrix_binary_traits::expression_type +#endif + prod (const matrix_expression &e1, + const matrix_expression &e2) { + BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); + typedef typename detail::prod_block_size::type> block_sizes; +#ifndef BOOST_UBLAS_LEGACY_PRODUCT + return prod (e1, e2, block_sizes()); +#else + return prod (e1, e2, block_sizes(), unknown_storage_tag(), + unknown_orientation_tag()); +#endif } template @@ -5529,13 +5599,79 @@ namespace boost { namespace numeric { namespace ublas { return prec_prod (e1, e2, storage_category (), orientation_category ()); } - template + template + BOOST_UBLAS_INLINE + void + prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, B b, + unknown_storage_tag, + unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits::expression_type expression_type; + + m.assign(expression_type (e1 (), e2 ())); + } + + template + BOOST_UBLAS_INLINE + void + prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, B b, + dense_proxy_tag, + unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits::expression_type expression_type; + typedef typename M::value_type result_value; + + if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) { + m.assign(expression_type (e1 (), e2 ())); + } else { + detail::gemm(result_value(1), e1, e2, + result_value(0), m, b); + } + } + + // dispatcher + template BOOST_UBLAS_INLINE M & + prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, B b) { + BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); + typedef typename matrix_matrix_binary_traits::storage_category storage_category; + typedef typename matrix_matrix_binary_traits::orientation_category orientation_category; + + prod (e1, e2, m, b, storage_category(), orientation_category()); + return m; + } + template + BOOST_UBLAS_INLINE + typename enable_if_c::value, + M&>::type prod (const matrix_expression &e1, const matrix_expression &e2, M &m) { - return m.assign (prod (e1, e2)); + BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); + typedef typename matrix_matrix_binary_traits::storage_category storage_category; + typedef typename matrix_matrix_binary_traits::orientation_category orientation_category; + typedef typename detail::prod_block_size::type> block_sizes; +#ifndef BOOST_UBLAS_LEGACY_PRODUCT + prod (e1, e2, m, block_sizes(), storage_category(), + orientation_category()); +#else + prod (e1, e2, m, block_sizes(), unknown_storage_tag(), + unknown_orientation_tag()); +#endif + return m; } template diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp index 64657ccf3..176669cf8 100644 --- a/include/boost/numeric/ublas/operation.hpp +++ b/include/boost/numeric/ublas/operation.hpp @@ -14,7 +14,8 @@ #define _BOOST_UBLAS_OPERATION_ #include - +#include +#include /** \file operation.hpp * \brief This file contains some specialized products. */ @@ -637,7 +638,63 @@ namespace boost { namespace numeric { namespace ublas { return m; } - // Dispatcher + template + BOOST_UBLAS_INLINE + M& + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, full, + S, bool init, B) + { + typedef typename M::value_type value_type; + typedef S storage_category; + typedef typename M::orientation_category orientation_category; + + + if (init) + m.assign (zero_matrix (e1 ().size1 (), e2 ().size2 ())); + return axpy_prod (e1, e2, m, full (), storage_category (), + orientation_category ()); + } + + template + BOOST_UBLAS_INLINE + M& + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, full, + dense_proxy_tag, bool init, B) + { + typedef typename M::value_type value_type; + typedef B block_sizes; + if (m.size1() < B::limit || m.size2() < B::limit) { + typedef typename M::storage_category storage_category; + typedef typename M::orientation_category orientation_category; + + if (init) + m.assign (zero_matrix (e1 ().size1 (), e2 ().size2 ())); + return axpy_prod (e1, e2, m, full (), storage_category (), + orientation_category ()); + } else { + detail::gemm(value_type(1), e1, e2, + value_type(init? 0 : 1), m, block_sizes()); + return m; + } + } + + template + BOOST_UBLAS_INLINE + M& + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, full, + dense_tag, bool init, B b) + { + return axpy_prod (e1, e2, m, full (), dense_proxy_tag(), + init, b); + } + + // Dispatchers template BOOST_UBLAS_INLINE M & @@ -651,11 +708,16 @@ namespace boost { namespace numeric { namespace ublas { if (init) m.assign (zero_matrix (e1 ().size1 (), e2 ().size2 ())); - return axpy_prod (e1, e2, m, triangular_restriction (), storage_category (), orientation_category ()); + + return axpy_prod(e1, e2, m, triangular_restriction (), + storage_category (), orientation_category(), + init); } + template BOOST_UBLAS_INLINE - M + typename enable_if_c::value, + M>::type axpy_prod (const matrix_expression &e1, const matrix_expression &e2, TRI) { @@ -690,20 +752,59 @@ namespace boost { namespace numeric { namespace ublas { \param E1 type of a matrix expression \c A \param E2 type of a matrix expression \c X */ + template + BOOST_UBLAS_INLINE + M & + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, + M &m, bool init, B) { + typedef typename M::storage_category storage_category; + typedef B block_sizes; + + return axpy_prod(e1, e2, m, full (), storage_category (), + init, block_sizes()); + } + template BOOST_UBLAS_INLINE M & axpy_prod (const matrix_expression &e1, const matrix_expression &e2, M &m, bool init = true) { - typedef typename M::value_type value_type; typedef typename M::storage_category storage_category; + typedef typename detail::prod_block_size::type> block_sizes; +#ifndef BOOST_UBLAS_LEGACY_PRODUCT + return axpy_prod(e1, e2, m, full (), storage_category (), + init, block_sizes()); +#else + typedef typename M::value_type value_type; typedef typename M::orientation_category orientation_category; if (init) m.assign (zero_matrix (e1 ().size1 (), e2 ().size2 ())); - return axpy_prod (e1, e2, m, full (), storage_category (), orientation_category ()); + + return axpy_prod(e1, e2, m, full (), storage_category (), + orientation_category()); +#endif + } + + template + BOOST_UBLAS_INLINE + typename boost::enable_if_c::value, + M>::type + axpy_prod (const matrix_expression &e1, + const matrix_expression &e2, B) { + typedef M matrix_type; + typedef typename M::storage_category storage_category; + typedef B block_sizes; + + matrix_type m (e1 ().size1 (), e2 ().size2 ()); + return axpy_prod(e1, e2, m, full (), storage_category(), true, + block_sizes()); } + template BOOST_UBLAS_INLINE M @@ -712,7 +813,7 @@ namespace boost { namespace numeric { namespace ublas { typedef M matrix_type; matrix_type m (e1 ().size1 (), e2 ().size2 ()); - return axpy_prod (e1, e2, m, full (), true); + return axpy_prod(e1, e2, m); } @@ -825,6 +926,48 @@ namespace boost { namespace numeric { namespace ublas { return opb_prod (e1, e2, m, true); } + /** \brief computes C = alpha * A * B + beta * C in an + optimized fashion. + + \param alpha scalar multiplier + \param e1 the matrix expression \c A + \param e2 the matrix expression \c B + \param beta scalar multiplier + \param e3 the result matrix \c C + + gemm implements the well known gemm operation + + \ingroup blas3 + + \internal + + template parameters: + \param E1 type of a matrix expression \c A + \param E2 type of a matrix expression \c B + \param E3 type of a matrix expression \c C + */ + template + BOOST_UBLAS_INLINE + void + gemm(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3) { + typedef typename detail::prod_block_size::type> block_sizes; + detail::gemm(alpha, e1, e2, beta, e3, block_sizes()); + } + + template + BOOST_UBLAS_INLINE + void + gemm(typename E3::value_type alpha, const matrix_expression &e1, + const matrix_expression &e2, + typename E3::value_type beta, matrix_expression &e3, + BlockSize b) { + detail::gemm(alpha, e1, e2, beta, e3, b); + } + }}} #endif diff --git a/include/boost/numeric/ublas/vector_expression.hpp b/include/boost/numeric/ublas/vector_expression.hpp index d92207c98..6f952a464 100644 --- a/include/boost/numeric/ublas/vector_expression.hpp +++ b/include/boost/numeric/ublas/vector_expression.hpp @@ -1609,6 +1609,16 @@ namespace boost { namespace numeric { namespace ublas { return expression_type (e ()); } + // real: norm_2_square v = sum(v [i] * v [i]) + // complex: norm_2_square v = sum(v [i] * conj (v [i])) + template + BOOST_UBLAS_INLINE + typename vector_scalar_unary_traits >::result_type + norm_2_square (const vector_expression &e) { + typedef typename vector_scalar_unary_traits >::expression_type expression_type; + return expression_type (e ()); + } + // real: norm_inf v = maximum (abs (v [i])) // complex: norm_inf v = maximum (maximum (abs (real (v [i])), abs (imag (v [i])))) template diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 77e296c91..f990b76b9 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -189,6 +189,10 @@ test-suite numeric/uBLAS ] [ run test_complex_norms.cpp ] + [ run test_scaled_norm.cpp + : : : + BOOST_UBLAS_SCALED_NORM + ] [ run test_assignment.cpp : : : BOOST_UBLAS_COO_ALWAYS_DO_FULL_SORT diff --git a/test/test_complex_norms.cpp b/test/test_complex_norms.cpp index d58a788ee..0c37ed45b 100644 --- a/test/test_complex_norms.cpp +++ b/test/test_complex_norms.cpp @@ -42,6 +42,21 @@ BOOST_UBLAS_TEST_DEF ( test_double_complex_norm_2 ) { BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - (3.0*expected)) < TOL); } +BOOST_UBLAS_TEST_DEF ( test_double_complex_norm_2_square ) { + typedef std::complex dComplex; + vector v(4); + for (unsigned int i = 0; i < v.size(); ++i) + v[i] = dComplex(i, i + 1); + + const double expected = 44; + + BOOST_UBLAS_DEBUG_TRACE( "square norm is " << norm_2_square(v) ); + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - expected) < TOL); + v *= 3.; + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - (9.0*expected)) < TOL); +} + + BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_inf ) { typedef std::complex dComplex; vector v(4); @@ -74,6 +89,22 @@ BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_2 ) { BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - (3.0*expected)) < TOL); } +BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_2_square ) { + typedef std::complex dComplex; + vector v(4); + for (unsigned short i = 0; i < v.size(); ++i) { + unsigned short imag(i + 1); + v[i] = dComplex(i, imag); + } + + const double expected = 44; + + BOOST_UBLAS_DEBUG_TRACE( "square norm is " << norm_2_square(v) ); + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - expected) < TOL); + v *= 3.f; + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - (9.0*expected)) < TOL); +} + int main() { BOOST_UBLAS_TEST_BEGIN(); @@ -81,6 +112,8 @@ int main() { BOOST_UBLAS_TEST_DO( test_float_complex_norm_inf ); BOOST_UBLAS_TEST_DO( test_double_complex_norm_2 ); BOOST_UBLAS_TEST_DO( test_float_complex_norm_2 ); + BOOST_UBLAS_TEST_DO( test_double_complex_norm_2_square ); + BOOST_UBLAS_TEST_DO( test_float_complex_norm_2_square ); BOOST_UBLAS_TEST_END(); } diff --git a/test/test_scaled_norm.cpp b/test/test_scaled_norm.cpp new file mode 100644 index 000000000..74065048f --- /dev/null +++ b/test/test_scaled_norm.cpp @@ -0,0 +1,41 @@ +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#include +#include + +#include "utils.hpp" + +using namespace boost::numeric::ublas; + +static const double TOL(1.0e-5); ///< Used for comparing two real numbers. + +BOOST_UBLAS_TEST_DEF ( test_double_scaled_norm_2 ) { + vector v(2); + v[0] = 0; v[1] = 1.0e155; + + const double expected = 1.0e155; + + BOOST_UBLAS_DEBUG_TRACE( "norm is " << norm_2(v) ); + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - expected) < TOL); +} + +BOOST_UBLAS_TEST_DEF ( test_float_scaled_norm_2 ) { + vector v(2); + v[0] = 0; v[1] = 1.0e20; + + const float expected = 1.0e20; + + BOOST_UBLAS_DEBUG_TRACE( "norm is " << norm_2(v) ); + BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - expected) < TOL); +} + +int main() { + BOOST_UBLAS_TEST_BEGIN(); + + BOOST_UBLAS_TEST_DO( test_double_scaled_norm_2 ); + BOOST_UBLAS_TEST_DO( test_float_scaled_norm_2 ); + + BOOST_UBLAS_TEST_END(); +}