From 5a8731e15241829ee58eeb2815f79a2cb3aad1f5 Mon Sep 17 00:00:00 2001
From: Shangtong Zhang <zhangshangtong.cpp@icloud.com>
Date: Mon, 14 Mar 2016 00:54:41 +0800
Subject: [PATCH 01/11] Fix bug in weak equality check

---
 include/boost/numeric/ublas/detail/matrix_assign.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/boost/numeric/ublas/detail/matrix_assign.hpp b/include/boost/numeric/ublas/detail/matrix_assign.hpp
index be172dd63..d42d495fc 100644
--- a/include/boost/numeric/ublas/detail/matrix_assign.hpp
+++ b/include/boost/numeric/ublas/detail/matrix_assign.hpp
@@ -30,7 +30,7 @@ namespace detail {
     template<class E1, class E2, class S>
     BOOST_UBLAS_INLINE
     bool equals (const matrix_expression<E1> &e1, const matrix_expression<E2> &e2, S epsilon, S min_norm) {
-        return norm_inf (e1 - e2) < epsilon *
+        return norm_inf (e1 - e2) <= epsilon *
                std::max<S> (std::max<S> (norm_inf (e1), norm_inf (e2)), min_norm);
     }
 

From 986547a60d34a9fae17d875109dc4fb75b520e6e Mon Sep 17 00:00:00 2001
From: Ronald Wampler <rdwampler@gmail.com>
Date: Mon, 28 Mar 2016 13:01:17 -0400
Subject: [PATCH 02/11] Add norm_2_square

---
 include/boost/numeric/ublas/functional.hpp    | 46 +++++++++++++++++++
 .../boost/numeric/ublas/vector_expression.hpp | 10 ++++
 test/test_complex_norms.cpp                   | 33 +++++++++++++
 3 files changed, 89 insertions(+)

diff --git a/include/boost/numeric/ublas/functional.hpp b/include/boost/numeric/ublas/functional.hpp
index b0be20608..3613f7ad4 100644
--- a/include/boost/numeric/ublas/functional.hpp
+++ b/include/boost/numeric/ublas/functional.hpp
@@ -522,6 +522,52 @@ namespace boost { namespace numeric { namespace ublas {
 #endif
         }
     };
+
+    template<class V>
+    struct vector_norm_2_square :
+        public vector_scalar_real_unary_functor<V> {
+        typedef typename vector_scalar_real_unary_functor<V>::value_type value_type;
+        typedef typename vector_scalar_real_unary_functor<V>::real_type real_type;
+        typedef typename vector_scalar_real_unary_functor<V>::result_type result_type;
+
+        template<class E>
+        static BOOST_UBLAS_INLINE
+        result_type apply (const vector_expression<E> &e) {
+            real_type t = real_type ();
+            typedef typename E::size_type vector_size_type;
+            vector_size_type size (e ().size ());
+            for (vector_size_type i = 0; i < size; ++ i) {
+                real_type u (type_traits<value_type>::norm_2 (e () (i)));
+                t +=  u * u;
+            }
+            return t;
+        }
+        // Dense case
+        template<class D, class I>
+        static BOOST_UBLAS_INLINE
+        result_type apply (D size, I it) {
+            real_type t = real_type ();
+            while (-- size >= 0) {
+                real_type u (type_traits<value_type>::norm_2 (*it));
+                t +=  u * u;
+                ++ it;
+            }
+            return t;
+        }
+        // Sparse case
+        template<class I>
+        static BOOST_UBLAS_INLINE
+        result_type apply (I it, const I &it_end) {
+            real_type t = real_type ();
+            while (it != it_end) {
+                real_type u (type_traits<value_type>::norm_2 (*it));
+                t +=  u * u;
+                ++ it;
+            }
+            return t;
+        }
+    };
+
     template<class V>
     struct vector_norm_inf:
         public vector_scalar_real_unary_functor<V> {
diff --git a/include/boost/numeric/ublas/vector_expression.hpp b/include/boost/numeric/ublas/vector_expression.hpp
index d92207c98..6f952a464 100644
--- a/include/boost/numeric/ublas/vector_expression.hpp
+++ b/include/boost/numeric/ublas/vector_expression.hpp
@@ -1609,6 +1609,16 @@ namespace boost { namespace numeric { namespace ublas {
         return expression_type (e ());
     }
 
+    // real: norm_2_square v = sum(v [i] * v [i])
+    // complex: norm_2_square v = sum(v [i] * conj (v [i]))
+    template<class E>
+    BOOST_UBLAS_INLINE
+    typename vector_scalar_unary_traits<E, vector_norm_2_square<E> >::result_type
+    norm_2_square (const vector_expression<E> &e) {
+        typedef typename vector_scalar_unary_traits<E, vector_norm_2_square<E> >::expression_type expression_type;
+        return expression_type (e ());
+    }
+
     // real: norm_inf v = maximum (abs (v [i]))
     // complex: norm_inf v = maximum (maximum (abs (real (v [i])), abs (imag (v [i]))))
     template<class E>
diff --git a/test/test_complex_norms.cpp b/test/test_complex_norms.cpp
index d58a788ee..0c37ed45b 100644
--- a/test/test_complex_norms.cpp
+++ b/test/test_complex_norms.cpp
@@ -42,6 +42,21 @@ BOOST_UBLAS_TEST_DEF ( test_double_complex_norm_2 ) {
     BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - (3.0*expected)) < TOL);
 }
 
+BOOST_UBLAS_TEST_DEF ( test_double_complex_norm_2_square ) {
+    typedef std::complex<double> dComplex;
+    vector<dComplex> v(4);
+    for (unsigned int i = 0; i < v.size(); ++i)
+        v[i] = dComplex(i, i + 1);
+
+    const double expected = 44;
+
+    BOOST_UBLAS_DEBUG_TRACE( "square norm is " << norm_2_square(v) );
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - expected) < TOL);
+    v *= 3.;
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - (9.0*expected)) < TOL);
+}
+
+
 BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_inf ) {
     typedef std::complex<float> dComplex;
     vector<dComplex> v(4);
@@ -74,6 +89,22 @@ BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_2 ) {
     BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - (3.0*expected)) < TOL);
 }
 
+BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_2_square ) {
+    typedef std::complex<float> dComplex;
+    vector<dComplex> v(4);
+    for (unsigned short i = 0; i < v.size(); ++i) {
+        unsigned short imag(i + 1);
+        v[i] = dComplex(i, imag);
+    }
+
+    const double expected = 44;
+
+    BOOST_UBLAS_DEBUG_TRACE( "square norm is " << norm_2_square(v) );
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - expected) < TOL);
+    v *= 3.f;
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - (9.0*expected)) < TOL);
+}
+
 int main() {
     BOOST_UBLAS_TEST_BEGIN();
 
@@ -81,6 +112,8 @@ int main() {
     BOOST_UBLAS_TEST_DO( test_float_complex_norm_inf );
     BOOST_UBLAS_TEST_DO( test_double_complex_norm_2 );
     BOOST_UBLAS_TEST_DO( test_float_complex_norm_2 );
+    BOOST_UBLAS_TEST_DO( test_double_complex_norm_2_square );
+    BOOST_UBLAS_TEST_DO( test_float_complex_norm_2_square );
 
     BOOST_UBLAS_TEST_END();
 }

From e88fc29650960ad0d1b9f7dbc5ce7fb2b7053a60 Mon Sep 17 00:00:00 2001
From: Ronald Wampler <rdwampler@gmail.com>
Date: Mon, 28 Mar 2016 13:34:15 -0400
Subject: [PATCH 03/11] Add documentation for norm_2_square

---
 doc/operations_overview.html | 3 ++-
 doc/vector_expression.html   | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/operations_overview.html b/doc/operations_overview.html
index 42d994212..212fcbebf 100644
--- a/doc/operations_overview.html
+++ b/doc/operations_overview.html
@@ -87,7 +87,8 @@ <h3>norms</h3>
 
 <pre><code>
 t = norm_inf(v); i = index_norm_inf(v);
-t = norm_1(v);   t = norm_2(v); 
+t = norm_1(v);   t = norm_2(v);
+t = norm_2_square(v);
 t = norm_inf(A); i = index_norm_inf(A);
 t = norm_1(A);   t = norm_frobenius(A); 
 </code></pre>
diff --git a/doc/vector_expression.html b/doc/vector_expression.html
index 7838007ff..2372ec3df 100644
--- a/doc/vector_expression.html
+++ b/doc/vector_expression.html
@@ -847,6 +847,11 @@ <h4>Prototypes</h4>
     typename vector_scalar_unary_traits&lt;E, vector_norm_2&lt;typename E::value_type&gt; &gt;::result_type
     norm_2 (const vector_expression&lt;E&gt; &amp;e);
 
+    // norm_2_square v = sum (v [i] * v [i])
+    template&lt;class E&gt;
+    typename vector_scalar_unary_traits&lt;E, vector_norm_2_square&lt;typename E::value_type&gt; &gt;::result_type
+    norm_2_square (const vector_expression&lt;E&gt; &amp;e);
+
     // norm_inf v = max (abs (v [i]))
     template&lt;class E&gt;
     typename vector_scalar_unary_traits&lt;E, vector_norm_inf&lt;typename E::value_type&gt; &gt;::result_type

From ad76936f49fe3ad4c981afb5c4aab76df20615de Mon Sep 17 00:00:00 2001
From: Ronald Wampler <rdwampler@gmail.com>
Date: Thu, 7 Apr 2016 15:10:27 -0400
Subject: [PATCH 04/11] Fix compilation error when BOOST_UBLAS_SCALED_NORM is
 defined

Commit e6b113 changed the `size_type` typedef to be based on the
container and in doing so missed the case in vector_norm_2 when
BOOST_UBLAS_SCALED_NORM is define. Also added test cases.
---
 include/boost/numeric/ublas/functional.hpp |  7 ++--
 test/Jamfile.v2                            |  4 +++
 test/test_scaled_norm.cpp                  | 41 ++++++++++++++++++++++
 3 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 test/test_scaled_norm.cpp

diff --git a/include/boost/numeric/ublas/functional.hpp b/include/boost/numeric/ublas/functional.hpp
index 3613f7ad4..f54b441fe 100644
--- a/include/boost/numeric/ublas/functional.hpp
+++ b/include/boost/numeric/ublas/functional.hpp
@@ -433,10 +433,10 @@ namespace boost { namespace numeric { namespace ublas {
         template<class E>
         static BOOST_UBLAS_INLINE
         result_type apply (const vector_expression<E> &e) {
-#ifndef BOOST_UBLAS_SCALED_NORM
-            real_type t = real_type ();
             typedef typename E::size_type vector_size_type;
             vector_size_type size (e ().size ());
+#ifndef BOOST_UBLAS_SCALED_NORM
+            real_type t = real_type ();
             for (vector_size_type i = 0; i < size; ++ i) {
                 real_type u (type_traits<value_type>::norm_2 (e () (i)));
                 t +=  u * u;
@@ -445,8 +445,7 @@ namespace boost { namespace numeric { namespace ublas {
 #else
             real_type scale = real_type ();
             real_type sum_squares (1);
-            size_type size (e ().size ());
-            for (size_type i = 0; i < size; ++ i) {
+            for (vector_size_type i = 0; i < size; ++ i) {
                 real_type u (type_traits<value_type>::norm_2 (e () (i)));
                 if ( real_type () /* zero */ == u ) continue;
                 if (scale < u) {
diff --git a/test/Jamfile.v2 b/test/Jamfile.v2
index 77e296c91..f990b76b9 100644
--- a/test/Jamfile.v2
+++ b/test/Jamfile.v2
@@ -189,6 +189,10 @@ test-suite numeric/uBLAS
       ]
       [ run test_complex_norms.cpp
       ]
+      [ run test_scaled_norm.cpp
+       : : :
+           <define>BOOST_UBLAS_SCALED_NORM
+      ]
       [ run test_assignment.cpp
        : : :
            <define>BOOST_UBLAS_COO_ALWAYS_DO_FULL_SORT
diff --git a/test/test_scaled_norm.cpp b/test/test_scaled_norm.cpp
new file mode 100644
index 000000000..74065048f
--- /dev/null
+++ b/test/test_scaled_norm.cpp
@@ -0,0 +1,41 @@
+// Distributed under the Boost Software License, Version 1.0. (See
+// accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+#include "utils.hpp"
+
+using namespace boost::numeric::ublas;
+
+static const double TOL(1.0e-5); ///< Used for comparing two real numbers.
+
+BOOST_UBLAS_TEST_DEF ( test_double_scaled_norm_2 ) {
+    vector<double> v(2);
+    v[0] = 0; v[1] = 1.0e155;
+
+    const double expected = 1.0e155;
+
+    BOOST_UBLAS_DEBUG_TRACE( "norm is " << norm_2(v) );
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - expected) < TOL);
+}
+
+BOOST_UBLAS_TEST_DEF ( test_float_scaled_norm_2 ) {
+    vector<float> v(2);
+    v[0] = 0; v[1] = 1.0e20;
+
+    const float expected = 1.0e20;
+
+    BOOST_UBLAS_DEBUG_TRACE( "norm is " << norm_2(v) );
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - expected) < TOL);
+}
+
+int main() {
+    BOOST_UBLAS_TEST_BEGIN();
+
+    BOOST_UBLAS_TEST_DO( test_double_scaled_norm_2 );
+    BOOST_UBLAS_TEST_DO( test_float_scaled_norm_2 );
+
+    BOOST_UBLAS_TEST_END();
+}

From 5dc64f9b240f55f8377260852465b50659a04365 Mon Sep 17 00:00:00 2001
From: Imre Palik <imre_palik@yahoo.co.uk>
Date: Thu, 11 Feb 2016 17:43:07 +0000
Subject: [PATCH 05/11] ublas: improved dense matrix multiplication performance

This patch includes the gemm implementation from Michael Lehn to
boost::ublas.

This modifies the workings of ublas::prod() and ublas::axppy_prod()
to use gemm() above a certain matrix size.

This patch only contains the basic architecture, and a generic c++
implementation.

Signed-off-by: Imre Palik <imre_palik@yahoo.co.uk>
Cc: Michael Lehn <michael.lehn@uni-ulm.de>
---
 include/boost/numeric/ublas/detail/gemm.hpp   | 345 ++++++++++++++++++
 .../boost/numeric/ublas/matrix_expression.hpp | 141 ++++++-
 include/boost/numeric/ublas/operation.hpp     | 155 +++++++-
 3 files changed, 622 insertions(+), 19 deletions(-)
 create mode 100644 include/boost/numeric/ublas/detail/gemm.hpp

diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp
new file mode 100644
index 000000000..f2a324a0d
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -0,0 +1,345 @@
+//
+//  Copyright (c) 2016
+//  Michael Lehn, Imre Palik
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_GEMM_
+#define _BOOST_UBLAS_GEMM_
+
+#include <boost/type_traits/common_type.hpp>
+#include <boost/type_traits/aligned_storage.hpp>
+#include <boost/align/aligned_allocator.hpp>
+#include <boost/align/assume_aligned.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+
+namespace boost { namespace numeric { namespace ublas { namespace detail {
+
+    template <typename T>
+    struct prod_block_size {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4092;
+        static const unsigned mr = 4; // stripe width for lhs
+        static const unsigned nr = 12; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 26; // Use gemm from this size
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<float> {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4080;
+        static const unsigned mr = 4; // stripe width for lhs
+        static const unsigned nr = 24; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 26; // Use gemm from this size
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<long double> {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 2; // stripe width for lhs
+        static const unsigned nr = 2; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 26; // Use gemm from this size
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template<typename T>
+    struct is_blocksize {
+        struct fallback { static const int nr = 0; };
+        struct derived : T, fallback {};
+        template<int C1>
+       struct nonr {
+           static const bool value = false;
+           typedef false_type type;
+       };
+
+         template<typename C> static char (&f(typename nonr<C::nr>::type*))[1];
+         template<typename C> static char (&f(...))[2];
+
+         static bool const value = sizeof(f<derived>(0)) == 2;
+    };
+
+    template <typename E>
+    void
+    gescal(const typename E::value_type &alpha, matrix_expression<E> &X)
+    {
+        typedef typename E::size_type  size_type;
+
+       for (size_type i=0; i<X().size1(); ++i) {
+            for (size_type j=0; j<X().size2(); ++j) {
+               X()(i,j) *= alpha;
+            }
+        }
+    }
+
+    template <typename Index, typename T>
+    void
+    geaxpy(Index m, Index n, const T &alpha,
+           const T *X, Index incRowX, Index incColX,
+           T       *Y, Index incRowY, Index incColY)
+    {
+        for (Index j=0; j<n; ++j) {
+            for (Index i=0; i<m; ++i) {
+                Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
+            }
+        }
+    }
+
+    template <typename Index, typename TX>
+    void
+    gescal(Index m, Index n,
+           const TX &alpha,
+           TX *X, Index incRowX, Index incColX)
+    {
+      if (alpha != TX(0)) {
+            for (Index j=0; j<n; ++j) {
+                for (Index i=0; i<m; ++i) {
+                    X[i*incRowX+j*incColX] *= alpha;
+                }
+            }
+        } else {
+            for (Index j=0; j<n; ++j) {
+                for (Index i=0; i<m; ++i) {
+                    X[i*incRowX+j*incColX] = 0;
+                }
+            }
+        }
+    }
+
+    //-- Micro Kernel --------------------------------------------------------------
+    template <typename Index, typename T, typename TC, typename BlockSize>
+    void
+    ugemm(Index kc, TC alpha, const T *A, const T *B,
+          TC beta, TC *C, Index incRowC, Index incColC)
+    {
+        BOOST_ALIGN_ASSUME_ALIGNED(A, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align);
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr;
+       typename boost::aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa;
+       T *P = reinterpret_cast<T*>(Pa.address());
+       for (unsigned c = 0; c < MR * NR; c++)
+         P[c] = 0;
+
+        for (Index l=0; l<kc; ++l) {
+           for (Index i=0; i<MR; ++i) {
+              for (Index j=0; j<NR; ++j) {
+                    P[i* NR+j] += A[i]*B[j];
+                }
+            }
+           A += MR;
+           B += NR;
+        }
+
+       if (alpha!=TC(1)) {
+           for (Index i=0; i<MR; ++i) {
+               for (Index j=0; j<NR; ++j) {
+                   P[i*NR+j] *= alpha;
+               }
+           }
+       }
+
+       if (beta == TC(0)) {
+           for (Index i=0; i<MR; ++i) {
+               for (Index j=0; j<NR; ++j) {
+                C[i*incRowC+j*incColC] = P[i*NR+j];
+               }
+           }
+       } else {
+           for (Index i=0; i<MR; ++i) {
+               for (Index j=0; j<NR; ++j) {
+                    C[i*incRowC+j*incColC] *= beta;
+                    C[i*incRowC+j*incColC] += P[i*NR+j];
+               }
+           }
+       }
+    }
+
+    //-- Macro Kernel --------------------------------------------------------------
+       template <typename Index, typename T, typename TC, typename BlockSize>
+    void
+    mgemm(Index mc, Index nc, Index kc, TC alpha,
+          const T *A, const T *B, TC beta,
+          TC *C, Index incRowC, Index incColC)
+    {
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr;
+        const Index mp  = (mc+MR-1) / MR;
+        const Index np  = (nc+NR-1) / NR;
+        const Index mr_ = mc % MR;
+        const Index nr_ = nc % NR;
+
+        // #if defined(_OPENMP)
+        // #pragma omp parallel for
+        // #endif
+        for (Index j=0; j<np; ++j) {
+      // __builtin_prefetch(B + j * kc * NR, 0);
+      // __builtin_prefetch(B + j * kc * NR + 8, 0);
+      // __builtin_prefetch(B + j * kc * NR + 16, 0);
+            const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
+            TC C_[MR*NR];
+
+            for (Index i=0; i<mp; ++i) {
+                const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
+
+                if (mr==MR && nr==NR) {
+                 ugemm<Index, T, TC, BlockSize>(kc, alpha,
+                          &A[i*kc*MR], &B[j*kc*NR],
+                          beta,
+                          &C[i*MR*incRowC+j*NR*incColC],
+                          incRowC, incColC);
+                } else {
+                    std::fill_n(C_, MR*NR, T(0));
+                    ugemm<Index, T, TC, BlockSize>(kc, alpha,
+                          &A[i*kc*MR], &B[j*kc*NR],
+                          T(0),
+                          C_, NR, Index(1));
+                    gescal(mr, nr, beta,
+                           &C[i*MR*incRowC+j*NR*incColC],
+                           incRowC, incColC);
+                    geaxpy(mr, nr, TC(1), C_, NR, Index(1),
+                           &C[i*MR*incRowC+j*NR*incColC],
+                           incRowC, incColC);
+                }
+            }
+        }
+    }
+
+    //-- Packing blocks ------------------------------------------------------------
+       template <typename E, typename T, typename BlockSize>
+    void
+    pack_A(const matrix_expression<E> &A, T *p)
+    {
+        typedef typename E::size_type  size_type;
+
+        const size_type mc = A ().size1();
+        const size_type kc = A ().size2();
+        static const size_type MR = BlockSize::mr;
+        const size_type mp = (mc+MR-1) / MR;
+
+        for (size_type j=0; j<kc; ++j) {
+            for (size_type l=0; l<mp; ++l) {
+                for (size_type i0=0; i0<MR; ++i0) {
+                    size_type i  = l*MR + i0;
+                    size_type nu = l*MR*kc + j*MR + i0;
+                    p[nu]        = (i<mc) ? A()(i,j) : T(0);
+                }
+            }
+        }
+    }
+
+       template <typename E, typename T, typename BlockSize>
+    void
+    pack_B(const matrix_expression<E> &B, T *p)
+    {
+        typedef typename E::size_type  size_type;
+
+        const size_type kc = B ().size1();
+        const size_type nc = B ().size2();
+        static const size_type NR = BlockSize::nr;
+        const size_type np = (nc+NR-1) / NR;
+
+        for (size_type l=0; l<np; ++l) {
+            for (size_type j0=0; j0<NR; ++j0) {
+                for (size_type i=0; i<kc; ++i) {
+                    size_type j  = l*NR+j0;
+                    size_type nu = l*NR*kc + i*NR + j0;
+                    p[nu]        = (j<nc) ? B()(i,j) : T(0);
+                }
+            }
+        }
+    }
+
+    //-- Frame routine -------------------------------------------------------------
+    template <typename E1, typename E2, typename E3, typename BlockSize>
+    void
+    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+        const matrix_expression<E2> &e2,
+         typename E3::value_type beta, matrix_expression<E3> &e3,
+        BlockSize)
+    {
+        typedef typename E3::size_type  size_type;
+        typedef typename E1::value_type value_type1;
+        typedef typename E2::value_type value_type2;
+        typedef typename E3::value_type value_type3;
+        typedef typename common_type<value_type1,
+                                     value_type2,
+                                     value_type3>::type value_type_i;
+        typedef unbounded_array<value_type_i,
+                                typename alignment::aligned_allocator<value_type_i,
+                                                                      BlockSize::align> > array_type_i;
+
+        static const size_type MC = BlockSize::mc;
+        static const size_type NC = BlockSize::nc;
+
+        const size_type m = BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ());
+        const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ());
+        const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
+
+        static const size_type KC = BlockSize::kc;
+        const size_type mb = (m+MC-1) / MC;
+        const size_type nb = (n+NC-1) / NC;
+        const size_type kb = (k+KC-1) / KC;
+        const size_type mc_ = m % MC;
+        const size_type nc_ = n % NC;
+        const size_type kc_ = k % KC;
+
+        value_type3 *C_ = &e3()(0,0);
+        const size_type incRowC = &e3()(1,0) - &e3()(0,0);
+        const size_type incColC = &e3()(0,1) - &e3()(0,0);
+
+        if (alpha==value_type3(0) || k==0) {
+            gescal(beta, e3);
+            return;
+        }
+
+        array_type_i A(MC * KC);
+        array_type_i B(NC * KC);
+
+        for (size_type j=0; j<nb; ++j) {
+            size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_;
+
+            for (size_type l=0; l<kb; ++l) {
+                size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_;
+                value_type3 beta_ = (l==0) ? beta : value_type3(1);
+
+                const matrix_range<const E2> Bs =
+                    subrange(e2(), l*KC, l*KC+kc, j*NC, j*NC+nc);
+                pack_B<matrix_range<const E2>, value_type_i, BlockSize>
+                    (Bs, &B[0]);
+
+                for (size_type i=0; i<mb; ++i) {
+                    size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_;
+
+                    const matrix_range<const E1> As =
+                        subrange(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc);
+                    pack_A<matrix_range<const E1>, value_type_i, BlockSize>
+                        (As, &A[0]);
+
+                    mgemm<size_type, value_type_i, value_type3, BlockSize>
+                        (mc, nc, kc, alpha, &A[0], &B[0], beta_,
+                         &C_[i*MC*incRowC+j*NC*incColC],
+                         incRowC, incColC);
+                }
+            }
+        }
+    }
+}}}}
+#endif
diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp
index 8e6c2f68b..e58c757da 100644
--- a/include/boost/numeric/ublas/matrix_expression.hpp
+++ b/include/boost/numeric/ublas/matrix_expression.hpp
@@ -14,6 +14,7 @@
 #define _BOOST_UBLAS_MATRIX_EXPRESSION_
 
 #include <boost/numeric/ublas/vector_expression.hpp>
+#include <boost/numeric/ublas/detail/gemm.hpp>
 
 // Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish
 // Iterators based on ideas of Jeremy Siek
@@ -5460,45 +5461,104 @@ namespace boost { namespace numeric { namespace ublas {
         expression2_closure_type e2_;
     };
 
+    namespace detail {
+      template<class E1, class E2, class P, bool s>
+      struct binary_calculate_result_type;
+
+      template<class E1, class E2, class P>
+      struct binary_calculate_result_type<E1, E2, P, false> {
+       typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, P> > result_type;
+      };
+
+      template<class E1, class E2, class P>
+      struct binary_calculate_result_type<E1, E2, P, true> {
+       typedef matrix<P> result_type;
+      };
+    }
+
     template<class T1, class E1, class T2, class E2>
     struct matrix_matrix_binary_traits {
-        typedef unknown_storage_tag storage_category;
+      //        typedef unknown_storage_tag storage_category;
+      typedef typename storage_restrict_traits<typename E1::storage_category, typename E2::storage_category>::storage_category storage_category;
         typedef unknown_orientation_tag orientation_category;
         typedef typename promote_traits<T1, T2>::promote_type promote_type;
         typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, promote_type> > expression_type;
 #ifndef BOOST_UBLAS_SIMPLE_ET_DEBUG
-        typedef expression_type result_type;
+      //        typedef expression_type result_type;
+      typedef typename detail::binary_calculate_result_type<E1, E2, promote_type, boost::is_base_of<dense_proxy_tag, storage_category>::value>::result_type result_type;
 #else
         typedef typename E1::matrix_temporary_type result_type;
 #endif
     };
 
-    template<class E1, class E2>
+    template<class E1, class E2, class B>
     BOOST_UBLAS_INLINE
     typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                          typename E2::value_type, E2>::result_type
     prod (const matrix_expression<E1> &e1,
           const matrix_expression<E2> &e2,
+         B,
           unknown_storage_tag,
           unknown_orientation_tag) {
+
         typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                                      typename E2::value_type, E2>::expression_type expression_type;
         return expression_type (e1 (), e2 ());
     }
 
-    // Dispatcher
-    template<class E1, class E2>
+    template<class E1, class E2, class B>
     BOOST_UBLAS_INLINE
     typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                          typename E2::value_type, E2>::result_type
     prod (const matrix_expression<E1> &e1,
-          const matrix_expression<E2> &e2) {
+          const matrix_expression<E2> &e2,
+         B b,
+          dense_proxy_tag,
+          unknown_orientation_tag) {
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::expression_type expression_type;
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::result_type result_type;
+       typedef typename result_type::value_type result_value;
+
+       if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) {
+           return expression_type (e1 (), e2 ());
+       } else {
+            result_type rv(e1 ().size1(), e2 ().size2());
+           detail::gemm(result_value(1), e1, e2,
+                        result_value(0), rv, b);
+           return rv;
+       }
+    }
+
+    // Dispatcher
+    template<class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    typename enable_if_c<detail::is_blocksize<B>::value,
+                        typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                             typename E2::value_type, E2>::result_type>::type
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+         B b) {
         BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0);
         typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                                      typename E2::value_type, E2>::storage_category storage_category;
         typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                                      typename E2::value_type, E2>::orientation_category orientation_category;
-        return prod (e1, e2, storage_category (), orientation_category ());
+        return prod (e1, e2, b, storage_category (), orientation_category ());
+    }
+
+    template<class E1, class E2>
+    BOOST_UBLAS_INLINE
+    typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                         typename E2::value_type, E2>::result_type
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2) {
+        BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0);
+       typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
+                                                                     typename E2::value_type>::type> block_sizes;
+
+        return prod (e1, e2, block_sizes());
     }
 
     template<class E1, class E2>
@@ -5529,13 +5589,76 @@ namespace boost { namespace numeric { namespace ublas {
         return prec_prod (e1, e2, storage_category (), orientation_category ());
     }
 
-    template<class M, class E1, class E2>
+    template<class M, class E1, class E2, typename B>
+    BOOST_UBLAS_INLINE
+    void
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+          M &m, B b,
+         unknown_storage_tag,
+          unknown_orientation_tag) {
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::expression_type expression_type;
+
+        m.assign(expression_type (e1 (), e2 ()));
+    }
+
+    template<class M, class E1, class E2, typename B>
+    BOOST_UBLAS_INLINE
+    void
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+          M &m, B b,
+         dense_proxy_tag,
+          unknown_orientation_tag) {
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::expression_type expression_type;
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::result_type result_type;
+       typedef typename result_type::value_type result_value;
+
+       if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) {
+            m.assign(expression_type (e1 (), e2 ()));
+       } else {
+           detail::gemm(result_value(1), e1, e2,
+                        result_value(0), m, b);
+       }
+    }
+
+    // dispatcher
+      template<class M, class E1, class E2, class B>
     BOOST_UBLAS_INLINE
     M &
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+          M &m, B b) {
+        BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0);
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::storage_category storage_category;
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::orientation_category orientation_category;
+
+        prod (e1, e2, m, b, storage_category(), orientation_category());
+       return m;
+    }
+    template<class M, class E1, class E2>
+    BOOST_UBLAS_INLINE
+    typename enable_if_c<!detail::is_blocksize<M>::value,
+                        M&>::type
     prod (const matrix_expression<E1> &e1,
           const matrix_expression<E2> &e2,
           M &m) {
-        return m.assign (prod (e1, e2));
+        BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0);
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::storage_category storage_category;
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::orientation_category orientation_category;
+        typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
+                                                                      typename E2::value_type,
+                                                                      typename M::value_type>::type> block_sizes;
+        prod (e1, e2, m, block_sizes(), storage_category(),
+              orientation_category());
+        return m;
     }
 
     template<class M, class E1, class E2>
diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp
index 64657ccf3..2a410a082 100644
--- a/include/boost/numeric/ublas/operation.hpp
+++ b/include/boost/numeric/ublas/operation.hpp
@@ -14,7 +14,7 @@
 #define _BOOST_UBLAS_OPERATION_
 
 #include <boost/numeric/ublas/matrix_proxy.hpp>
-
+#include <boost/numeric/ublas/detail/gemm.hpp>
 /** \file operation.hpp
  *  \brief This file contains some specialized products.
  */
@@ -637,7 +637,63 @@ namespace boost { namespace numeric { namespace ublas {
         return m;
     }
 
-    // Dispatcher
+    template<class M, class E1, class E2, class S, class B>
+    BOOST_UBLAS_INLINE
+    M&
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2,
+               M &m, full,
+              S, bool init, B)
+    {
+        typedef typename M::value_type value_type;
+       typedef S storage_category;
+       typedef typename M::orientation_category orientation_category;
+
+
+       if (init)
+         m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
+       return axpy_prod (e1, e2, m, full (), storage_category (),
+                         orientation_category ());
+    }
+
+    template<class M, class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    M&
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2,
+               M &m, full,
+              dense_proxy_tag, bool init, B)
+    {
+        typedef typename M::value_type value_type;
+       typedef B block_sizes;
+        if (m.size1() < B::limit || m.size2() < B::limit) {
+            typedef typename M::storage_category storage_category;
+            typedef typename M::orientation_category orientation_category;
+
+            if (init)
+                m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
+            return axpy_prod (e1, e2, m, full (), storage_category (),
+                             orientation_category ());
+        } else {
+            detail::gemm(value_type(1), e1, e2,
+                        value_type(init? 0 : 1), m, block_sizes());
+            return m;
+        }
+    }
+
+    template<class M, class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    M&
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2,
+               M &m, full,
+              dense_tag, bool init, B b)
+    {
+       return axpy_prod (e1, e2, m, full (), dense_proxy_tag(),
+                         init, b);
+    }
+
+    // Dispatchers
     template<class M, class E1, class E2, class TRI>
     BOOST_UBLAS_INLINE
     M &
@@ -651,11 +707,16 @@ namespace boost { namespace numeric { namespace ublas {
 
         if (init)
             m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
-        return axpy_prod (e1, e2, m, triangular_restriction (), storage_category (), orientation_category ());
+
+        return axpy_prod(e1, e2, m, triangular_restriction (),
+                        storage_category (), orientation_category(),
+                        init);
     }
+
     template<class M, class E1, class E2, class TRI>
     BOOST_UBLAS_INLINE
-    M
+    typename enable_if_c<!detail::is_blocksize<TRI>::value,
+                               M>::type
     axpy_prod (const matrix_expression<E1> &e1,
                const matrix_expression<E2> &e2,
                TRI) {
@@ -690,29 +751,61 @@ namespace boost { namespace numeric { namespace ublas {
           \param E1 type of a matrix expression \c A
           \param E2 type of a matrix expression \c X
   */
+    template<class M, class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    M &
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2,
+               M &m, bool init, B) {
+        typedef typename M::storage_category storage_category;
+       typedef B block_sizes;
+
+        return axpy_prod(e1, e2, m, full (), storage_category (),
+                        init, block_sizes());
+    }
+
     template<class M, class E1, class E2>
     BOOST_UBLAS_INLINE
     M &
     axpy_prod (const matrix_expression<E1> &e1,
                const matrix_expression<E2> &e2,
                M &m, bool init = true) {
-        typedef typename M::value_type value_type;
         typedef typename M::storage_category storage_category;
-        typedef typename M::orientation_category orientation_category;
+        typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
+                                                                      typename E2::value_type,
+                                                                      typename M::value_type>::type> block_sizes;
+        return axpy_prod(e1, e2, m, full (), storage_category (),
+                        init, block_sizes());
+    }
 
-        if (init)
-            m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
-        return axpy_prod (e1, e2, m, full (), storage_category (), orientation_category ());
+    template<class M, class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    typename boost::enable_if_c<detail::is_blocksize<B>::value,
+                               M>::type
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2, B) {
+        typedef M matrix_type;
+        typedef typename M::storage_category storage_category;
+       typedef B block_sizes;
+
+        matrix_type m (e1 ().size1 (), e2 ().size2 ());
+        return axpy_prod(e1, e2, m, full (), storage_category(), true,
+                        block_sizes());
     }
+
     template<class M, class E1, class E2>
     BOOST_UBLAS_INLINE
     M
     axpy_prod (const matrix_expression<E1> &e1,
                const matrix_expression<E2> &e2) {
         typedef M matrix_type;
+        typedef typename M::storage_category storage_category;
+       typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
+                                                                     typename E2::value_type>::type> block_sizes;
 
         matrix_type m (e1 ().size1 (), e2 ().size2 ());
-        return axpy_prod (e1, e2, m, full (), true);
+        return axpy_prod(e1, e2, m, full (), storage_category(), true,
+                        block_sizes());
     }
 
 
@@ -825,6 +918,48 @@ namespace boost { namespace numeric { namespace ublas {
         return opb_prod (e1, e2, m, true);
     }
 
+  /** \brief computes <tt>C = alpha * A * B + beta * C</tt> in an
+          optimized fashion.
+
+         \param alpha scalar multiplier
+          \param e1 the matrix expression \c A
+          \param e2 the matrix expression \c B
+         \param beta scalar multiplier
+          \param e3  the result matrix \c C
+
+         <tt>gemm</tt> implements the well known gemm operation
+
+          \ingroup blas3
+
+          \internal
+
+          template parameters:
+          \param E1 type of a matrix expression \c A
+          \param E2 type of a matrix expression \c B
+          \param E3 type of a matrix expression \c C
+  */
+    template <typename E1, typename E2, typename E3>
+    BOOST_UBLAS_INLINE
+    void
+    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+         const matrix_expression<E2> &e2,
+        typename E3::value_type beta, matrix_expression<E3> &e3) {
+        typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
+                                                                      typename E2::value_type,
+                                                                      typename E3::value_type>::type> block_sizes;
+       detail::gemm(alpha, e1, e2, beta, e3, block_sizes());
+    }
+
+    template <typename E1, typename E2, typename E3, typename BlockSize>
+    BOOST_UBLAS_INLINE
+    void
+    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+         const matrix_expression<E2> &e2,
+        typename E3::value_type beta, matrix_expression<E3> &e3,
+        BlockSize b) {
+       detail::gemm(alpha, e1, e2, beta, e3, b);
+    }
+
 }}}
 
 #endif

From b20cfb0fac2bb3431ebf24a9eff2cd1aeb0fc6ca Mon Sep 17 00:00:00 2001
From: Imre Palik <imre_palik@yahoo.co.uk>
Date: Sat, 27 Feb 2016 15:23:16 +0000
Subject: [PATCH 06/11] boost::ublas: gcc support for optimal matrix
 multiplication

This patch contains an optimised, vectorised kernel, using gcc's
SIMD vectors.  This reaches matrix multiplication speeds
comparable to hand-crafted assembly kernels on x86 Haswell
microarchitecture.

The kernels are compile-time parametrisable, ans the patch also
contains optimised kernel parameters for float, double, and long
double.

The validity of the parameters are checked during compile-time,
but the produced compile error can be relatively obscure if
vector_length is too big.

For architectures that doesn't support vectorisation for all types, it is
possible to specify parameters that cause really suboptimal code generation.
But as far as I see code correctness is ensured.

Signed-off-by: Imre Palik <imre_palik@yahoo.co.uk>
Cc: Michael Lehn <michael.lehn@uni-ulm.de>
---
 .../numeric/ublas/detail/block_sizes.hpp      |  91 ++++++++
 include/boost/numeric/ublas/detail/gemm.hpp   | 212 ++++++++++--------
 include/boost/numeric/ublas/detail/vector.hpp |  32 +++
 .../boost/numeric/ublas/matrix_expression.hpp |   1 +
 include/boost/numeric/ublas/operation.hpp     |   1 +
 5 files changed, 238 insertions(+), 99 deletions(-)
 create mode 100644 include/boost/numeric/ublas/detail/block_sizes.hpp
 create mode 100644 include/boost/numeric/ublas/detail/vector.hpp

diff --git a/include/boost/numeric/ublas/detail/block_sizes.hpp b/include/boost/numeric/ublas/detail/block_sizes.hpp
new file mode 100644
index 000000000..9d861de1f
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/block_sizes.hpp
@@ -0,0 +1,91 @@
+//
+//  Copyright (c) 2016
+//  Michael Lehn, Imre Palik
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_BLOCK_SIZES_
+#define _BOOST_UBLAS_BLOCK_SIZES_
+
+#include <boost/numeric/ublas/detail/vector.hpp>
+
+namespace boost { namespace numeric { namespace ublas { namespace detail {
+
+    template <typename T>
+    struct prod_block_size {
+        static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(T); // Number of elements in a vector register
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = (4096/(3 * vector_length)) * (3 * vector_length);
+        static const unsigned mr = 4; // stripe width for lhs
+        static const unsigned nr = 3 * vector_length; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 14; // Use gemm from this size
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<float> {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 4; // stripe width for lhs
+        static const unsigned nr = 16; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 14; // Use gemm from this size
+        static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(float); // Number of elements in a vector register
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<long double> {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 1; // stripe width for lhs
+        static const unsigned nr = 4; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 42; // Use gemm from this size
+        static const unsigned vector_length = 1; // Number of elements in a vector register
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template<typename T>
+    struct is_blocksize {
+        struct fallback { static const int nr = 0; };
+        struct derived : T, fallback {};
+        template<int C1>
+        struct nonr {
+            static const bool value = false;
+            typedef false_type type;
+        };
+
+         template<typename C> static char (&f(typename nonr<C::nr>::type*))[1];
+         template<typename C> static char (&f(...))[2];
+
+         static bool const value = sizeof(f<derived>(0)) == 2;
+    };
+
+    template<typename T>
+    struct check_blocksize {
+        BOOST_STATIC_ASSERT_MSG(T::mc>0 && T::kc>0 && T::nc>0 && T::mr>0 && T::nr>0,
+                                "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(T::mc % T::mr == 0,
+                                "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(T::nc % T::nr == 0,
+                                "NC must be a multiple of NR.");
+        BOOST_STATIC_ASSERT_MSG(T::vector_length <= 1 || T::nr % T::vector_length == 0,
+                                "NR must be a multiple of vector size");
+        BOOST_STATIC_ASSERT_MSG(T::limit >= 2,
+                                "Minimum matrix size for gemm is 2*2");
+    };
+}}}}
+#endif
diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp
index f2a324a0d..daf38a119 100644
--- a/include/boost/numeric/ublas/detail/gemm.hpp
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -11,80 +11,26 @@
 
 #include <boost/type_traits/common_type.hpp>
 #include <boost/type_traits/aligned_storage.hpp>
-#include <boost/align/aligned_allocator.hpp>
+#include <boost/type_traits/is_arithmetic.hpp>
+#include <boost/align/aligned_alloc.hpp>
 #include <boost/align/assume_aligned.hpp>
 #include <boost/static_assert.hpp>
 #include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/detail/vector.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
+#include <boost/predef/compiler.h>
 
 namespace boost { namespace numeric { namespace ublas { namespace detail {
 
-    template <typename T>
-    struct prod_block_size {
-        static const unsigned mc = 256;
-        static const unsigned kc = 512; // stripe length
-        static const unsigned nc = 4092;
-        static const unsigned mr = 4; // stripe width for lhs
-        static const unsigned nr = 12; // stripe width for rhs
-        static const unsigned align = 64; // align temporary arrays to this boundary
-        static const unsigned limit = 26; // Use gemm from this size
-        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
-        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
-        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
-    };
-
-    template <>
-    struct prod_block_size<float> {
-        static const unsigned mc = 256;
-        static const unsigned kc = 512; // stripe length
-        static const unsigned nc = 4080;
-        static const unsigned mr = 4; // stripe width for lhs
-        static const unsigned nr = 24; // stripe width for rhs
-        static const unsigned align = 64; // align temporary arrays to this boundary
-        static const unsigned limit = 26; // Use gemm from this size
-        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
-        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
-        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
-    };
-
-    template <>
-    struct prod_block_size<long double> {
-        static const unsigned mc = 256;
-        static const unsigned kc = 512; // stripe length
-        static const unsigned nc = 4096;
-        static const unsigned mr = 2; // stripe width for lhs
-        static const unsigned nr = 2; // stripe width for rhs
-        static const unsigned align = 64; // align temporary arrays to this boundary
-        static const unsigned limit = 26; // Use gemm from this size
-        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
-        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
-        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
-    };
-
-    template<typename T>
-    struct is_blocksize {
-        struct fallback { static const int nr = 0; };
-        struct derived : T, fallback {};
-        template<int C1>
-       struct nonr {
-           static const bool value = false;
-           typedef false_type type;
-       };
-
-         template<typename C> static char (&f(typename nonr<C::nr>::type*))[1];
-         template<typename C> static char (&f(...))[2];
-
-         static bool const value = sizeof(f<derived>(0)) == 2;
-    };
-
     template <typename E>
     void
     gescal(const typename E::value_type &alpha, matrix_expression<E> &X)
     {
         typedef typename E::size_type  size_type;
 
-       for (size_type i=0; i<X().size1(); ++i) {
+        for (size_type i=0; i<X().size1(); ++i) {
             for (size_type j=0; j<X().size2(); ++j) {
-               X()(i,j) *= alpha;
+                X()(i,j) *= alpha;
             }
         }
     }
@@ -123,9 +69,73 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         }
     }
 
-    //-- Micro Kernel --------------------------------------------------------------
-    template <typename Index, typename T, typename TC, typename BlockSize>
+    //-- Micro Kernel ----------------------------------------------------------
+#ifdef BOOST_UBLAS_VECTOR_KERNEL
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<is_arithmetic<T>::value
+                         && (1 < BlockSize::vector_length)
+                         && (BlockSize::vector_length * sizeof(T) <= _BOOST_UBLAS_VECTOR_SIZE),
+                         void>::type
+    ugemm(Index kc, TC alpha, const T *A, const T *B,
+          TC beta, TC *C, Index incRowC, Index incColC)
+    {
+        BOOST_ALIGN_ASSUME_ALIGNED (A, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED (B, BlockSize::align);
+        static const unsigned vector_length = BlockSize::vector_length;
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr/vector_length;
+
+        typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE)));
+
+        vx P[MR*NR] = {};
+
+        const vx *b = (const vx *)B;
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    P[i*NR+j] += A[i]*b[j];
+                }
+            }
+            A += MR;
+            b += NR;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    P[i*NR+j] *= alpha;
+                }
+            }
+        }
+
+        const T *p = (const T *) P;
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR * vector_length; ++j) {
+                    C[i*incRowC+j*incColC] = p[i*NR * vector_length +j];
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR * vector_length; ++j) {
+                    C[i*incRowC+j*incColC] *= beta;
+                    C[i*incRowC+j*incColC] += p[i*NR * vector_length+j];
+                }
+            }
+        }
+    }
+
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<!is_arithmetic<T>::value
+                         || (BlockSize::vector_length <= 1),
+                         void>::type
+#else
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
     void
+#endif
     ugemm(Index kc, TC alpha, const T *A, const T *B,
           TC beta, TC *C, Index incRowC, Index incColC)
     {
@@ -133,47 +143,47 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align);
         static const Index MR = BlockSize::mr;
         static const Index NR = BlockSize::nr;
-       typename boost::aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa;
-       T *P = reinterpret_cast<T*>(Pa.address());
-       for (unsigned c = 0; c < MR * NR; c++)
-         P[c] = 0;
+        typename aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa;
+        T *P = reinterpret_cast<T*>(Pa.address());
+        for (unsigned c = 0; c < MR * NR; c++)
+          P[c] = 0;
 
         for (Index l=0; l<kc; ++l) {
-           for (Index i=0; i<MR; ++i) {
+            for (Index i=0; i<MR; ++i) {
               for (Index j=0; j<NR; ++j) {
                     P[i* NR+j] += A[i]*B[j];
                 }
             }
-           A += MR;
-           B += NR;
+            A += MR;
+            B += NR;
         }
 
-       if (alpha!=TC(1)) {
-           for (Index i=0; i<MR; ++i) {
-               for (Index j=0; j<NR; ++j) {
-                   P[i*NR+j] *= alpha;
-               }
-           }
-       }
-
-       if (beta == TC(0)) {
-           for (Index i=0; i<MR; ++i) {
-               for (Index j=0; j<NR; ++j) {
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    P[i*NR+j] *= alpha;
+                }
+            }
+        }
+
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
                 C[i*incRowC+j*incColC] = P[i*NR+j];
-               }
-           }
-       } else {
-           for (Index i=0; i<MR; ++i) {
-               for (Index j=0; j<NR; ++j) {
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
                     C[i*incRowC+j*incColC] *= beta;
                     C[i*incRowC+j*incColC] += P[i*NR+j];
-               }
-           }
-       }
+                }
+            }
+        }
     }
 
-    //-- Macro Kernel --------------------------------------------------------------
-       template <typename Index, typename T, typename TC, typename BlockSize>
+    //-- Macro Kernel ----------------------------------------------------------
+        template <typename Index, typename T, typename TC, typename BlockSize>
     void
     mgemm(Index mc, Index nc, Index kc, TC alpha,
           const T *A, const T *B, TC beta,
@@ -190,9 +200,6 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         // #pragma omp parallel for
         // #endif
         for (Index j=0; j<np; ++j) {
-      // __builtin_prefetch(B + j * kc * NR, 0);
-      // __builtin_prefetch(B + j * kc * NR + 8, 0);
-      // __builtin_prefetch(B + j * kc * NR + 16, 0);
             const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
             TC C_[MR*NR];
 
@@ -200,7 +207,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
                 const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
 
                 if (mr==MR && nr==NR) {
-                 ugemm<Index, T, TC, BlockSize>(kc, alpha,
+                  ugemm<Index, T, TC, BlockSize>(kc, alpha,
                           &A[i*kc*MR], &B[j*kc*NR],
                           beta,
                           &C[i*MR*incRowC+j*NR*incColC],
@@ -223,11 +230,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
     }
 
     //-- Packing blocks ------------------------------------------------------------
-       template <typename E, typename T, typename BlockSize>
+        template <typename E, typename T, typename BlockSize>
     void
     pack_A(const matrix_expression<E> &A, T *p)
     {
         typedef typename E::size_type  size_type;
+        BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
 
         const size_type mc = A ().size1();
         const size_type kc = A ().size2();
@@ -245,11 +253,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         }
     }
 
-       template <typename E, typename T, typename BlockSize>
+        template <typename E, typename T, typename BlockSize>
     void
     pack_B(const matrix_expression<E> &B, T *p)
     {
         typedef typename E::size_type  size_type;
+        BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
 
         const size_type kc = B ().size1();
         const size_type nc = B ().size2();
@@ -271,10 +280,15 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
     template <typename E1, typename E2, typename E3, typename BlockSize>
     void
     gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
-        const matrix_expression<E2> &e2,
+         const matrix_expression<E2> &e2,
          typename E3::value_type beta, matrix_expression<E3> &e3,
-        BlockSize)
+         BlockSize)
     {
+#if defined(BOOST_COMP_GNUC_DETECTION)
+        check_blocksize<BlockSize> check __attribute__ ((unused));
+#else
+        check_blocksize<BlockSize> check;
+#endif
         typedef typename E3::size_type  size_type;
         typedef typename E1::value_type value_type1;
         typedef typename E2::value_type value_type2;
diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp
new file mode 100644
index 000000000..3281ae950
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/vector.hpp
@@ -0,0 +1,32 @@
+//
+//  Copyright (c) 2016
+//  Imre Palik
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_DETAIL_VECTOR_
+#define _BOOST_UBLAS_DETAIL_VECTOR_
+#include <boost/predef/hardware/simd.h>
+
+#if defined(BOOST_HW_SIMD) && (BOOST_HW_SIMD == BOOST_HW_SIMD_X86 || BOOST_HW_SIMD == BOOST_HW_SIMD_X86_AMD)
+#if BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_AVX_VERSION
+#define _BOOST_UBLAS_VECTOR_SIZE 32
+#elif BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_SSE_VERSION
+#define _BOOST_UBLAS_VECTOR_SIZE 16
+#else
+#define _BOOST_UBLAS_VECTOR_SIZE 8
+#endif
+#endif
+
+#ifndef _BOOST_UBLAS_VECTOR_SIZE
+#define _BOOST_UBLAS_VECTOR_SIZE 1
+#endif
+
+#if (defined(BOOST_COMP_GNUC_DETECTION)                           \
+     && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0))
+#define BOOST_UBLAS_VECTOR_KERNEL = _BOOST_UBLAS_VECTOR_SIZE
+#endif
+
+#endif
diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp
index e58c757da..6b9c8fec8 100644
--- a/include/boost/numeric/ublas/matrix_expression.hpp
+++ b/include/boost/numeric/ublas/matrix_expression.hpp
@@ -15,6 +15,7 @@
 
 #include <boost/numeric/ublas/vector_expression.hpp>
 #include <boost/numeric/ublas/detail/gemm.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
 
 // Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish
 // Iterators based on ideas of Jeremy Siek
diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp
index 2a410a082..1e70326a5 100644
--- a/include/boost/numeric/ublas/operation.hpp
+++ b/include/boost/numeric/ublas/operation.hpp
@@ -15,6 +15,7 @@
 
 #include <boost/numeric/ublas/matrix_proxy.hpp>
 #include <boost/numeric/ublas/detail/gemm.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
 /** \file operation.hpp
  *  \brief This file contains some specialized products.
  */

From cff83d1c77cc23c19380b13262b96cbce8a36e30 Mon Sep 17 00:00:00 2001
From: Imre Palik <imre_palik@yahoo.co.uk>
Date: Sun, 28 Feb 2016 18:08:12 +0000
Subject: [PATCH 07/11] boost::ublas increasing the range of BLAS level 3
 benchmarks

This patch increases the range of BLAS level 3 benchmarks for dense
matrices up to 1000*1000 matrices.

Signed-off-by: Imre Palik <imre_palik@yahoo.co.uk>
---
 benchmarks/bench1/Jamfile.v2  |  2 +-
 benchmarks/bench1/bench1.cpp  | 14 ++++++++++----
 benchmarks/bench1/bench13.cpp |  8 ++++++++
 benchmarks/bench3/Jamfile.v2  |  3 ++-
 benchmarks/bench3/bench3.cpp  | 14 ++++++++++----
 benchmarks/bench3/bench33.cpp |  8 ++++++++
 6 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/benchmarks/bench1/Jamfile.v2 b/benchmarks/bench1/Jamfile.v2
index 77b11c77e..bf9c821c6 100644
--- a/benchmarks/bench1/Jamfile.v2
+++ b/benchmarks/bench1/Jamfile.v2
@@ -7,4 +7,4 @@
 
 exe bench1
     : bench1.cpp bench11.cpp bench12.cpp bench13.cpp
-    ;
+    : : <optimization>speed ;
diff --git a/benchmarks/bench1/bench1.cpp b/benchmarks/bench1/bench1.cpp
index 87478e139..300eafa4a 100644
--- a/benchmarks/bench1/bench1.cpp
+++ b/benchmarks/bench1/bench1.cpp
@@ -76,22 +76,28 @@ void do_bench (std::string type_string, int scale)
     header (type_string + ", 3");
     bench_1<scalar, 3> () (1000000 * scale);
     bench_2<scalar, 3> () (300000 * scale);
-    bench_3<scalar, 3> () (100000 * scale);
+    bench_3<scalar, 3> () (3000000 * scale);
 
     header (type_string + ", 10");
     bench_1<scalar, 10> () (300000 * scale);
     bench_2<scalar, 10> () (30000 * scale);
-    bench_3<scalar, 10> () (3000 * scale);
+    bench_3<scalar, 10> () (100000 * scale);
 
     header (type_string + ", 30");
     bench_1<scalar, 30> () (100000 * scale);
     bench_2<scalar, 30> () (3000 * scale);
-    bench_3<scalar, 30> () (100 * scale);
+    bench_3<scalar, 30> () (30000 * scale);
 
     header (type_string + ", 100");
     bench_1<scalar, 100> () (30000 * scale);
     bench_2<scalar, 100> () (300 * scale);
-    bench_3<scalar, 100> () (3 * scale);
+    bench_3<scalar, 100> () (1000 * scale);
+
+    header (type_string + ", 300");
+    bench_3<scalar, 300> () (30 * scale);
+
+    header (type_string + ", 1000");
+    bench_3<scalar, 1000> () (1 * scale);
 }
 
 int main (int argc, char *argv []) {
diff --git a/benchmarks/bench1/bench13.cpp b/benchmarks/bench1/bench13.cpp
index fadb0b673..2378d46b2 100644
--- a/benchmarks/bench1/bench13.cpp
+++ b/benchmarks/bench1/bench13.cpp
@@ -166,6 +166,8 @@ template struct bench_3<float, 3>;
 template struct bench_3<float, 10>;
 template struct bench_3<float, 30>;
 template struct bench_3<float, 100>;
+template struct bench_3<float, 300>;
+template struct bench_3<float, 1000>;
 #endif
 
 #ifdef USE_DOUBLE
@@ -173,6 +175,8 @@ template struct bench_3<double, 3>;
 template struct bench_3<double, 10>;
 template struct bench_3<double, 30>;
 template struct bench_3<double, 100>;
+template struct bench_3<double, 300>;
+template struct bench_3<double, 1000>;
 #endif
 
 #ifdef USE_STD_COMPLEX
@@ -181,6 +185,8 @@ template struct bench_3<std::complex<float>, 3>;
 template struct bench_3<std::complex<float>, 10>;
 template struct bench_3<std::complex<float>, 30>;
 template struct bench_3<std::complex<float>, 100>;
+template struct bench_3<std::complex<float>, 300>;
+template struct bench_3<std::complex<float>, 1000>;
 #endif
 
 #ifdef USE_DOUBLE
@@ -188,5 +194,7 @@ template struct bench_3<std::complex<double>, 3>;
 template struct bench_3<std::complex<double>, 10>;
 template struct bench_3<std::complex<double>, 30>;
 template struct bench_3<std::complex<double>, 100>;
+template struct bench_3<std::complex<double>, 300>;
+template struct bench_3<std::complex<double>, 1000>;
 #endif
 #endif
diff --git a/benchmarks/bench3/Jamfile.v2 b/benchmarks/bench3/Jamfile.v2
index 7ce9c9b85..cbaa638a7 100644
--- a/benchmarks/bench3/Jamfile.v2
+++ b/benchmarks/bench3/Jamfile.v2
@@ -7,4 +7,5 @@
 
 exe bench3
     : bench3.cpp bench31.cpp bench32.cpp bench33.cpp
-    ;
+    : : <optimization>speed ;
+
diff --git a/benchmarks/bench3/bench3.cpp b/benchmarks/bench3/bench3.cpp
index 390d226ca..72a3db585 100644
--- a/benchmarks/bench3/bench3.cpp
+++ b/benchmarks/bench3/bench3.cpp
@@ -76,22 +76,28 @@ void do_bench (std::string type_string, int scale)
     header (type_string + ", 3");
     bench_1<scalar, 3> () (1000000 * scale);
     bench_2<scalar, 3> () (300000 * scale);
-    bench_3<scalar, 3> () (100000 * scale);
+    bench_3<scalar, 3> () (3000000 * scale);
 
     header (type_string + ", 10");
     bench_1<scalar, 10> () (300000 * scale);
     bench_2<scalar, 10> () (30000 * scale);
-    bench_3<scalar, 10> () (3000 * scale);
+    bench_3<scalar, 10> () (100000 * scale);
 
     header (type_string + ", 30");
     bench_1<scalar, 30> () (100000 * scale);
     bench_2<scalar, 30> () (3000 * scale);
-    bench_3<scalar, 30> () (100 * scale);
+    bench_3<scalar, 30> () (30000 * scale);
 
     header (type_string + ", 100");
     bench_1<scalar, 100> () (30000 * scale);
     bench_2<scalar, 100> () (300 * scale);
-    bench_3<scalar, 100> () (3 * scale);
+    bench_3<scalar, 100> () (1000 * scale);
+
+    header (type_string + ", 300");
+    bench_3<scalar, 300> () (30 * scale);
+
+    header (type_string + ", 1000");
+    bench_3<scalar, 1000> () (1 * scale);
 }
 
 int main (int argc, char *argv []) {
diff --git a/benchmarks/bench3/bench33.cpp b/benchmarks/bench3/bench33.cpp
index 9b8e1070e..7eeec07ce 100644
--- a/benchmarks/bench3/bench33.cpp
+++ b/benchmarks/bench3/bench33.cpp
@@ -172,6 +172,8 @@ template struct bench_3<float, 3>;
 template struct bench_3<float, 10>;
 template struct bench_3<float, 30>;
 template struct bench_3<float, 100>;
+template struct bench_3<float, 300>;
+template struct bench_3<float, 1000>;
 #endif
 
 #ifdef USE_DOUBLE
@@ -179,6 +181,8 @@ template struct bench_3<double, 3>;
 template struct bench_3<double, 10>;
 template struct bench_3<double, 30>;
 template struct bench_3<double, 100>;
+template struct bench_3<double, 300>;
+template struct bench_3<double, 1000>;
 #endif
 
 #ifdef USE_STD_COMPLEX
@@ -187,6 +191,8 @@ template struct bench_3<std::complex<float>, 3>;
 template struct bench_3<std::complex<float>, 10>;
 template struct bench_3<std::complex<float>, 30>;
 template struct bench_3<std::complex<float>, 100>;
+template struct bench_3<std::complex<float>, 300>;
+template struct bench_3<std::complex<float>, 1000>;
 #endif
 
 #ifdef USE_DOUBLE
@@ -194,5 +200,7 @@ template struct bench_3<std::complex<double>, 3>;
 template struct bench_3<std::complex<double>, 10>;
 template struct bench_3<std::complex<double>, 30>;
 template struct bench_3<std::complex<double>, 100>;
+template struct bench_3<std::complex<double>, 300>;
+template struct bench_3<std::complex<double>, 1000>;
 #endif
 #endif

From 93545c0695f60be1cd9ffe9491c95f3351acc374 Mon Sep 17 00:00:00 2001
From: Imre Palik <imre_palik@yahoo.co.uk>
Date: Sun, 6 Mar 2016 15:16:33 +0000
Subject: [PATCH 08/11] ublas::gemm adding support for clang

This patch enables the optimised kernel for clang.
Sadly, when compiled by clang, the performance of this kernel on
Haswell is approximately half of the kernel compiled by gcc.
But it is still way faster than anything else.

Signed-off-by: Imre Palik <imre_palik@yahoo.co.uk>
---
 include/boost/numeric/ublas/detail/gemm.hpp   | 4 ++++
 include/boost/numeric/ublas/detail/vector.hpp | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp
index daf38a119..2d492e681 100644
--- a/include/boost/numeric/ublas/detail/gemm.hpp
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -86,7 +86,11 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         static const Index MR = BlockSize::mr;
         static const Index NR = BlockSize::nr/vector_length;
 
+#ifdef BOOST_COMP_CLANG_DETECTION
+        typedef T vx __attribute__((ext_vector_type (vector_length)));
+#else
         typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE)));
+#endif
 
         vx P[MR*NR] = {};
 
diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp
index 3281ae950..686200d33 100644
--- a/include/boost/numeric/ublas/detail/vector.hpp
+++ b/include/boost/numeric/ublas/detail/vector.hpp
@@ -25,7 +25,9 @@
 #endif
 
 #if (defined(BOOST_COMP_GNUC_DETECTION)                           \
-     && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0))
+     && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)) \
+    || (defined(BOOST_COMP_CLANG_DETECTION)                       \
+        && BOOST_COMP_CLANG_DETECTION >= BOOST_VERSION_NUMBER(3,5,0))
 #define BOOST_UBLAS_VECTOR_KERNEL = _BOOST_UBLAS_VECTOR_SIZE
 #endif
 

From 8ceb6c0ab7361064ba1e54270fdd40cce4087e0e Mon Sep 17 00:00:00 2001
From: Imre Palik <imre_palik@yahoo.co.uk>
Date: Sun, 13 Mar 2016 19:58:30 +0000
Subject: [PATCH 09/11] ublas::gemm making possible to disable gemm as the
 default dense matrix multiplication

If the preprocessor macro BOOST_UBLAS_LEGACY_PRODUCT is defined, then
both prod() and axpy_prod() falls back to the legacy matrix
multiplication algorithm.

Signed-off-by: Imre Palik <imre_palik@yahoo.co.uk>
---
 .../boost/numeric/ublas/matrix_expression.hpp | 42 ++++++++++++-------
 include/boost/numeric/ublas/operation.hpp     | 17 +++++---
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp
index 6b9c8fec8..80fd961b6 100644
--- a/include/boost/numeric/ublas/matrix_expression.hpp
+++ b/include/boost/numeric/ublas/matrix_expression.hpp
@@ -5463,18 +5463,18 @@ namespace boost { namespace numeric { namespace ublas {
     };
 
     namespace detail {
-      template<class E1, class E2, class P, bool s>
-      struct binary_calculate_result_type;
-
-      template<class E1, class E2, class P>
-      struct binary_calculate_result_type<E1, E2, P, false> {
-       typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, P> > result_type;
-      };
-
-      template<class E1, class E2, class P>
-      struct binary_calculate_result_type<E1, E2, P, true> {
-       typedef matrix<P> result_type;
-      };
+        template<class E1, class E2, class P, bool s>
+        struct binary_calculate_result_type;
+
+        template<class E1, class E2, class P>
+        struct binary_calculate_result_type<E1, E2, P, false> {
+            typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, P> > result_type;
+        };
+
+        template<class E1, class E2, class P>
+        struct binary_calculate_result_type<E1, E2, P, true> {
+            typedef matrix<P> result_type;
+        };
     }
 
     template<class T1, class E1, class T2, class E2>
@@ -5495,7 +5495,7 @@ namespace boost { namespace numeric { namespace ublas {
     template<class E1, class E2, class B>
     BOOST_UBLAS_INLINE
     typename matrix_matrix_binary_traits<typename E1::value_type, E1,
-                                         typename E2::value_type, E2>::result_type
+                                         typename E2::value_type, E2>::expression_type
     prod (const matrix_expression<E1> &e1,
           const matrix_expression<E2> &e2,
          B,
@@ -5551,15 +5551,24 @@ namespace boost { namespace numeric { namespace ublas {
 
     template<class E1, class E2>
     BOOST_UBLAS_INLINE
+#ifndef BOOST_UBLAS_LEGACY_PRODUCT
     typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                          typename E2::value_type, E2>::result_type
+#else
+    typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                         typename E2::value_type, E2>::expression_type
+#endif
     prod (const matrix_expression<E1> &e1,
           const matrix_expression<E2> &e2) {
         BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0);
        typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
                                                                      typename E2::value_type>::type> block_sizes;
-
+#ifndef BOOST_UBLAS_LEGACY_PRODUCT
         return prod (e1, e2, block_sizes());
+#else
+        return prod (e1, e2, block_sizes(), unknown_storage_tag(),
+                     unknown_orientation_tag());
+#endif
     }
 
     template<class E1, class E2>
@@ -5657,8 +5666,13 @@ namespace boost { namespace numeric { namespace ublas {
         typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
                                                                       typename E2::value_type,
                                                                       typename M::value_type>::type> block_sizes;
+#ifndef BOOST_UBLAS_LEGACY_PRODUCT
         prod (e1, e2, m, block_sizes(), storage_category(),
               orientation_category());
+#else
+        prod (e1, e2, m, block_sizes(), unknown_storage_tag(),
+              unknown_orientation_tag());
+#endif
         return m;
     }
 
diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp
index 1e70326a5..176669cf8 100644
--- a/include/boost/numeric/ublas/operation.hpp
+++ b/include/boost/numeric/ublas/operation.hpp
@@ -775,8 +775,19 @@ namespace boost { namespace numeric { namespace ublas {
         typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
                                                                       typename E2::value_type,
                                                                       typename M::value_type>::type> block_sizes;
+#ifndef BOOST_UBLAS_LEGACY_PRODUCT
         return axpy_prod(e1, e2, m, full (), storage_category (),
                         init, block_sizes());
+#else
+        typedef typename M::value_type value_type;
+        typedef typename M::orientation_category orientation_category;
+
+        if (init)
+            m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
+
+        return axpy_prod(e1, e2, m, full (), storage_category (),
+                         orientation_category());
+#endif
     }
 
     template<class M, class E1, class E2, class B>
@@ -800,13 +811,9 @@ namespace boost { namespace numeric { namespace ublas {
     axpy_prod (const matrix_expression<E1> &e1,
                const matrix_expression<E2> &e2) {
         typedef M matrix_type;
-        typedef typename M::storage_category storage_category;
-       typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
-                                                                     typename E2::value_type>::type> block_sizes;
 
         matrix_type m (e1 ().size1 (), e2 ().size2 ());
-        return axpy_prod(e1, e2, m, full (), storage_category(), true,
-                        block_sizes());
+        return axpy_prod(e1, e2, m);
     }
 
 

From 752bc93fab7c8d1b640a15d0271e77f3b6afd596 Mon Sep 17 00:00:00 2001
From: Imre Palik <imre_palik@yahoo.co.uk>
Date: Sat, 19 Mar 2016 13:13:22 +0000
Subject: [PATCH 10/11] ublas:gemm kernels for complex multiplication

This patch contains optimised kernels for multiplying complex matrices,
together with the kernel parameters I find decently performing on Haswell.

Signed-off-by: Imre Palik <imre_palik@yahoo.co.uk>
---
 .../numeric/ublas/detail/block_sizes.hpp      |  30 ++
 include/boost/numeric/ublas/detail/gemm.hpp   | 375 ++++++++++++++++--
 .../boost/numeric/ublas/matrix_expression.hpp |  10 +-
 3 files changed, 370 insertions(+), 45 deletions(-)

diff --git a/include/boost/numeric/ublas/detail/block_sizes.hpp b/include/boost/numeric/ublas/detail/block_sizes.hpp
index 9d861de1f..da042da58 100644
--- a/include/boost/numeric/ublas/detail/block_sizes.hpp
+++ b/include/boost/numeric/ublas/detail/block_sizes.hpp
@@ -58,6 +58,36 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
     };
 
+    template <typename T>
+    struct prod_block_size<std::complex<T> > {
+        static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(T); // Number of elements in a vector register
+        static const unsigned mc = 255;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 3; // stripe width for lhs
+        static const unsigned nr = vector_length; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 23; // Use gemm from this size
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<std::complex<long double> > {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 1; // stripe width for lhs
+        static const unsigned nr = 1; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 68; // Use gemm from this size
+        static const unsigned vector_length = 1; // Number of elements in a vector register
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
     template<typename T>
     struct is_blocksize {
         struct fallback { static const int nr = 0; };
diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp
index 2d492e681..671fa42a0 100644
--- a/include/boost/numeric/ublas/detail/gemm.hpp
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -12,10 +12,11 @@
 #include <boost/type_traits/common_type.hpp>
 #include <boost/type_traits/aligned_storage.hpp>
 #include <boost/type_traits/is_arithmetic.hpp>
-#include <boost/align/aligned_alloc.hpp>
+#include <boost/align/aligned_allocator.hpp>
 #include <boost/align/assume_aligned.hpp>
 #include <boost/static_assert.hpp>
 #include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/storage.hpp>
 #include <boost/numeric/ublas/detail/vector.hpp>
 #include <boost/numeric/ublas/detail/block_sizes.hpp>
 #include <boost/predef/compiler.h>
@@ -186,12 +187,150 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         }
     }
 
+#ifdef BOOST_UBLAS_VECTOR_KERNEL
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<is_arithmetic<T>::value
+                         && (1 < BlockSize::vector_length)
+                         && (BlockSize::vector_length * sizeof(T) <= _BOOST_UBLAS_VECTOR_SIZE),
+                         void>::type
+    ugemm(Index kc, TC alpha, const T *Ar, const T *Ai,
+          const T *Br, const T *Bi,
+          TC beta, TC *C, Index incRowC, Index incColC)
+    {
+        BOOST_ALIGN_ASSUME_ALIGNED (Ar, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED (Ai, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED (Br, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED (Bi, BlockSize::align);
+        static const unsigned vector_length = BlockSize::vector_length;
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr/vector_length;
+
+#ifdef BOOST_COMP_CLANG_DETECTION
+        typedef T vx __attribute__((ext_vector_type (vector_length)));
+#else
+        typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE)));
+#endif
+
+        vx Pr[MR*NR] = {};
+        vx Pi[MR*NR] = {};
+
+        const vx *br = (const vx *)Br;
+        const vx *bi = (const vx *)Bi;
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    Pr[i * NR + j] += Ar[i] * br[j] - Ai[i] * bi[j];
+                    Pi[i * NR + j] += Ar[i] * bi[j] + Ai[i] * br[j];
+                }
+            }
+            Ar += MR;
+            Ai += MR;
+            br += NR;
+            bi += NR;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    Pr[i*NR+j] *= alpha.real();
+                    Pi[i*NR+j] *= alpha.imag();
+                }
+            }
+        }
+
+        const T *pr = (const T *) Pr;
+        const T *pi = (const T *) Pi;
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR * vector_length; ++j) {
+                    C[i*incRowC+j*incColC] =
+                        TC(pr[i*NR * vector_length +j], pi[i*NR * vector_length +j]);
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR * vector_length; ++j) {
+                    C[i*incRowC+j*incColC] *= beta;
+                    C[i*incRowC+j*incColC] +=
+                          TC(pr[i*NR * vector_length+j], pi[i*NR * vector_length+j]);
+                }
+            }
+        }
+    }
+
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<!is_arithmetic<T>::value
+                         || (BlockSize::vector_length <= 1),
+                         void>::type
+#else
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    void
+#endif
+    ugemm(Index kc, TC alpha, const T *Ar, const T *Ai,
+          const T *Br, const T *Bi,
+          TC beta, TC *C, Index incRowC, Index incColC)
+    {
+        BOOST_ALIGN_ASSUME_ALIGNED(Ar, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(Ai, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(Br, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(Bi, BlockSize::align);
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr;
+        typename aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Par;
+        T *Pr = reinterpret_cast<T*>(Par.address());
+        typename aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pai;
+        T *Pi = reinterpret_cast<T*>(Pai.address());
+
+        for (unsigned c = 0; c < MR * NR; c++)
+          Pr[c] = Pi[c] = 0;
+
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+              for (Index j=0; j<NR; ++j) {
+                    Pr[i * NR + j] += Ar[i] * Br[j] - Ai[i] * Bi[j];
+                    Pi[i * NR + j] += Ar[i] * Bi[j] + Ai[i] * Br[j];
+                }
+            }
+            Ar += MR;
+            Ai += MR;
+            Br += NR;
+            Bi += NR;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    Pr[i*NR+j] *= alpha.real();
+                    Pi[i*NR+j] *= alpha.imag();;
+                }
+            }
+        }
+
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    C[i*incRowC+j*incColC] = TC(Pr[i*NR+j], Pi[i*NR+j]);
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    C[i*incRowC+j*incColC] *= beta;
+                    C[i*incRowC+j*incColC] += TC(Pr[i*NR+j], Pi[i*NR+j]);
+                }
+            }
+        }
+    }
+
     //-- Macro Kernel ----------------------------------------------------------
-        template <typename Index, typename T, typename TC, typename BlockSize>
+    template <typename Index, typename T, typename TC, typename BlockSize>
     void
     mgemm(Index mc, Index nc, Index kc, TC alpha,
-          const T *A, const T *B, TC beta,
-          TC *C, Index incRowC, Index incColC)
+          const T *A, const T *B, TC beta, TC *C,
+          Index incRowC, Index incColC, BlockSize)
     {
         static const Index MR = BlockSize::mr;
         static const Index NR = BlockSize::nr;
@@ -200,9 +339,6 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         const Index mr_ = mc % MR;
         const Index nr_ = nc % NR;
 
-        // #if defined(_OPENMP)
-        // #pragma omp parallel for
-        // #endif
         for (Index j=0; j<np; ++j) {
             const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
             TC C_[MR*NR];
@@ -233,10 +369,57 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         }
     }
 
+    template <typename Index, typename T, typename TC, typename BlockSize>
+    void
+    mgemm(Index mc, Index nc, Index kc, TC alpha,
+          std::pair<T*, T*> A, std::pair<T*, T*>B,
+          TC beta, TC *C, Index incRowC, Index incColC, BlockSize)
+    {
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr;
+        const Index mp  = (mc+MR-1) / MR;
+        const Index np  = (nc+NR-1) / NR;
+        const Index mr_ = mc % MR;
+        const Index nr_ = nc % NR;
+
+        for (Index j=0; j<np; ++j) {
+            const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
+            TC C_[MR*NR];
+
+            for (Index i=0; i<mp; ++i) {
+                const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
+
+                if (mr==MR && nr==NR) {
+                    ugemm<Index, T, TC, BlockSize>
+                        (kc, alpha,
+                         &A.first[i*kc*MR], &A.second[i*kc*MR],
+                         &B.first[j*kc*NR], &B.second[j*kc*NR],
+                         beta,
+                         &C[i*MR*incRowC+j*NR*incColC],
+                         incRowC, incColC);
+                } else {
+                    std::fill_n(C_, MR*NR, T(0));
+                    ugemm<Index, T, TC, BlockSize>
+                        (kc, alpha,
+                         &A.first[i*kc*MR], &A.second[i*kc*MR],
+                         &B.first[j*kc*NR], &B.second[j*kc*NR],
+                         T(0),
+                         C_, NR, Index(1));
+                    gescal(mr, nr, beta,
+                           &C[i*MR*incRowC+j*NR*incColC],
+                           incRowC, incColC);
+                    geaxpy(mr, nr, TC(1), C_, NR, Index(1),
+                           &C[i*MR*incRowC+j*NR*incColC],
+                           incRowC, incColC);
+                }
+            }
+        }
+    }
+
     //-- Packing blocks ------------------------------------------------------------
-        template <typename E, typename T, typename BlockSize>
+    template <typename E, typename T, typename BlockSize>
     void
-    pack_A(const matrix_expression<E> &A, T *p)
+    pack_A(const matrix_expression<E> &A, T *p, BlockSize)
     {
         typedef typename E::size_type  size_type;
         BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
@@ -257,9 +440,34 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         }
     }
 
-        template <typename E, typename T, typename BlockSize>
+    template <typename E, typename T, typename BlockSize>
     void
-    pack_B(const matrix_expression<E> &B, T *p)
+    pack_A(const matrix_expression<E> &A, std::pair<T*, T*> p, BlockSize)
+    {
+        typedef typename E::size_type  size_type;
+        BOOST_ALIGN_ASSUME_ALIGNED(p.first, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(p.second, BlockSize::align);
+
+        const size_type mc = A ().size1();
+        const size_type kc = A ().size2();
+        static const size_type MR = BlockSize::mr;
+        const size_type mp = (mc+MR-1) / MR;
+
+        for (size_type j=0; j<kc; ++j) {
+            for (size_type l=0; l<mp; ++l) {
+                for (size_type i0=0; i0<MR; ++i0) {
+                    size_type i  = l*MR + i0;
+                    size_type nu = l*MR*kc + j*MR + i0;
+                    p.first[nu]  = (i<mc) ? A()(i,j).real() : T(0);
+                    p.second[nu] = (i<mc) ? A()(i,j).imag() : T(0);
+                }
+            }
+        }
+    }
+
+    template <typename E, typename T, typename BlockSize>
+    void
+    pack_B(const matrix_expression<E> &B, T *p, BlockSize)
     {
         typedef typename E::size_type  size_type;
         BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
@@ -280,13 +488,38 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         }
     }
 
+    template <typename E, typename T, typename BlockSize>
+    void
+    pack_B(const matrix_expression<E> &B, std::pair<T*, T*> p, BlockSize)
+    {
+        typedef typename E::size_type  size_type;
+        BOOST_ALIGN_ASSUME_ALIGNED(p.first, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(p.second, BlockSize::align);
+
+        const size_type kc = B ().size1();
+        const size_type nc = B ().size2();
+        static const size_type NR = BlockSize::nr;
+        const size_type np = (nc+NR-1) / NR;
+
+        for (size_type l=0; l<np; ++l) {
+            for (size_type j0=0; j0<NR; ++j0) {
+                for (size_type i=0; i<kc; ++i) {
+                    size_type j  = l*NR+j0;
+                    size_type nu = l*NR*kc + i*NR + j0;
+                    p.first[nu]  = (j<nc) ? B()(i,j).real() : T(0);
+                    p.second[nu] = (j<nc) ? B()(i,j).imag() : T(0);
+                }
+            }
+        }
+    }
+
     //-- Frame routine -------------------------------------------------------------
-    template <typename E1, typename E2, typename E3, typename BlockSize>
+    template <typename E1, typename E2, typename E3, typename M, typename BlockSize>
     void
-    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
-         const matrix_expression<E2> &e2,
-         typename E3::value_type beta, matrix_expression<E3> &e3,
-         BlockSize)
+    gemm_(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+          typename E3::value_type beta, matrix_expression<E3> &e3,
+          M A, M B, BlockSize bs)
     {
 #if defined(BOOST_COMP_GNUC_DETECTION)
         check_blocksize<BlockSize> check __attribute__ ((unused));
@@ -294,15 +527,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         check_blocksize<BlockSize> check;
 #endif
         typedef typename E3::size_type  size_type;
-        typedef typename E1::value_type value_type1;
-        typedef typename E2::value_type value_type2;
         typedef typename E3::value_type value_type3;
-        typedef typename common_type<value_type1,
-                                     value_type2,
-                                     value_type3>::type value_type_i;
-        typedef unbounded_array<value_type_i,
-                                typename alignment::aligned_allocator<value_type_i,
-                                                                      BlockSize::align> > array_type_i;
 
         static const size_type MC = BlockSize::mc;
         static const size_type NC = BlockSize::nc;
@@ -323,13 +548,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         const size_type incRowC = &e3()(1,0) - &e3()(0,0);
         const size_type incColC = &e3()(0,1) - &e3()(0,0);
 
-        if (alpha==value_type3(0) || k==0) {
-            gescal(beta, e3);
-            return;
-        }
-
-        array_type_i A(MC * KC);
-        array_type_i B(NC * KC);
+        assert(alpha!=value_type3(0) && k!=0);
 
         for (size_type j=0; j<nb; ++j) {
             size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_;
@@ -340,24 +559,102 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
 
                 const matrix_range<const E2> Bs =
                     subrange(e2(), l*KC, l*KC+kc, j*NC, j*NC+nc);
-                pack_B<matrix_range<const E2>, value_type_i, BlockSize>
-                    (Bs, &B[0]);
+                pack_B(Bs, B, bs);
 
                 for (size_type i=0; i<mb; ++i) {
                     size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_;
 
                     const matrix_range<const E1> As =
                         subrange(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc);
-                    pack_A<matrix_range<const E1>, value_type_i, BlockSize>
-                        (As, &A[0]);
+                    pack_A(As, A, bs);
 
-                    mgemm<size_type, value_type_i, value_type3, BlockSize>
-                        (mc, nc, kc, alpha, &A[0], &B[0], beta_,
-                         &C_[i*MC*incRowC+j*NC*incColC],
-                         incRowC, incColC);
+                    mgemm(mc, nc, kc, alpha, A, B, beta_,
+                          &C_[i*MC*incRowC+j*NC*incColC],
+                          incRowC, incColC, bs);
                 }
             }
         }
     }
+
+    template <typename E1, typename E2, typename E3, typename BlockSize>
+    typename enable_if_c<!is_complex<typename common_type<typename E1::value_type,
+                                                          typename E2::value_type>::type>::value
+                         || !is_complex<typename E3::value_type>::value,
+                         void>::type
+    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+         const matrix_expression<E2> &e2,
+         typename E3::value_type beta, matrix_expression<E3> &e3,
+         BlockSize bs)
+    {
+        typedef typename E3::size_type  size_type;
+        typedef typename E1::value_type value_type1;
+        typedef typename E2::value_type value_type2;
+        typedef typename E3::value_type value_type3;
+        typedef typename common_type<value_type1,
+                                     value_type2,
+                                     value_type3>::type value_type_i;
+        typedef unbounded_array<value_type_i,
+                                typename alignment::aligned_allocator<value_type_i,
+                                                                      BlockSize::align> > array_type_i;
+
+        static const size_type MC = BlockSize::mc;
+        static const size_type NC = BlockSize::nc;
+
+        const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
+
+        static const size_type KC = BlockSize::kc;
+
+        if (alpha==value_type3(0) || k==0) {
+            gescal(beta, e3);
+            return;
+        }
+
+        array_type_i A(MC * KC);
+        array_type_i B(NC * KC);
+        gemm_(alpha, e1, e2, beta, e3, &A[0], &B[0], bs);
+    }
+
+    template <typename E1, typename E2, typename E3, typename BlockSize>
+    typename enable_if_c<is_complex<typename common_type<typename E1::value_type,
+                                                         typename E2::value_type>::type>::value
+                         && is_complex<typename E3::value_type>::value,
+                         void>::type
+    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+         const matrix_expression<E2> &e2,
+         typename E3::value_type beta, matrix_expression<E3> &e3,
+         BlockSize bs)
+    {
+        typedef typename E3::size_type  size_type;
+        typedef typename E1::value_type value_type1;
+        typedef typename E2::value_type value_type2;
+        typedef typename E3::value_type value_type3;
+        typedef typename common_type<value_type1,
+                                     value_type2,
+                                     value_type3>::type value_type_i;
+        typedef typename value_type_i::value_type value_type_f;
+        typedef unbounded_array<value_type_f,
+                                typename alignment::aligned_allocator<value_type_f,
+                                                                      BlockSize::align> > array_type_f;
+
+        static const size_type MC = BlockSize::mc;
+        static const size_type NC = BlockSize::nc;
+
+        const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
+
+        static const size_type KC = BlockSize::kc;
+
+        if (alpha==value_type3(0) || k==0) {
+            gescal(beta, e3);
+            return;
+        }
+
+        array_type_f Ar(MC * KC);
+        array_type_f Ai(MC * KC);
+        array_type_f Br(NC * KC);
+        array_type_f Bi(NC * KC);
+        gemm_(alpha, e1, e2, beta, e3, std::make_pair(&Ar[0], &Ai[0]),
+              std::make_pair(&Br[0], &Bi[0]), bs);
+    }
+
 }}}}
 #endif
diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp
index 80fd961b6..36bbf130b 100644
--- a/include/boost/numeric/ublas/matrix_expression.hpp
+++ b/include/boost/numeric/ublas/matrix_expression.hpp
@@ -5623,16 +5623,14 @@ namespace boost { namespace numeric { namespace ublas {
           unknown_orientation_tag) {
         typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                                      typename E2::value_type, E2>::expression_type expression_type;
-        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
-                                                     typename E2::value_type, E2>::result_type result_type;
-       typedef typename result_type::value_type result_value;
+        typedef typename M::value_type result_value;
 
-       if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) {
+        if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) {
             m.assign(expression_type (e1 (), e2 ()));
-       } else {
+        } else {
            detail::gemm(result_value(1), e1, e2,
                         result_value(0), m, b);
-       }
+        }
     }
 
     // dispatcher

From 4f4d25105082475fa123b30ea46e29f1491bc0f6 Mon Sep 17 00:00:00 2001
From: Imre Palik <imre_palik@yahoo.co.uk>
Date: Sun, 3 Apr 2016 14:59:34 +0000
Subject: [PATCH 11/11] ublas::gemm support for icc

This patch adds vectorised kernel support for icc.  It is implemented
using cilk arrays.  It is tested on icc16, whith older versions
your mileage may vary.

The resulting kernel is still half the speed of the one compiled by
gcc.

Signed-off-by: Imre Palik <imrep@yahoo.co.uk>
---
 include/boost/numeric/ublas/detail/gemm.hpp   | 93 +++++++++++++++----
 include/boost/numeric/ublas/detail/vector.hpp |  7 +-
 2 files changed, 79 insertions(+), 21 deletions(-)

diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp
index 671fa42a0..3891101cd 100644
--- a/include/boost/numeric/ublas/detail/gemm.hpp
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -86,7 +86,30 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         static const unsigned vector_length = BlockSize::vector_length;
         static const Index MR = BlockSize::mr;
         static const Index NR = BlockSize::nr/vector_length;
+        static const Index nr = BlockSize::nr;
+#ifdef BOOST_COMP_INTEL_DETECTION
+        static const Index pl = MR * nr;
 
+        T P[pl] = {};
+
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<nr; j += vector_length) {
+                    P[i*nr+j:vector_length] += A[i]*B[j:vector_length];
+                }
+            }
+            A += MR;
+            B += nr;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<nr; j += vector_length) {
+                    P[i*nr+j:vector_length] *= alpha;
+                }
+            }
+        }
+#else
 #ifdef BOOST_COMP_CLANG_DETECTION
         typedef T vx __attribute__((ext_vector_type (vector_length)));
 #else
@@ -113,19 +136,19 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
                 }
             }
         }
-
+#endif
         const T *p = (const T *) P;
         if (beta == TC(0)) {
             for (Index i=0; i<MR; ++i) {
-                for (Index j=0; j<NR * vector_length; ++j) {
-                    C[i*incRowC+j*incColC] = p[i*NR * vector_length +j];
+                for (Index j=0; j< nr; ++j) {
+                    C[i*incRowC+j*incColC] = p[i * nr +j];
                 }
             }
         } else {
             for (Index i=0; i<MR; ++i) {
-                for (Index j=0; j<NR * vector_length; ++j) {
+                for (Index j=0; j< nr; ++j) {
                     C[i*incRowC+j*incColC] *= beta;
-                    C[i*incRowC+j*incColC] += p[i*NR * vector_length+j];
+                    C[i*incRowC+j*incColC] += p[i * nr + j];
                 }
             }
         }
@@ -205,7 +228,37 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         static const unsigned vector_length = BlockSize::vector_length;
         static const Index MR = BlockSize::mr;
         static const Index NR = BlockSize::nr/vector_length;
+        static const Index nr = BlockSize::nr;
+#ifdef BOOST_COMP_INTEL_DETECTION
+        static const Index pl = MR * nr;
+
+        T Pr[pl] = {};
+        T Pi[pl] = {};
 
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<nr; j += vector_length) {
+                  Pr[i * nr + j:vector_length] +=
+                      Ar[i] * Br[j:vector_length] - Ai[i] * Bi[j:vector_length];
+                  Pi[i * nr + j:vector_length] +=
+                      Ar[i] * Bi[j:vector_length] + Ai[i] * Br[j:vector_length];
+                }
+            }
+            Ar += MR;
+            Ai += MR;
+            Br += nr;
+            Bi += nr;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<nr; j += vector_length) {
+                    Pr[i*nr+j:vector_length] *= alpha.real();
+                    Pi[i*nr+j:vector_length] *= alpha.imag();
+                }
+            }
+        }
+#else
 #ifdef BOOST_COMP_CLANG_DETECTION
         typedef T vx __attribute__((ext_vector_type (vector_length)));
 #else
@@ -238,22 +291,22 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
                 }
             }
         }
-
+#endif
         const T *pr = (const T *) Pr;
         const T *pi = (const T *) Pi;
         if (beta == TC(0)) {
             for (Index i=0; i<MR; ++i) {
-                for (Index j=0; j<NR * vector_length; ++j) {
+                for (Index j=0; j< nr; ++j) {
                     C[i*incRowC+j*incColC] =
-                        TC(pr[i*NR * vector_length +j], pi[i*NR * vector_length +j]);
+                        TC(pr[i * nr + j], pi[i * nr  + j]);
                 }
             }
         } else {
             for (Index i=0; i<MR; ++i) {
-                for (Index j=0; j<NR * vector_length; ++j) {
+                for (Index j=0; j< nr; ++j) {
                     C[i*incRowC+j*incColC] *= beta;
                     C[i*incRowC+j*incColC] +=
-                          TC(pr[i*NR * vector_length+j], pi[i*NR * vector_length+j]);
+                        TC(pr[i * nr + j], pi[i * nr + j]);
                 }
             }
         }
@@ -445,8 +498,10 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
     pack_A(const matrix_expression<E> &A, std::pair<T*, T*> p, BlockSize)
     {
         typedef typename E::size_type  size_type;
-        BOOST_ALIGN_ASSUME_ALIGNED(p.first, BlockSize::align);
-        BOOST_ALIGN_ASSUME_ALIGNED(p.second, BlockSize::align);
+        T* re = p.first;
+        T* im = p.second;
+        BOOST_ALIGN_ASSUME_ALIGNED(re, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(im, BlockSize::align);
 
         const size_type mc = A ().size1();
         const size_type kc = A ().size2();
@@ -458,8 +513,8 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
                 for (size_type i0=0; i0<MR; ++i0) {
                     size_type i  = l*MR + i0;
                     size_type nu = l*MR*kc + j*MR + i0;
-                    p.first[nu]  = (i<mc) ? A()(i,j).real() : T(0);
-                    p.second[nu] = (i<mc) ? A()(i,j).imag() : T(0);
+                    re[nu]  = (i<mc) ? A()(i,j).real() : T(0);
+                    im[nu] = (i<mc) ? A()(i,j).imag() : T(0);
                 }
             }
         }
@@ -493,8 +548,10 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
     pack_B(const matrix_expression<E> &B, std::pair<T*, T*> p, BlockSize)
     {
         typedef typename E::size_type  size_type;
-        BOOST_ALIGN_ASSUME_ALIGNED(p.first, BlockSize::align);
-        BOOST_ALIGN_ASSUME_ALIGNED(p.second, BlockSize::align);
+        T* re = p.first;
+        T* im = p.second;
+        BOOST_ALIGN_ASSUME_ALIGNED(re, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(im, BlockSize::align);
 
         const size_type kc = B ().size1();
         const size_type nc = B ().size2();
@@ -506,8 +563,8 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
                 for (size_type i=0; i<kc; ++i) {
                     size_type j  = l*NR+j0;
                     size_type nu = l*NR*kc + i*NR + j0;
-                    p.first[nu]  = (j<nc) ? B()(i,j).real() : T(0);
-                    p.second[nu] = (j<nc) ? B()(i,j).imag() : T(0);
+                    re[nu] = (j<nc) ? B()(i,j).real() : T(0);
+                    im[nu] = (j<nc) ? B()(i,j).imag() : T(0);
                 }
             }
         }
diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp
index 686200d33..e0ec2b181 100644
--- a/include/boost/numeric/ublas/detail/vector.hpp
+++ b/include/boost/numeric/ublas/detail/vector.hpp
@@ -24,9 +24,10 @@
 #define _BOOST_UBLAS_VECTOR_SIZE 1
 #endif
 
-#if (defined(BOOST_COMP_GNUC_DETECTION)                           \
-     && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)) \
-    || (defined(BOOST_COMP_CLANG_DETECTION)                       \
+#if defined(BOOST_COMP_INTEL_DETECTION)                              \
+    || (defined(BOOST_COMP_GNUC_DETECTION)                           \
+        && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)) \
+    || (defined(BOOST_COMP_CLANG_DETECTION)                          \
         && BOOST_COMP_CLANG_DETECTION >= BOOST_VERSION_NUMBER(3,5,0))
 #define BOOST_UBLAS_VECTOR_KERNEL = _BOOST_UBLAS_VECTOR_SIZE
 #endif