diff --git a/benchmarks/bench1/Jamfile.v2 b/benchmarks/bench1/Jamfile.v2
index 77b11c77e..bf9c821c6 100644
--- a/benchmarks/bench1/Jamfile.v2
+++ b/benchmarks/bench1/Jamfile.v2
@@ -7,4 +7,4 @@
 
 exe bench1
     : bench1.cpp bench11.cpp bench12.cpp bench13.cpp
-    ;
+    : : <optimization>speed ;
diff --git a/benchmarks/bench1/bench1.cpp b/benchmarks/bench1/bench1.cpp
index 87478e139..300eafa4a 100644
--- a/benchmarks/bench1/bench1.cpp
+++ b/benchmarks/bench1/bench1.cpp
@@ -76,22 +76,28 @@ void do_bench (std::string type_string, int scale)
     header (type_string + ", 3");
     bench_1<scalar, 3> () (1000000 * scale);
     bench_2<scalar, 3> () (300000 * scale);
-    bench_3<scalar, 3> () (100000 * scale);
+    bench_3<scalar, 3> () (3000000 * scale);
 
     header (type_string + ", 10");
     bench_1<scalar, 10> () (300000 * scale);
     bench_2<scalar, 10> () (30000 * scale);
-    bench_3<scalar, 10> () (3000 * scale);
+    bench_3<scalar, 10> () (100000 * scale);
 
     header (type_string + ", 30");
     bench_1<scalar, 30> () (100000 * scale);
     bench_2<scalar, 30> () (3000 * scale);
-    bench_3<scalar, 30> () (100 * scale);
+    bench_3<scalar, 30> () (30000 * scale);
 
     header (type_string + ", 100");
     bench_1<scalar, 100> () (30000 * scale);
     bench_2<scalar, 100> () (300 * scale);
-    bench_3<scalar, 100> () (3 * scale);
+    bench_3<scalar, 100> () (1000 * scale);
+
+    header (type_string + ", 300");
+    bench_3<scalar, 300> () (30 * scale);
+
+    header (type_string + ", 1000");
+    bench_3<scalar, 1000> () (1 * scale);
 }
 
 int main (int argc, char *argv []) {
diff --git a/benchmarks/bench1/bench13.cpp b/benchmarks/bench1/bench13.cpp
index fadb0b673..2378d46b2 100644
--- a/benchmarks/bench1/bench13.cpp
+++ b/benchmarks/bench1/bench13.cpp
@@ -166,6 +166,8 @@ template struct bench_3<float, 3>;
 template struct bench_3<float, 10>;
 template struct bench_3<float, 30>;
 template struct bench_3<float, 100>;
+template struct bench_3<float, 300>;
+template struct bench_3<float, 1000>;
 #endif
 
 #ifdef USE_DOUBLE
@@ -173,6 +175,8 @@ template struct bench_3<double, 3>;
 template struct bench_3<double, 10>;
 template struct bench_3<double, 30>;
 template struct bench_3<double, 100>;
+template struct bench_3<double, 300>;
+template struct bench_3<double, 1000>;
 #endif
 
 #ifdef USE_STD_COMPLEX
@@ -181,6 +185,8 @@ template struct bench_3<std::complex<float>, 3>;
 template struct bench_3<std::complex<float>, 10>;
 template struct bench_3<std::complex<float>, 30>;
 template struct bench_3<std::complex<float>, 100>;
+template struct bench_3<std::complex<float>, 300>;
+template struct bench_3<std::complex<float>, 1000>;
 #endif
 
 #ifdef USE_DOUBLE
@@ -188,5 +194,7 @@ template struct bench_3<std::complex<double>, 3>;
 template struct bench_3<std::complex<double>, 10>;
 template struct bench_3<std::complex<double>, 30>;
 template struct bench_3<std::complex<double>, 100>;
+template struct bench_3<std::complex<double>, 300>;
+template struct bench_3<std::complex<double>, 1000>;
 #endif
 #endif
diff --git a/benchmarks/bench3/Jamfile.v2 b/benchmarks/bench3/Jamfile.v2
index 7ce9c9b85..cbaa638a7 100644
--- a/benchmarks/bench3/Jamfile.v2
+++ b/benchmarks/bench3/Jamfile.v2
@@ -7,4 +7,5 @@
 
 exe bench3
     : bench3.cpp bench31.cpp bench32.cpp bench33.cpp
-    ;
+    : : <optimization>speed ;
+
diff --git a/benchmarks/bench3/bench3.cpp b/benchmarks/bench3/bench3.cpp
index 390d226ca..72a3db585 100644
--- a/benchmarks/bench3/bench3.cpp
+++ b/benchmarks/bench3/bench3.cpp
@@ -76,22 +76,28 @@ void do_bench (std::string type_string, int scale)
     header (type_string + ", 3");
     bench_1<scalar, 3> () (1000000 * scale);
     bench_2<scalar, 3> () (300000 * scale);
-    bench_3<scalar, 3> () (100000 * scale);
+    bench_3<scalar, 3> () (3000000 * scale);
 
     header (type_string + ", 10");
     bench_1<scalar, 10> () (300000 * scale);
     bench_2<scalar, 10> () (30000 * scale);
-    bench_3<scalar, 10> () (3000 * scale);
+    bench_3<scalar, 10> () (100000 * scale);
 
     header (type_string + ", 30");
     bench_1<scalar, 30> () (100000 * scale);
     bench_2<scalar, 30> () (3000 * scale);
-    bench_3<scalar, 30> () (100 * scale);
+    bench_3<scalar, 30> () (30000 * scale);
 
     header (type_string + ", 100");
     bench_1<scalar, 100> () (30000 * scale);
     bench_2<scalar, 100> () (300 * scale);
-    bench_3<scalar, 100> () (3 * scale);
+    bench_3<scalar, 100> () (1000 * scale);
+
+    header (type_string + ", 300");
+    bench_3<scalar, 300> () (30 * scale);
+
+    header (type_string + ", 1000");
+    bench_3<scalar, 1000> () (1 * scale);
 }
 
 int main (int argc, char *argv []) {
diff --git a/benchmarks/bench3/bench33.cpp b/benchmarks/bench3/bench33.cpp
index 9b8e1070e..7eeec07ce 100644
--- a/benchmarks/bench3/bench33.cpp
+++ b/benchmarks/bench3/bench33.cpp
@@ -172,6 +172,8 @@ template struct bench_3<float, 3>;
 template struct bench_3<float, 10>;
 template struct bench_3<float, 30>;
 template struct bench_3<float, 100>;
+template struct bench_3<float, 300>;
+template struct bench_3<float, 1000>;
 #endif
 
 #ifdef USE_DOUBLE
@@ -179,6 +181,8 @@ template struct bench_3<double, 3>;
 template struct bench_3<double, 10>;
 template struct bench_3<double, 30>;
 template struct bench_3<double, 100>;
+template struct bench_3<double, 300>;
+template struct bench_3<double, 1000>;
 #endif
 
 #ifdef USE_STD_COMPLEX
@@ -187,6 +191,8 @@ template struct bench_3<std::complex<float>, 3>;
 template struct bench_3<std::complex<float>, 10>;
 template struct bench_3<std::complex<float>, 30>;
 template struct bench_3<std::complex<float>, 100>;
+template struct bench_3<std::complex<float>, 300>;
+template struct bench_3<std::complex<float>, 1000>;
 #endif
 
 #ifdef USE_DOUBLE
@@ -194,5 +200,7 @@ template struct bench_3<std::complex<double>, 3>;
 template struct bench_3<std::complex<double>, 10>;
 template struct bench_3<std::complex<double>, 30>;
 template struct bench_3<std::complex<double>, 100>;
+template struct bench_3<std::complex<double>, 300>;
+template struct bench_3<std::complex<double>, 1000>;
 #endif
 #endif
diff --git a/doc/operations_overview.html b/doc/operations_overview.html
index 42d994212..212fcbebf 100644
--- a/doc/operations_overview.html
+++ b/doc/operations_overview.html
@@ -87,7 +87,8 @@ <h3>norms</h3>
 
 <pre><code>
 t = norm_inf(v); i = index_norm_inf(v);
-t = norm_1(v);   t = norm_2(v); 
+t = norm_1(v);   t = norm_2(v);
+t = norm_2_square(v);
 t = norm_inf(A); i = index_norm_inf(A);
 t = norm_1(A);   t = norm_frobenius(A); 
 </code></pre>
diff --git a/doc/vector_expression.html b/doc/vector_expression.html
index 7838007ff..2372ec3df 100644
--- a/doc/vector_expression.html
+++ b/doc/vector_expression.html
@@ -847,6 +847,11 @@ <h4>Prototypes</h4>
     typename vector_scalar_unary_traits&lt;E, vector_norm_2&lt;typename E::value_type&gt; &gt;::result_type
     norm_2 (const vector_expression&lt;E&gt; &amp;e);
 
+    // norm_2_square v = sum (v [i] * v [i])
+    template&lt;class E&gt;
+    typename vector_scalar_unary_traits&lt;E, vector_norm_2_square&lt;typename E::value_type&gt; &gt;::result_type
+    norm_2_square (const vector_expression&lt;E&gt; &amp;e);
+
     // norm_inf v = max (abs (v [i]))
     template&lt;class E&gt;
     typename vector_scalar_unary_traits&lt;E, vector_norm_inf&lt;typename E::value_type&gt; &gt;::result_type
diff --git a/include/boost/numeric/ublas/detail/block_sizes.hpp b/include/boost/numeric/ublas/detail/block_sizes.hpp
new file mode 100644
index 000000000..da042da58
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/block_sizes.hpp
@@ -0,0 +1,121 @@
+//
+//  Copyright (c) 2016
+//  Michael Lehn, Imre Palik
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_BLOCK_SIZES_
+#define _BOOST_UBLAS_BLOCK_SIZES_
+
+#include <boost/numeric/ublas/detail/vector.hpp>
+
+namespace boost { namespace numeric { namespace ublas { namespace detail {
+
+    template <typename T>
+    struct prod_block_size {
+        static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(T); // Number of elements in a vector register
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = (4096/(3 * vector_length)) * (3 * vector_length);
+        static const unsigned mr = 4; // stripe width for lhs
+        static const unsigned nr = 3 * vector_length; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 14; // Use gemm from this size
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<float> {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 4; // stripe width for lhs
+        static const unsigned nr = 16; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 14; // Use gemm from this size
+        static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(float); // Number of elements in a vector register
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<long double> {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 1; // stripe width for lhs
+        static const unsigned nr = 4; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 42; // Use gemm from this size
+        static const unsigned vector_length = 1; // Number of elements in a vector register
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <typename T>
+    struct prod_block_size<std::complex<T> > {
+        static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(T); // Number of elements in a vector register
+        static const unsigned mc = 255;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 3; // stripe width for lhs
+        static const unsigned nr = vector_length; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 23; // Use gemm from this size
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<std::complex<long double> > {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 1; // stripe width for lhs
+        static const unsigned nr = 1; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 68; // Use gemm from this size
+        static const unsigned vector_length = 1; // Number of elements in a vector register
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template<typename T>
+    struct is_blocksize {
+        struct fallback { static const int nr = 0; };
+        struct derived : T, fallback {};
+        template<int C1>
+        struct nonr {
+            static const bool value = false;
+            typedef false_type type;
+        };
+
+         template<typename C> static char (&f(typename nonr<C::nr>::type*))[1];
+         template<typename C> static char (&f(...))[2];
+
+         static bool const value = sizeof(f<derived>(0)) == 2;
+    };
+
+    template<typename T>
+    struct check_blocksize {
+        BOOST_STATIC_ASSERT_MSG(T::mc>0 && T::kc>0 && T::nc>0 && T::mr>0 && T::nr>0,
+                                "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(T::mc % T::mr == 0,
+                                "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(T::nc % T::nr == 0,
+                                "NC must be a multiple of NR.");
+        BOOST_STATIC_ASSERT_MSG(T::vector_length <= 1 || T::nr % T::vector_length == 0,
+                                "NR must be a multiple of vector size");
+        BOOST_STATIC_ASSERT_MSG(T::limit >= 2,
+                                "Minimum matrix size for gemm is 2*2");
+    };
+}}}}
+#endif
diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp
new file mode 100644
index 000000000..3891101cd
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -0,0 +1,717 @@
+//
+//  Copyright (c) 2016
+//  Michael Lehn, Imre Palik
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_GEMM_
+#define _BOOST_UBLAS_GEMM_
+
+#include <boost/type_traits/common_type.hpp>
+#include <boost/type_traits/aligned_storage.hpp>
+#include <boost/type_traits/is_arithmetic.hpp>
+#include <boost/align/aligned_allocator.hpp>
+#include <boost/align/assume_aligned.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/storage.hpp>
+#include <boost/numeric/ublas/detail/vector.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
+#include <boost/predef/compiler.h>
+
+namespace boost { namespace numeric { namespace ublas { namespace detail {
+
+    template <typename E>
+    void
+    gescal(const typename E::value_type &alpha, matrix_expression<E> &X)
+    {
+        typedef typename E::size_type  size_type;
+
+        for (size_type i=0; i<X().size1(); ++i) {
+            for (size_type j=0; j<X().size2(); ++j) {
+                X()(i,j) *= alpha;
+            }
+        }
+    }
+
+    template <typename Index, typename T>
+    void
+    geaxpy(Index m, Index n, const T &alpha,
+           const T *X, Index incRowX, Index incColX,
+           T       *Y, Index incRowY, Index incColY)
+    {
+        for (Index j=0; j<n; ++j) {
+            for (Index i=0; i<m; ++i) {
+                Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
+            }
+        }
+    }
+
+    template <typename Index, typename TX>
+    void
+    gescal(Index m, Index n,
+           const TX &alpha,
+           TX *X, Index incRowX, Index incColX)
+    {
+      if (alpha != TX(0)) {
+            for (Index j=0; j<n; ++j) {
+                for (Index i=0; i<m; ++i) {
+                    X[i*incRowX+j*incColX] *= alpha;
+                }
+            }
+        } else {
+            for (Index j=0; j<n; ++j) {
+                for (Index i=0; i<m; ++i) {
+                    X[i*incRowX+j*incColX] = 0;
+                }
+            }
+        }
+    }
+
+    //-- Micro Kernel ----------------------------------------------------------
+#ifdef BOOST_UBLAS_VECTOR_KERNEL
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<is_arithmetic<T>::value
+                         && (1 < BlockSize::vector_length)
+                         && (BlockSize::vector_length * sizeof(T) <= _BOOST_UBLAS_VECTOR_SIZE),
+                         void>::type
+    ugemm(Index kc, TC alpha, const T *A, const T *B,
+          TC beta, TC *C, Index incRowC, Index incColC)
+    {
+        BOOST_ALIGN_ASSUME_ALIGNED (A, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED (B, BlockSize::align);
+        static const unsigned vector_length = BlockSize::vector_length;
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr/vector_length;
+        static const Index nr = BlockSize::nr;
+#ifdef BOOST_COMP_INTEL_DETECTION
+        static const Index pl = MR * nr;
+
+        T P[pl] = {};
+
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<nr; j += vector_length) {
+                    P[i*nr+j:vector_length] += A[i]*B[j:vector_length];
+                }
+            }
+            A += MR;
+            B += nr;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<nr; j += vector_length) {
+                    P[i*nr+j:vector_length] *= alpha;
+                }
+            }
+        }
+#else
+#ifdef BOOST_COMP_CLANG_DETECTION
+        typedef T vx __attribute__((ext_vector_type (vector_length)));
+#else
+        typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE)));
+#endif
+
+        vx P[MR*NR] = {};
+
+        const vx *b = (const vx *)B;
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    P[i*NR+j] += A[i]*b[j];
+                }
+            }
+            A += MR;
+            b += NR;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    P[i*NR+j] *= alpha;
+                }
+            }
+        }
+#endif
+        const T *p = (const T *) P;
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j< nr; ++j) {
+                    C[i*incRowC+j*incColC] = p[i * nr +j];
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j< nr; ++j) {
+                    C[i*incRowC+j*incColC] *= beta;
+                    C[i*incRowC+j*incColC] += p[i * nr + j];
+                }
+            }
+        }
+    }
+
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<!is_arithmetic<T>::value
+                         || (BlockSize::vector_length <= 1),
+                         void>::type
+#else
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    void
+#endif
+    ugemm(Index kc, TC alpha, const T *A, const T *B,
+          TC beta, TC *C, Index incRowC, Index incColC)
+    {
+        BOOST_ALIGN_ASSUME_ALIGNED(A, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align);
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr;
+        typename aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa;
+        T *P = reinterpret_cast<T*>(Pa.address());
+        for (unsigned c = 0; c < MR * NR; c++)
+          P[c] = 0;
+
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+              for (Index j=0; j<NR; ++j) {
+                    P[i* NR+j] += A[i]*B[j];
+                }
+            }
+            A += MR;
+            B += NR;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    P[i*NR+j] *= alpha;
+                }
+            }
+        }
+
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                C[i*incRowC+j*incColC] = P[i*NR+j];
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    C[i*incRowC+j*incColC] *= beta;
+                    C[i*incRowC+j*incColC] += P[i*NR+j];
+                }
+            }
+        }
+    }
+
+#ifdef BOOST_UBLAS_VECTOR_KERNEL
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<is_arithmetic<T>::value
+                         && (1 < BlockSize::vector_length)
+                         && (BlockSize::vector_length * sizeof(T) <= _BOOST_UBLAS_VECTOR_SIZE),
+                         void>::type
+    ugemm(Index kc, TC alpha, const T *Ar, const T *Ai,
+          const T *Br, const T *Bi,
+          TC beta, TC *C, Index incRowC, Index incColC)
+    {
+        BOOST_ALIGN_ASSUME_ALIGNED (Ar, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED (Ai, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED (Br, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED (Bi, BlockSize::align);
+        static const unsigned vector_length = BlockSize::vector_length;
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr/vector_length;
+        static const Index nr = BlockSize::nr;
+#ifdef BOOST_COMP_INTEL_DETECTION
+        static const Index pl = MR * nr;
+
+        T Pr[pl] = {};
+        T Pi[pl] = {};
+
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<nr; j += vector_length) {
+                  Pr[i * nr + j:vector_length] +=
+                      Ar[i] * Br[j:vector_length] - Ai[i] * Bi[j:vector_length];
+                  Pi[i * nr + j:vector_length] +=
+                      Ar[i] * Bi[j:vector_length] + Ai[i] * Br[j:vector_length];
+                }
+            }
+            Ar += MR;
+            Ai += MR;
+            Br += nr;
+            Bi += nr;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<nr; j += vector_length) {
+                    Pr[i*nr+j:vector_length] *= alpha.real();
+                    Pi[i*nr+j:vector_length] *= alpha.imag();
+                }
+            }
+        }
+#else
+#ifdef BOOST_COMP_CLANG_DETECTION
+        typedef T vx __attribute__((ext_vector_type (vector_length)));
+#else
+        typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE)));
+#endif
+
+        vx Pr[MR*NR] = {};
+        vx Pi[MR*NR] = {};
+
+        const vx *br = (const vx *)Br;
+        const vx *bi = (const vx *)Bi;
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    Pr[i * NR + j] += Ar[i] * br[j] - Ai[i] * bi[j];
+                    Pi[i * NR + j] += Ar[i] * bi[j] + Ai[i] * br[j];
+                }
+            }
+            Ar += MR;
+            Ai += MR;
+            br += NR;
+            bi += NR;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    Pr[i*NR+j] *= alpha.real();
+                    Pi[i*NR+j] *= alpha.imag();
+                }
+            }
+        }
+#endif
+        const T *pr = (const T *) Pr;
+        const T *pi = (const T *) Pi;
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j< nr; ++j) {
+                    C[i*incRowC+j*incColC] =
+                        TC(pr[i * nr + j], pi[i * nr  + j]);
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j< nr; ++j) {
+                    C[i*incRowC+j*incColC] *= beta;
+                    C[i*incRowC+j*incColC] +=
+                        TC(pr[i * nr + j], pi[i * nr + j]);
+                }
+            }
+        }
+    }
+
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<!is_arithmetic<T>::value
+                         || (BlockSize::vector_length <= 1),
+                         void>::type
+#else
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    void
+#endif
+    ugemm(Index kc, TC alpha, const T *Ar, const T *Ai,
+          const T *Br, const T *Bi,
+          TC beta, TC *C, Index incRowC, Index incColC)
+    {
+        BOOST_ALIGN_ASSUME_ALIGNED(Ar, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(Ai, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(Br, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(Bi, BlockSize::align);
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr;
+        typename aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Par;
+        T *Pr = reinterpret_cast<T*>(Par.address());
+        typename aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pai;
+        T *Pi = reinterpret_cast<T*>(Pai.address());
+
+        for (unsigned c = 0; c < MR * NR; c++)
+          Pr[c] = Pi[c] = 0;
+
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+              for (Index j=0; j<NR; ++j) {
+                    Pr[i * NR + j] += Ar[i] * Br[j] - Ai[i] * Bi[j];
+                    Pi[i * NR + j] += Ar[i] * Bi[j] + Ai[i] * Br[j];
+                }
+            }
+            Ar += MR;
+            Ai += MR;
+            Br += NR;
+            Bi += NR;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    Pr[i*NR+j] *= alpha.real();
+                    Pi[i*NR+j] *= alpha.imag();;
+                }
+            }
+        }
+
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    C[i*incRowC+j*incColC] = TC(Pr[i*NR+j], Pi[i*NR+j]);
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    C[i*incRowC+j*incColC] *= beta;
+                    C[i*incRowC+j*incColC] += TC(Pr[i*NR+j], Pi[i*NR+j]);
+                }
+            }
+        }
+    }
+
+    //-- Macro Kernel ----------------------------------------------------------
+    template <typename Index, typename T, typename TC, typename BlockSize>
+    void
+    mgemm(Index mc, Index nc, Index kc, TC alpha,
+          const T *A, const T *B, TC beta, TC *C,
+          Index incRowC, Index incColC, BlockSize)
+    {
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr;
+        const Index mp  = (mc+MR-1) / MR;
+        const Index np  = (nc+NR-1) / NR;
+        const Index mr_ = mc % MR;
+        const Index nr_ = nc % NR;
+
+        for (Index j=0; j<np; ++j) {
+            const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
+            TC C_[MR*NR];
+
+            for (Index i=0; i<mp; ++i) {
+                const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
+
+                if (mr==MR && nr==NR) {
+                  ugemm<Index, T, TC, BlockSize>(kc, alpha,
+                          &A[i*kc*MR], &B[j*kc*NR],
+                          beta,
+                          &C[i*MR*incRowC+j*NR*incColC],
+                          incRowC, incColC);
+                } else {
+                    std::fill_n(C_, MR*NR, T(0));
+                    ugemm<Index, T, TC, BlockSize>(kc, alpha,
+                          &A[i*kc*MR], &B[j*kc*NR],
+                          T(0),
+                          C_, NR, Index(1));
+                    gescal(mr, nr, beta,
+                           &C[i*MR*incRowC+j*NR*incColC],
+                           incRowC, incColC);
+                    geaxpy(mr, nr, TC(1), C_, NR, Index(1),
+                           &C[i*MR*incRowC+j*NR*incColC],
+                           incRowC, incColC);
+                }
+            }
+        }
+    }
+
+    template <typename Index, typename T, typename TC, typename BlockSize>
+    void
+    mgemm(Index mc, Index nc, Index kc, TC alpha,
+          std::pair<T*, T*> A, std::pair<T*, T*>B,
+          TC beta, TC *C, Index incRowC, Index incColC, BlockSize)
+    {
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr;
+        const Index mp  = (mc+MR-1) / MR;
+        const Index np  = (nc+NR-1) / NR;
+        const Index mr_ = mc % MR;
+        const Index nr_ = nc % NR;
+
+        for (Index j=0; j<np; ++j) {
+            const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
+            TC C_[MR*NR];
+
+            for (Index i=0; i<mp; ++i) {
+                const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
+
+                if (mr==MR && nr==NR) {
+                    ugemm<Index, T, TC, BlockSize>
+                        (kc, alpha,
+                         &A.first[i*kc*MR], &A.second[i*kc*MR],
+                         &B.first[j*kc*NR], &B.second[j*kc*NR],
+                         beta,
+                         &C[i*MR*incRowC+j*NR*incColC],
+                         incRowC, incColC);
+                } else {
+                    std::fill_n(C_, MR*NR, T(0));
+                    ugemm<Index, T, TC, BlockSize>
+                        (kc, alpha,
+                         &A.first[i*kc*MR], &A.second[i*kc*MR],
+                         &B.first[j*kc*NR], &B.second[j*kc*NR],
+                         T(0),
+                         C_, NR, Index(1));
+                    gescal(mr, nr, beta,
+                           &C[i*MR*incRowC+j*NR*incColC],
+                           incRowC, incColC);
+                    geaxpy(mr, nr, TC(1), C_, NR, Index(1),
+                           &C[i*MR*incRowC+j*NR*incColC],
+                           incRowC, incColC);
+                }
+            }
+        }
+    }
+
+    //-- Packing blocks ------------------------------------------------------------
+    template <typename E, typename T, typename BlockSize>
+    void
+    pack_A(const matrix_expression<E> &A, T *p, BlockSize)
+    {
+        typedef typename E::size_type  size_type;
+        BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
+
+        const size_type mc = A ().size1();
+        const size_type kc = A ().size2();
+        static const size_type MR = BlockSize::mr;
+        const size_type mp = (mc+MR-1) / MR;
+
+        for (size_type j=0; j<kc; ++j) {
+            for (size_type l=0; l<mp; ++l) {
+                for (size_type i0=0; i0<MR; ++i0) {
+                    size_type i  = l*MR + i0;
+                    size_type nu = l*MR*kc + j*MR + i0;
+                    p[nu]        = (i<mc) ? A()(i,j) : T(0);
+                }
+            }
+        }
+    }
+
+    template <typename E, typename T, typename BlockSize>
+    void
+    pack_A(const matrix_expression<E> &A, std::pair<T*, T*> p, BlockSize)
+    {
+        typedef typename E::size_type  size_type;
+        T* re = p.first;
+        T* im = p.second;
+        BOOST_ALIGN_ASSUME_ALIGNED(re, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(im, BlockSize::align);
+
+        const size_type mc = A ().size1();
+        const size_type kc = A ().size2();
+        static const size_type MR = BlockSize::mr;
+        const size_type mp = (mc+MR-1) / MR;
+
+        for (size_type j=0; j<kc; ++j) {
+            for (size_type l=0; l<mp; ++l) {
+                for (size_type i0=0; i0<MR; ++i0) {
+                    size_type i  = l*MR + i0;
+                    size_type nu = l*MR*kc + j*MR + i0;
+                    re[nu]  = (i<mc) ? A()(i,j).real() : T(0);
+                    im[nu] = (i<mc) ? A()(i,j).imag() : T(0);
+                }
+            }
+        }
+    }
+
+    template <typename E, typename T, typename BlockSize>
+    void
+    pack_B(const matrix_expression<E> &B, T *p, BlockSize)
+    {
+        typedef typename E::size_type  size_type;
+        BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
+
+        const size_type kc = B ().size1();
+        const size_type nc = B ().size2();
+        static const size_type NR = BlockSize::nr;
+        const size_type np = (nc+NR-1) / NR;
+
+        for (size_type l=0; l<np; ++l) {
+            for (size_type j0=0; j0<NR; ++j0) {
+                for (size_type i=0; i<kc; ++i) {
+                    size_type j  = l*NR+j0;
+                    size_type nu = l*NR*kc + i*NR + j0;
+                    p[nu]        = (j<nc) ? B()(i,j) : T(0);
+                }
+            }
+        }
+    }
+
+    template <typename E, typename T, typename BlockSize>
+    void
+    pack_B(const matrix_expression<E> &B, std::pair<T*, T*> p, BlockSize)
+    {
+        typedef typename E::size_type  size_type;
+        T* re = p.first;
+        T* im = p.second;
+        BOOST_ALIGN_ASSUME_ALIGNED(re, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED(im, BlockSize::align);
+
+        const size_type kc = B ().size1();
+        const size_type nc = B ().size2();
+        static const size_type NR = BlockSize::nr;
+        const size_type np = (nc+NR-1) / NR;
+
+        for (size_type l=0; l<np; ++l) {
+            for (size_type j0=0; j0<NR; ++j0) {
+                for (size_type i=0; i<kc; ++i) {
+                    size_type j  = l*NR+j0;
+                    size_type nu = l*NR*kc + i*NR + j0;
+                    re[nu] = (j<nc) ? B()(i,j).real() : T(0);
+                    im[nu] = (j<nc) ? B()(i,j).imag() : T(0);
+                }
+            }
+        }
+    }
+
+    //-- Frame routine -------------------------------------------------------------
+    template <typename E1, typename E2, typename E3, typename M, typename BlockSize>
+    void
+    gemm_(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+          typename E3::value_type beta, matrix_expression<E3> &e3,
+          M A, M B, BlockSize bs)
+    {
+#if defined(BOOST_COMP_GNUC_DETECTION)
+        check_blocksize<BlockSize> check __attribute__ ((unused));
+#else
+        check_blocksize<BlockSize> check;
+#endif
+        typedef typename E3::size_type  size_type;
+        typedef typename E3::value_type value_type3;
+
+        static const size_type MC = BlockSize::mc;
+        static const size_type NC = BlockSize::nc;
+
+        const size_type m = BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ());
+        const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ());
+        const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
+
+        static const size_type KC = BlockSize::kc;
+        const size_type mb = (m+MC-1) / MC;
+        const size_type nb = (n+NC-1) / NC;
+        const size_type kb = (k+KC-1) / KC;
+        const size_type mc_ = m % MC;
+        const size_type nc_ = n % NC;
+        const size_type kc_ = k % KC;
+
+        value_type3 *C_ = &e3()(0,0);
+        const size_type incRowC = &e3()(1,0) - &e3()(0,0);
+        const size_type incColC = &e3()(0,1) - &e3()(0,0);
+
+        assert(alpha!=value_type3(0) && k!=0);
+
+        for (size_type j=0; j<nb; ++j) {
+            size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_;
+
+            for (size_type l=0; l<kb; ++l) {
+                size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_;
+                value_type3 beta_ = (l==0) ? beta : value_type3(1);
+
+                const matrix_range<const E2> Bs =
+                    subrange(e2(), l*KC, l*KC+kc, j*NC, j*NC+nc);
+                pack_B(Bs, B, bs);
+
+                for (size_type i=0; i<mb; ++i) {
+                    size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_;
+
+                    const matrix_range<const E1> As =
+                        subrange(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc);
+                    pack_A(As, A, bs);
+
+                    mgemm(mc, nc, kc, alpha, A, B, beta_,
+                          &C_[i*MC*incRowC+j*NC*incColC],
+                          incRowC, incColC, bs);
+                }
+            }
+        }
+    }
+
+    template <typename E1, typename E2, typename E3, typename BlockSize>
+    typename enable_if_c<!is_complex<typename common_type<typename E1::value_type,
+                                                          typename E2::value_type>::type>::value
+                         || !is_complex<typename E3::value_type>::value,
+                         void>::type
+    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+         const matrix_expression<E2> &e2,
+         typename E3::value_type beta, matrix_expression<E3> &e3,
+         BlockSize bs)
+    {
+        typedef typename E3::size_type  size_type;
+        typedef typename E1::value_type value_type1;
+        typedef typename E2::value_type value_type2;
+        typedef typename E3::value_type value_type3;
+        typedef typename common_type<value_type1,
+                                     value_type2,
+                                     value_type3>::type value_type_i;
+        typedef unbounded_array<value_type_i,
+                                typename alignment::aligned_allocator<value_type_i,
+                                                                      BlockSize::align> > array_type_i;
+
+        static const size_type MC = BlockSize::mc;
+        static const size_type NC = BlockSize::nc;
+
+        const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
+
+        static const size_type KC = BlockSize::kc;
+
+        if (alpha==value_type3(0) || k==0) {
+            gescal(beta, e3);
+            return;
+        }
+
+        array_type_i A(MC * KC);
+        array_type_i B(NC * KC);
+        gemm_(alpha, e1, e2, beta, e3, &A[0], &B[0], bs);
+    }
+
+    template <typename E1, typename E2, typename E3, typename BlockSize>
+    typename enable_if_c<is_complex<typename common_type<typename E1::value_type,
+                                                         typename E2::value_type>::type>::value
+                         && is_complex<typename E3::value_type>::value,
+                         void>::type
+    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+         const matrix_expression<E2> &e2,
+         typename E3::value_type beta, matrix_expression<E3> &e3,
+         BlockSize bs)
+    {
+        typedef typename E3::size_type  size_type;
+        typedef typename E1::value_type value_type1;
+        typedef typename E2::value_type value_type2;
+        typedef typename E3::value_type value_type3;
+        typedef typename common_type<value_type1,
+                                     value_type2,
+                                     value_type3>::type value_type_i;
+        typedef typename value_type_i::value_type value_type_f;
+        typedef unbounded_array<value_type_f,
+                                typename alignment::aligned_allocator<value_type_f,
+                                                                      BlockSize::align> > array_type_f;
+
+        static const size_type MC = BlockSize::mc;
+        static const size_type NC = BlockSize::nc;
+
+        const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
+
+        static const size_type KC = BlockSize::kc;
+
+        if (alpha==value_type3(0) || k==0) {
+            gescal(beta, e3);
+            return;
+        }
+
+        array_type_f Ar(MC * KC);
+        array_type_f Ai(MC * KC);
+        array_type_f Br(NC * KC);
+        array_type_f Bi(NC * KC);
+        gemm_(alpha, e1, e2, beta, e3, std::make_pair(&Ar[0], &Ai[0]),
+              std::make_pair(&Br[0], &Bi[0]), bs);
+    }
+
+}}}}
+#endif
diff --git a/include/boost/numeric/ublas/detail/matrix_assign.hpp b/include/boost/numeric/ublas/detail/matrix_assign.hpp
index efaf9ab81..b6de2586f 100644
--- a/include/boost/numeric/ublas/detail/matrix_assign.hpp
+++ b/include/boost/numeric/ublas/detail/matrix_assign.hpp
@@ -30,7 +30,7 @@ namespace detail {
     template<class E1, class E2, class S>
     BOOST_UBLAS_INLINE
     bool equals (const matrix_expression<E1> &e1, const matrix_expression<E2> &e2, S epsilon, S min_norm) {
-        return norm_inf (e1 - e2) < epsilon *
+        return norm_inf (e1 - e2) <= epsilon *
                std::max<S> (std::max<S> (norm_inf (e1), norm_inf (e2)), min_norm);
     }
 
diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp
new file mode 100644
index 000000000..e0ec2b181
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/vector.hpp
@@ -0,0 +1,35 @@
+//
+//  Copyright (c) 2016
+//  Imre Palik
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_DETAIL_VECTOR_
+#define _BOOST_UBLAS_DETAIL_VECTOR_
+#include <boost/predef/hardware/simd.h>
+
+#if defined(BOOST_HW_SIMD) && (BOOST_HW_SIMD == BOOST_HW_SIMD_X86 || BOOST_HW_SIMD == BOOST_HW_SIMD_X86_AMD)
+#if BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_AVX_VERSION
+#define _BOOST_UBLAS_VECTOR_SIZE 32
+#elif BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_SSE_VERSION
+#define _BOOST_UBLAS_VECTOR_SIZE 16
+#else
+#define _BOOST_UBLAS_VECTOR_SIZE 8
+#endif
+#endif
+
+#ifndef _BOOST_UBLAS_VECTOR_SIZE
+#define _BOOST_UBLAS_VECTOR_SIZE 1
+#endif
+
+#if defined(BOOST_COMP_INTEL_DETECTION)                              \
+    || (defined(BOOST_COMP_GNUC_DETECTION)                           \
+        && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)) \
+    || (defined(BOOST_COMP_CLANG_DETECTION)                          \
+        && BOOST_COMP_CLANG_DETECTION >= BOOST_VERSION_NUMBER(3,5,0))
+#define BOOST_UBLAS_VECTOR_KERNEL = _BOOST_UBLAS_VECTOR_SIZE
+#endif
+
+#endif
diff --git a/include/boost/numeric/ublas/functional.hpp b/include/boost/numeric/ublas/functional.hpp
index b0be20608..f54b441fe 100644
--- a/include/boost/numeric/ublas/functional.hpp
+++ b/include/boost/numeric/ublas/functional.hpp
@@ -433,10 +433,10 @@ namespace boost { namespace numeric { namespace ublas {
         template<class E>
         static BOOST_UBLAS_INLINE
         result_type apply (const vector_expression<E> &e) {
-#ifndef BOOST_UBLAS_SCALED_NORM
-            real_type t = real_type ();
             typedef typename E::size_type vector_size_type;
             vector_size_type size (e ().size ());
+#ifndef BOOST_UBLAS_SCALED_NORM
+            real_type t = real_type ();
             for (vector_size_type i = 0; i < size; ++ i) {
                 real_type u (type_traits<value_type>::norm_2 (e () (i)));
                 t +=  u * u;
@@ -445,8 +445,7 @@ namespace boost { namespace numeric { namespace ublas {
 #else
             real_type scale = real_type ();
             real_type sum_squares (1);
-            size_type size (e ().size ());
-            for (size_type i = 0; i < size; ++ i) {
+            for (vector_size_type i = 0; i < size; ++ i) {
                 real_type u (type_traits<value_type>::norm_2 (e () (i)));
                 if ( real_type () /* zero */ == u ) continue;
                 if (scale < u) {
@@ -522,6 +521,52 @@ namespace boost { namespace numeric { namespace ublas {
 #endif
         }
     };
+
+    template<class V>
+    struct vector_norm_2_square :
+        public vector_scalar_real_unary_functor<V> {
+        typedef typename vector_scalar_real_unary_functor<V>::value_type value_type;
+        typedef typename vector_scalar_real_unary_functor<V>::real_type real_type;
+        typedef typename vector_scalar_real_unary_functor<V>::result_type result_type;
+
+        template<class E>
+        static BOOST_UBLAS_INLINE
+        result_type apply (const vector_expression<E> &e) {
+            real_type t = real_type ();
+            typedef typename E::size_type vector_size_type;
+            vector_size_type size (e ().size ());
+            for (vector_size_type i = 0; i < size; ++ i) {
+                real_type u (type_traits<value_type>::norm_2 (e () (i)));
+                t +=  u * u;
+            }
+            return t;
+        }
+        // Dense case
+        template<class D, class I>
+        static BOOST_UBLAS_INLINE
+        result_type apply (D size, I it) {
+            real_type t = real_type ();
+            while (-- size >= 0) {
+                real_type u (type_traits<value_type>::norm_2 (*it));
+                t +=  u * u;
+                ++ it;
+            }
+            return t;
+        }
+        // Sparse case
+        template<class I>
+        static BOOST_UBLAS_INLINE
+        result_type apply (I it, const I &it_end) {
+            real_type t = real_type ();
+            while (it != it_end) {
+                real_type u (type_traits<value_type>::norm_2 (*it));
+                t +=  u * u;
+                ++ it;
+            }
+            return t;
+        }
+    };
+
     template<class V>
     struct vector_norm_inf:
         public vector_scalar_real_unary_functor<V> {
diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp
index 8e6c2f68b..36bbf130b 100644
--- a/include/boost/numeric/ublas/matrix_expression.hpp
+++ b/include/boost/numeric/ublas/matrix_expression.hpp
@@ -14,6 +14,8 @@
 #define _BOOST_UBLAS_MATRIX_EXPRESSION_
 
 #include <boost/numeric/ublas/vector_expression.hpp>
+#include <boost/numeric/ublas/detail/gemm.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
 
 // Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish
 // Iterators based on ideas of Jeremy Siek
@@ -5460,45 +5462,113 @@ namespace boost { namespace numeric { namespace ublas {
         expression2_closure_type e2_;
     };
 
+    namespace detail {
+        template<class E1, class E2, class P, bool s>
+        struct binary_calculate_result_type;
+
+        template<class E1, class E2, class P>
+        struct binary_calculate_result_type<E1, E2, P, false> {
+            typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, P> > result_type;
+        };
+
+        template<class E1, class E2, class P>
+        struct binary_calculate_result_type<E1, E2, P, true> {
+            typedef matrix<P> result_type;
+        };
+    }
+
     template<class T1, class E1, class T2, class E2>
     struct matrix_matrix_binary_traits {
-        typedef unknown_storage_tag storage_category;
+      //        typedef unknown_storage_tag storage_category;
+      typedef typename storage_restrict_traits<typename E1::storage_category, typename E2::storage_category>::storage_category storage_category;
         typedef unknown_orientation_tag orientation_category;
         typedef typename promote_traits<T1, T2>::promote_type promote_type;
         typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, promote_type> > expression_type;
 #ifndef BOOST_UBLAS_SIMPLE_ET_DEBUG
-        typedef expression_type result_type;
+      //        typedef expression_type result_type;
+      typedef typename detail::binary_calculate_result_type<E1, E2, promote_type, boost::is_base_of<dense_proxy_tag, storage_category>::value>::result_type result_type;
 #else
         typedef typename E1::matrix_temporary_type result_type;
 #endif
     };
 
-    template<class E1, class E2>
+    template<class E1, class E2, class B>
     BOOST_UBLAS_INLINE
     typename matrix_matrix_binary_traits<typename E1::value_type, E1,
-                                         typename E2::value_type, E2>::result_type
+                                         typename E2::value_type, E2>::expression_type
     prod (const matrix_expression<E1> &e1,
           const matrix_expression<E2> &e2,
+         B,
           unknown_storage_tag,
           unknown_orientation_tag) {
+
         typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                                      typename E2::value_type, E2>::expression_type expression_type;
         return expression_type (e1 (), e2 ());
     }
 
-    // Dispatcher
-    template<class E1, class E2>
+    template<class E1, class E2, class B>
     BOOST_UBLAS_INLINE
     typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                          typename E2::value_type, E2>::result_type
     prod (const matrix_expression<E1> &e1,
-          const matrix_expression<E2> &e2) {
+          const matrix_expression<E2> &e2,
+         B b,
+          dense_proxy_tag,
+          unknown_orientation_tag) {
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::expression_type expression_type;
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::result_type result_type;
+       typedef typename result_type::value_type result_value;
+
+       if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) {
+           return expression_type (e1 (), e2 ());
+       } else {
+            result_type rv(e1 ().size1(), e2 ().size2());
+           detail::gemm(result_value(1), e1, e2,
+                        result_value(0), rv, b);
+           return rv;
+       }
+    }
+
+    // Dispatcher
+    template<class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    typename enable_if_c<detail::is_blocksize<B>::value,
+                        typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                             typename E2::value_type, E2>::result_type>::type
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+         B b) {
         BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0);
         typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                                      typename E2::value_type, E2>::storage_category storage_category;
         typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
                                                      typename E2::value_type, E2>::orientation_category orientation_category;
-        return prod (e1, e2, storage_category (), orientation_category ());
+        return prod (e1, e2, b, storage_category (), orientation_category ());
+    }
+
+    template<class E1, class E2>
+    BOOST_UBLAS_INLINE
+#ifndef BOOST_UBLAS_LEGACY_PRODUCT
+    typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                         typename E2::value_type, E2>::result_type
+#else
+    typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                         typename E2::value_type, E2>::expression_type
+#endif
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2) {
+        BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0);
+       typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
+                                                                     typename E2::value_type>::type> block_sizes;
+#ifndef BOOST_UBLAS_LEGACY_PRODUCT
+        return prod (e1, e2, block_sizes());
+#else
+        return prod (e1, e2, block_sizes(), unknown_storage_tag(),
+                     unknown_orientation_tag());
+#endif
     }
 
     template<class E1, class E2>
@@ -5529,13 +5599,79 @@ namespace boost { namespace numeric { namespace ublas {
         return prec_prod (e1, e2, storage_category (), orientation_category ());
     }
 
-    template<class M, class E1, class E2>
+    template<class M, class E1, class E2, typename B>
+    BOOST_UBLAS_INLINE
+    void
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+          M &m, B b,
+         unknown_storage_tag,
+          unknown_orientation_tag) {
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::expression_type expression_type;
+
+        m.assign(expression_type (e1 (), e2 ()));
+    }
+
+    template<class M, class E1, class E2, typename B>
+    BOOST_UBLAS_INLINE
+    void
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+          M &m, B b,
+         dense_proxy_tag,
+          unknown_orientation_tag) {
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::expression_type expression_type;
+        typedef typename M::value_type result_value;
+
+        if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) {
+            m.assign(expression_type (e1 (), e2 ()));
+        } else {
+           detail::gemm(result_value(1), e1, e2,
+                        result_value(0), m, b);
+        }
+    }
+
+    // dispatcher
+      template<class M, class E1, class E2, class B>
     BOOST_UBLAS_INLINE
     M &
+    prod (const matrix_expression<E1> &e1,
+          const matrix_expression<E2> &e2,
+          M &m, B b) {
+        BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0);
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::storage_category storage_category;
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::orientation_category orientation_category;
+
+        prod (e1, e2, m, b, storage_category(), orientation_category());
+       return m;
+    }
+    template<class M, class E1, class E2>
+    BOOST_UBLAS_INLINE
+    typename enable_if_c<!detail::is_blocksize<M>::value,
+                        M&>::type
     prod (const matrix_expression<E1> &e1,
           const matrix_expression<E2> &e2,
           M &m) {
-        return m.assign (prod (e1, e2));
+        BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0);
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::storage_category storage_category;
+        typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+                                                     typename E2::value_type, E2>::orientation_category orientation_category;
+        typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
+                                                                      typename E2::value_type,
+                                                                      typename M::value_type>::type> block_sizes;
+#ifndef BOOST_UBLAS_LEGACY_PRODUCT
+        prod (e1, e2, m, block_sizes(), storage_category(),
+              orientation_category());
+#else
+        prod (e1, e2, m, block_sizes(), unknown_storage_tag(),
+              unknown_orientation_tag());
+#endif
+        return m;
     }
 
     template<class M, class E1, class E2>
diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp
index 64657ccf3..176669cf8 100644
--- a/include/boost/numeric/ublas/operation.hpp
+++ b/include/boost/numeric/ublas/operation.hpp
@@ -14,7 +14,8 @@
 #define _BOOST_UBLAS_OPERATION_
 
 #include <boost/numeric/ublas/matrix_proxy.hpp>
-
+#include <boost/numeric/ublas/detail/gemm.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
 /** \file operation.hpp
  *  \brief This file contains some specialized products.
  */
@@ -637,7 +638,63 @@ namespace boost { namespace numeric { namespace ublas {
         return m;
     }
 
-    // Dispatcher
+    template<class M, class E1, class E2, class S, class B>
+    BOOST_UBLAS_INLINE
+    M&
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2,
+               M &m, full,
+              S, bool init, B)
+    {
+        typedef typename M::value_type value_type;
+       typedef S storage_category;
+       typedef typename M::orientation_category orientation_category;
+
+
+       if (init)
+         m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
+       return axpy_prod (e1, e2, m, full (), storage_category (),
+                         orientation_category ());
+    }
+
+    template<class M, class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    M&
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2,
+               M &m, full,
+              dense_proxy_tag, bool init, B)
+    {
+        typedef typename M::value_type value_type;
+       typedef B block_sizes;
+        if (m.size1() < B::limit || m.size2() < B::limit) {
+            typedef typename M::storage_category storage_category;
+            typedef typename M::orientation_category orientation_category;
+
+            if (init)
+                m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
+            return axpy_prod (e1, e2, m, full (), storage_category (),
+                             orientation_category ());
+        } else {
+            detail::gemm(value_type(1), e1, e2,
+                        value_type(init? 0 : 1), m, block_sizes());
+            return m;
+        }
+    }
+
+    template<class M, class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    M&
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2,
+               M &m, full,
+              dense_tag, bool init, B b)
+    {
+       return axpy_prod (e1, e2, m, full (), dense_proxy_tag(),
+                         init, b);
+    }
+
+    // Dispatchers
     template<class M, class E1, class E2, class TRI>
     BOOST_UBLAS_INLINE
     M &
@@ -651,11 +708,16 @@ namespace boost { namespace numeric { namespace ublas {
 
         if (init)
             m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
-        return axpy_prod (e1, e2, m, triangular_restriction (), storage_category (), orientation_category ());
+
+        return axpy_prod(e1, e2, m, triangular_restriction (),
+                        storage_category (), orientation_category(),
+                        init);
     }
+
     template<class M, class E1, class E2, class TRI>
     BOOST_UBLAS_INLINE
-    M
+    typename enable_if_c<!detail::is_blocksize<TRI>::value,
+                               M>::type
     axpy_prod (const matrix_expression<E1> &e1,
                const matrix_expression<E2> &e2,
                TRI) {
@@ -690,20 +752,59 @@ namespace boost { namespace numeric { namespace ublas {
           \param E1 type of a matrix expression \c A
           \param E2 type of a matrix expression \c X
   */
+    template<class M, class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    M &
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2,
+               M &m, bool init, B) {
+        typedef typename M::storage_category storage_category;
+       typedef B block_sizes;
+
+        return axpy_prod(e1, e2, m, full (), storage_category (),
+                        init, block_sizes());
+    }
+
     template<class M, class E1, class E2>
     BOOST_UBLAS_INLINE
     M &
     axpy_prod (const matrix_expression<E1> &e1,
                const matrix_expression<E2> &e2,
                M &m, bool init = true) {
-        typedef typename M::value_type value_type;
         typedef typename M::storage_category storage_category;
+        typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
+                                                                      typename E2::value_type,
+                                                                      typename M::value_type>::type> block_sizes;
+#ifndef BOOST_UBLAS_LEGACY_PRODUCT
+        return axpy_prod(e1, e2, m, full (), storage_category (),
+                        init, block_sizes());
+#else
+        typedef typename M::value_type value_type;
         typedef typename M::orientation_category orientation_category;
 
         if (init)
             m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
-        return axpy_prod (e1, e2, m, full (), storage_category (), orientation_category ());
+
+        return axpy_prod(e1, e2, m, full (), storage_category (),
+                         orientation_category());
+#endif
+    }
+
+    template<class M, class E1, class E2, class B>
+    BOOST_UBLAS_INLINE
+    typename boost::enable_if_c<detail::is_blocksize<B>::value,
+                               M>::type
+    axpy_prod (const matrix_expression<E1> &e1,
+               const matrix_expression<E2> &e2, B) {
+        typedef M matrix_type;
+        typedef typename M::storage_category storage_category;
+       typedef B block_sizes;
+
+        matrix_type m (e1 ().size1 (), e2 ().size2 ());
+        return axpy_prod(e1, e2, m, full (), storage_category(), true,
+                        block_sizes());
     }
+
     template<class M, class E1, class E2>
     BOOST_UBLAS_INLINE
     M
@@ -712,7 +813,7 @@ namespace boost { namespace numeric { namespace ublas {
         typedef M matrix_type;
 
         matrix_type m (e1 ().size1 (), e2 ().size2 ());
-        return axpy_prod (e1, e2, m, full (), true);
+        return axpy_prod(e1, e2, m);
     }
 
 
@@ -825,6 +926,48 @@ namespace boost { namespace numeric { namespace ublas {
         return opb_prod (e1, e2, m, true);
     }
 
+  /** \brief computes <tt>C = alpha * A * B + beta * C</tt> in an
+          optimized fashion.
+
+         \param alpha scalar multiplier
+          \param e1 the matrix expression \c A
+          \param e2 the matrix expression \c B
+         \param beta scalar multiplier
+          \param e3  the result matrix \c C
+
+         <tt>gemm</tt> implements the well known gemm operation
+
+          \ingroup blas3
+
+          \internal
+
+          template parameters:
+          \param E1 type of a matrix expression \c A
+          \param E2 type of a matrix expression \c B
+          \param E3 type of a matrix expression \c C
+  */
+    template <typename E1, typename E2, typename E3>
+    BOOST_UBLAS_INLINE
+    void
+    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+         const matrix_expression<E2> &e2,
+        typename E3::value_type beta, matrix_expression<E3> &e3) {
+        typedef typename detail::prod_block_size<typename common_type<typename E1::value_type,
+                                                                      typename E2::value_type,
+                                                                      typename E3::value_type>::type> block_sizes;
+       detail::gemm(alpha, e1, e2, beta, e3, block_sizes());
+    }
+
+    template <typename E1, typename E2, typename E3, typename BlockSize>
+    BOOST_UBLAS_INLINE
+    void
+    gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+         const matrix_expression<E2> &e2,
+        typename E3::value_type beta, matrix_expression<E3> &e3,
+        BlockSize b) {
+       detail::gemm(alpha, e1, e2, beta, e3, b);
+    }
+
 }}}
 
 #endif
diff --git a/include/boost/numeric/ublas/vector_expression.hpp b/include/boost/numeric/ublas/vector_expression.hpp
index d92207c98..6f952a464 100644
--- a/include/boost/numeric/ublas/vector_expression.hpp
+++ b/include/boost/numeric/ublas/vector_expression.hpp
@@ -1609,6 +1609,16 @@ namespace boost { namespace numeric { namespace ublas {
         return expression_type (e ());
     }
 
+    // real: norm_2_square v = sum(v [i] * v [i])
+    // complex: norm_2_square v = sum(v [i] * conj (v [i]))
+    template<class E>
+    BOOST_UBLAS_INLINE
+    typename vector_scalar_unary_traits<E, vector_norm_2_square<E> >::result_type
+    norm_2_square (const vector_expression<E> &e) {
+        typedef typename vector_scalar_unary_traits<E, vector_norm_2_square<E> >::expression_type expression_type;
+        return expression_type (e ());
+    }
+
     // real: norm_inf v = maximum (abs (v [i]))
     // complex: norm_inf v = maximum (maximum (abs (real (v [i])), abs (imag (v [i]))))
     template<class E>
diff --git a/test/Jamfile.v2 b/test/Jamfile.v2
index 77e296c91..f990b76b9 100644
--- a/test/Jamfile.v2
+++ b/test/Jamfile.v2
@@ -189,6 +189,10 @@ test-suite numeric/uBLAS
       ]
       [ run test_complex_norms.cpp
       ]
+      [ run test_scaled_norm.cpp
+       : : :
+           <define>BOOST_UBLAS_SCALED_NORM
+      ]
       [ run test_assignment.cpp
        : : :
            <define>BOOST_UBLAS_COO_ALWAYS_DO_FULL_SORT
diff --git a/test/test_complex_norms.cpp b/test/test_complex_norms.cpp
index d58a788ee..0c37ed45b 100644
--- a/test/test_complex_norms.cpp
+++ b/test/test_complex_norms.cpp
@@ -42,6 +42,21 @@ BOOST_UBLAS_TEST_DEF ( test_double_complex_norm_2 ) {
     BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - (3.0*expected)) < TOL);
 }
 
+BOOST_UBLAS_TEST_DEF ( test_double_complex_norm_2_square ) {
+    typedef std::complex<double> dComplex;
+    vector<dComplex> v(4);
+    for (unsigned int i = 0; i < v.size(); ++i)
+        v[i] = dComplex(i, i + 1);
+
+    const double expected = 44;
+
+    BOOST_UBLAS_DEBUG_TRACE( "square norm is " << norm_2_square(v) );
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - expected) < TOL);
+    v *= 3.;
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - (9.0*expected)) < TOL);
+}
+
+
 BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_inf ) {
     typedef std::complex<float> dComplex;
     vector<dComplex> v(4);
@@ -74,6 +89,22 @@ BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_2 ) {
     BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - (3.0*expected)) < TOL);
 }
 
+BOOST_UBLAS_TEST_DEF ( test_float_complex_norm_2_square ) {
+    typedef std::complex<float> dComplex;
+    vector<dComplex> v(4);
+    for (unsigned short i = 0; i < v.size(); ++i) {
+        unsigned short imag(i + 1);
+        v[i] = dComplex(i, imag);
+    }
+
+    const double expected = 44;
+
+    BOOST_UBLAS_DEBUG_TRACE( "square norm is " << norm_2_square(v) );
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - expected) < TOL);
+    v *= 3.f;
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2_square(v) - (9.0*expected)) < TOL);
+}
+
 int main() {
     BOOST_UBLAS_TEST_BEGIN();
 
@@ -81,6 +112,8 @@ int main() {
     BOOST_UBLAS_TEST_DO( test_float_complex_norm_inf );
     BOOST_UBLAS_TEST_DO( test_double_complex_norm_2 );
     BOOST_UBLAS_TEST_DO( test_float_complex_norm_2 );
+    BOOST_UBLAS_TEST_DO( test_double_complex_norm_2_square );
+    BOOST_UBLAS_TEST_DO( test_float_complex_norm_2_square );
 
     BOOST_UBLAS_TEST_END();
 }
diff --git a/test/test_scaled_norm.cpp b/test/test_scaled_norm.cpp
new file mode 100644
index 000000000..74065048f
--- /dev/null
+++ b/test/test_scaled_norm.cpp
@@ -0,0 +1,41 @@
+// Distributed under the Boost Software License, Version 1.0. (See
+// accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#include <boost/numeric/ublas/vector.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+#include "utils.hpp"
+
+using namespace boost::numeric::ublas;
+
+static const double TOL(1.0e-5); ///< Used for comparing two real numbers.
+
+BOOST_UBLAS_TEST_DEF ( test_double_scaled_norm_2 ) {
+    vector<double> v(2);
+    v[0] = 0; v[1] = 1.0e155;
+
+    const double expected = 1.0e155;
+
+    BOOST_UBLAS_DEBUG_TRACE( "norm is " << norm_2(v) );
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - expected) < TOL);
+}
+
+BOOST_UBLAS_TEST_DEF ( test_float_scaled_norm_2 ) {
+    vector<float> v(2);
+    v[0] = 0; v[1] = 1.0e20;
+
+    const float expected = 1.0e20;
+
+    BOOST_UBLAS_DEBUG_TRACE( "norm is " << norm_2(v) );
+    BOOST_UBLAS_TEST_CHECK(std::abs(norm_2(v) - expected) < TOL);
+}
+
+int main() {
+    BOOST_UBLAS_TEST_BEGIN();
+
+    BOOST_UBLAS_TEST_DO( test_double_scaled_norm_2 );
+    BOOST_UBLAS_TEST_DO( test_float_scaled_norm_2 );
+
+    BOOST_UBLAS_TEST_END();
+}