diff --git a/build.sh b/build.sh
index ae7aa09bd..0fd50ebcf 100755
--- a/build.sh
+++ b/build.sh
@@ -42,6 +42,7 @@ HAVE_EXPAT=0
 HAVE_GIT2=1
 HAVE_LSTREAM=1
 HAVE_CPP20=0
+HAVE_OPENMP=0
 
 RUBYINCLUDE=""
 RUBYINCLUDE2=""
@@ -103,6 +104,9 @@ while [ "$*" != "" ]; do
   -without-qtbinding)
     HAVE_QTBINDINGS=0
     ;;
+  -with-openmp)
+    HAVE_OPENMP=1
+    ;;
   -without-qt-uitools)
     HAVE_QT_UITOOLS=0
     ;;
@@ -255,6 +259,7 @@ while [ "$*" != "" ]; do
     echo "  -with-qtbinding       Create Qt bindings for ruby scripts [default]"
     echo "  -without-qtbinding    Don't create Qt bindings for ruby scripts"
     echo "  -without-qt-uitools   Don't include uitools in Qt binding"
+    echo "  -with-openmp          Enable OpenMP parallelization for hierarchical processing"
     echo "  -with-64bit-coord     Use long (64bit) coordinates - EXPERIMENTAL FEATURE"
     echo "                          (only available for gcc>=4.4 for 64bit build)"
     echo "  -without-64bit-coord  Don't use long (64bit) coordinates [default]"
@@ -601,6 +606,7 @@ echo "      HAVE_PNG=$HAVE_PNG"
 echo "      HAVE_EXPAT=$HAVE_EXPAT"
 echo "      HAVE_GIT2=$HAVE_GIT2"
 echo "      HAVE_LSTREAM=$HAVE_LSTREAM"
+echo "      HAVE_OPENMP=$HAVE_OPENMP"
 echo "      RPATH=$RPATH"
 
 mkdir -p $BUILD
@@ -676,6 +682,7 @@ qmake_options=(
   HAVE_GIT2="$HAVE_GIT2"
   HAVE_LSTREAM="$HAVE_LSTREAM"
   HAVE_CPP20="$HAVE_CPP20"
+  HAVE_OPENMP="$HAVE_OPENMP"
   PREFIX="$BIN"
   RPATH="$RPATH"
   KLAYOUT_VERSION="$KLAYOUT_VERSION"
diff --git a/src/db/db/dbCompoundOperation.cc b/src/db/db/dbCompoundOperation.cc
index dc9953582..fd94ded4e 100644
--- a/src/db/db/dbCompoundOperation.cc
+++ b/src/db/db/dbCompoundOperation.cc
@@ -766,46 +766,52 @@ CompoundRegionGeometricalBoolOperationNode::implement_bool (CompoundRegionOperat
   one_a.push_back (std::unordered_set<T1> ());
 
   shape_interactions<T, T> computed_a;
-  child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc);
 
-  if (one_a.front ().empty ()) {
-
-    if (m_op == GeometricalOp::And || m_op == GeometricalOp::Not) {
-
-      //  .. no results ..
+  std::vector<std::unordered_set<T2> > one_b;
+  one_b.push_back (std::unordered_set<T2> ());
 
-    } else {
+  shape_interactions<T, T> computed_b;
 
-      std::vector<std::unordered_set<T2> > one_b;
-      one_b.push_back (std::unordered_set<T2> ());
+  bool can_parallel = (m_op != GeometricalOp::And && m_op != GeometricalOp::Not);
 
-      shape_interactions<T, T> computed_b;
+#if defined(_OPENMP)
+  if (can_parallel && proc->threads() > 0) {
+    #pragma omp task shared(one_a, computed_a, cache, layout, cell, interactions, proc)
+    {
+      child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc);
+    }
+    #pragma omp task shared(one_b, computed_b, cache, layout, cell, interactions, proc)
+    {
+      child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
+    }
+    #pragma omp taskwait
+  } else
+#endif
+  {
+    child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc);
+    if (!one_a.front().empty()) {
       child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
+    } else {
+      if (!can_parallel) { // And or Not and A is empty
+        return; // nothing to do, results remain empty
+      }
+    }
+  }
 
+  if (one_a.front ().empty ()) {
+    if (!can_parallel) {
+      //  .. no results ..
+    } else {
       copy_results (results, one_b);
-
     }
-
   } else {
-
-    std::vector<std::unordered_set<T2> > one_b;
-    one_b.push_back (std::unordered_set<T2> ());
-
-    shape_interactions<T, T> computed_b;
-    child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
-
     if (one_b.front ().empty ()) {
-
       if (m_op != GeometricalOp::And) {
         copy_results (results, one_a);
       }
-
     } else {
-
       run_bool (m_op, layout, one_a.front (), one_b.front (), results.front ());
-
     }
-
   }
 }
 
@@ -934,30 +940,54 @@ void compound_region_generic_operation_node<TS, TI, TR>::implement_compute_local
   shape_interactions<TTS, TTI> self_interactions_heap;
   const shape_interactions<TTS, TTI> &self_interactions = interactions_for_child (interactions, 0, self_interactions_heap);
 
-  self->compute_local (cache, layout, cell, self_interactions, self_result, proc);
-
-  db::generic_shape_iterator <TS> is (self_result.front ().begin (), self_result.front ().end ());
-
   std::vector<db::generic_shape_iterator<TI> > iiv;
   std::vector<std::unordered_set<TI> > intruder_results;
-  intruder_results.reserve (children () - 1);  //  important, so that the memory layout will not change while we generate them
+  intruder_results.resize (children () - 1);  //  allocate memory upfront
 
-  for (unsigned int ci = 1; ci < children (); ++ci) {
+  #if defined(_OPENMP)
+  if (proc->threads() > 0) {
+    #pragma omp task shared(self_result, self_interactions_heap, cache, layout, cell, interactions, proc)
+    {
+      self->compute_local (cache, layout, cell, self_interactions, self_result, proc);
+    }
+    for (unsigned int ci = 1; ci < children (); ++ci) {
+      #pragma omp task shared(intruder_results, cache, layout, cell, interactions, proc) firstprivate(ci)
+      {
+        const CompoundRegionOperationNode *intruder = child (ci);
+        std::vector<std::unordered_set<TI> > intruder_result;
+        intruder_result.push_back (std::unordered_set<TI> ());
+
+        shape_interactions<TTS, TTI> intruder_interactions_heap;
+        const shape_interactions<TTS, TTI> &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap);
+
+        intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc);
+        intruder_results[ci - 1] = std::move(intruder_result.front());
+      }
+    }
+    #pragma omp taskwait
+  } else
+  #endif
+  {
+    self->compute_local (cache, layout, cell, self_interactions, self_result, proc);
 
-    const CompoundRegionOperationNode *intruder = child (ci);
-    std::vector<std::unordered_set<TI> > intruder_result;
-    intruder_result.push_back (std::unordered_set<TI> ());
+    for (unsigned int ci = 1; ci < children (); ++ci) {
 
-    shape_interactions<TTS, TTI> intruder_interactions_heap;
-    const shape_interactions<TTS, TTI> &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap);
+      const CompoundRegionOperationNode *intruder = child (ci);
+      std::vector<std::unordered_set<TI> > intruder_result;
+      intruder_result.push_back (std::unordered_set<TI> ());
 
-    intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc);
+      shape_interactions<TTS, TTI> intruder_interactions_heap;
+      const shape_interactions<TTS, TTI> &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap);
 
-    intruder_results.push_back (std::unordered_set<TI> ());
-    intruder_results.back ().swap (intruder_result.front ());
+      intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc);
+      intruder_results[ci - 1] = std::move(intruder_result.front());
+    }
+  }
 
-    iiv.push_back (db::generic_shape_iterator<TI> (intruder_results.back ().begin (), intruder_results.back ().end ()));
+  db::generic_shape_iterator <TS> is (self_result.front ().begin (), self_result.front ().end ());
 
+  for (unsigned int ci = 1; ci < children (); ++ci) {
+    iiv.push_back (db::generic_shape_iterator <TI> (intruder_results[ci - 1].begin (), intruder_results[ci - 1].end ()));
   }
 
   db::local_processor <TS, TI, TR> lproc (layout);
diff --git a/src/db/db/dbCompoundOperation.h b/src/db/db/dbCompoundOperation.h
index cfbd5d1b7..fdac1351e 100644
--- a/src/db/db/dbCompoundOperation.h
+++ b/src/db/db/dbCompoundOperation.h
@@ -57,12 +57,15 @@ class CompoundRegionOperationNode;
  *  This cache is important to avoid duplicate evaluation of the same node in
  *  a diamond-graph structure of nodes.
  */
+#include "tlThreads.h"
+
 class DB_PUBLIC CompoundRegionOperationCache
 {
 public:
   template <class TR>
   std::pair<bool, std::vector<std::unordered_set<TR> > *> get (const CompoundRegionOperationNode *node)
   {
+    tl::MutexLocker lock (&m_mutex);
     bool valid = false;
     std::vector<std::unordered_set<TR> > *cache = 0;
     get_cache (cache, valid, node);
@@ -70,6 +73,7 @@ class DB_PUBLIC CompoundRegionOperationCache
   }
 
 private:
+  tl::Mutex m_mutex;
   std::map<const CompoundRegionOperationNode *, std::vector<std::unordered_set<db::PolygonRefWithProperties> > > m_cache_polyref_wp;
   std::map<const CompoundRegionOperationNode *, std::vector<std::unordered_set<db::PolygonWithProperties> > > m_cache_poly_wp;
   std::map<const CompoundRegionOperationNode *, std::vector<std::unordered_set<db::EdgeWithProperties> > > m_cache_edge_wp;
diff --git a/src/db/db/dbHierNetworkProcessor.cc b/src/db/db/dbHierNetworkProcessor.cc
index 0ef16ef41..c20e9c9d1 100644
--- a/src/db/db/dbHierNetworkProcessor.cc
+++ b/src/db/db/dbHierNetworkProcessor.cc
@@ -2945,10 +2945,23 @@ template <class T>
 void
 hier_clusters<T>::build_hier_connections_for_cells (cell_clusters_box_converter<T> &cbc, const db::Layout &layout, const std::vector<db::cell_index_type> &cells, const db::Connectivity &conn, const std::set<db::cell_index_type> *breakout_cells, tl::RelativeProgress &progress, instance_interaction_cache_type &instance_interaction_cache, bool separate_attributes)
 {
+#if defined(_OPENMP)
+  #pragma omp parallel for schedule(dynamic)
+  for (long long i = 0; i < (long long)cells.size (); ++i) {
+    db::cell_index_type c = cells[i];
+    build_hier_connections (cbc, layout, layout.cell (c), conn, breakout_cells, instance_interaction_cache, separate_attributes);
+    
+    #pragma omp critical
+    {
+      ++progress;
+    }
+  }
+#else
   for (std::vector<db::cell_index_type>::const_iterator c = cells.begin (); c != cells.end (); ++c) {
     build_hier_connections (cbc, layout, layout.cell (*c), conn, breakout_cells, instance_interaction_cache, separate_attributes);
     ++progress;
   }
+#endif
 }
 
 namespace {
diff --git a/src/db/db/dbHierNetworkProcessor.h b/src/db/db/dbHierNetworkProcessor.h
index e24605270..2176d1e07 100644
--- a/src/db/db/dbHierNetworkProcessor.h
+++ b/src/db/db/dbHierNetworkProcessor.h
@@ -1110,6 +1110,7 @@ class DB_PUBLIC_TEMPLATE instance_interaction_cache
 
   size_t size () const
   {
+    tl::MutexLocker lock (&m_mutex);
     MemStatisticsSimple ms;
     ms << m_map;
     return ms.used ();
@@ -1127,6 +1128,7 @@ class DB_PUBLIC_TEMPLATE instance_interaction_cache
 
   const Value *find (db::cell_index_type ci1, db::cell_index_type ci2, const Key &key) const
   {
+    tl::MutexLocker lock (&m_mutex);
     typename std::map <std::pair<db::cell_index_type, db::cell_index_type>, std::list <std::pair<Key, Value> > >::iterator i1 = m_map.find (std::make_pair (ci1, ci2));
     if (i1 == m_map.end ()) {
       ++m_misses;
@@ -1156,6 +1158,7 @@ class DB_PUBLIC_TEMPLATE instance_interaction_cache
   {
     const size_t instance_cache_variant_threshold = 20;
 
+    tl::MutexLocker lock (&m_mutex);
     std::list <std::pair<Key, Value> > &m = m_map [std::make_pair (ci1, ci2)];
     if (m.size () >= instance_cache_variant_threshold) {
       m.pop_back ();
@@ -1166,6 +1169,7 @@ class DB_PUBLIC_TEMPLATE instance_interaction_cache
   }
 
 private:
+  mutable tl::Mutex m_mutex;
   mutable size_t m_hits, m_misses;
   mutable std::map <std::pair<db::cell_index_type, db::cell_index_type>, std::list <std::pair<Key, Value> > > m_map;
 };
diff --git a/src/db/db/dbHierProcessor.cc b/src/db/db/dbHierProcessor.cc
index 26cc7608e..df6b76381 100644
--- a/src/db/db/dbHierProcessor.cc
+++ b/src/db/db/dbHierProcessor.cc
@@ -890,18 +890,41 @@ void local_processor<TS, TI, TR>::compute_contexts (local_processor_contexts<TS,
 
     tl::SelfTimer timer (tl::verbosity () > base_verbosity () + 10, tl::to_string (tr ("Computing contexts for ")) + description (op));
 
+#if defined(_OPENMP)
+    if (threads () > 0) {
+      mp_cc_job.reset (0);
+    } else {
+      mp_cc_job.reset (0);
+    }
+#else
     if (threads () > 0) {
       mp_cc_job.reset (new tl::Job<local_processor_context_computation_worker<TS, TI, TR> > (threads ()));
     } else {
       mp_cc_job.reset (0);
     }
+#endif
 
     contexts.clear ();
     contexts.set_intruder_layers (intruder_layers);
     contexts.set_subject_layer (subject_layer);
 
     typename local_processor_cell_contexts<TS, TI, TR>::context_key_type intruders;
+#if defined(_OPENMP)
+    if (threads() > 0) {
+      int nthreads = threads();
+      #pragma omp parallel num_threads(nthreads) shared(contexts, intruders)
+      {
+        #pragma omp single
+        {
+          issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ());
+        }
+      }
+    } else {
+      issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ());
+    }
+#else
     issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ());
+#endif
 
     if (mp_cc_job.get ()) {
       mp_cc_job->start ();
@@ -926,11 +949,24 @@ void local_processor<TS, TI, TR>::issue_compute_contexts (local_processor_contex
 {
   bool is_small_job = subject_cell->begin ().at_end ();
 
+#if defined(_OPENMP)
+  if (! is_small_job && threads() > 0) {
+    typename local_processor_cell_contexts<TS, TI, TR>::context_key_type my_intruders;
+    my_intruders.swap (intruders);
+    #pragma omp task shared(contexts) firstprivate(parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, my_intruders, dist)
+    {
+      compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, my_intruders, dist);
+    }
+  } else {
+    compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist);
+  }
+#else
   if (! is_small_job && mp_cc_job.get ()) {
     mp_cc_job->schedule (new local_processor_context_computation_task<TS, TI, TR> (this, contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist));
   } else {
     compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist);
   }
+#endif
 }
 
 template <class TS, class TI, class TR>
@@ -1164,8 +1200,6 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
 
   if (threads () > 0) {
 
-    std::unique_ptr<tl::Job<local_processor_result_computation_worker<TS, TI, TR> > > rc_job (new tl::Job<local_processor_result_computation_worker<TS, TI, TR> > (threads ()));
-
     //  schedule computation jobs in "waves": we need to make sure they are executed
     //  bottom-up. So we identify a new bunch of cells each time we pass through the cell set
     //  and proceed until all cells are removed.
@@ -1188,6 +1222,8 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
       std::vector<db::cell_index_type> next_cells_bu;
       next_cells_bu.reserve (cells_bu.size ());
 
+      std::vector<local_processor_result_computation_task<TS, TI, TR>*> tasks;
+
       for (std::vector<db::cell_index_type>::const_iterator bu = cells_bu.begin (); bu != cells_bu.end (); ++bu) {
 
         tl::MutexLocker locker (& contexts.lock ());
@@ -1197,7 +1233,7 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
 
           if (later.find (*bu) == later.end ()) {
 
-            rc_job->schedule (new local_processor_result_computation_task<TS, TI, TR> (this, contexts, cpc->first, &cpc->second, op, output_layers));
+            tasks.push_back(new local_processor_result_computation_task<TS, TI, TR> (this, contexts, cpc->first, &cpc->second, op, output_layers));
             any = true;
 
           } else {
@@ -1218,20 +1254,37 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
         break;
       }
 
-      if (rc_job.get ()) {
-
+      if (!tasks.empty()) {
         try {
-
-          rc_job->start ();
+#if defined(_OPENMP)
+          int nthreads = threads();
+          #pragma omp parallel for num_threads(nthreads) schedule(dynamic)
+          for (long long i = 0; i < (long long)tasks.size(); ++i) {
+            tasks[i]->perform();
+          }
+#else
+          std::unique_ptr<tl::Job<local_processor_result_computation_worker<TS, TI, TR> > > rc_job (new tl::Job<local_processor_result_computation_worker<TS, TI, TR> > (threads ()));
+          for (size_t i = 0; i < tasks.size(); ++i) {
+            rc_job->schedule(tasks[i]);
+          }
+          rc_job->start();
           while (! rc_job->wait (10)) {
             progress.set (get_progress ());
           }
-
+#endif
         } catch (...) {
-          rc_job->terminate ();
+#if !defined(_OPENMP)
+          // rc_job cleanup will be handled by the smart pointer, but we don't have it explicitly throwing here in openmp mode
+#endif
+          for (size_t i = 0; i < tasks.size(); ++i) { delete tasks[i]; }
           throw;
         }
-
+#if defined(_OPENMP)
+        for (size_t i = 0; i < tasks.size(); ++i) {
+          delete tasks[i];
+        }
+        progress.set(get_progress());
+#endif
       }
 
     }
diff --git a/src/drc/drc/built-in-macros/_drc_engine.rb b/src/drc/drc/built-in-macros/_drc_engine.rb
index d8d207390..8827d059c 100644
--- a/src/drc/drc/built-in-macros/_drc_engine.rb
+++ b/src/drc/drc/built-in-macros/_drc_engine.rb
@@ -1238,11 +1238,11 @@ def flat
     
     # %DRC%
     # @name threads
-    # @brief Specifies the number of CPU cores to use in tiling mode
+    # @brief Specifies the number of CPU cores to use in tiling and hierarchical mode
     # @synopsis threads(n)
     # @synopsis threads
-    # If using threads, tiles are distributed on multiple CPU cores for
-    # parallelization. Still, all tiles must be processed before the 
+    # If using threads, tiles or hierarchical cells are distributed on multiple CPU cores for
+    # parallelization. Still, all tiles or cells must be processed before the 
     # operation proceeds with the next statement. 
     #
     # Without an argument, "threads" will return the current number of 
diff --git a/src/klayout.pri b/src/klayout.pri
index c21ae29d8..5abc73988 100644
--- a/src/klayout.pri
+++ b/src/klayout.pri
@@ -216,6 +216,20 @@ msvc {
     }
   }
 
+  equals(HAVE_OPENMP, "1") {
+    msvc {
+      QMAKE_CXXFLAGS += /openmp
+      QMAKE_LFLAGS += /openmp
+    } else:macx {
+      QMAKE_CXXFLAGS += -Xpreprocessor -fopenmp
+      LIBS += -lomp
+    } else {
+      QMAKE_CXXFLAGS += -fopenmp
+      QMAKE_LFLAGS += -fopenmp
+    }
+    DEFINES += _OPENMP
+  }
+
   win32 {
 
     QMAKE_LFLAGS += -Wl,--exclude-all-symbols