diff --git a/build.sh b/build.sh index ae7aa09bd..0fd50ebcf 100755 --- a/build.sh +++ b/build.sh @@ -42,6 +42,7 @@ HAVE_EXPAT=0 HAVE_GIT2=1 HAVE_LSTREAM=1 HAVE_CPP20=0 +HAVE_OPENMP=0 RUBYINCLUDE="" RUBYINCLUDE2="" @@ -103,6 +104,9 @@ while [ "$*" != "" ]; do -without-qtbinding) HAVE_QTBINDINGS=0 ;; + -with-openmp) + HAVE_OPENMP=1 + ;; -without-qt-uitools) HAVE_QT_UITOOLS=0 ;; @@ -255,6 +259,7 @@ while [ "$*" != "" ]; do echo " -with-qtbinding Create Qt bindings for ruby scripts [default]" echo " -without-qtbinding Don't create Qt bindings for ruby scripts" echo " -without-qt-uitools Don't include uitools in Qt binding" + echo " -with-openmp Enable OpenMP parallelization for hierarchical processing" echo " -with-64bit-coord Use long (64bit) coordinates - EXPERIMENTAL FEATURE" echo " (only available for gcc>=4.4 for 64bit build)" echo " -without-64bit-coord Don't use long (64bit) coordinates [default]" @@ -601,6 +606,7 @@ echo " HAVE_PNG=$HAVE_PNG" echo " HAVE_EXPAT=$HAVE_EXPAT" echo " HAVE_GIT2=$HAVE_GIT2" echo " HAVE_LSTREAM=$HAVE_LSTREAM" +echo " HAVE_OPENMP=$HAVE_OPENMP" echo " RPATH=$RPATH" mkdir -p $BUILD @@ -676,6 +682,7 @@ qmake_options=( HAVE_GIT2="$HAVE_GIT2" HAVE_LSTREAM="$HAVE_LSTREAM" HAVE_CPP20="$HAVE_CPP20" + HAVE_OPENMP="$HAVE_OPENMP" PREFIX="$BIN" RPATH="$RPATH" KLAYOUT_VERSION="$KLAYOUT_VERSION" diff --git a/src/db/db/dbCompoundOperation.cc b/src/db/db/dbCompoundOperation.cc index dc9953582..fd94ded4e 100644 --- a/src/db/db/dbCompoundOperation.cc +++ b/src/db/db/dbCompoundOperation.cc @@ -766,46 +766,52 @@ CompoundRegionGeometricalBoolOperationNode::implement_bool (CompoundRegionOperat one_a.push_back (std::unordered_set ()); shape_interactions computed_a; - child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc); - if (one_a.front ().empty ()) { - - if (m_op == GeometricalOp::And || m_op == GeometricalOp::Not) { - - // .. no results .. + std::vector > one_b; + one_b.push_back (std::unordered_set ()); - } else { + shape_interactions computed_b; - std::vector > one_b; - one_b.push_back (std::unordered_set ()); + bool can_parallel = (m_op != GeometricalOp::And && m_op != GeometricalOp::Not); - shape_interactions computed_b; +#if defined(_OPENMP) + if (can_parallel && proc->threads() > 0) { + #pragma omp task shared(one_a, computed_a, cache, layout, cell, interactions, proc) + { + child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc); + } + #pragma omp task shared(one_b, computed_b, cache, layout, cell, interactions, proc) + { + child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc); + } + #pragma omp taskwait + } else +#endif + { + child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc); + if (!one_a.front().empty()) { child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc); + } else { + if (!can_parallel) { // And or Not and A is empty + return; // nothing to do, results remain empty + } + } + } + if (one_a.front ().empty ()) { + if (!can_parallel) { + // .. no results .. + } else { copy_results (results, one_b); - } - } else { - - std::vector > one_b; - one_b.push_back (std::unordered_set ()); - - shape_interactions computed_b; - child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc); - if (one_b.front ().empty ()) { - if (m_op != GeometricalOp::And) { copy_results (results, one_a); } - } else { - run_bool (m_op, layout, one_a.front (), one_b.front (), results.front ()); - } - } } @@ -934,30 +940,54 @@ void compound_region_generic_operation_node::implement_compute_local shape_interactions self_interactions_heap; const shape_interactions &self_interactions = interactions_for_child (interactions, 0, self_interactions_heap); - self->compute_local (cache, layout, cell, self_interactions, self_result, proc); - - db::generic_shape_iterator is (self_result.front ().begin (), self_result.front ().end ()); - std::vector > iiv; std::vector > intruder_results; - intruder_results.reserve (children () - 1); // important, so that the memory layout will not change while we generate them + intruder_results.resize (children () - 1); // allocate memory upfront - for (unsigned int ci = 1; ci < children (); ++ci) { + #if defined(_OPENMP) + if (proc->threads() > 0) { + #pragma omp task shared(self_result, self_interactions_heap, cache, layout, cell, interactions, proc) + { + self->compute_local (cache, layout, cell, self_interactions, self_result, proc); + } + for (unsigned int ci = 1; ci < children (); ++ci) { + #pragma omp task shared(intruder_results, cache, layout, cell, interactions, proc) firstprivate(ci) + { + const CompoundRegionOperationNode *intruder = child (ci); + std::vector > intruder_result; + intruder_result.push_back (std::unordered_set ()); + + shape_interactions intruder_interactions_heap; + const shape_interactions &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap); + + intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc); + intruder_results[ci - 1] = std::move(intruder_result.front()); + } + } + #pragma omp taskwait + } else + #endif + { + self->compute_local (cache, layout, cell, self_interactions, self_result, proc); - const CompoundRegionOperationNode *intruder = child (ci); - std::vector > intruder_result; - intruder_result.push_back (std::unordered_set ()); + for (unsigned int ci = 1; ci < children (); ++ci) { - shape_interactions intruder_interactions_heap; - const shape_interactions &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap); + const CompoundRegionOperationNode *intruder = child (ci); + std::vector > intruder_result; + intruder_result.push_back (std::unordered_set ()); - intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc); + shape_interactions intruder_interactions_heap; + const shape_interactions &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap); - intruder_results.push_back (std::unordered_set ()); - intruder_results.back ().swap (intruder_result.front ()); + intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc); + intruder_results[ci - 1] = std::move(intruder_result.front()); + } + } - iiv.push_back (db::generic_shape_iterator (intruder_results.back ().begin (), intruder_results.back ().end ())); + db::generic_shape_iterator is (self_result.front ().begin (), self_result.front ().end ()); + for (unsigned int ci = 1; ci < children (); ++ci) { + iiv.push_back (db::generic_shape_iterator (intruder_results[ci - 1].begin (), intruder_results[ci - 1].end ())); } db::local_processor lproc (layout); diff --git a/src/db/db/dbCompoundOperation.h b/src/db/db/dbCompoundOperation.h index cfbd5d1b7..fdac1351e 100644 --- a/src/db/db/dbCompoundOperation.h +++ b/src/db/db/dbCompoundOperation.h @@ -57,12 +57,15 @@ class CompoundRegionOperationNode; * This cache is important to avoid duplicate evaluation of the same node in * a diamond-graph structure of nodes. */ +#include "tlThreads.h" + class DB_PUBLIC CompoundRegionOperationCache { public: template std::pair > *> get (const CompoundRegionOperationNode *node) { + tl::MutexLocker lock (&m_mutex); bool valid = false; std::vector > *cache = 0; get_cache (cache, valid, node); @@ -70,6 +73,7 @@ class DB_PUBLIC CompoundRegionOperationCache } private: + tl::Mutex m_mutex; std::map > > m_cache_polyref_wp; std::map > > m_cache_poly_wp; std::map > > m_cache_edge_wp; diff --git a/src/db/db/dbHierNetworkProcessor.cc b/src/db/db/dbHierNetworkProcessor.cc index 0ef16ef41..c20e9c9d1 100644 --- a/src/db/db/dbHierNetworkProcessor.cc +++ b/src/db/db/dbHierNetworkProcessor.cc @@ -2945,10 +2945,23 @@ template void hier_clusters::build_hier_connections_for_cells (cell_clusters_box_converter &cbc, const db::Layout &layout, const std::vector &cells, const db::Connectivity &conn, const std::set *breakout_cells, tl::RelativeProgress &progress, instance_interaction_cache_type &instance_interaction_cache, bool separate_attributes) { +#if defined(_OPENMP) + #pragma omp parallel for schedule(dynamic) + for (long long i = 0; i < (long long)cells.size (); ++i) { + db::cell_index_type c = cells[i]; + build_hier_connections (cbc, layout, layout.cell (c), conn, breakout_cells, instance_interaction_cache, separate_attributes); + + #pragma omp critical + { + ++progress; + } + } +#else for (std::vector::const_iterator c = cells.begin (); c != cells.end (); ++c) { build_hier_connections (cbc, layout, layout.cell (*c), conn, breakout_cells, instance_interaction_cache, separate_attributes); ++progress; } +#endif } namespace { diff --git a/src/db/db/dbHierNetworkProcessor.h b/src/db/db/dbHierNetworkProcessor.h index e24605270..2176d1e07 100644 --- a/src/db/db/dbHierNetworkProcessor.h +++ b/src/db/db/dbHierNetworkProcessor.h @@ -1110,6 +1110,7 @@ class DB_PUBLIC_TEMPLATE instance_interaction_cache size_t size () const { + tl::MutexLocker lock (&m_mutex); MemStatisticsSimple ms; ms << m_map; return ms.used (); @@ -1127,6 +1128,7 @@ class DB_PUBLIC_TEMPLATE instance_interaction_cache const Value *find (db::cell_index_type ci1, db::cell_index_type ci2, const Key &key) const { + tl::MutexLocker lock (&m_mutex); typename std::map , std::list > >::iterator i1 = m_map.find (std::make_pair (ci1, ci2)); if (i1 == m_map.end ()) { ++m_misses; @@ -1156,6 +1158,7 @@ class DB_PUBLIC_TEMPLATE instance_interaction_cache { const size_t instance_cache_variant_threshold = 20; + tl::MutexLocker lock (&m_mutex); std::list > &m = m_map [std::make_pair (ci1, ci2)]; if (m.size () >= instance_cache_variant_threshold) { m.pop_back (); @@ -1166,6 +1169,7 @@ class DB_PUBLIC_TEMPLATE instance_interaction_cache } private: + mutable tl::Mutex m_mutex; mutable size_t m_hits, m_misses; mutable std::map , std::list > > m_map; }; diff --git a/src/db/db/dbHierProcessor.cc b/src/db/db/dbHierProcessor.cc index 26cc7608e..df6b76381 100644 --- a/src/db/db/dbHierProcessor.cc +++ b/src/db/db/dbHierProcessor.cc @@ -890,18 +890,41 @@ void local_processor::compute_contexts (local_processor_contexts base_verbosity () + 10, tl::to_string (tr ("Computing contexts for ")) + description (op)); +#if defined(_OPENMP) + if (threads () > 0) { + mp_cc_job.reset (0); + } else { + mp_cc_job.reset (0); + } +#else if (threads () > 0) { mp_cc_job.reset (new tl::Job > (threads ())); } else { mp_cc_job.reset (0); } +#endif contexts.clear (); contexts.set_intruder_layers (intruder_layers); contexts.set_subject_layer (subject_layer); typename local_processor_cell_contexts::context_key_type intruders; +#if defined(_OPENMP) + if (threads() > 0) { + int nthreads = threads(); + #pragma omp parallel num_threads(nthreads) shared(contexts, intruders) + { + #pragma omp single + { + issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ()); + } + } + } else { + issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ()); + } +#else issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ()); +#endif if (mp_cc_job.get ()) { mp_cc_job->start (); @@ -926,11 +949,24 @@ void local_processor::issue_compute_contexts (local_processor_contex { bool is_small_job = subject_cell->begin ().at_end (); +#if defined(_OPENMP) + if (! is_small_job && threads() > 0) { + typename local_processor_cell_contexts::context_key_type my_intruders; + my_intruders.swap (intruders); + #pragma omp task shared(contexts) firstprivate(parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, my_intruders, dist) + { + compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, my_intruders, dist); + } + } else { + compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist); + } +#else if (! is_small_job && mp_cc_job.get ()) { mp_cc_job->schedule (new local_processor_context_computation_task (this, contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist)); } else { compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist); } +#endif } template @@ -1164,8 +1200,6 @@ local_processor::compute_results (local_processor_contexts 0) { - std::unique_ptr > > rc_job (new tl::Job > (threads ())); - // schedule computation jobs in "waves": we need to make sure they are executed // bottom-up. So we identify a new bunch of cells each time we pass through the cell set // and proceed until all cells are removed. @@ -1188,6 +1222,8 @@ local_processor::compute_results (local_processor_contexts next_cells_bu; next_cells_bu.reserve (cells_bu.size ()); + std::vector*> tasks; + for (std::vector::const_iterator bu = cells_bu.begin (); bu != cells_bu.end (); ++bu) { tl::MutexLocker locker (& contexts.lock ()); @@ -1197,7 +1233,7 @@ local_processor::compute_results (local_processor_contextsschedule (new local_processor_result_computation_task (this, contexts, cpc->first, &cpc->second, op, output_layers)); + tasks.push_back(new local_processor_result_computation_task (this, contexts, cpc->first, &cpc->second, op, output_layers)); any = true; } else { @@ -1218,20 +1254,37 @@ local_processor::compute_results (local_processor_contextsstart (); +#if defined(_OPENMP) + int nthreads = threads(); + #pragma omp parallel for num_threads(nthreads) schedule(dynamic) + for (long long i = 0; i < (long long)tasks.size(); ++i) { + tasks[i]->perform(); + } +#else + std::unique_ptr > > rc_job (new tl::Job > (threads ())); + for (size_t i = 0; i < tasks.size(); ++i) { + rc_job->schedule(tasks[i]); + } + rc_job->start(); while (! rc_job->wait (10)) { progress.set (get_progress ()); } - +#endif } catch (...) { - rc_job->terminate (); +#if !defined(_OPENMP) + // rc_job cleanup will be handled by the smart pointer, but we don't have it explicitly throwing here in openmp mode +#endif + for (size_t i = 0; i < tasks.size(); ++i) { delete tasks[i]; } throw; } - +#if defined(_OPENMP) + for (size_t i = 0; i < tasks.size(); ++i) { + delete tasks[i]; + } + progress.set(get_progress()); +#endif } } diff --git a/src/drc/drc/built-in-macros/_drc_engine.rb b/src/drc/drc/built-in-macros/_drc_engine.rb index d8d207390..8827d059c 100644 --- a/src/drc/drc/built-in-macros/_drc_engine.rb +++ b/src/drc/drc/built-in-macros/_drc_engine.rb @@ -1238,11 +1238,11 @@ def flat # %DRC% # @name threads - # @brief Specifies the number of CPU cores to use in tiling mode + # @brief Specifies the number of CPU cores to use in tiling and hierarchical mode # @synopsis threads(n) # @synopsis threads - # If using threads, tiles are distributed on multiple CPU cores for - # parallelization. Still, all tiles must be processed before the + # If using threads, tiles or hierarchical cells are distributed on multiple CPU cores for + # parallelization. Still, all tiles or cells must be processed before the # operation proceeds with the next statement. # # Without an argument, "threads" will return the current number of diff --git a/src/klayout.pri b/src/klayout.pri index c21ae29d8..5abc73988 100644 --- a/src/klayout.pri +++ b/src/klayout.pri @@ -216,6 +216,20 @@ msvc { } } + equals(HAVE_OPENMP, "1") { + msvc { + QMAKE_CXXFLAGS += /openmp + QMAKE_LFLAGS += /openmp + } else:macx { + QMAKE_CXXFLAGS += -Xpreprocessor -fopenmp + LIBS += -lomp + } else { + QMAKE_CXXFLAGS += -fopenmp + QMAKE_LFLAGS += -fopenmp + } + DEFINES += _OPENMP + } + win32 { QMAKE_LFLAGS += -Wl,--exclude-all-symbols