From f493936fcd1a3f06608af41df4546ee8e7632103 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 8 Oct 2019 15:21:51 -0700 Subject: [PATCH 01/40] Initial copy-and-paste implementation from MPI. --- build_all.sh | 7 ++- gasnet/.gitignore | 1 + gasnet/Makefile | 31 ++++++++++ gasnet/seq.cc | 143 ++++++++++++++++++++++++++++++++++++++++++++++ get_deps.sh | 3 +- 5 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 gasnet/.gitignore create mode 100644 gasnet/Makefile create mode 100644 gasnet/seq.cc diff --git a/build_all.sh b/build_all.sh index 7fa23b99..c3004a59 100755 --- a/build_all.sh +++ b/build_all.sh @@ -47,10 +47,15 @@ if [[ $USE_MPI_OPENMP -eq 1 ]]; then ) fi -if [[ $USE_GASNET -eq 1 ]]; then +if [[ $USE_GASNET -eq 1 || $TASKBENCH_USE_GASNET -eq 1 ]]; then make -C "$GASNET_DIR" fi +if [[ $TASKBENCH_USE_GASNET -eq 1 ]]; then + make -C gasnet clean + make -C gasnet -j$THREADS +fi + if [[ $TASKBENCH_USE_HWLOC -eq 1 ]]; then pushd "$HWLOC_SRC_DIR" if [[ ! -d build ]]; then diff --git a/gasnet/.gitignore b/gasnet/.gitignore new file mode 100644 index 00000000..9403f506 --- /dev/null +++ b/gasnet/.gitignore @@ -0,0 +1 @@ +/seq diff --git a/gasnet/Makefile b/gasnet/Makefile new file mode 100644 index 00000000..ff7d614f --- /dev/null +++ b/gasnet/Makefile @@ -0,0 +1,31 @@ +MPICXX ?= mpicxx + +DEBUG ?= 0 + +CXXFLAGS ?= +CXXFLAGS += -std=c++11 -I../core +CXXFLAGS += $(shell PKG_CONFIG_PATH="$(GASNET)/lib/pkgconfig" pkg-config gasnet-$(CONDUIT)-par --cflags) + +LDFLAGS ?= +LDFLAGS += -L../core -lcore_s +LDFLAGS += $(shell PKG_CONFIG_PATH="$(GASNET)/lib/pkgconfig" pkg-config gasnet-$(CONDUIT)-par --libs) + +ifeq ($(strip $(DEBUG)),0) + CXXFLAGS += -O3 +else + CXXFLAGS += -O0 -ggdb +endif + +include ../core/make_blas.mk + +BIN := seq + +.PHONY: all +all: $(BIN) + +$(BIN): %:%.cc + $(MPICXX) -o $@ $(CXXFLAGS) $< $(LDFLAGS) + +.PHONY: clean +clean: + rm -f *.o $(BIN) diff --git a/gasnet/seq.cc b/gasnet/seq.cc new file mode 100644 index 00000000..ea72a749 --- /dev/null +++ b/gasnet/seq.cc @@ -0,0 +1,143 @@ +/* Copyright 2019 Stanford University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "core.h" +#include "timer.h" + +#include "gasnetex.h" + +int main(int argc, char *argv[]) +{ + gex_Client_t client; + gex_EP_t ep; + gex_TM_t tm; + gex_Client_Init(&client, &ep, &tm, "main", &argc, &argv, 0); + + gex_Rank_t rank = gex_TM_QueryRank(tm); + gex_Rank_t n_ranks = gex_TM_QuerySize(tm); + + App app(argc, argv); + if (rank == 0) app.display(); + + std::vector > scratch; + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + long n_points = last_point - first_point + 1; + + size_t scratch_bytes = graph.scratch_bytes_per_task; + scratch.emplace_back(scratch_bytes * n_points); + TaskGraph::prepare_scratch(scratch.back().data(), scratch.back().size()); + } + + double elapsed_time = 0.0; + for (int iter = 0; iter < 2; ++iter) { + gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); + + double start_time = Timer::get_cur_time(); + + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + long n_points = last_point - first_point + 1; + + size_t scratch_bytes = graph.scratch_bytes_per_task; + char *scratch_ptr = scratch[graph.graph_index].data(); + + std::vector rank_by_point(graph.max_width); + std::vector tag_bits_by_point(graph.max_width); + for (gex_Rank_t r = 0; r < n_ranks; ++r) { + long r_first_point = r * graph.max_width / n_ranks; + long r_last_point = (r + 1) * graph.max_width / n_ranks - 1; + for (long p = r_first_point; p <= r_last_point; ++p) { + rank_by_point[p] = r; + tag_bits_by_point[p] = p - r_first_point; + // Has to fit in 7 bits because MPI only guarrantees that + // tags can use 15 bits. + assert((tag_bits_by_point[p] & ~0x7F) == 0); + } + } + + long max_deps = 0; + for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { + for (long point = first_point; point <= last_point; ++point) { + long deps = 0; + for (auto interval : graph.dependencies(dset, point)) { + deps += interval.second - interval.first + 1; + } + max_deps = std::max(max_deps, deps); + } + } + + // Create input and output buffers. + std::vector > > inputs(n_points); + std::vector > input_ptr(n_points); + std::vector > input_bytes(n_points); + std::vector n_inputs(n_points); + std::vector > outputs(n_points); + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + auto &point_inputs = inputs[point_index]; + auto &point_input_ptr = input_ptr[point_index]; + auto &point_input_bytes = input_bytes[point_index]; + + point_inputs.resize(max_deps); + point_input_ptr.resize(max_deps); + point_input_bytes.resize(max_deps); + + for (long dep = 0; dep < max_deps; ++dep) { + point_inputs[dep].resize(graph.output_bytes_per_task); + point_input_ptr[dep] = point_inputs[dep].data(); + point_input_bytes[dep] = point_inputs[dep].size(); + } + + auto &point_outputs = outputs[point_index]; + point_outputs.resize(graph.output_bytes_per_task); + } + + // Cache dependencies. + std::vector > > > dependencies(graph.max_dependence_sets()); + std::vector > > > reverse_dependencies(graph.max_dependence_sets()); + for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { + dependencies[dset].resize(n_points); + reverse_dependencies[dset].resize(n_points); + + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + dependencies[dset][point_index] = graph.dependencies(dset, point); + reverse_dependencies[dset][point_index] = graph.reverse_dependencies(dset, point); + } + } + + for (long timestep = 0; timestep < graph.timesteps; ++timestep) { + } + } + + gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); + + double stop_time = Timer::get_cur_time(); + elapsed_time = stop_time - start_time; + } + + if (rank == 0) { + app.report_timing(elapsed_time); + } +} diff --git a/get_deps.sh b/get_deps.sh index 1018eeb3..735fbcd7 100755 --- a/get_deps.sh +++ b/get_deps.sh @@ -53,6 +53,7 @@ cat >>deps/env.sh < Date: Sun, 13 Oct 2019 22:20:03 -0700 Subject: [PATCH 02/40] Runs correctly with RAW dependencies. --- gasnet/seq.cc | 324 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 286 insertions(+), 38 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index ea72a749..3b6c2c02 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -22,58 +22,187 @@ #include "gasnetex.h" +#define CHECK_OK(x) assert((x) == GASNET_OK); + +class AutoLock { +public: + AutoLock(gex_HSL_t &lock_) : lock(&lock_) { + gex_HSL_Lock(lock); + } + ~AutoLock() { + gex_HSL_Unlock(lock); + } + +private: + gex_HSL_t *lock; +}; + +struct RankState { + gex_Rank_t rank; + gex_Rank_t n_ranks; + std::vector graphs; + std::vector > > > inputs; + std::vector > > input_ready; + std::vector > > input_ptr; + std::vector > > input_bytes; + std::vector > > outputs; + std::vector > output_ready; + std::vector > > scratch; + std::vector > > > > dependencies; + std::vector > > > > reverse_dependencies; +}; + +RankState state; +gex_HSL_t state_lock; + +static bool check_and_run(long graph_index, long timestep, long point) { + auto rank = state.rank; + auto n_ranks = state.n_ranks; + + auto &graph = state.graphs[graph_index]; + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point_index = point - first_point; + + long last_offset = graph.offset_at_timestep(timestep-1); + long last_width = graph.width_at_timestep(timestep-1); + + long dset = graph.dependence_set_at_timestep(timestep); + + auto &point_inputs = state.inputs[graph_index][point_index]; + auto &point_input_ready = state.input_ready[graph_index][point_index]; + auto &point_input_ptr = state.input_ptr[graph_index][point_index]; + auto &point_input_bytes = state.input_bytes[graph_index][point_index]; + auto &point_output = state.outputs[graph_index][point_index]; + auto &point_output_ready = state.output_ready[graph_index][point_index]; + auto &point_scratch = state.scratch[graph_index][point_index]; + auto &point_deps = state.dependencies[graph_index][dset][point_index]; + + long n_inputs = 0; + for (auto interval : point_deps) { + long first_dep = std::min(std::max(interval.first, last_offset), last_offset + last_width); + long last_dep = std::min(interval.second + 1, last_offset + last_width); + assert(first_dep <= last_dep); + n_inputs += last_dep - first_dep; + } + + bool ready = point_output_ready; + for (long input_idx = 0; input_idx < n_inputs; ++input_idx) { + ready = ready && point_input_ready[input_idx]; + } + printf("check_and_run graph %ld timestep %ld point %ld ready %d\n", graph_index, timestep, point, ready); + if (ready) { + graph.execute_point(timestep, point, + point_output.data(), point_output.size(), + point_input_ptr.data(), point_input_bytes.data(), n_inputs, + point_scratch.data(), point_scratch.size()); + + point_input_ready.assign(point_input_ready.size(), 0); + point_output_ready = 0; + + return true; + } + + return false; +} + +static void recv_handler(gex_Token_t token, void *buffer, size_t size, + gex_AM_Arg_t graph_index, gex_AM_Arg_t timestep, gex_AM_Arg_t source_point, gex_AM_Arg_t dest_point) +{ + printf("recv_handler graph %d timestep %d source %d dest %d\n", graph_index, timestep, source_point, dest_point); + fflush(stdout); + + AutoLock guard(state_lock); + + auto rank = state.rank; + auto n_ranks = state.n_ranks; + + auto &graph = state.graphs[graph_index]; + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point = dest_point; + long point_index = point - first_point; + + long last_offset = graph.offset_at_timestep(timestep-1); + long last_width = graph.width_at_timestep(timestep-1); + + long dset = graph.dependence_set_at_timestep(timestep); + + auto &point_inputs = state.inputs[graph_index][point_index]; + auto &point_input_ready = state.input_ready[graph_index][point_index]; + auto &point_deps = state.dependencies[graph_index][dset][point_index]; + + long input_idx = 0; + for (auto interval : point_deps) { + long first_dep = std::min(std::max(interval.first, last_offset), last_offset + last_width); + long last_dep = std::min(interval.second + 1, last_offset + last_width); + assert(first_dep <= last_dep); + input_idx += std::min(last_dep, (long)source_point) - std::min(first_dep, (long)source_point); + } + + point_inputs[input_idx].assign((char *)buffer, ((char *)buffer) + size); + point_input_ready[input_idx]++; + + check_and_run(graph_index, timestep, dest_point); +} + +const int N_HANDLERS = 1; + +gex_AM_Entry_t handlers[N_HANDLERS] = { + gex_AM_Entry_t { + .gex_index = 0, + .gex_fnptr = (void (*)())recv_handler, + .gex_flags = GEX_FLAG_AM_MEDIUM | GEX_FLAG_AM_REQUEST, + .gex_nargs = 4, + .gex_cdata = NULL, + .gex_name = "recv handler", + }, +}; + int main(int argc, char *argv[]) { gex_Client_t client; gex_EP_t ep; gex_TM_t tm; - gex_Client_Init(&client, &ep, &tm, "main", &argc, &argv, 0); + CHECK_OK(gex_Client_Init(&client, &ep, &tm, "main", &argc, &argv, 0)); gex_Rank_t rank = gex_TM_QueryRank(tm); gex_Rank_t n_ranks = gex_TM_QuerySize(tm); + state.rank = rank; + state.n_ranks = n_ranks; + + uintptr_t max_size = 0; // gasnet_getMaxLocalSegmentSize(); // don't need this with AM Medium + gex_Segment_t segment; + CHECK_OK(gex_Segment_Attach(&segment, tm, max_size)); + + CHECK_OK(gex_EP_RegisterHandlers(ep, handlers, N_HANDLERS)); App app(argc, argv); if (rank == 0) app.display(); - std::vector > scratch; - for (auto graph : app.graphs) { - long first_point = rank * graph.max_width / n_ranks; - long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - long n_points = last_point - first_point + 1; + gex_HSL_Init(&state_lock); - size_t scratch_bytes = graph.scratch_bytes_per_task; - scratch.emplace_back(scratch_bytes * n_points); - TaskGraph::prepare_scratch(scratch.back().data(), scratch.back().size()); - } + state.graphs = app.graphs; + + state.inputs.resize(app.graphs.size()); + state.input_ready.resize(app.graphs.size()); + state.input_ptr.resize(app.graphs.size()); + state.input_bytes.resize(app.graphs.size()); + state.outputs.resize(app.graphs.size()); + state.output_ready.resize(app.graphs.size()); + state.scratch.resize(app.graphs.size()); + state.dependencies.resize(app.graphs.size()); + state.reverse_dependencies.resize(app.graphs.size()); double elapsed_time = 0.0; for (int iter = 0; iter < 2; ++iter) { - gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); - - double start_time = Timer::get_cur_time(); - for (auto graph : app.graphs) { long first_point = rank * graph.max_width / n_ranks; long last_point = (rank + 1) * graph.max_width / n_ranks - 1; long n_points = last_point - first_point + 1; - size_t scratch_bytes = graph.scratch_bytes_per_task; - char *scratch_ptr = scratch[graph.graph_index].data(); - - std::vector rank_by_point(graph.max_width); - std::vector tag_bits_by_point(graph.max_width); - for (gex_Rank_t r = 0; r < n_ranks; ++r) { - long r_first_point = r * graph.max_width / n_ranks; - long r_last_point = (r + 1) * graph.max_width / n_ranks - 1; - for (long p = r_first_point; p <= r_last_point; ++p) { - rank_by_point[p] = r; - tag_bits_by_point[p] = p - r_first_point; - // Has to fit in 7 bits because MPI only guarrantees that - // tags can use 15 bits. - assert((tag_bits_by_point[p] & ~0x7F) == 0); - } - } - long max_deps = 0; for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { for (long point = first_point; point <= last_point; ++point) { @@ -86,35 +215,60 @@ int main(int argc, char *argv[]) } // Create input and output buffers. - std::vector > > inputs(n_points); - std::vector > input_ptr(n_points); - std::vector > input_bytes(n_points); - std::vector n_inputs(n_points); - std::vector > outputs(n_points); + auto &inputs = state.inputs[graph.graph_index]; + auto &input_ready = state.input_ready[graph.graph_index]; + auto &input_ptr = state.input_ptr[graph.graph_index]; + auto &input_bytes = state.input_bytes[graph.graph_index]; + auto &outputs = state.outputs[graph.graph_index]; + auto &output_ready = state.output_ready[graph.graph_index]; + auto &scratch = state.scratch[graph.graph_index]; + + inputs.resize(n_points); + input_ready.resize(n_points); + input_ptr.resize(n_points); + input_bytes.resize(n_points); + outputs.resize(n_points); + output_ready.resize(n_points); + scratch.resize(n_points); + for (long point = first_point; point <= last_point; ++point) { long point_index = point - first_point; auto &point_inputs = inputs[point_index]; + auto &point_input_ready = input_ready[point_index]; auto &point_input_ptr = input_ptr[point_index]; auto &point_input_bytes = input_bytes[point_index]; point_inputs.resize(max_deps); + point_input_ready.resize(max_deps); point_input_ptr.resize(max_deps); point_input_bytes.resize(max_deps); for (long dep = 0; dep < max_deps; ++dep) { point_inputs[dep].resize(graph.output_bytes_per_task); + point_input_ready[dep] = 0; point_input_ptr[dep] = point_inputs[dep].data(); point_input_bytes[dep] = point_inputs[dep].size(); } auto &point_outputs = outputs[point_index]; point_outputs.resize(graph.output_bytes_per_task); + + auto &point_output_ready = output_ready[point_index]; + point_output_ready = 0; + + auto &point_scratch = scratch[point_index]; + point_scratch.resize(graph.scratch_bytes_per_task); + TaskGraph::prepare_scratch(point_scratch.data(), point_scratch.size()); } // Cache dependencies. - std::vector > > > dependencies(graph.max_dependence_sets()); - std::vector > > > reverse_dependencies(graph.max_dependence_sets()); + auto &dependencies = state.dependencies[graph.graph_index]; + auto &reverse_dependencies = state.reverse_dependencies[graph.graph_index]; + + dependencies.resize(n_points); + reverse_dependencies.resize(n_points); + for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { dependencies[dset].resize(n_points); reverse_dependencies[dset].resize(n_points); @@ -126,8 +280,100 @@ int main(int argc, char *argv[]) reverse_dependencies[dset][point_index] = graph.reverse_dependencies(dset, point); } } + } + + gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); + + double start_time = Timer::get_cur_time(); + + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + long n_points = last_point - first_point + 1; + + auto &outputs = state.outputs[graph.graph_index]; + auto &output_ready = state.output_ready[graph.graph_index]; + + auto &reverse_dependencies = state.reverse_dependencies[graph.graph_index]; + + std::vector rank_by_point(graph.max_width); + for (gex_Rank_t r = 0; r < n_ranks; ++r) { + long r_first_point = r * graph.max_width / n_ranks; + long r_last_point = (r + 1) * graph.max_width / n_ranks - 1; + for (long p = r_first_point; p <= r_last_point; ++p) { + rank_by_point[p] = r; + } + } for (long timestep = 0; timestep < graph.timesteps; ++timestep) { + long offset = graph.offset_at_timestep(timestep); + long width = graph.width_at_timestep(timestep); + + long last_offset = graph.offset_at_timestep(timestep-1); + long last_width = graph.width_at_timestep(timestep-1); + + long dset = graph.dependence_set_at_timestep(timestep); + auto &rev_deps = reverse_dependencies[dset]; + + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + auto &point_output = outputs[point_index]; + auto &point_rev_deps = rev_deps[point_index]; + + // Send data for RAW dependencies + if (point >= last_offset && point < last_offset + last_width) { + for (auto interval : point_rev_deps) { + for (long dep = interval.first; dep <= interval.second; dep++) { + if (dep < offset || dep >= offset + width) { + continue; + } + + CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, + point_output.data(), point_output.size(), + GEX_EVENT_GROUP, 0, + (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, + (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); + } + } + } + } + + // Wait for local completion so it's safe to override output buffers + gex_NBI_Wait(GEX_EC_ALL, 0); + + { + AutoLock guard(state_lock); + + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + auto &point_output_ready = output_ready[point_index]; + point_output_ready = 1; + } + } + + // Spin for inputs until task is complete + { + bool complete = false; + while (!complete) { + { + AutoLock guard(state_lock); + + complete = true; + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + auto &point_output_ready = output_ready[point_index]; + complete = point_output_ready == 0 || check_and_run(graph.graph_index, timestep, point); + } + } + + if (!complete) { + CHECK_OK(gasnet_AMPoll()); + } + } + } } } @@ -140,4 +386,6 @@ int main(int argc, char *argv[]) if (rank == 0) { app.report_timing(elapsed_time); } + + gex_HSL_Destroy(&state_lock); } From 4373d218f03e67c237e23dff63befd47bf6683b5 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Mon, 14 Oct 2019 09:59:17 -0700 Subject: [PATCH 03/40] Event loop implementation. --- gasnet/seq.cc | 231 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 149 insertions(+), 82 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 3b6c2c02..915c8c60 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -38,45 +38,96 @@ class AutoLock { }; struct RankState { + gex_HSL_t lock; gex_Rank_t rank; gex_Rank_t n_ranks; std::vector graphs; + std::vector > timesteps; std::vector > > > inputs; std::vector > > input_ready; std::vector > > input_ptr; std::vector > > input_bytes; std::vector > > outputs; - std::vector > output_ready; + std::vector > output_empty; std::vector > > scratch; std::vector > > > > dependencies; std::vector > > > > reverse_dependencies; }; RankState state; -gex_HSL_t state_lock; -static bool check_and_run(long graph_index, long timestep, long point) { +static bool is_complete() { + AutoLock guard(state.lock); + + auto rank = state.rank; + auto n_ranks = state.n_ranks; + + auto &graphs = state.graphs; + + bool complete = true; + for (auto graph : graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + auto &point_timestep = state.timesteps[graph.graph_index][point_index]; + complete = complete && point_timestep == graph.timesteps; + if (!complete) break; + } + if (!complete) break; + } + return complete; +} + +static long timestep_to_send(long graph_index, long point) { + AutoLock guard(state.lock); + auto rank = state.rank; auto n_ranks = state.n_ranks; auto &graph = state.graphs[graph_index]; + long first_point = rank * graph.max_width / n_ranks; long last_point = (rank + 1) * graph.max_width / n_ranks - 1; long point_index = point - first_point; - long last_offset = graph.offset_at_timestep(timestep-1); - long last_width = graph.width_at_timestep(timestep-1); + auto &point_timestep = state.timesteps[graph.graph_index][point_index]; - long dset = graph.dependence_set_at_timestep(timestep); + if (point_timestep > 0 && point_timestep < graph.timesteps) { + return point_timestep; + } + return -1; +} +static bool check_and_run(long graph_index, long point) { + auto rank = state.rank; + auto n_ranks = state.n_ranks; + + auto &graph = state.graphs[graph_index]; + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point_index = point - first_point; + + auto &point_timestep = state.timesteps[graph_index][point_index]; auto &point_inputs = state.inputs[graph_index][point_index]; auto &point_input_ready = state.input_ready[graph_index][point_index]; auto &point_input_ptr = state.input_ptr[graph_index][point_index]; auto &point_input_bytes = state.input_bytes[graph_index][point_index]; auto &point_output = state.outputs[graph_index][point_index]; - auto &point_output_ready = state.output_ready[graph_index][point_index]; + auto &point_output_empty = state.output_empty[graph_index][point_index]; auto &point_scratch = state.scratch[graph_index][point_index]; + + long timestep = point_timestep; + + long last_offset = graph.offset_at_timestep(timestep-1); + long last_width = graph.width_at_timestep(timestep-1); + + long dset = graph.dependence_set_at_timestep(timestep); + auto &point_deps = state.dependencies[graph_index][dset][point_index]; long n_inputs = 0; @@ -87,9 +138,10 @@ static bool check_and_run(long graph_index, long timestep, long point) { n_inputs += last_dep - first_dep; } - bool ready = point_output_ready; + bool ready = point_timestep < graph.timesteps && point_output_empty; for (long input_idx = 0; input_idx < n_inputs; ++input_idx) { ready = ready && point_input_ready[input_idx]; + if (!ready) break; } printf("check_and_run graph %ld timestep %ld point %ld ready %d\n", graph_index, timestep, point, ready); if (ready) { @@ -98,8 +150,9 @@ static bool check_and_run(long graph_index, long timestep, long point) { point_input_ptr.data(), point_input_bytes.data(), n_inputs, point_scratch.data(), point_scratch.size()); + ++point_timestep; point_input_ready.assign(point_input_ready.size(), 0); - point_output_ready = 0; + point_output_empty = 0; return true; } @@ -113,7 +166,7 @@ static void recv_handler(gex_Token_t token, void *buffer, size_t size, printf("recv_handler graph %d timestep %d source %d dest %d\n", graph_index, timestep, source_point, dest_point); fflush(stdout); - AutoLock guard(state_lock); + AutoLock guard(state.lock); auto rank = state.rank; auto n_ranks = state.n_ranks; @@ -145,7 +198,7 @@ static void recv_handler(gex_Token_t token, void *buffer, size_t size, point_inputs[input_idx].assign((char *)buffer, ((char *)buffer) + size); point_input_ready[input_idx]++; - check_and_run(graph_index, timestep, dest_point); + check_and_run(graph_index, dest_point); } const int N_HANDLERS = 1; @@ -182,20 +235,23 @@ int main(int argc, char *argv[]) App app(argc, argv); if (rank == 0) app.display(); - gex_HSL_Init(&state_lock); + gex_HSL_Init(&state.lock); state.graphs = app.graphs; + state.timesteps.resize(app.graphs.size()); state.inputs.resize(app.graphs.size()); state.input_ready.resize(app.graphs.size()); state.input_ptr.resize(app.graphs.size()); state.input_bytes.resize(app.graphs.size()); state.outputs.resize(app.graphs.size()); - state.output_ready.resize(app.graphs.size()); + state.output_empty.resize(app.graphs.size()); state.scratch.resize(app.graphs.size()); state.dependencies.resize(app.graphs.size()); state.reverse_dependencies.resize(app.graphs.size()); + std::vector > graph_rank_by_point(app.graphs.size()); + double elapsed_time = 0.0; for (int iter = 0; iter < 2; ++iter) { for (auto graph : app.graphs) { @@ -214,26 +270,40 @@ int main(int argc, char *argv[]) } } - // Create input and output buffers. + // Initialize data structures. + graph_rank_by_point[graph.graph_index].resize(graph.max_width); + for (gex_Rank_t r = 0; r < n_ranks; ++r) { + long r_first_point = r * graph.max_width / n_ranks; + long r_last_point = (r + 1) * graph.max_width / n_ranks - 1; + for (long p = r_first_point; p <= r_last_point; ++p) { + graph_rank_by_point[graph.graph_index][p] = r; + } + } + + auto ×teps = state.timesteps[graph.graph_index]; auto &inputs = state.inputs[graph.graph_index]; auto &input_ready = state.input_ready[graph.graph_index]; auto &input_ptr = state.input_ptr[graph.graph_index]; auto &input_bytes = state.input_bytes[graph.graph_index]; auto &outputs = state.outputs[graph.graph_index]; - auto &output_ready = state.output_ready[graph.graph_index]; + auto &output_empty = state.output_empty[graph.graph_index]; auto &scratch = state.scratch[graph.graph_index]; + timesteps.resize(n_points); inputs.resize(n_points); input_ready.resize(n_points); input_ptr.resize(n_points); input_bytes.resize(n_points); outputs.resize(n_points); - output_ready.resize(n_points); + output_empty.resize(n_points); scratch.resize(n_points); for (long point = first_point; point <= last_point; ++point) { long point_index = point - first_point; + auto &point_timestep = timesteps[point_index]; + point_timestep = 0; + auto &point_inputs = inputs[point_index]; auto &point_input_ready = input_ready[point_index]; auto &point_input_ptr = input_ptr[point_index]; @@ -254,8 +324,8 @@ int main(int argc, char *argv[]) auto &point_outputs = outputs[point_index]; point_outputs.resize(graph.output_bytes_per_task); - auto &point_output_ready = output_ready[point_index]; - point_output_ready = 0; + auto &point_output_empty = output_empty[point_index]; + point_output_empty = 0; auto &point_scratch = scratch[point_index]; point_scratch.resize(graph.scratch_bytes_per_task); @@ -286,95 +356,92 @@ int main(int argc, char *argv[]) double start_time = Timer::get_cur_time(); - for (auto graph : app.graphs) { - long first_point = rank * graph.max_width / n_ranks; - long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - long n_points = last_point - first_point + 1; + while (!is_complete()) { + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - auto &outputs = state.outputs[graph.graph_index]; - auto &output_ready = state.output_ready[graph.graph_index]; + auto &outputs = state.outputs[graph.graph_index]; - auto &reverse_dependencies = state.reverse_dependencies[graph.graph_index]; + auto &reverse_dependencies = state.reverse_dependencies[graph.graph_index]; - std::vector rank_by_point(graph.max_width); - for (gex_Rank_t r = 0; r < n_ranks; ++r) { - long r_first_point = r * graph.max_width / n_ranks; - long r_last_point = (r + 1) * graph.max_width / n_ranks - 1; - for (long p = r_first_point; p <= r_last_point; ++p) { - rank_by_point[p] = r; - } - } - - for (long timestep = 0; timestep < graph.timesteps; ++timestep) { - long offset = graph.offset_at_timestep(timestep); - long width = graph.width_at_timestep(timestep); - - long last_offset = graph.offset_at_timestep(timestep-1); - long last_width = graph.width_at_timestep(timestep-1); - - long dset = graph.dependence_set_at_timestep(timestep); - auto &rev_deps = reverse_dependencies[dset]; + auto &rank_by_point = graph_rank_by_point[graph.graph_index]; for (long point = first_point; point <= last_point; ++point) { long point_index = point - first_point; - auto &point_output = outputs[point_index]; - auto &point_rev_deps = rev_deps[point_index]; - - // Send data for RAW dependencies - if (point >= last_offset && point < last_offset + last_width) { - for (auto interval : point_rev_deps) { - for (long dep = interval.first; dep <= interval.second; dep++) { - if (dep < offset || dep >= offset + width) { - continue; + long timestep = timestep_to_send(graph.graph_index, point); + if (timestep > 0) { + long offset = graph.offset_at_timestep(timestep); + long width = graph.width_at_timestep(timestep); + + long last_offset = graph.offset_at_timestep(timestep-1); + long last_width = graph.width_at_timestep(timestep-1); + + long dset = graph.dependence_set_at_timestep(timestep); + auto &rev_deps = reverse_dependencies[dset]; + + auto &point_output = outputs[point_index]; + auto &point_rev_deps = rev_deps[point_index]; + + // Send data for RAW dependencies + if (point >= last_offset && point < last_offset + last_width) { + for (auto interval : point_rev_deps) { + for (long dep = interval.first; dep <= interval.second; dep++) { + if (dep < offset || dep >= offset + width) { + continue; + } + + CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, + point_output.data(), point_output.size(), + GEX_EVENT_GROUP, 0, + (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, + (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); } - - CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, - point_output.data(), point_output.size(), - GEX_EVENT_GROUP, 0, - (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, - (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); } } } } + } + + // Wait for local completion so it's safe to override output buffers. + gex_NBI_Wait(GEX_EC_LC, 0); - // Wait for local completion so it's safe to override output buffers - gex_NBI_Wait(GEX_EC_ALL, 0); + // Mark readiness of the output buffers. + { + AutoLock guard(state.lock); - { - AutoLock guard(state_lock); + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + auto &output_empty = state.output_empty[graph.graph_index]; for (long point = first_point; point <= last_point; ++point) { long point_index = point - first_point; - auto &point_output_ready = output_ready[point_index]; - point_output_ready = 1; + auto &point_output_empty = output_empty[point_index]; + point_output_empty = 1; } } + } - // Spin for inputs until task is complete - { - bool complete = false; - while (!complete) { - { - AutoLock guard(state_lock); - - complete = true; - for (long point = first_point; point <= last_point; ++point) { - long point_index = point - first_point; + // Run any ready tasks. + { + AutoLock guard(state.lock); - auto &point_output_ready = output_ready[point_index]; - complete = point_output_ready == 0 || check_and_run(graph.graph_index, timestep, point); - } - } + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - if (!complete) { - CHECK_OK(gasnet_AMPoll()); - } + for (long point = first_point; point <= last_point; ++point) { + check_and_run(graph.graph_index, point); } } } + + // Poll the network to make sure we're making progress. + CHECK_OK(gasnet_AMPoll()); } gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); @@ -387,5 +454,5 @@ int main(int argc, char *argv[]) app.report_timing(elapsed_time); } - gex_HSL_Destroy(&state_lock); + gex_HSL_Destroy(&state.lock); } From ce62f605c8a253d1370fe28704901049d8950dae Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Mon, 14 Oct 2019 11:14:02 -0700 Subject: [PATCH 04/40] Now with fields. --- gasnet/seq.cc | 138 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 88 insertions(+), 50 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 915c8c60..efa57816 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include "core.h" #include "timer.h" @@ -41,14 +43,15 @@ struct RankState { gex_HSL_t lock; gex_Rank_t rank; gex_Rank_t n_ranks; + long num_fields; std::vector graphs; std::vector > timesteps; - std::vector > > > inputs; + std::vector > > > > inputs; std::vector > > input_ready; - std::vector > > input_ptr; - std::vector > > input_bytes; - std::vector > > outputs; - std::vector > output_empty; + std::vector > > > input_ptr; + std::vector > > > input_bytes; + std::vector > > > outputs; + std::vector > > output_empty; std::vector > > scratch; std::vector > > > > dependencies; std::vector > > > > reverse_dependencies; @@ -94,9 +97,13 @@ static long timestep_to_send(long graph_index, long point) { long point_index = point - first_point; - auto &point_timestep = state.timesteps[graph.graph_index][point_index]; + auto &point_timestep = state.timesteps[graph_index][point_index]; + + long last_field = (point_timestep + state.num_fields - 1) % state.num_fields; - if (point_timestep > 0 && point_timestep < graph.timesteps) { + auto &point_output_empty = state.output_empty[graph_index][point_index][last_field]; + + if (point_timestep > 0 && point_timestep < graph.timesteps && point_output_empty == 0) { return point_timestep; } return -1; @@ -113,21 +120,24 @@ static bool check_and_run(long graph_index, long point) { long point_index = point - first_point; auto &point_timestep = state.timesteps[graph_index][point_index]; - auto &point_inputs = state.inputs[graph_index][point_index]; - auto &point_input_ready = state.input_ready[graph_index][point_index]; - auto &point_input_ptr = state.input_ptr[graph_index][point_index]; - auto &point_input_bytes = state.input_bytes[graph_index][point_index]; - auto &point_output = state.outputs[graph_index][point_index]; - auto &point_output_empty = state.output_empty[graph_index][point_index]; - auto &point_scratch = state.scratch[graph_index][point_index]; long timestep = point_timestep; + long last_field = (timestep + state.num_fields - 1) % state.num_fields; + long field = timestep % state.num_fields; + long last_offset = graph.offset_at_timestep(timestep-1); long last_width = graph.width_at_timestep(timestep-1); long dset = graph.dependence_set_at_timestep(timestep); + auto &point_inputs = state.inputs[graph_index][point_index][last_field]; + auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; + auto &point_input_ptr = state.input_ptr[graph_index][point_index][last_field]; + auto &point_input_bytes = state.input_bytes[graph_index][point_index][last_field]; + auto &point_output = state.outputs[graph_index][point_index][field]; + auto &point_output_empty = state.output_empty[graph_index][point_index][field]; + auto &point_scratch = state.scratch[graph_index][point_index]; auto &point_deps = state.dependencies[graph_index][dset][point_index]; long n_inputs = 0; @@ -138,22 +148,19 @@ static bool check_and_run(long graph_index, long point) { n_inputs += last_dep - first_dep; } - bool ready = point_timestep < graph.timesteps && point_output_empty; - for (long input_idx = 0; input_idx < n_inputs; ++input_idx) { - ready = ready && point_input_ready[input_idx]; - if (!ready) break; - } - printf("check_and_run graph %ld timestep %ld point %ld ready %d\n", graph_index, timestep, point, ready); + bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty; if (ready) { + printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d\n", graph_index, timestep, point, last_field, ready); graph.execute_point(timestep, point, point_output.data(), point_output.size(), point_input_ptr.data(), point_input_bytes.data(), n_inputs, point_scratch.data(), point_scratch.size()); - ++point_timestep; - point_input_ready.assign(point_input_ready.size(), 0); + point_input_ready = 0; point_output_empty = 0; + ++point_timestep; + return true; } @@ -163,9 +170,6 @@ static bool check_and_run(long graph_index, long point) { static void recv_handler(gex_Token_t token, void *buffer, size_t size, gex_AM_Arg_t graph_index, gex_AM_Arg_t timestep, gex_AM_Arg_t source_point, gex_AM_Arg_t dest_point) { - printf("recv_handler graph %d timestep %d source %d dest %d\n", graph_index, timestep, source_point, dest_point); - fflush(stdout); - AutoLock guard(state.lock); auto rank = state.rank; @@ -178,13 +182,15 @@ static void recv_handler(gex_Token_t token, void *buffer, size_t size, long point = dest_point; long point_index = point - first_point; + long last_field = (timestep + state.num_fields - 1) % state.num_fields; + long last_offset = graph.offset_at_timestep(timestep-1); long last_width = graph.width_at_timestep(timestep-1); long dset = graph.dependence_set_at_timestep(timestep); - auto &point_inputs = state.inputs[graph_index][point_index]; - auto &point_input_ready = state.input_ready[graph_index][point_index]; + auto &point_inputs = state.inputs[graph_index][point_index][last_field]; + auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; auto &point_deps = state.dependencies[graph_index][dset][point_index]; long input_idx = 0; @@ -195,8 +201,10 @@ static void recv_handler(gex_Token_t token, void *buffer, size_t size, input_idx += std::min(last_dep, (long)source_point) - std::min(first_dep, (long)source_point); } + printf("recv_handler graph %d timestep %d source %d dest %d last_field %ld input_idx %ld input %p\n", graph_index, timestep, source_point, dest_point, input_idx, last_field, point_inputs[input_idx].data()); + point_inputs[input_idx].assign((char *)buffer, ((char *)buffer) + size); - point_input_ready[input_idx]++; + point_input_ready++; check_and_run(graph_index, dest_point); } @@ -237,6 +245,8 @@ int main(int argc, char *argv[]) gex_HSL_Init(&state.lock); + state.num_fields = 5; + state.graphs = app.graphs; state.timesteps.resize(app.graphs.size()); @@ -252,6 +262,8 @@ int main(int argc, char *argv[]) std::vector > graph_rank_by_point(app.graphs.size()); + std::vector > sends; + double elapsed_time = 0.0; for (int iter = 0; iter < 2; ++iter) { for (auto graph : app.graphs) { @@ -308,24 +320,40 @@ int main(int argc, char *argv[]) auto &point_input_ready = input_ready[point_index]; auto &point_input_ptr = input_ptr[point_index]; auto &point_input_bytes = input_bytes[point_index]; + auto &point_outputs = outputs[point_index]; + auto &point_output_empty = output_empty[point_index]; - point_inputs.resize(max_deps); - point_input_ready.resize(max_deps); - point_input_ptr.resize(max_deps); - point_input_bytes.resize(max_deps); + point_inputs.resize(state.num_fields); + point_input_ready.resize(state.num_fields); + point_input_ptr.resize(state.num_fields); + point_input_bytes.resize(state.num_fields); + point_outputs.resize(state.num_fields); + point_output_empty.resize(state.num_fields); + + for (long field = 0; field < state.num_fields; ++field) { + auto &field_inputs = point_inputs[field]; + auto &field_input_ptr = point_input_ptr[field]; + auto &field_input_bytes = point_input_bytes[field]; + + field_inputs.resize(max_deps); + field_input_ptr.resize(max_deps); + field_input_bytes.resize(max_deps); + + for (long dep = 0; dep < max_deps; ++dep) { + field_inputs[dep].resize(graph.output_bytes_per_task); + field_input_ptr[dep] = field_inputs[dep].data(); + field_input_bytes[dep] = field_inputs[dep].size(); + } - for (long dep = 0; dep < max_deps; ++dep) { - point_inputs[dep].resize(graph.output_bytes_per_task); - point_input_ready[dep] = 0; - point_input_ptr[dep] = point_inputs[dep].data(); - point_input_bytes[dep] = point_inputs[dep].size(); - } + auto &field_input_ready = point_input_ready[field]; + field_input_ready = 0; - auto &point_outputs = outputs[point_index]; - point_outputs.resize(graph.output_bytes_per_task); + auto &field_outputs = point_outputs[field]; + field_outputs.resize(graph.output_bytes_per_task); - auto &point_output_empty = output_empty[point_index]; - point_output_empty = 0; + auto &field_output_empty = point_output_empty[field]; + field_output_empty = 1; + } auto &point_scratch = scratch[point_index]; point_scratch.resize(graph.scratch_bytes_per_task); @@ -378,10 +406,12 @@ int main(int argc, char *argv[]) long last_offset = graph.offset_at_timestep(timestep-1); long last_width = graph.width_at_timestep(timestep-1); + long last_field = (timestep + state.num_fields - 1) % state.num_fields; + long dset = graph.dependence_set_at_timestep(timestep); auto &rev_deps = reverse_dependencies[dset]; - auto &point_output = outputs[point_index]; + auto &point_output = outputs[point_index][last_field]; auto &point_rev_deps = rev_deps[point_index]; // Send data for RAW dependencies @@ -397,6 +427,7 @@ int main(int argc, char *argv[]) GEX_EVENT_GROUP, 0, (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); + sends.push_back(std::tuple(graph.graph_index, timestep, point)); } } } @@ -410,21 +441,28 @@ int main(int argc, char *argv[]) // Mark readiness of the output buffers. { AutoLock guard(state.lock); + for (auto &send : sends) { + long graph_index; + long timestep; + long point; + std::tie(graph_index, timestep, point) = send; + + printf("local completion for graph %ld timestep %ld point %ld\n", graph_index, timestep, point); + + auto &graph = state.graphs[graph_index]; - for (auto graph : app.graphs) { long first_point = rank * graph.max_width / n_ranks; long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - auto &output_empty = state.output_empty[graph.graph_index]; + long point_index = point - first_point; - for (long point = first_point; point <= last_point; ++point) { - long point_index = point - first_point; + long last_field = (timestep + state.num_fields - 1) % state.num_fields; - auto &point_output_empty = output_empty[point_index]; - point_output_empty = 1; - } + auto &point_output_empty = state.output_empty[graph_index][point_index][last_field]; + point_output_empty = 1; } } + sends.clear(); // Run any ready tasks. { From 09d43a0f8c85488f2076d2072cbaf103477e06ea Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Mon, 14 Oct 2019 11:32:01 -0700 Subject: [PATCH 05/40] Remove debug printfs and add exit call. --- gasnet/seq.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index efa57816..30c9a140 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -150,7 +150,6 @@ static bool check_and_run(long graph_index, long point) { bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty; if (ready) { - printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d\n", graph_index, timestep, point, last_field, ready); graph.execute_point(timestep, point, point_output.data(), point_output.size(), point_input_ptr.data(), point_input_bytes.data(), n_inputs, @@ -201,8 +200,6 @@ static void recv_handler(gex_Token_t token, void *buffer, size_t size, input_idx += std::min(last_dep, (long)source_point) - std::min(first_dep, (long)source_point); } - printf("recv_handler graph %d timestep %d source %d dest %d last_field %ld input_idx %ld input %p\n", graph_index, timestep, source_point, dest_point, input_idx, last_field, point_inputs[input_idx].data()); - point_inputs[input_idx].assign((char *)buffer, ((char *)buffer) + size); point_input_ready++; @@ -447,8 +444,6 @@ int main(int argc, char *argv[]) long point; std::tie(graph_index, timestep, point) = send; - printf("local completion for graph %ld timestep %ld point %ld\n", graph_index, timestep, point); - auto &graph = state.graphs[graph_index]; long first_point = rank * graph.max_width / n_ranks; @@ -493,4 +488,8 @@ int main(int argc, char *argv[]) } gex_HSL_Destroy(&state.lock); + + gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); + + gasnet_exit(0); } From c5a483ba2e1f8e8f4c0240c2addc81ef936032dd Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Mon, 14 Oct 2019 11:45:41 -0700 Subject: [PATCH 06/40] Revert "Remove debug printfs and add exit call." This reverts commit 5f4b2775546d79c52c5d2afb271e73139d0f34ac. --- gasnet/seq.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 30c9a140..efa57816 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -150,6 +150,7 @@ static bool check_and_run(long graph_index, long point) { bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty; if (ready) { + printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d\n", graph_index, timestep, point, last_field, ready); graph.execute_point(timestep, point, point_output.data(), point_output.size(), point_input_ptr.data(), point_input_bytes.data(), n_inputs, @@ -200,6 +201,8 @@ static void recv_handler(gex_Token_t token, void *buffer, size_t size, input_idx += std::min(last_dep, (long)source_point) - std::min(first_dep, (long)source_point); } + printf("recv_handler graph %d timestep %d source %d dest %d last_field %ld input_idx %ld input %p\n", graph_index, timestep, source_point, dest_point, input_idx, last_field, point_inputs[input_idx].data()); + point_inputs[input_idx].assign((char *)buffer, ((char *)buffer) + size); point_input_ready++; @@ -444,6 +447,8 @@ int main(int argc, char *argv[]) long point; std::tie(graph_index, timestep, point) = send; + printf("local completion for graph %ld timestep %ld point %ld\n", graph_index, timestep, point); + auto &graph = state.graphs[graph_index]; long first_point = rank * graph.max_width / n_ranks; @@ -488,8 +493,4 @@ int main(int argc, char *argv[]) } gex_HSL_Destroy(&state.lock); - - gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); - - gasnet_exit(0); } From 55da8e4fab85c31612cc2dfacc749f74e17ae86c Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Mon, 14 Oct 2019 11:51:51 -0700 Subject: [PATCH 07/40] Fix for trivial case. --- gasnet/seq.cc | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index efa57816..b91c2b0a 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -129,7 +129,11 @@ static bool check_and_run(long graph_index, long point) { long last_offset = graph.offset_at_timestep(timestep-1); long last_width = graph.width_at_timestep(timestep-1); + long next_offset = graph.offset_at_timestep(timestep+1); + long next_width = graph.width_at_timestep(timestep+1); + long dset = graph.dependence_set_at_timestep(timestep); + long next_dset = graph.dependence_set_at_timestep(timestep+1); auto &point_inputs = state.inputs[graph_index][point_index][last_field]; auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; @@ -139,6 +143,7 @@ static bool check_and_run(long graph_index, long point) { auto &point_output_empty = state.output_empty[graph_index][point_index][field]; auto &point_scratch = state.scratch[graph_index][point_index]; auto &point_deps = state.dependencies[graph_index][dset][point_index]; + auto &point_rev_deps = state.reverse_dependencies[graph_index][next_dset][point_index]; long n_inputs = 0; for (auto interval : point_deps) { @@ -148,16 +153,26 @@ static bool check_and_run(long graph_index, long point) { n_inputs += last_dep - first_dep; } + long n_outputs = 0; + for (auto interval : point_rev_deps) { + long first_dep = std::min(std::max(interval.first, next_offset), next_offset + next_width); + long last_dep = std::min(interval.second + 1, next_offset + next_width); + assert(first_dep <= last_dep); + n_outputs += last_dep - first_dep; + } + bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty; + printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d (timestep %d input %d output %d)\n", + graph_index, timestep, point, last_field, ready, + point_timestep < graph.timesteps, point_input_ready == n_inputs, point_output_empty); if (ready) { - printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d\n", graph_index, timestep, point, last_field, ready); graph.execute_point(timestep, point, point_output.data(), point_output.size(), point_input_ptr.data(), point_input_bytes.data(), n_inputs, point_scratch.data(), point_scratch.size()); point_input_ready = 0; - point_output_empty = 0; + point_output_empty = n_outputs == 0; ++point_timestep; From 7694803dec00167e8fb4d69d8ce3001898db5a8b Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Mon, 14 Oct 2019 11:52:15 -0700 Subject: [PATCH 08/40] GASNet build and test configuration. --- .travis.yml | 1 + get_deps.sh | 7 ++++++- test_all.sh | 11 +++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 98e907f8..2adab9f5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,6 +19,7 @@ env: matrix: - TASKBENCH_USE_MPI=1 - USE_MPI_OPENMP=1 + - TASKBENCH_USE_GASNET=1 GASNET_DEBUG=1 CONDUIT=mpi - USE_LEGION=1 # USE_GASNET=1 CONDUIT=mpi - USE_PYGION=1 - USE_REGENT=1 diff --git a/get_deps.sh b/get_deps.sh index 735fbcd7..2f85920b 100755 --- a/get_deps.sh +++ b/get_deps.sh @@ -84,7 +84,12 @@ if [[ $USE_GASNET -eq 1 || $TASKBENCH_USE_GASNET -eq 1 ]]; then export GASNET_DIR="$PWD"/deps/gasnet cat >>deps/env.sh < Date: Tue, 15 Oct 2019 09:53:47 -0700 Subject: [PATCH 09/40] Fix bug in stencil_1d_periodic. --- gasnet/seq.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index b91c2b0a..eca128bd 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -165,6 +165,9 @@ static bool check_and_run(long graph_index, long point) { printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d (timestep %d input %d output %d)\n", graph_index, timestep, point, last_field, ready, point_timestep < graph.timesteps, point_input_ready == n_inputs, point_output_empty); + for (long input_idx = 0; input_idx < n_inputs; ++input_idx) { + printf(" input %ld content %ld %ld\n", input_idx, ((long *)(point_input_ptr[input_idx]))[0], ((long *)(point_input_ptr[input_idx]))[1]); + } if (ready) { graph.execute_point(timestep, point, point_output.data(), point_output.size(), @@ -208,15 +211,24 @@ static void recv_handler(gex_Token_t token, void *buffer, size_t size, auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; auto &point_deps = state.dependencies[graph_index][dset][point_index]; + printf("recv_handler graph %d timestep %d source %d dest %d\n", graph_index, timestep, source_point, dest_point); long input_idx = 0; for (auto interval : point_deps) { long first_dep = std::min(std::max(interval.first, last_offset), last_offset + last_width); long last_dep = std::min(interval.second + 1, last_offset + last_width); assert(first_dep <= last_dep); - input_idx += std::min(last_dep, (long)source_point) - std::min(first_dep, (long)source_point); + if (first_dep <= source_point && source_point <= last_dep) { + first_dep = std::min(first_dep, (long)source_point); + last_dep = std::min(last_dep, (long)source_point); + } + printf(" interval %ld %ld first_dep %ld last_dep %ld\n", interval.first, interval.second, first_dep, last_dep); + input_idx += last_dep - first_dep; + if (first_dep <= source_point && source_point <= last_dep) { + break; + } } - printf("recv_handler graph %d timestep %d source %d dest %d last_field %ld input_idx %ld input %p\n", graph_index, timestep, source_point, dest_point, input_idx, last_field, point_inputs[input_idx].data()); + printf(" input_idx %ld input %p\n", input_idx, point_inputs[input_idx].data()); point_inputs[input_idx].assign((char *)buffer, ((char *)buffer) + size); point_input_ready++; From 5efce8f13ca7d096c28c92f909524ad9630e96fc Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 15 Oct 2019 10:25:29 -0700 Subject: [PATCH 10/40] Fixes for DOM. --- gasnet/seq.cc | 64 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index eca128bd..8262a6cc 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -76,7 +76,20 @@ static bool is_complete() { long point_index = point - first_point; auto &point_timestep = state.timesteps[graph.graph_index][point_index]; - complete = complete && point_timestep == graph.timesteps; + + // Copy so we don't modify the global value. + long timestep = point_timestep; + + for (; timestep < graph.timesteps; ++timestep) { + long offset = graph.offset_at_timestep(timestep); + long width = graph.width_at_timestep(timestep); + + if (point >= offset && point < offset + width) + break; + printf("is_complete point %ld timestep %ld offset %ld width %ld\n", point, timestep, offset, width); + } + + complete = complete && timestep == graph.timesteps; if (!complete) break; } if (!complete) break; @@ -121,19 +134,26 @@ static bool check_and_run(long graph_index, long point) { auto &point_timestep = state.timesteps[graph_index][point_index]; - long timestep = point_timestep; + for (; point_timestep < graph.timesteps; ++point_timestep) { + long offset = graph.offset_at_timestep(point_timestep); + long width = graph.width_at_timestep(point_timestep); - long last_field = (timestep + state.num_fields - 1) % state.num_fields; - long field = timestep % state.num_fields; + if (point >= offset && point < offset + width) + break; + } + assert(point_timestep < graph.timesteps); - long last_offset = graph.offset_at_timestep(timestep-1); - long last_width = graph.width_at_timestep(timestep-1); + long last_offset = graph.offset_at_timestep(point_timestep-1); + long last_width = graph.width_at_timestep(point_timestep-1); - long next_offset = graph.offset_at_timestep(timestep+1); - long next_width = graph.width_at_timestep(timestep+1); + long next_offset = graph.offset_at_timestep(point_timestep+1); + long next_width = graph.width_at_timestep(point_timestep+1); - long dset = graph.dependence_set_at_timestep(timestep); - long next_dset = graph.dependence_set_at_timestep(timestep+1); + long last_field = (point_timestep + state.num_fields - 1) % state.num_fields; + long field = point_timestep % state.num_fields; + + long dset = graph.dependence_set_at_timestep(point_timestep); + long next_dset = graph.dependence_set_at_timestep(point_timestep+1); auto &point_inputs = state.inputs[graph_index][point_index][last_field]; auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; @@ -162,14 +182,15 @@ static bool check_and_run(long graph_index, long point) { } bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty; - printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d (timestep %d input %d output %d)\n", - graph_index, timestep, point, last_field, ready, - point_timestep < graph.timesteps, point_input_ready == n_inputs, point_output_empty); + printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d (timestep %d input %d output %d) n_inputs %ld n_outputs %ld input_ready %ld\n", + graph_index, point_timestep, point, last_field, ready, + point_timestep < graph.timesteps, point_input_ready == n_inputs, point_output_empty, + n_inputs, n_outputs, point_input_ready); for (long input_idx = 0; input_idx < n_inputs; ++input_idx) { printf(" input %ld content %ld %ld\n", input_idx, ((long *)(point_input_ptr[input_idx]))[0], ((long *)(point_input_ptr[input_idx]))[1]); } if (ready) { - graph.execute_point(timestep, point, + graph.execute_point(point_timestep, point, point_output.data(), point_output.size(), point_input_ptr.data(), point_input_bytes.data(), n_inputs, point_scratch.data(), point_scratch.size()); @@ -412,6 +433,7 @@ int main(int argc, char *argv[]) double start_time = Timer::get_cur_time(); while (!is_complete()) { + // Send data for RAW dependencies for (auto graph : app.graphs) { long first_point = rank * graph.max_width / n_ranks; long last_point = (rank + 1) * graph.max_width / n_ranks - 1; @@ -441,7 +463,6 @@ int main(int argc, char *argv[]) auto &point_output = outputs[point_index][last_field]; auto &point_rev_deps = rev_deps[point_index]; - // Send data for RAW dependencies if (point >= last_offset && point < last_offset + last_width) { for (auto interval : point_rev_deps) { for (long dep = interval.first; dep <= interval.second; dep++) { @@ -449,6 +470,8 @@ int main(int argc, char *argv[]) continue; } + printf("send graph %ld timestep %ld source %ld dest %ld\n", graph.graph_index, timestep, point, dep); + CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, point_output.data(), point_output.size(), GEX_EVENT_GROUP, 0, @@ -500,7 +523,16 @@ int main(int argc, char *argv[]) long last_point = (rank + 1) * graph.max_width / n_ranks - 1; for (long point = first_point; point <= last_point; ++point) { - check_and_run(graph.graph_index, point); + long point_index = point - first_point; + + long timestep = state.timesteps[graph.graph_index][point_index]; + + long offset = graph.offset_at_timestep(timestep); + long width = graph.width_at_timestep(timestep); + + if (point >= offset && point < offset + width) { + check_and_run(graph.graph_index, point); + } } } } From 504640e2ccc16d3c90b0185395124c4ec99d6a3d Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 15 Oct 2019 10:39:42 -0700 Subject: [PATCH 11/40] Fix a timestep bug. --- gasnet/seq.cc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 8262a6cc..832a3f61 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -26,6 +26,8 @@ #define CHECK_OK(x) assert((x) == GASNET_OK); +#define printf(...) do {} while(false) + class AutoLock { public: AutoLock(gex_HSL_t &lock_) : lock(&lock_) { @@ -140,7 +142,11 @@ static bool check_and_run(long graph_index, long point) { if (point >= offset && point < offset + width) break; + printf("advancing graph %ld timestep %ld point %ld\n", + graph_index, point_timestep, point); } + printf("check_and_run graph %ld timestep %ld point %ld\n", + graph_index, point_timestep, point); assert(point_timestep < graph.timesteps); long last_offset = graph.offset_at_timestep(point_timestep-1); @@ -182,7 +188,7 @@ static bool check_and_run(long graph_index, long point) { } bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty; - printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d (timestep %d input %d output %d) n_inputs %ld n_outputs %ld input_ready %ld\n", + printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d (timestep %d input %d output %d) n_inputs %ld n_outputs %ld input_ready %d\n", graph_index, point_timestep, point, last_field, ready, point_timestep < graph.timesteps, point_input_ready == n_inputs, point_output_empty, n_inputs, n_outputs, point_input_ready); @@ -198,6 +204,7 @@ static bool check_and_run(long graph_index, long point) { point_input_ready = 0; point_output_empty = n_outputs == 0; + printf("incrementing graph %ld timestep %ld point %ld\n", graph_index, point_timestep, point); ++point_timestep; return true; @@ -527,6 +534,10 @@ int main(int argc, char *argv[]) long timestep = state.timesteps[graph.graph_index][point_index]; + if (timestep >= graph.timesteps) { + continue; + } + long offset = graph.offset_at_timestep(timestep); long width = graph.width_at_timestep(timestep); From 37dc877ad11defc97e98a583322aa3cfc468bed5 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 15 Oct 2019 15:15:30 -0700 Subject: [PATCH 12/40] Some progress on WAR dependencies. Back to being able to run stencil. --- gasnet/seq.cc | 277 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 218 insertions(+), 59 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 832a3f61..b091f9e7 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -50,6 +50,8 @@ struct RankState { std::vector > timesteps; std::vector > > > > inputs; std::vector > > input_ready; + std::vector > > input_consumed; + std::vector > > remote_input_empty; std::vector > > > input_ptr; std::vector > > > input_bytes; std::vector > > > outputs; @@ -88,7 +90,6 @@ static bool is_complete() { if (point >= offset && point < offset + width) break; - printf("is_complete point %ld timestep %ld offset %ld width %ld\n", point, timestep, offset, width); } complete = complete && timestep == graph.timesteps; @@ -99,7 +100,7 @@ static bool is_complete() { return complete; } -static long timestep_to_send(long graph_index, long point) { +static std::pair timestep_to_send(long graph_index, long point) { AutoLock guard(state.lock); auto rank = state.rank; @@ -114,14 +115,27 @@ static long timestep_to_send(long graph_index, long point) { auto &point_timestep = state.timesteps[graph_index][point_index]; - long last_field = (point_timestep + state.num_fields - 1) % state.num_fields; + long timestep = point_timestep - 1; // gets incremented at the end of check_and_run, so decrement here + + long field = timestep % state.num_fields; + long last_field = (timestep + state.num_fields - 1) % state.num_fields; - auto &point_output_empty = state.output_empty[graph_index][point_index][last_field]; + auto &point_output_empty = state.output_empty[graph_index][point_index][field]; + auto &point_input_consumed = state.input_consumed[graph_index][point_index][last_field]; - if (point_timestep > 0 && point_timestep < graph.timesteps && point_output_empty == 0) { - return point_timestep; + long raw_timestep = -1, war_timestep = -1; + if (timestep >= 0 && timestep < graph.timesteps - 1) { + if (point_output_empty == 0) { + raw_timestep = timestep; + } + if (point_input_consumed == 1) { + war_timestep = timestep; + } } - return -1; + printf("timestep_to_send graph %ld timestep %ld point %ld field %ld last_field %ld output_empty %d input_consumed %d raw %ld war %ld\n", + graph_index, timestep, point, field, last_field, point_output_empty, point_input_consumed, + raw_timestep, war_timestep); + return std::pair(raw_timestep, war_timestep); } static bool check_and_run(long graph_index, long point) { @@ -142,11 +156,9 @@ static bool check_and_run(long graph_index, long point) { if (point >= offset && point < offset + width) break; - printf("advancing graph %ld timestep %ld point %ld\n", - graph_index, point_timestep, point); } - printf("check_and_run graph %ld timestep %ld point %ld\n", - graph_index, point_timestep, point); + // printf("check_and_run graph %ld timestep %ld point %ld\n", + // graph_index, point_timestep, point); assert(point_timestep < graph.timesteps); long last_offset = graph.offset_at_timestep(point_timestep-1); @@ -163,6 +175,7 @@ static bool check_and_run(long graph_index, long point) { auto &point_inputs = state.inputs[graph_index][point_index][last_field]; auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; + auto &point_input_consumed = state.input_consumed[graph_index][point_index][last_field]; auto &point_input_ptr = state.input_ptr[graph_index][point_index][last_field]; auto &point_input_bytes = state.input_bytes[graph_index][point_index][last_field]; auto &point_output = state.outputs[graph_index][point_index][field]; @@ -187,14 +200,14 @@ static bool check_and_run(long graph_index, long point) { n_outputs += last_dep - first_dep; } - bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty; - printf("check_and_run graph %ld timestep %ld point %ld last_field %ld ready %d (timestep %d input %d output %d) n_inputs %ld n_outputs %ld input_ready %d\n", - graph_index, point_timestep, point, last_field, ready, - point_timestep < graph.timesteps, point_input_ready == n_inputs, point_output_empty, - n_inputs, n_outputs, point_input_ready); - for (long input_idx = 0; input_idx < n_inputs; ++input_idx) { - printf(" input %ld content %ld %ld\n", input_idx, ((long *)(point_input_ptr[input_idx]))[0], ((long *)(point_input_ptr[input_idx]))[1]); - } + bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty == 1 && point_input_consumed == 0; + printf("before check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld output_empty %d input_consumed %d\n", + graph_index, point_timestep, point, field, last_field, + point_input_ready, n_inputs, + point_output_empty, point_input_consumed); + // for (long input_idx = 0; input_idx < n_inputs; ++input_idx) { + // printf(" input %ld content %ld %ld\n", input_idx, ((long *)(point_input_ptr[input_idx]))[0], ((long *)(point_input_ptr[input_idx]))[1]); + // } if (ready) { graph.execute_point(point_timestep, point, point_output.data(), point_output.size(), @@ -202,9 +215,14 @@ static bool check_and_run(long graph_index, long point) { point_scratch.data(), point_scratch.size()); point_input_ready = 0; + point_input_consumed = 1; point_output_empty = n_outputs == 0; - printf("incrementing graph %ld timestep %ld point %ld\n", graph_index, point_timestep, point); + printf("after check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld output_empty %d input_consumed %d\n", + graph_index, point_timestep, point, field, last_field, + point_input_ready, n_inputs, + point_output_empty, point_input_consumed); + ++point_timestep; return true; @@ -213,7 +231,7 @@ static bool check_and_run(long graph_index, long point) { return false; } -static void recv_handler(gex_Token_t token, void *buffer, size_t size, +static void RAW_handler(gex_Token_t token, void *buffer, size_t size, gex_AM_Arg_t graph_index, gex_AM_Arg_t timestep, gex_AM_Arg_t source_point, gex_AM_Arg_t dest_point) { AutoLock guard(state.lock); @@ -228,52 +246,90 @@ static void recv_handler(gex_Token_t token, void *buffer, size_t size, long point = dest_point; long point_index = point - first_point; - long last_field = (timestep + state.num_fields - 1) % state.num_fields; + long field = timestep % state.num_fields; - long last_offset = graph.offset_at_timestep(timestep-1); - long last_width = graph.width_at_timestep(timestep-1); + long offset = graph.offset_at_timestep(timestep); + long width = graph.width_at_timestep(timestep); long dset = graph.dependence_set_at_timestep(timestep); - auto &point_inputs = state.inputs[graph_index][point_index][last_field]; - auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; + auto &point_inputs = state.inputs[graph_index][point_index][field]; + auto &point_input_ready = state.input_ready[graph_index][point_index][field]; auto &point_deps = state.dependencies[graph_index][dset][point_index]; - printf("recv_handler graph %d timestep %d source %d dest %d\n", graph_index, timestep, source_point, dest_point); + printf("RAW_handler graph %d timestep %d source %d dest %d\n", graph_index, timestep, source_point, dest_point); long input_idx = 0; for (auto interval : point_deps) { - long first_dep = std::min(std::max(interval.first, last_offset), last_offset + last_width); - long last_dep = std::min(interval.second + 1, last_offset + last_width); + long first_dep = std::min(std::max(interval.first, offset), offset + width); + long last_dep = std::min(interval.second + 1, offset + width); assert(first_dep <= last_dep); if (first_dep <= source_point && source_point <= last_dep) { first_dep = std::min(first_dep, (long)source_point); last_dep = std::min(last_dep, (long)source_point); } - printf(" interval %ld %ld first_dep %ld last_dep %ld\n", interval.first, interval.second, first_dep, last_dep); input_idx += last_dep - first_dep; if (first_dep <= source_point && source_point <= last_dep) { break; } } - printf(" input_idx %ld input %p\n", input_idx, point_inputs[input_idx].data()); - point_inputs[input_idx].assign((char *)buffer, ((char *)buffer) + size); point_input_ready++; - check_and_run(graph_index, dest_point); + // FIXME: there is a bug that occurs like this: + // + // * send RAW + // * run RAW handler (note: happens before the send returns) + // * run task + // * increments timestep + // * send WAR + // * check fails because the timestep has already been incremented + // + // in other words, there is (probably) a missing synchronization between the WAR send completion and the task being able to run + + // check_and_run(graph_index, dest_point); +} + +static void WAR_handler(gex_Token_t token, + gex_AM_Arg_t graph_index, gex_AM_Arg_t timestep, gex_AM_Arg_t point) +{ + AutoLock guard(state.lock); + + auto rank = state.rank; + auto n_ranks = state.n_ranks; + + auto &graph = state.graphs[graph_index]; + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point_index = point - first_point; + + long last_field = (timestep + state.num_fields - 1) % state.num_fields; + + printf("WAR_handler graph %d timestep %d dest %d last_field %ld\n", graph_index, timestep, point, last_field); + + auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][last_field]; + point_remote_input_empty++; } -const int N_HANDLERS = 1; +const int N_HANDLERS = 2; gex_AM_Entry_t handlers[N_HANDLERS] = { gex_AM_Entry_t { .gex_index = 0, - .gex_fnptr = (void (*)())recv_handler, + .gex_fnptr = (void (*)())RAW_handler, .gex_flags = GEX_FLAG_AM_MEDIUM | GEX_FLAG_AM_REQUEST, .gex_nargs = 4, .gex_cdata = NULL, - .gex_name = "recv handler", + .gex_name = "RAW handler", + }, + gex_AM_Entry_t { + .gex_index = 0, + .gex_fnptr = (void (*)())WAR_handler, + .gex_flags = GEX_FLAG_AM_SHORT | GEX_FLAG_AM_REQUEST, + .gex_nargs = 3, + .gex_cdata = NULL, + .gex_name = "WAR handler", }, }; @@ -307,6 +363,8 @@ int main(int argc, char *argv[]) state.timesteps.resize(app.graphs.size()); state.inputs.resize(app.graphs.size()); state.input_ready.resize(app.graphs.size()); + state.input_consumed.resize(app.graphs.size()); + state.remote_input_empty.resize(app.graphs.size()); state.input_ptr.resize(app.graphs.size()); state.input_bytes.resize(app.graphs.size()); state.outputs.resize(app.graphs.size()); @@ -317,7 +375,8 @@ int main(int argc, char *argv[]) std::vector > graph_rank_by_point(app.graphs.size()); - std::vector > sends; + std::vector > sends_raw; + std::vector > sends_war; double elapsed_time = 0.0; for (int iter = 0; iter < 2; ++iter) { @@ -350,6 +409,8 @@ int main(int argc, char *argv[]) auto ×teps = state.timesteps[graph.graph_index]; auto &inputs = state.inputs[graph.graph_index]; auto &input_ready = state.input_ready[graph.graph_index]; + auto &input_consumed = state.input_consumed[graph.graph_index]; + auto &remote_input_empty = state.remote_input_empty[graph.graph_index]; auto &input_ptr = state.input_ptr[graph.graph_index]; auto &input_bytes = state.input_bytes[graph.graph_index]; auto &outputs = state.outputs[graph.graph_index]; @@ -359,6 +420,8 @@ int main(int argc, char *argv[]) timesteps.resize(n_points); inputs.resize(n_points); input_ready.resize(n_points); + input_consumed.resize(n_points); + remote_input_empty.resize(n_points); input_ptr.resize(n_points); input_bytes.resize(n_points); outputs.resize(n_points); @@ -373,6 +436,8 @@ int main(int argc, char *argv[]) auto &point_inputs = inputs[point_index]; auto &point_input_ready = input_ready[point_index]; + auto &point_input_consumed = input_consumed[point_index]; + auto &point_remote_input_empty = remote_input_empty[point_index]; auto &point_input_ptr = input_ptr[point_index]; auto &point_input_bytes = input_bytes[point_index]; auto &point_outputs = outputs[point_index]; @@ -380,6 +445,8 @@ int main(int argc, char *argv[]) point_inputs.resize(state.num_fields); point_input_ready.resize(state.num_fields); + point_input_consumed.resize(state.num_fields); + point_remote_input_empty.resize(state.num_fields); point_input_ptr.resize(state.num_fields); point_input_bytes.resize(state.num_fields); point_outputs.resize(state.num_fields); @@ -403,6 +470,12 @@ int main(int argc, char *argv[]) auto &field_input_ready = point_input_ready[field]; field_input_ready = 0; + auto &field_input_consumed = point_input_consumed[field]; + field_input_consumed = 0; + + auto &field_remote_input_empty = point_remote_input_empty[field]; + field_remote_input_empty = 0; + auto &field_outputs = point_outputs[field]; field_outputs.resize(graph.output_bytes_per_task); @@ -445,6 +518,7 @@ int main(int argc, char *argv[]) long first_point = rank * graph.max_width / n_ranks; long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + auto &remote_input_empty = state.remote_input_empty[graph.graph_index]; auto &outputs = state.outputs[graph.graph_index]; auto &reverse_dependencies = state.reverse_dependencies[graph.graph_index]; @@ -454,37 +528,97 @@ int main(int argc, char *argv[]) for (long point = first_point; point <= last_point; ++point) { long point_index = point - first_point; - long timestep = timestep_to_send(graph.graph_index, point); - if (timestep > 0) { + long timestep = timestep_to_send(graph.graph_index, point).first; + if (timestep >= 0) { long offset = graph.offset_at_timestep(timestep); long width = graph.width_at_timestep(timestep); - long last_offset = graph.offset_at_timestep(timestep-1); - long last_width = graph.width_at_timestep(timestep-1); + long next_offset = graph.offset_at_timestep(timestep+1); + long next_width = graph.width_at_timestep(timestep+1); + + long prev_offset = graph.offset_at_timestep(timestep - state.num_fields + 1); + long prev_width = graph.width_at_timestep(timestep - state.num_fields + 1); - long last_field = (timestep + state.num_fields - 1) % state.num_fields; + long field = timestep % state.num_fields; long dset = graph.dependence_set_at_timestep(timestep); - auto &rev_deps = reverse_dependencies[dset]; - auto &point_output = outputs[point_index][last_field]; - auto &point_rev_deps = rev_deps[point_index]; + auto &point_remote_input_empty = remote_input_empty[point_index][field]; + auto &point_output = outputs[point_index][field]; + auto &point_rev_deps = reverse_dependencies[dset][point_index]; - if (point >= last_offset && point < last_offset + last_width) { + if (point >= offset && point < offset + width) { + long n_war_deps = 0; for (auto interval : point_rev_deps) { + long start = std::max(interval.first, prev_offset); + long stop = std::min(interval.second + 1, prev_offset + prev_width); + if (stop > start) { + n_war_deps += stop - start; + } + } + + printf("considering RAW graph %ld timestep %ld point %ld field %ld remote_input_empty %d n_war_deps %ld\n", + graph.graph_index, timestep, point, field, point_remote_input_empty, n_war_deps); + if (point_remote_input_empty == n_war_deps) { + for (auto interval : point_rev_deps) { + for (long dep = interval.first; dep <= interval.second; dep++) { + if (dep < next_offset || dep >= next_offset + next_width) { + continue; + } + + printf("send RAW graph %ld timestep %ld source %ld dest %ld\n", graph.graph_index, timestep, point, dep); + + CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, + point_output.data(), point_output.size(), + GEX_EVENT_GROUP, 0, + (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, + (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); + sends_raw.push_back(std::tuple(graph.graph_index, timestep, point)); + } + } + } + } + } + } + } + + // Send data for WAR dependencies + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + auto &dependencies = state.dependencies[graph.graph_index]; + + auto &rank_by_point = graph_rank_by_point[graph.graph_index]; + + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + long timestep = timestep_to_send(graph.graph_index, point).second; + if (timestep >= 0) { + + long offset = graph.offset_at_timestep(timestep); + long width = graph.width_at_timestep(timestep); + + long next_offset = graph.offset_at_timestep(timestep + state.num_fields - 1); + long next_width = graph.width_at_timestep(timestep + state.num_fields - 1); + + long dset = graph.dependence_set_at_timestep(timestep); + + auto &point_deps = dependencies[dset][point_index]; + + if (point >= offset && point < offset + width) { + for (auto interval : point_deps) { for (long dep = interval.first; dep <= interval.second; dep++) { - if (dep < offset || dep >= offset + width) { + if (dep < next_offset || dep >= next_offset + next_width) { continue; } - printf("send graph %ld timestep %ld source %ld dest %ld\n", graph.graph_index, timestep, point, dep); + printf("send WAR graph %ld timestep %ld source %ld dest %ld\n", graph.graph_index, timestep, point, dep); - CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, - point_output.data(), point_output.size(), - GEX_EVENT_GROUP, 0, - (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, - (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); - sends.push_back(std::tuple(graph.graph_index, timestep, point)); + CHECK_OK(gex_AM_RequestShort(tm, rank_by_point[dep], handlers[1].gex_index, 0, + (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, (gex_AM_Arg_t)dep)); + sends_war.push_back(std::tuple(graph.graph_index, timestep, point)); } } } @@ -498,13 +632,35 @@ int main(int argc, char *argv[]) // Mark readiness of the output buffers. { AutoLock guard(state.lock); - for (auto &send : sends) { + for (auto &send : sends_raw) { long graph_index; long timestep; long point; std::tie(graph_index, timestep, point) = send; - printf("local completion for graph %ld timestep %ld point %ld\n", graph_index, timestep, point); + auto &graph = state.graphs[graph_index]; + + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point_index = point - first_point; + + long field = timestep % state.num_fields; + + auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][field]; + point_remote_input_empty = 0; + + auto &point_output_empty = state.output_empty[graph_index][point_index][field]; + point_output_empty = 1; + + printf("marking RAW graph %ld timestep %ld point %ld field %ld remote_input_empty %d output_empty %d\n", + graph_index, timestep, point, field, point_remote_input_empty, point_output_empty); + } + for (auto &send : sends_war) { + long graph_index; + long timestep; + long point; + std::tie(graph_index, timestep, point) = send; auto &graph = state.graphs[graph_index]; @@ -515,11 +671,14 @@ int main(int argc, char *argv[]) long last_field = (timestep + state.num_fields - 1) % state.num_fields; - auto &point_output_empty = state.output_empty[graph_index][point_index][last_field]; - point_output_empty = 1; + auto &point_input_consumed = state.input_consumed[graph_index][point_index][last_field]; + point_input_consumed = 0; + + printf("marking WAR graph %ld timestep %ld point %ld last_field %ld input_consumed %d\n", graph_index, timestep, point, last_field, point_input_consumed); } } - sends.clear(); + sends_raw.clear(); + sends_war.clear(); // Run any ready tasks. { From 1383493380b50a1025cf3365bf488f8358cf6ecb Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 15 Oct 2019 15:22:24 -0700 Subject: [PATCH 13/40] Fix trivial, stencil/no_comm still working. --- gasnet/seq.cc | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index b091f9e7..6f19d0bb 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -167,6 +167,9 @@ static bool check_and_run(long graph_index, long point) { long next_offset = graph.offset_at_timestep(point_timestep+1); long next_width = graph.width_at_timestep(point_timestep+1); + long next_field_offset = graph.offset_at_timestep(point_timestep + state.num_fields - 1); + long next_field_width = graph.width_at_timestep(point_timestep + state.num_fields - 1); + long last_field = (point_timestep + state.num_fields - 1) % state.num_fields; long field = point_timestep % state.num_fields; @@ -200,6 +203,14 @@ static bool check_and_run(long graph_index, long point) { n_outputs += last_dep - first_dep; } + long n_consumed = 0; + for (auto interval : point_deps) { + long first_dep = std::min(std::max(interval.first, next_field_offset), next_field_offset + next_field_width); + long last_dep = std::min(interval.second + 1, next_field_offset + next_field_width); + assert(first_dep <= last_dep); + n_consumed += last_dep - first_dep; + } + bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty == 1 && point_input_consumed == 0; printf("before check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld output_empty %d input_consumed %d\n", graph_index, point_timestep, point, field, last_field, @@ -215,7 +226,7 @@ static bool check_and_run(long graph_index, long point) { point_scratch.data(), point_scratch.size()); point_input_ready = 0; - point_input_consumed = 1; + point_input_consumed = n_consumed != 0; point_output_empty = n_outputs == 0; printf("after check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld output_empty %d input_consumed %d\n", @@ -600,8 +611,8 @@ int main(int argc, char *argv[]) long offset = graph.offset_at_timestep(timestep); long width = graph.width_at_timestep(timestep); - long next_offset = graph.offset_at_timestep(timestep + state.num_fields - 1); - long next_width = graph.width_at_timestep(timestep + state.num_fields - 1); + long next_field_offset = graph.offset_at_timestep(timestep + state.num_fields - 1); + long next_field_width = graph.width_at_timestep(timestep + state.num_fields - 1); long dset = graph.dependence_set_at_timestep(timestep); @@ -610,7 +621,7 @@ int main(int argc, char *argv[]) if (point >= offset && point < offset + width) { for (auto interval : point_deps) { for (long dep = interval.first; dep <= interval.second; dep++) { - if (dep < next_offset || dep >= next_offset + next_width) { + if (dep < next_field_offset || dep >= next_field_offset + next_field_width) { continue; } From d44aecad2673c65e963c77470deafb624c9ce759 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 15 Oct 2019 15:39:35 -0700 Subject: [PATCH 14/40] Fix for dom. Trivial, no_comm, stencil, dom now pass in 1 rank. Stencil fails in 2 ranks. --- gasnet/seq.cc | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 6f19d0bb..f2dcaa46 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -159,7 +159,9 @@ static bool check_and_run(long graph_index, long point) { } // printf("check_and_run graph %ld timestep %ld point %ld\n", // graph_index, point_timestep, point); - assert(point_timestep < graph.timesteps); + if (point_timestep >= graph.timesteps) { + return false; + } long last_offset = graph.offset_at_timestep(point_timestep-1); long last_width = graph.width_at_timestep(point_timestep-1); @@ -190,7 +192,7 @@ static bool check_and_run(long graph_index, long point) { long n_inputs = 0; for (auto interval : point_deps) { long first_dep = std::min(std::max(interval.first, last_offset), last_offset + last_width); - long last_dep = std::min(interval.second + 1, last_offset + last_width); + long last_dep = std::min(std::max(interval.second + 1, last_offset), last_offset + last_width); assert(first_dep <= last_dep); n_inputs += last_dep - first_dep; } @@ -198,7 +200,7 @@ static bool check_and_run(long graph_index, long point) { long n_outputs = 0; for (auto interval : point_rev_deps) { long first_dep = std::min(std::max(interval.first, next_offset), next_offset + next_width); - long last_dep = std::min(interval.second + 1, next_offset + next_width); + long last_dep = std::min(std::max(interval.second + 1, next_offset), next_offset + next_width); assert(first_dep <= last_dep); n_outputs += last_dep - first_dep; } @@ -206,7 +208,7 @@ static bool check_and_run(long graph_index, long point) { long n_consumed = 0; for (auto interval : point_deps) { long first_dep = std::min(std::max(interval.first, next_field_offset), next_field_offset + next_field_width); - long last_dep = std::min(interval.second + 1, next_field_offset + next_field_width); + long last_dep = std::min(std::max(interval.second + 1, next_field_offset), next_field_offset + next_field_width); assert(first_dep <= last_dep); n_consumed += last_dep - first_dep; } @@ -708,12 +710,7 @@ int main(int argc, char *argv[]) continue; } - long offset = graph.offset_at_timestep(timestep); - long width = graph.width_at_timestep(timestep); - - if (point >= offset && point < offset + width) { - check_and_run(graph.graph_index, point); - } + check_and_run(graph.graph_index, point); } } } From acfbf84bfca0257a252b8633699cd52cc0aa366b Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 15 Oct 2019 16:28:07 -0700 Subject: [PATCH 15/40] Workaround for synchronization bug. Still broken on dom 4 ranks. --- gasnet/seq.cc | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index f2dcaa46..e80b979e 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -52,6 +52,7 @@ struct RankState { std::vector > > input_ready; std::vector > > input_consumed; std::vector > > remote_input_empty; + std::vector > > remote_input_timestep; std::vector > > > input_ptr; std::vector > > > input_bytes; std::vector > > > outputs; @@ -319,10 +320,18 @@ static void WAR_handler(gex_Token_t token, long last_field = (timestep + state.num_fields - 1) % state.num_fields; - printf("WAR_handler graph %d timestep %d dest %d last_field %ld\n", graph_index, timestep, point, last_field); - auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][last_field]; + auto &point_remote_input_timestep = state.remote_input_timestep[graph_index][point_index][last_field]; + + if (point_remote_input_timestep != timestep) { + point_remote_input_empty = 0; + point_remote_input_timestep = timestep; + } + point_remote_input_empty++; + + printf("WAR_handler graph %d timestep %d dest %d last_field %ld remote_input_empty %d\n", + graph_index, timestep, point, last_field, point_remote_input_empty); } const int N_HANDLERS = 2; @@ -378,6 +387,7 @@ int main(int argc, char *argv[]) state.input_ready.resize(app.graphs.size()); state.input_consumed.resize(app.graphs.size()); state.remote_input_empty.resize(app.graphs.size()); + state.remote_input_timestep.resize(app.graphs.size()); state.input_ptr.resize(app.graphs.size()); state.input_bytes.resize(app.graphs.size()); state.outputs.resize(app.graphs.size()); @@ -424,6 +434,7 @@ int main(int argc, char *argv[]) auto &input_ready = state.input_ready[graph.graph_index]; auto &input_consumed = state.input_consumed[graph.graph_index]; auto &remote_input_empty = state.remote_input_empty[graph.graph_index]; + auto &remote_input_timestep = state.remote_input_timestep[graph.graph_index]; auto &input_ptr = state.input_ptr[graph.graph_index]; auto &input_bytes = state.input_bytes[graph.graph_index]; auto &outputs = state.outputs[graph.graph_index]; @@ -435,6 +446,7 @@ int main(int argc, char *argv[]) input_ready.resize(n_points); input_consumed.resize(n_points); remote_input_empty.resize(n_points); + remote_input_timestep.resize(n_points); input_ptr.resize(n_points); input_bytes.resize(n_points); outputs.resize(n_points); @@ -451,6 +463,7 @@ int main(int argc, char *argv[]) auto &point_input_ready = input_ready[point_index]; auto &point_input_consumed = input_consumed[point_index]; auto &point_remote_input_empty = remote_input_empty[point_index]; + auto &point_remote_input_timestep = remote_input_timestep[point_index]; auto &point_input_ptr = input_ptr[point_index]; auto &point_input_bytes = input_bytes[point_index]; auto &point_outputs = outputs[point_index]; @@ -460,6 +473,7 @@ int main(int argc, char *argv[]) point_input_ready.resize(state.num_fields); point_input_consumed.resize(state.num_fields); point_remote_input_empty.resize(state.num_fields); + point_remote_input_timestep.resize(state.num_fields); point_input_ptr.resize(state.num_fields); point_input_bytes.resize(state.num_fields); point_outputs.resize(state.num_fields); @@ -489,6 +503,9 @@ int main(int argc, char *argv[]) auto &field_remote_input_empty = point_remote_input_empty[field]; field_remote_input_empty = 0; + auto &field_remote_input_timestep = point_remote_input_timestep[field]; + field_remote_input_timestep = 0; + auto &field_outputs = point_outputs[field]; field_outputs.resize(graph.output_bytes_per_task); @@ -658,16 +675,17 @@ int main(int argc, char *argv[]) long point_index = point - first_point; + long last_field = (timestep + state.num_fields - 1) % state.num_fields; long field = timestep % state.num_fields; - auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][field]; - point_remote_input_empty = 0; + // auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][field]; + // point_remote_input_empty = 0; auto &point_output_empty = state.output_empty[graph_index][point_index][field]; point_output_empty = 1; - printf("marking RAW graph %ld timestep %ld point %ld field %ld remote_input_empty %d output_empty %d\n", - graph_index, timestep, point, field, point_remote_input_empty, point_output_empty); + printf("marking RAW graph %ld timestep %ld point %ld last_field %ld field %ld output_empty %d\n", + graph_index, timestep, point, last_field, field, point_output_empty); } for (auto &send : sends_war) { long graph_index; From b4b2bcf3efaedd95eb545fad03a41d5c9d300823 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 15 Oct 2019 20:44:26 -0700 Subject: [PATCH 16/40] Switch to a different structure, WAR dependencies on tasks. DOM is now unhappy. --- gasnet/seq.cc | 103 ++++++++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 62 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index e80b979e..cdf7012f 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -26,7 +26,7 @@ #define CHECK_OK(x) assert((x) == GASNET_OK); -#define printf(...) do {} while(false) +// #define printf(...) do {} while(false) class AutoLock { public: @@ -52,7 +52,6 @@ struct RankState { std::vector > > input_ready; std::vector > > input_consumed; std::vector > > remote_input_empty; - std::vector > > remote_input_timestep; std::vector > > > input_ptr; std::vector > > > input_bytes; std::vector > > > outputs; @@ -170,6 +169,9 @@ static bool check_and_run(long graph_index, long point) { long next_offset = graph.offset_at_timestep(point_timestep+1); long next_width = graph.width_at_timestep(point_timestep+1); + long last_field_offset = graph.offset_at_timestep(point_timestep - state.num_fields + 1); + long last_field_width = graph.width_at_timestep(point_timestep - state.num_fields + 1); + long next_field_offset = graph.offset_at_timestep(point_timestep + state.num_fields - 1); long next_field_width = graph.width_at_timestep(point_timestep + state.num_fields - 1); @@ -182,6 +184,7 @@ static bool check_and_run(long graph_index, long point) { auto &point_inputs = state.inputs[graph_index][point_index][last_field]; auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; auto &point_input_consumed = state.input_consumed[graph_index][point_index][last_field]; + auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][field]; auto &point_input_ptr = state.input_ptr[graph_index][point_index][last_field]; auto &point_input_bytes = state.input_bytes[graph_index][point_index][last_field]; auto &point_output = state.outputs[graph_index][point_index][field]; @@ -206,19 +209,29 @@ static bool check_and_run(long graph_index, long point) { n_outputs += last_dep - first_dep; } - long n_consumed = 0; + long n_war_in = 0; + for (auto interval : point_deps) { + long first_dep = std::min(std::max(interval.first, last_field_offset), last_field_offset + last_field_width); + long last_dep = std::min(std::max(interval.second + 1, last_field_offset), last_field_offset + last_field_width); + assert(first_dep <= last_dep); + n_war_in += last_dep - first_dep; + } + + long n_war_out = 0; for (auto interval : point_deps) { long first_dep = std::min(std::max(interval.first, next_field_offset), next_field_offset + next_field_width); long last_dep = std::min(std::max(interval.second + 1, next_field_offset), next_field_offset + next_field_width); assert(first_dep <= last_dep); - n_consumed += last_dep - first_dep; + n_war_out += last_dep - first_dep; } - bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_output_empty == 1 && point_input_consumed == 0; - printf("before check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld output_empty %d input_consumed %d\n", + bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_remote_input_empty == n_war_in && point_output_empty == 1 && point_input_consumed == 0; + printf("before check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld remote_input_empty %d %p n_war_in %ld output_empty %d input_consumed %d last_field_offset %ld width %ld\n", graph_index, point_timestep, point, field, last_field, point_input_ready, n_inputs, - point_output_empty, point_input_consumed); + point_remote_input_empty, &point_remote_input_empty, n_war_in, + point_output_empty, point_input_consumed, + last_field_offset, last_field_width); // for (long input_idx = 0; input_idx < n_inputs; ++input_idx) { // printf(" input %ld content %ld %ld\n", input_idx, ((long *)(point_input_ptr[input_idx]))[0], ((long *)(point_input_ptr[input_idx]))[1]); // } @@ -229,13 +242,15 @@ static bool check_and_run(long graph_index, long point) { point_scratch.data(), point_scratch.size()); point_input_ready = 0; - point_input_consumed = n_consumed != 0; + point_remote_input_empty = 0; + point_input_consumed = n_war_out != 0; point_output_empty = n_outputs == 0; - printf("after check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld output_empty %d input_consumed %d\n", - graph_index, point_timestep, point, field, last_field, - point_input_ready, n_inputs, - point_output_empty, point_input_consumed); + printf("after check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld remote_input_empty %d n_war_in %ld output_empty %d input_consumed %d\n", + graph_index, point_timestep, point, field, last_field, + point_input_ready, n_inputs, + point_remote_input_empty, n_war_in, + point_output_empty, point_input_consumed); ++point_timestep; @@ -321,17 +336,10 @@ static void WAR_handler(gex_Token_t token, long last_field = (timestep + state.num_fields - 1) % state.num_fields; auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][last_field]; - auto &point_remote_input_timestep = state.remote_input_timestep[graph_index][point_index][last_field]; - - if (point_remote_input_timestep != timestep) { - point_remote_input_empty = 0; - point_remote_input_timestep = timestep; - } - point_remote_input_empty++; - printf("WAR_handler graph %d timestep %d dest %d last_field %ld remote_input_empty %d\n", - graph_index, timestep, point, last_field, point_remote_input_empty); + printf("WAR_handler graph %d timestep %d dest %d last_field %ld remote_input_empty %d %p\n", + graph_index, timestep, point, last_field, point_remote_input_empty, &point_remote_input_empty); } const int N_HANDLERS = 2; @@ -387,7 +395,6 @@ int main(int argc, char *argv[]) state.input_ready.resize(app.graphs.size()); state.input_consumed.resize(app.graphs.size()); state.remote_input_empty.resize(app.graphs.size()); - state.remote_input_timestep.resize(app.graphs.size()); state.input_ptr.resize(app.graphs.size()); state.input_bytes.resize(app.graphs.size()); state.outputs.resize(app.graphs.size()); @@ -434,7 +441,6 @@ int main(int argc, char *argv[]) auto &input_ready = state.input_ready[graph.graph_index]; auto &input_consumed = state.input_consumed[graph.graph_index]; auto &remote_input_empty = state.remote_input_empty[graph.graph_index]; - auto &remote_input_timestep = state.remote_input_timestep[graph.graph_index]; auto &input_ptr = state.input_ptr[graph.graph_index]; auto &input_bytes = state.input_bytes[graph.graph_index]; auto &outputs = state.outputs[graph.graph_index]; @@ -446,7 +452,6 @@ int main(int argc, char *argv[]) input_ready.resize(n_points); input_consumed.resize(n_points); remote_input_empty.resize(n_points); - remote_input_timestep.resize(n_points); input_ptr.resize(n_points); input_bytes.resize(n_points); outputs.resize(n_points); @@ -463,7 +468,6 @@ int main(int argc, char *argv[]) auto &point_input_ready = input_ready[point_index]; auto &point_input_consumed = input_consumed[point_index]; auto &point_remote_input_empty = remote_input_empty[point_index]; - auto &point_remote_input_timestep = remote_input_timestep[point_index]; auto &point_input_ptr = input_ptr[point_index]; auto &point_input_bytes = input_bytes[point_index]; auto &point_outputs = outputs[point_index]; @@ -473,7 +477,6 @@ int main(int argc, char *argv[]) point_input_ready.resize(state.num_fields); point_input_consumed.resize(state.num_fields); point_remote_input_empty.resize(state.num_fields); - point_remote_input_timestep.resize(state.num_fields); point_input_ptr.resize(state.num_fields); point_input_bytes.resize(state.num_fields); point_outputs.resize(state.num_fields); @@ -503,9 +506,6 @@ int main(int argc, char *argv[]) auto &field_remote_input_empty = point_remote_input_empty[field]; field_remote_input_empty = 0; - auto &field_remote_input_timestep = point_remote_input_timestep[field]; - field_remote_input_timestep = 0; - auto &field_outputs = point_outputs[field]; field_outputs.resize(graph.output_bytes_per_task); @@ -548,7 +548,6 @@ int main(int argc, char *argv[]) long first_point = rank * graph.max_width / n_ranks; long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - auto &remote_input_empty = state.remote_input_empty[graph.graph_index]; auto &outputs = state.outputs[graph.graph_index]; auto &reverse_dependencies = state.reverse_dependencies[graph.graph_index]; @@ -566,45 +565,28 @@ int main(int argc, char *argv[]) long next_offset = graph.offset_at_timestep(timestep+1); long next_width = graph.width_at_timestep(timestep+1); - long prev_offset = graph.offset_at_timestep(timestep - state.num_fields + 1); - long prev_width = graph.width_at_timestep(timestep - state.num_fields + 1); - long field = timestep % state.num_fields; long dset = graph.dependence_set_at_timestep(timestep); - auto &point_remote_input_empty = remote_input_empty[point_index][field]; auto &point_output = outputs[point_index][field]; auto &point_rev_deps = reverse_dependencies[dset][point_index]; if (point >= offset && point < offset + width) { - long n_war_deps = 0; for (auto interval : point_rev_deps) { - long start = std::max(interval.first, prev_offset); - long stop = std::min(interval.second + 1, prev_offset + prev_width); - if (stop > start) { - n_war_deps += stop - start; - } - } - - printf("considering RAW graph %ld timestep %ld point %ld field %ld remote_input_empty %d n_war_deps %ld\n", - graph.graph_index, timestep, point, field, point_remote_input_empty, n_war_deps); - if (point_remote_input_empty == n_war_deps) { - for (auto interval : point_rev_deps) { - for (long dep = interval.first; dep <= interval.second; dep++) { - if (dep < next_offset || dep >= next_offset + next_width) { - continue; - } - - printf("send RAW graph %ld timestep %ld source %ld dest %ld\n", graph.graph_index, timestep, point, dep); - - CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, - point_output.data(), point_output.size(), - GEX_EVENT_GROUP, 0, - (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, - (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); - sends_raw.push_back(std::tuple(graph.graph_index, timestep, point)); + for (long dep = interval.first; dep <= interval.second; dep++) { + if (dep < next_offset || dep >= next_offset + next_width) { + continue; } + + printf("send RAW graph %ld timestep %ld source %ld dest %ld\n", graph.graph_index, timestep, point, dep); + + CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, + point_output.data(), point_output.size(), + GEX_EVENT_GROUP, 0, + (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, + (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); + sends_raw.push_back(std::tuple(graph.graph_index, timestep, point)); } } } @@ -678,9 +660,6 @@ int main(int argc, char *argv[]) long last_field = (timestep + state.num_fields - 1) % state.num_fields; long field = timestep % state.num_fields; - // auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][field]; - // point_remote_input_empty = 0; - auto &point_output_empty = state.output_empty[graph_index][point_index][field]; point_output_empty = 1; From e38f3a49981fc50d4152ff2852d272b9cc76fa7a Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 15 Oct 2019 21:09:27 -0700 Subject: [PATCH 17/40] Fix for dom on 1 rank, still failing multiple ranks. --- gasnet/seq.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index cdf7012f..471177dc 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -180,6 +180,7 @@ static bool check_and_run(long graph_index, long point) { long dset = graph.dependence_set_at_timestep(point_timestep); long next_dset = graph.dependence_set_at_timestep(point_timestep+1); + long last_field_dset = graph.dependence_set_at_timestep(point_timestep - state.num_fields + 1); auto &point_inputs = state.inputs[graph_index][point_index][last_field]; auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; @@ -192,6 +193,7 @@ static bool check_and_run(long graph_index, long point) { auto &point_scratch = state.scratch[graph_index][point_index]; auto &point_deps = state.dependencies[graph_index][dset][point_index]; auto &point_rev_deps = state.reverse_dependencies[graph_index][next_dset][point_index]; + auto &point_last_field_rev_deps = state.reverse_dependencies[graph_index][next_dset][point_index]; long n_inputs = 0; for (auto interval : point_deps) { @@ -210,9 +212,12 @@ static bool check_and_run(long graph_index, long point) { } long n_war_in = 0; - for (auto interval : point_deps) { + for (auto interval : point_last_field_rev_deps) { long first_dep = std::min(std::max(interval.first, last_field_offset), last_field_offset + last_field_width); long last_dep = std::min(std::max(interval.second + 1, last_field_offset), last_field_offset + last_field_width); + printf("computing n_war_in interval %ld %ld last_field_offset %ld width %ld first %ld last %ld\n", + interval.first, interval.second, last_field_offset, last_field_width, + first_dep, last_dep); assert(first_dep <= last_dep); n_war_in += last_dep - first_dep; } From 60502bab2ab8c10ca6ecd587894526fe310bc798 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 15 Oct 2019 21:30:10 -0700 Subject: [PATCH 18/40] Fix dom multiple ranks. Still unhappy on fft, random. --- gasnet/seq.cc | 52 +++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 471177dc..f1cbc927 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -26,7 +26,7 @@ #define CHECK_OK(x) assert((x) == GASNET_OK); -// #define printf(...) do {} while(false) +#define printf(...) do {} while(false) class AutoLock { public: @@ -181,6 +181,7 @@ static bool check_and_run(long graph_index, long point) { long dset = graph.dependence_set_at_timestep(point_timestep); long next_dset = graph.dependence_set_at_timestep(point_timestep+1); long last_field_dset = graph.dependence_set_at_timestep(point_timestep - state.num_fields + 1); + long next_field_dset = graph.dependence_set_at_timestep(point_timestep + state.num_fields - 1); auto &point_inputs = state.inputs[graph_index][point_index][last_field]; auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; @@ -193,7 +194,8 @@ static bool check_and_run(long graph_index, long point) { auto &point_scratch = state.scratch[graph_index][point_index]; auto &point_deps = state.dependencies[graph_index][dset][point_index]; auto &point_rev_deps = state.reverse_dependencies[graph_index][next_dset][point_index]; - auto &point_last_field_rev_deps = state.reverse_dependencies[graph_index][next_dset][point_index]; + auto &point_last_field_rev_deps = state.reverse_dependencies[graph_index][last_field_dset][point_index]; + auto &point_next_field_rev_deps = state.reverse_dependencies[graph_index][next_field_dset][point_index]; long n_inputs = 0; for (auto interval : point_deps) { @@ -548,6 +550,28 @@ int main(int argc, char *argv[]) double start_time = Timer::get_cur_time(); while (!is_complete()) { + // Run any ready tasks. + { + AutoLock guard(state.lock); + + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + long timestep = state.timesteps[graph.graph_index][point_index]; + + if (timestep >= graph.timesteps) { + continue; + } + + check_and_run(graph.graph_index, point); + } + } + } + // Send data for RAW dependencies for (auto graph : app.graphs) { long first_point = rank * graph.max_width / n_ranks; @@ -577,6 +601,8 @@ int main(int argc, char *argv[]) auto &point_output = outputs[point_index][field]; auto &point_rev_deps = reverse_dependencies[dset][point_index]; + printf("considering RAW graph %ld timestep %ld point %ld offset %ld width %ld next offset %ld width %ld\n", + graph.graph_index, timestep, point, offset, width, next_offset, next_width); if (point >= offset && point < offset + width) { for (auto interval : point_rev_deps) { for (long dep = interval.first; dep <= interval.second; dep++) { @@ -695,28 +721,6 @@ int main(int argc, char *argv[]) sends_raw.clear(); sends_war.clear(); - // Run any ready tasks. - { - AutoLock guard(state.lock); - - for (auto graph : app.graphs) { - long first_point = rank * graph.max_width / n_ranks; - long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - - for (long point = first_point; point <= last_point; ++point) { - long point_index = point - first_point; - - long timestep = state.timesteps[graph.graph_index][point_index]; - - if (timestep >= graph.timesteps) { - continue; - } - - check_and_run(graph.graph_index, point); - } - } - } - // Poll the network to make sure we're making progress. CHECK_OK(gasnet_AMPoll()); } From dad10eed484f289dfd7f1f7800d0851a6fd3778a Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Wed, 16 Oct 2019 11:13:20 -0700 Subject: [PATCH 19/40] Fixes for FFT. --- gasnet/seq.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index f1cbc927..f5e54c0d 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -287,7 +287,7 @@ static void RAW_handler(gex_Token_t token, void *buffer, size_t size, long offset = graph.offset_at_timestep(timestep); long width = graph.width_at_timestep(timestep); - long dset = graph.dependence_set_at_timestep(timestep); + long dset = graph.dependence_set_at_timestep(timestep+1); auto &point_inputs = state.inputs[graph_index][point_index][field]; auto &point_input_ready = state.input_ready[graph_index][point_index][field]; @@ -529,8 +529,8 @@ int main(int argc, char *argv[]) auto &dependencies = state.dependencies[graph.graph_index]; auto &reverse_dependencies = state.reverse_dependencies[graph.graph_index]; - dependencies.resize(n_points); - reverse_dependencies.resize(n_points); + dependencies.resize(graph.max_dependence_sets()); + reverse_dependencies.resize(graph.max_dependence_sets()); for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { dependencies[dset].resize(n_points); @@ -596,7 +596,7 @@ int main(int argc, char *argv[]) long field = timestep % state.num_fields; - long dset = graph.dependence_set_at_timestep(timestep); + long dset = graph.dependence_set_at_timestep(timestep + 1); auto &point_output = outputs[point_index][field]; auto &point_rev_deps = reverse_dependencies[dset][point_index]; From 4aaa1de5836029a065cc6d82d079c5bf09ad90d3 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Wed, 16 Oct 2019 11:17:02 -0700 Subject: [PATCH 20/40] Fixes for the core to accept dsets for negative timesteps. --- core/core.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/core/core.cc b/core/core.cc index ef88307e..a13d7202 100644 --- a/core/core.cc +++ b/core/core.cc @@ -238,14 +238,26 @@ long TaskGraph::dependence_set_at_timestep(long timestep) const case DependenceType::TREE: return 0; case DependenceType::FFT: - return (timestep + max_dependence_sets() - 1) % max_dependence_sets(); + { + long dset = (timestep - 1) % max_dependence_sets(); + if (dset < 0) { + dset += max_dependence_sets(); + } + return dset; + } case DependenceType::ALL_TO_ALL: case DependenceType::NEAREST: return 0; case DependenceType::SPREAD: case DependenceType::RANDOM_NEAREST: case DependenceType::RANDOM_SPREAD: - return timestep % max_dependence_sets(); + { + long dset = timestep % max_dependence_sets(); + if (dset < 0) { + dset += max_dependence_sets(); + } + return dset; + } default: assert(false && "unexpected dependence type"); }; From 3e6ad5e0cdf9cb26881c9f3d419fa6495b897e41 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Wed, 16 Oct 2019 11:18:32 -0700 Subject: [PATCH 21/40] Clean up debug prints. --- gasnet/seq.cc | 53 --------------------------------------------------- 1 file changed, 53 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index f5e54c0d..6dbf9810 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -26,8 +26,6 @@ #define CHECK_OK(x) assert((x) == GASNET_OK); -#define printf(...) do {} while(false) - class AutoLock { public: AutoLock(gex_HSL_t &lock_) : lock(&lock_) { @@ -132,9 +130,6 @@ static std::pair timestep_to_send(long graph_index, long point) { war_timestep = timestep; } } - printf("timestep_to_send graph %ld timestep %ld point %ld field %ld last_field %ld output_empty %d input_consumed %d raw %ld war %ld\n", - graph_index, timestep, point, field, last_field, point_output_empty, point_input_consumed, - raw_timestep, war_timestep); return std::pair(raw_timestep, war_timestep); } @@ -157,8 +152,6 @@ static bool check_and_run(long graph_index, long point) { if (point >= offset && point < offset + width) break; } - // printf("check_and_run graph %ld timestep %ld point %ld\n", - // graph_index, point_timestep, point); if (point_timestep >= graph.timesteps) { return false; } @@ -217,9 +210,6 @@ static bool check_and_run(long graph_index, long point) { for (auto interval : point_last_field_rev_deps) { long first_dep = std::min(std::max(interval.first, last_field_offset), last_field_offset + last_field_width); long last_dep = std::min(std::max(interval.second + 1, last_field_offset), last_field_offset + last_field_width); - printf("computing n_war_in interval %ld %ld last_field_offset %ld width %ld first %ld last %ld\n", - interval.first, interval.second, last_field_offset, last_field_width, - first_dep, last_dep); assert(first_dep <= last_dep); n_war_in += last_dep - first_dep; } @@ -233,15 +223,6 @@ static bool check_and_run(long graph_index, long point) { } bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_remote_input_empty == n_war_in && point_output_empty == 1 && point_input_consumed == 0; - printf("before check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld remote_input_empty %d %p n_war_in %ld output_empty %d input_consumed %d last_field_offset %ld width %ld\n", - graph_index, point_timestep, point, field, last_field, - point_input_ready, n_inputs, - point_remote_input_empty, &point_remote_input_empty, n_war_in, - point_output_empty, point_input_consumed, - last_field_offset, last_field_width); - // for (long input_idx = 0; input_idx < n_inputs; ++input_idx) { - // printf(" input %ld content %ld %ld\n", input_idx, ((long *)(point_input_ptr[input_idx]))[0], ((long *)(point_input_ptr[input_idx]))[1]); - // } if (ready) { graph.execute_point(point_timestep, point, point_output.data(), point_output.size(), @@ -253,12 +234,6 @@ static bool check_and_run(long graph_index, long point) { point_input_consumed = n_war_out != 0; point_output_empty = n_outputs == 0; - printf("after check_and_run graph %ld timestep %ld point %ld field %ld last_field %ld input_ready %d n_inputs %ld remote_input_empty %d n_war_in %ld output_empty %d input_consumed %d\n", - graph_index, point_timestep, point, field, last_field, - point_input_ready, n_inputs, - point_remote_input_empty, n_war_in, - point_output_empty, point_input_consumed); - ++point_timestep; return true; @@ -293,7 +268,6 @@ static void RAW_handler(gex_Token_t token, void *buffer, size_t size, auto &point_input_ready = state.input_ready[graph_index][point_index][field]; auto &point_deps = state.dependencies[graph_index][dset][point_index]; - printf("RAW_handler graph %d timestep %d source %d dest %d\n", graph_index, timestep, source_point, dest_point); long input_idx = 0; for (auto interval : point_deps) { long first_dep = std::min(std::max(interval.first, offset), offset + width); @@ -311,19 +285,6 @@ static void RAW_handler(gex_Token_t token, void *buffer, size_t size, point_inputs[input_idx].assign((char *)buffer, ((char *)buffer) + size); point_input_ready++; - - // FIXME: there is a bug that occurs like this: - // - // * send RAW - // * run RAW handler (note: happens before the send returns) - // * run task - // * increments timestep - // * send WAR - // * check fails because the timestep has already been incremented - // - // in other words, there is (probably) a missing synchronization between the WAR send completion and the task being able to run - - // check_and_run(graph_index, dest_point); } static void WAR_handler(gex_Token_t token, @@ -344,9 +305,6 @@ static void WAR_handler(gex_Token_t token, auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][last_field]; point_remote_input_empty++; - - printf("WAR_handler graph %d timestep %d dest %d last_field %ld remote_input_empty %d %p\n", - graph_index, timestep, point, last_field, point_remote_input_empty, &point_remote_input_empty); } const int N_HANDLERS = 2; @@ -601,8 +559,6 @@ int main(int argc, char *argv[]) auto &point_output = outputs[point_index][field]; auto &point_rev_deps = reverse_dependencies[dset][point_index]; - printf("considering RAW graph %ld timestep %ld point %ld offset %ld width %ld next offset %ld width %ld\n", - graph.graph_index, timestep, point, offset, width, next_offset, next_width); if (point >= offset && point < offset + width) { for (auto interval : point_rev_deps) { for (long dep = interval.first; dep <= interval.second; dep++) { @@ -610,8 +566,6 @@ int main(int argc, char *argv[]) continue; } - printf("send RAW graph %ld timestep %ld source %ld dest %ld\n", graph.graph_index, timestep, point, dep); - CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, point_output.data(), point_output.size(), GEX_EVENT_GROUP, 0, @@ -657,8 +611,6 @@ int main(int argc, char *argv[]) continue; } - printf("send WAR graph %ld timestep %ld source %ld dest %ld\n", graph.graph_index, timestep, point, dep); - CHECK_OK(gex_AM_RequestShort(tm, rank_by_point[dep], handlers[1].gex_index, 0, (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, (gex_AM_Arg_t)dep)); sends_war.push_back(std::tuple(graph.graph_index, timestep, point)); @@ -693,9 +645,6 @@ int main(int argc, char *argv[]) auto &point_output_empty = state.output_empty[graph_index][point_index][field]; point_output_empty = 1; - - printf("marking RAW graph %ld timestep %ld point %ld last_field %ld field %ld output_empty %d\n", - graph_index, timestep, point, last_field, field, point_output_empty); } for (auto &send : sends_war) { long graph_index; @@ -714,8 +663,6 @@ int main(int argc, char *argv[]) auto &point_input_consumed = state.input_consumed[graph_index][point_index][last_field]; point_input_consumed = 0; - - printf("marking WAR graph %ld timestep %ld point %ld last_field %ld input_consumed %d\n", graph_index, timestep, point, last_field, point_input_consumed); } } sends_raw.clear(); From 34196eba89a0eb67509986b820547af854310929 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Wed, 16 Oct 2019 15:35:19 -0700 Subject: [PATCH 22/40] We don't actually use the segment so there's no need to attach it.... --- gasnet/seq.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 6dbf9810..6083af89 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -340,9 +340,9 @@ int main(int argc, char *argv[]) state.rank = rank; state.n_ranks = n_ranks; - uintptr_t max_size = 0; // gasnet_getMaxLocalSegmentSize(); // don't need this with AM Medium - gex_Segment_t segment; - CHECK_OK(gex_Segment_Attach(&segment, tm, max_size)); + // uintptr_t max_size = 0; // gasnet_getMaxLocalSegmentSize(); // don't need this with AM Medium + // gex_Segment_t segment; + // CHECK_OK(gex_Segment_Attach(&segment, tm, max_size)); CHECK_OK(gex_EP_RegisterHandlers(ep, handlers, N_HANDLERS)); From dfd0a824c7ac53745e6abf6bbee8f83f1c2b4932 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Wed, 16 Oct 2019 15:35:40 -0700 Subject: [PATCH 23/40] Switch to GASNet seq library. --- gasnet/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gasnet/Makefile b/gasnet/Makefile index ff7d614f..8b98ec64 100644 --- a/gasnet/Makefile +++ b/gasnet/Makefile @@ -4,11 +4,11 @@ DEBUG ?= 0 CXXFLAGS ?= CXXFLAGS += -std=c++11 -I../core -CXXFLAGS += $(shell PKG_CONFIG_PATH="$(GASNET)/lib/pkgconfig" pkg-config gasnet-$(CONDUIT)-par --cflags) +CXXFLAGS += $(shell PKG_CONFIG_PATH="$(GASNET)/lib/pkgconfig" pkg-config gasnet-$(CONDUIT)-seq --cflags) LDFLAGS ?= LDFLAGS += -L../core -lcore_s -LDFLAGS += $(shell PKG_CONFIG_PATH="$(GASNET)/lib/pkgconfig" pkg-config gasnet-$(CONDUIT)-par --libs) +LDFLAGS += $(shell PKG_CONFIG_PATH="$(GASNET)/lib/pkgconfig" pkg-config gasnet-$(CONDUIT)-seq --libs) ifeq ($(strip $(DEBUG)),0) CXXFLAGS += -O3 From 0936342c655d76ff0beaf80449eade131f745095 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Wed, 16 Oct 2019 15:36:36 -0700 Subject: [PATCH 24/40] Fetch the Task Bench version of GASNet. --- get_deps.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/get_deps.sh b/get_deps.sh index 2f85920b..92989d26 100755 --- a/get_deps.sh +++ b/get_deps.sh @@ -92,7 +92,7 @@ else fi export CONDUIT=$CONDUIT EOF - git clone https://github.com/StanfordLegion/gasnet.git "$GASNET_DIR" + git clone -b task-bench https://github.com/StanfordLegion/gasnet.git "$GASNET_DIR" fi if [[ $TASKBENCH_USE_HWLOC -eq 1 ]]; then From 8a75a2aa447e712d80db51526f246b7ebaa2bb5d Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 3 Mar 2020 15:28:07 -0800 Subject: [PATCH 25/40] Bump copyright year. --- gasnet/seq.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 6083af89..9ee4abc9 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -1,4 +1,4 @@ -/* Copyright 2019 Stanford University +/* Copyright 2020 Stanford University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From c3d3992570a6a9ad0921bab93e49ef61e095bf06 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Wed, 4 Mar 2020 16:50:32 -0800 Subject: [PATCH 26/40] Work on new version of GASNet implementation. --- gasnet/Makefile | 6 +- gasnet/seq.cc | 723 +++++++++++++++++++++++------------------------- 2 files changed, 354 insertions(+), 375 deletions(-) diff --git a/gasnet/Makefile b/gasnet/Makefile index 8b98ec64..df9619d6 100644 --- a/gasnet/Makefile +++ b/gasnet/Makefile @@ -13,17 +13,19 @@ LDFLAGS += $(shell PKG_CONFIG_PATH="$(GASNET)/lib/pkgconfig" pkg-config gasnet-$ ifeq ($(strip $(DEBUG)),0) CXXFLAGS += -O3 else - CXXFLAGS += -O0 -ggdb + CXXFLAGS += -O0 -ggdb -DBOUNDS_CHECKS endif include ../core/make_blas.mk BIN := seq +HEADERS := multi_dimensional_array.h + .PHONY: all all: $(BIN) -$(BIN): %:%.cc +$(BIN): %:%.cc $(HEADERS) $(MPICXX) -o $@ $(CXXFLAGS) $< $(LDFLAGS) .PHONY: clean diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 9ee4abc9..3299720f 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -22,6 +22,8 @@ #include "core.h" #include "timer.h" +#include "multi_dimensional_array.h" + #include "gasnetex.h" #define CHECK_OK(x) assert((x) == GASNET_OK); @@ -44,96 +46,77 @@ struct RankState { gex_Rank_t rank; gex_Rank_t n_ranks; long num_fields; + long complete; std::vector graphs; - std::vector > timesteps; - std::vector > > > > inputs; - std::vector > > input_ready; - std::vector > > input_consumed; - std::vector > > remote_input_empty; - std::vector > > > input_ptr; - std::vector > > > input_bytes; - std::vector > > > outputs; - std::vector > > output_empty; - std::vector > > scratch; - std::vector > > > > dependencies; - std::vector > > > > reverse_dependencies; + std::vector > task_ready_queue; + Array<2, long> timestep; + Array<5, char> inputs; + Array<3, int> input_ready; + Array<3, int> input_consumed; + Array<3, int> remote_input_empty; + Array<4, const char *> input_ptr; + Array<4, size_t> input_bytes; + Array<4, char> outputs; + Array<3, int> output_empty; + Array<3, int> n_raw_in; + Array<3, int> n_raw_out; + Array<3, int> n_war_in; + Array<3, int> n_war_out; + Array<3, char> scratch; + Array<3, std::vector > > dependencies; + Array<3, std::vector > > reverse_dependencies; }; RankState state; -static bool is_complete() { - AutoLock guard(state.lock); - - auto rank = state.rank; - auto n_ranks = state.n_ranks; - - auto &graphs = state.graphs; - - bool complete = true; - for (auto graph : graphs) { - long first_point = rank * graph.max_width / n_ranks; - long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - - for (long point = first_point; point <= last_point; ++point) { - long point_index = point - first_point; - - auto &point_timestep = state.timesteps[graph.graph_index][point_index]; - - // Copy so we don't modify the global value. - long timestep = point_timestep; +template +T clamp(T value, T low, T high) { + return std::min(std::max(value, low), high); +} - for (; timestep < graph.timesteps; ++timestep) { - long offset = graph.offset_at_timestep(timestep); - long width = graph.width_at_timestep(timestep); +// IMPORTANT: must be called with state lock held +static bool check_task_ready(long graph_index, long point_index, long timestep) { + auto &graph = state.graphs[graph_index]; - if (point >= offset && point < offset + width) - break; - } + long last_field = (timestep + state.num_fields - 1) % state.num_fields; + long field = timestep % state.num_fields; - complete = complete && timestep == graph.timesteps; - if (!complete) break; - } - if (!complete) break; - } - return complete; + const auto point_input_ready = state.input_ready(graph_index, point_index, last_field); + const auto point_input_consumed = state.input_consumed(graph_index, point_index, last_field); + const auto point_remote_input_empty = state.remote_input_empty(graph_index, point_index, field); + const auto point_output_empty = state.output_empty(graph_index, point_index, field); + const auto point_n_raw_in = state.n_raw_in(graph_index, point_index, timestep); + const auto point_n_war_in = state.n_war_in(graph_index, point_index, timestep); + + bool result = timestep < graph.timesteps && point_input_ready == point_n_raw_in && point_remote_input_empty == point_n_war_in && point_output_empty == 1 && point_input_consumed == 0; + printf("checking timestep %ld point_index %ld RAW in %d (of %d) WAR in %d (of %d) output empty %d input consumed %d result %d\n", + timestep, point_index, + point_input_ready, point_n_raw_in, + point_remote_input_empty, point_n_war_in, + point_output_empty, point_input_consumed, result); + return result; } -static std::pair timestep_to_send(long graph_index, long point) { - AutoLock guard(state.lock); - - auto rank = state.rank; - auto n_ranks = state.n_ranks; - +// IMPORTANT: must be called with state lock held +static void advance_timestep(long graph_index, long point, long point_index) { auto &graph = state.graphs[graph_index]; - long first_point = rank * graph.max_width / n_ranks; - long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - - long point_index = point - first_point; + auto &point_timestep = state.timestep(graph_index, point_index); - auto &point_timestep = state.timesteps[graph_index][point_index]; - - long timestep = point_timestep - 1; // gets incremented at the end of check_and_run, so decrement here + for (; point_timestep < graph.timesteps; ++point_timestep) { + long offset = graph.offset_at_timestep(point_timestep); + long width = graph.width_at_timestep(point_timestep); - long field = timestep % state.num_fields; - long last_field = (timestep + state.num_fields - 1) % state.num_fields; + if (point >= offset && point < offset + width) + break; - auto &point_output_empty = state.output_empty[graph_index][point_index][field]; - auto &point_input_consumed = state.input_consumed[graph_index][point_index][last_field]; + printf("advance skipping timestep %ld point %ld\n", point_timestep, point); - long raw_timestep = -1, war_timestep = -1; - if (timestep >= 0 && timestep < graph.timesteps - 1) { - if (point_output_empty == 0) { - raw_timestep = timestep; - } - if (point_input_consumed == 1) { - war_timestep = timestep; - } } - return std::pair(raw_timestep, war_timestep); } -static bool check_and_run(long graph_index, long point) { +// IMPORTANT: must be called with state lock held +static std::pair run_task_body(long graph_index, long point, long timestep) { auto rank = state.rank; auto n_ranks = state.n_ranks; @@ -143,103 +126,50 @@ static bool check_and_run(long graph_index, long point) { long point_index = point - first_point; - auto &point_timestep = state.timesteps[graph_index][point_index]; - - for (; point_timestep < graph.timesteps; ++point_timestep) { - long offset = graph.offset_at_timestep(point_timestep); - long width = graph.width_at_timestep(point_timestep); - - if (point >= offset && point < offset + width) - break; - } - if (point_timestep >= graph.timesteps) { - return false; - } - - long last_offset = graph.offset_at_timestep(point_timestep-1); - long last_width = graph.width_at_timestep(point_timestep-1); - - long next_offset = graph.offset_at_timestep(point_timestep+1); - long next_width = graph.width_at_timestep(point_timestep+1); - - long last_field_offset = graph.offset_at_timestep(point_timestep - state.num_fields + 1); - long last_field_width = graph.width_at_timestep(point_timestep - state.num_fields + 1); - - long next_field_offset = graph.offset_at_timestep(point_timestep + state.num_fields - 1); - long next_field_width = graph.width_at_timestep(point_timestep + state.num_fields - 1); + auto &point_timestep = state.timestep(graph_index, point_index); + printf("sanity check timestep given %ld actual %ld\n", timestep, point_timestep); + assert(point_timestep == timestep); long last_field = (point_timestep + state.num_fields - 1) % state.num_fields; long field = point_timestep % state.num_fields; - long dset = graph.dependence_set_at_timestep(point_timestep); - long next_dset = graph.dependence_set_at_timestep(point_timestep+1); - long last_field_dset = graph.dependence_set_at_timestep(point_timestep - state.num_fields + 1); - long next_field_dset = graph.dependence_set_at_timestep(point_timestep + state.num_fields - 1); - - auto &point_inputs = state.inputs[graph_index][point_index][last_field]; - auto &point_input_ready = state.input_ready[graph_index][point_index][last_field]; - auto &point_input_consumed = state.input_consumed[graph_index][point_index][last_field]; - auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][field]; - auto &point_input_ptr = state.input_ptr[graph_index][point_index][last_field]; - auto &point_input_bytes = state.input_bytes[graph_index][point_index][last_field]; - auto &point_output = state.outputs[graph_index][point_index][field]; - auto &point_output_empty = state.output_empty[graph_index][point_index][field]; - auto &point_scratch = state.scratch[graph_index][point_index]; - auto &point_deps = state.dependencies[graph_index][dset][point_index]; - auto &point_rev_deps = state.reverse_dependencies[graph_index][next_dset][point_index]; - auto &point_last_field_rev_deps = state.reverse_dependencies[graph_index][last_field_dset][point_index]; - auto &point_next_field_rev_deps = state.reverse_dependencies[graph_index][next_field_dset][point_index]; - - long n_inputs = 0; - for (auto interval : point_deps) { - long first_dep = std::min(std::max(interval.first, last_offset), last_offset + last_width); - long last_dep = std::min(std::max(interval.second + 1, last_offset), last_offset + last_width); - assert(first_dep <= last_dep); - n_inputs += last_dep - first_dep; - } - - long n_outputs = 0; - for (auto interval : point_rev_deps) { - long first_dep = std::min(std::max(interval.first, next_offset), next_offset + next_width); - long last_dep = std::min(std::max(interval.second + 1, next_offset), next_offset + next_width); - assert(first_dep <= last_dep); - n_outputs += last_dep - first_dep; - } - - long n_war_in = 0; - for (auto interval : point_last_field_rev_deps) { - long first_dep = std::min(std::max(interval.first, last_field_offset), last_field_offset + last_field_width); - long last_dep = std::min(std::max(interval.second + 1, last_field_offset), last_field_offset + last_field_width); - assert(first_dep <= last_dep); - n_war_in += last_dep - first_dep; - } - - long n_war_out = 0; - for (auto interval : point_deps) { - long first_dep = std::min(std::max(interval.first, next_field_offset), next_field_offset + next_field_width); - long last_dep = std::min(std::max(interval.second + 1, next_field_offset), next_field_offset + next_field_width); - assert(first_dep <= last_dep); - n_war_out += last_dep - first_dep; + auto &point_input_ready = state.input_ready(graph_index, point_index, last_field); + auto &point_input_consumed = state.input_consumed(graph_index, point_index, last_field); + auto &point_remote_input_empty = state.remote_input_empty(graph_index, point_index, field); + auto point_input_ptr = state.input_ptr.empty() ? nullptr : &state.input_ptr(graph_index, point_index, last_field, 0); + auto point_input_bytes = state.input_bytes.empty() ? nullptr : &state.input_bytes(graph_index, point_index, last_field, 0); + auto &point_output = state.outputs(graph_index, point_index, field, 0); + auto &point_output_empty = state.output_empty(graph_index, point_index, field); + auto point_scratch = state.scratch.empty() ? nullptr : &state.scratch(graph_index, point_index, 0); + const auto point_n_raw_in = state.n_raw_in(graph_index, point_index, timestep); + const auto point_n_raw_out = state.n_raw_out(graph_index, point_index, timestep); + const auto point_n_war_out = state.n_war_out(graph_index, point_index, timestep); + + printf("running task timestep %ld point %ld\n", timestep, point); + + graph.execute_point(point_timestep, point, + &point_output, graph.output_bytes_per_task, + point_input_ptr, point_input_bytes, point_n_raw_in, + point_scratch, graph.scratch_bytes_per_task); + + point_input_ready = 0; + point_remote_input_empty = 0; + point_input_consumed = point_n_war_out != 0; + point_output_empty = point_n_raw_out == 0; + + ++point_timestep; + advance_timestep(graph_index, point, point_index); + if (point_timestep >= graph.timesteps) { + ++state.complete; + } else if (check_task_ready(graph_index, point_index, point_timestep)) { + state.task_ready_queue.push_back( + std::tuple(graph_index, point_index, point_timestep)); } - bool ready = point_timestep < graph.timesteps && point_input_ready == n_inputs && point_remote_input_empty == n_war_in && point_output_empty == 1 && point_input_consumed == 0; - if (ready) { - graph.execute_point(point_timestep, point, - point_output.data(), point_output.size(), - point_input_ptr.data(), point_input_bytes.data(), n_inputs, - point_scratch.data(), point_scratch.size()); + long send_raw = point_output_empty == 0 && timestep < graph.timesteps-1 ? timestep : -1; + long send_war = point_input_consumed == 1 && timestep < graph.timesteps-1 ? timestep : -1; - point_input_ready = 0; - point_remote_input_empty = 0; - point_input_consumed = n_war_out != 0; - point_output_empty = n_outputs == 0; - - ++point_timestep; - - return true; - } - - return false; + return std::pair(send_raw, send_war); } static void RAW_handler(gex_Token_t token, void *buffer, size_t size, @@ -264,14 +194,13 @@ static void RAW_handler(gex_Token_t token, void *buffer, size_t size, long dset = graph.dependence_set_at_timestep(timestep+1); - auto &point_inputs = state.inputs[graph_index][point_index][field]; - auto &point_input_ready = state.input_ready[graph_index][point_index][field]; - auto &point_deps = state.dependencies[graph_index][dset][point_index]; + auto &point_input_ready = state.input_ready(graph_index, point_index, field); + auto &point_deps = state.dependencies(graph_index, dset, point_index); long input_idx = 0; for (auto interval : point_deps) { - long first_dep = std::min(std::max(interval.first, offset), offset + width); - long last_dep = std::min(interval.second + 1, offset + width); + long first_dep = clamp(interval.first, offset, offset + width); + long last_dep = clamp(interval.second + 1, offset, offset + width); assert(first_dep <= last_dep); if (first_dep <= source_point && source_point <= last_dep) { first_dep = std::min(first_dep, (long)source_point); @@ -283,8 +212,19 @@ static void RAW_handler(gex_Token_t token, void *buffer, size_t size, } } - point_inputs[input_idx].assign((char *)buffer, ((char *)buffer) + size); + std::copy( + (char *)buffer, ((char *)buffer) + size, + &state.inputs(graph_index, point_index, field, input_idx, 0)); + point_input_ready++; + + printf("RAW handler timestep %d source %d dest %d input ready (after) %d\n", timestep, source_point, dest_point, point_input_ready); + + auto &point_timestep = state.timestep(graph_index, point_index); + if (point_timestep < graph.timesteps && check_task_ready(graph_index, point_index, point_timestep)) { + state.task_ready_queue.push_back( + std::tuple(graph_index, point_index, point_timestep)); + } } static void WAR_handler(gex_Token_t token, @@ -303,8 +243,22 @@ static void WAR_handler(gex_Token_t token, long last_field = (timestep + state.num_fields - 1) % state.num_fields; - auto &point_remote_input_empty = state.remote_input_empty[graph_index][point_index][last_field]; + auto &point_remote_input_empty = state.remote_input_empty(graph_index, point_index, last_field); point_remote_input_empty++; + + printf("WAR handler timestep %d point %d remote input empty (after) %d\n", timestep, point, point_remote_input_empty); + + // FIXME: I think there are some conditions under which we need this + // (if a WAR handler comes very late) but if you just do it blindly + // you end up with double triggers. + + // Need to check that timestep == next_field_timestep, or something like that. + + // auto &point_timestep = state.timestep(graph_index, point_index); + // if (point_timestep < graph.timesteps && check_task_ready(graph_index, point_index, point_timestep)) { + // state.task_ready_queue.push_back( + // std::tuple(graph_index, point_index, point_timestep)); + // } } const int N_HANDLERS = 2; @@ -355,271 +309,291 @@ int main(int argc, char *argv[]) state.graphs = app.graphs; - state.timesteps.resize(app.graphs.size()); - state.inputs.resize(app.graphs.size()); - state.input_ready.resize(app.graphs.size()); - state.input_consumed.resize(app.graphs.size()); - state.remote_input_empty.resize(app.graphs.size()); - state.input_ptr.resize(app.graphs.size()); - state.input_bytes.resize(app.graphs.size()); - state.outputs.resize(app.graphs.size()); - state.output_empty.resize(app.graphs.size()); - state.scratch.resize(app.graphs.size()); - state.dependencies.resize(app.graphs.size()); - state.reverse_dependencies.resize(app.graphs.size()); - - std::vector > graph_rank_by_point(app.graphs.size()); + long max_timesteps = 0; + long max_points = 0; + long max_dsets = 0; + long max_deps = 0; + size_t max_output_bytes = 0; + size_t max_scratch_bytes = 0; + long expected_tasks = 0; + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + long n_points = last_point - first_point + 1; - std::vector > sends_raw; - std::vector > sends_war; + max_timesteps = std::max(max_timesteps, graph.timesteps); - double elapsed_time = 0.0; - for (int iter = 0; iter < 2; ++iter) { - for (auto graph : app.graphs) { - long first_point = rank * graph.max_width / n_ranks; - long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - long n_points = last_point - first_point + 1; + max_points = std::max(max_points, n_points); - long max_deps = 0; - for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { - for (long point = first_point; point <= last_point; ++point) { - long deps = 0; - for (auto interval : graph.dependencies(dset, point)) { - deps += interval.second - interval.first + 1; - } - max_deps = std::max(max_deps, deps); - } - } + max_output_bytes = std::max(max_output_bytes, graph.output_bytes_per_task); + max_scratch_bytes = std::max(max_scratch_bytes, graph.scratch_bytes_per_task); + + max_dsets = std::max(max_dsets, graph.max_dependence_sets()); - // Initialize data structures. - graph_rank_by_point[graph.graph_index].resize(graph.max_width); - for (gex_Rank_t r = 0; r < n_ranks; ++r) { - long r_first_point = r * graph.max_width / n_ranks; - long r_last_point = (r + 1) * graph.max_width / n_ranks - 1; - for (long p = r_first_point; p <= r_last_point; ++p) { - graph_rank_by_point[graph.graph_index][p] = r; + for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { + for (long point = first_point; point <= last_point; ++point) { + long deps = 0; + for (auto interval : graph.dependencies(dset, point)) { + deps += interval.second - interval.first + 1; } + max_deps = std::max(max_deps, deps); } + } - auto ×teps = state.timesteps[graph.graph_index]; - auto &inputs = state.inputs[graph.graph_index]; - auto &input_ready = state.input_ready[graph.graph_index]; - auto &input_consumed = state.input_consumed[graph.graph_index]; - auto &remote_input_empty = state.remote_input_empty[graph.graph_index]; - auto &input_ptr = state.input_ptr[graph.graph_index]; - auto &input_bytes = state.input_bytes[graph.graph_index]; - auto &outputs = state.outputs[graph.graph_index]; - auto &output_empty = state.output_empty[graph.graph_index]; - auto &scratch = state.scratch[graph.graph_index]; - - timesteps.resize(n_points); - inputs.resize(n_points); - input_ready.resize(n_points); - input_consumed.resize(n_points); - remote_input_empty.resize(n_points); - input_ptr.resize(n_points); - input_bytes.resize(n_points); - outputs.resize(n_points); - output_empty.resize(n_points); - scratch.resize(n_points); + expected_tasks += n_points; + } - for (long point = first_point; point <= last_point; ++point) { - long point_index = point - first_point; + size_t n_graphs = app.graphs.size(); + + state.timestep.resize( {n_graphs, (size_t)max_points}); + state.inputs.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields, (size_t)max_deps, max_output_bytes}); + state.input_ready.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields}); + state.input_consumed.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields}); + state.remote_input_empty.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields}); + state.input_ptr.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields, (size_t)max_deps}); + state.input_bytes.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields, (size_t)max_deps}); + state.outputs.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields, max_output_bytes}); + state.output_empty.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields}); + state.n_raw_in.resize( {n_graphs, (size_t)max_points, (size_t)max_timesteps}); + state.n_raw_out.resize( {n_graphs, (size_t)max_points, (size_t)max_timesteps}); + state.n_war_in.resize( {n_graphs, (size_t)max_points, (size_t)max_timesteps}); + state.n_war_out.resize( {n_graphs, (size_t)max_points, (size_t)max_timesteps}); + state.scratch.resize( {n_graphs, (size_t)max_points, max_scratch_bytes}); + state.dependencies.resize( {n_graphs, (size_t)max_dsets, (size_t)max_points}); + state.reverse_dependencies.resize({n_graphs, (size_t)max_dsets, (size_t)max_points}); + + std::vector > rank_by_point(app.graphs.size()); + + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + long n_points = last_point - first_point + 1; - auto &point_timestep = timesteps[point_index]; - point_timestep = 0; - - auto &point_inputs = inputs[point_index]; - auto &point_input_ready = input_ready[point_index]; - auto &point_input_consumed = input_consumed[point_index]; - auto &point_remote_input_empty = remote_input_empty[point_index]; - auto &point_input_ptr = input_ptr[point_index]; - auto &point_input_bytes = input_bytes[point_index]; - auto &point_outputs = outputs[point_index]; - auto &point_output_empty = output_empty[point_index]; - - point_inputs.resize(state.num_fields); - point_input_ready.resize(state.num_fields); - point_input_consumed.resize(state.num_fields); - point_remote_input_empty.resize(state.num_fields); - point_input_ptr.resize(state.num_fields); - point_input_bytes.resize(state.num_fields); - point_outputs.resize(state.num_fields); - point_output_empty.resize(state.num_fields); - - for (long field = 0; field < state.num_fields; ++field) { - auto &field_inputs = point_inputs[field]; - auto &field_input_ptr = point_input_ptr[field]; - auto &field_input_bytes = point_input_bytes[field]; - - field_inputs.resize(max_deps); - field_input_ptr.resize(max_deps); - field_input_bytes.resize(max_deps); - - for (long dep = 0; dep < max_deps; ++dep) { - field_inputs[dep].resize(graph.output_bytes_per_task); - field_input_ptr[dep] = field_inputs[dep].data(); - field_input_bytes[dep] = field_inputs[dep].size(); - } + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; - auto &field_input_ready = point_input_ready[field]; - field_input_ready = 0; + for (long field = 0; field < state.num_fields; ++field) { + for (long dep = 0; dep < max_deps; ++dep) { + state.input_ptr(graph.graph_index, point_index, field, dep) = &state.inputs(graph.graph_index, point_index, field, dep, 0); + state.input_bytes(graph.graph_index, point_index, field, dep) = graph.output_bytes_per_task; + } + } - auto &field_input_consumed = point_input_consumed[field]; - field_input_consumed = 0; + for (long timestep = 0; timestep < graph.timesteps; ++timestep) { + long last_offset = graph.offset_at_timestep(timestep-1); + long last_width = graph.width_at_timestep(timestep-1); - auto &field_remote_input_empty = point_remote_input_empty[field]; - field_remote_input_empty = 0; + long next_offset = graph.offset_at_timestep(timestep+1); + long next_width = graph.width_at_timestep(timestep+1); - auto &field_outputs = point_outputs[field]; - field_outputs.resize(graph.output_bytes_per_task); + long last_field_offset = graph.offset_at_timestep(timestep - state.num_fields + 1); + long last_field_width = graph.width_at_timestep(timestep - state.num_fields + 1); - auto &field_output_empty = point_output_empty[field]; - field_output_empty = 1; - } + long next_field_offset = graph.offset_at_timestep(timestep + state.num_fields - 1); + long next_field_width = graph.width_at_timestep(timestep + state.num_fields - 1); - auto &point_scratch = scratch[point_index]; - point_scratch.resize(graph.scratch_bytes_per_task); - TaskGraph::prepare_scratch(point_scratch.data(), point_scratch.size()); - } - - // Cache dependencies. - auto &dependencies = state.dependencies[graph.graph_index]; - auto &reverse_dependencies = state.reverse_dependencies[graph.graph_index]; + long dset = graph.dependence_set_at_timestep(timestep); + long next_dset = graph.dependence_set_at_timestep(timestep+1); + long last_field_dset = graph.dependence_set_at_timestep(timestep - state.num_fields + 1); + long next_field_dset = graph.dependence_set_at_timestep(timestep + state.num_fields - 1); - dependencies.resize(graph.max_dependence_sets()); - reverse_dependencies.resize(graph.max_dependence_sets()); + long raw_in = 0; + for (auto interval : graph.dependencies(dset, point)) { + long first_dep = clamp(interval.first, last_offset, last_offset + last_width); + long last_dep = clamp(interval.second + 1, last_offset, last_offset + last_width); + assert(first_dep <= last_dep); + raw_in += last_dep - first_dep; + } + state.n_raw_in(graph.graph_index, point_index, timestep) = raw_in; + + long raw_out = 0; + for (auto interval : graph.reverse_dependencies(next_dset, point)) { + long first_dep = clamp(interval.first, next_offset, next_offset + next_width); + long last_dep = clamp(interval.second + 1, next_offset, next_offset + next_width); + assert(first_dep <= last_dep); + raw_out += last_dep - first_dep; + } + state.n_raw_out(graph.graph_index, point_index, timestep) = raw_out; + + long war_in = 0; + for (auto interval : graph.reverse_dependencies(last_field_dset, point)) { + long first_dep = clamp(interval.first, last_field_offset, last_field_offset + last_field_width); + long last_dep = clamp(interval.second + 1, last_field_offset, last_field_offset + last_field_width); + assert(first_dep <= last_dep); + war_in += last_dep - first_dep; + } + state.n_war_in(graph.graph_index, point_index, timestep) = war_in; + + long war_out = 0; + for (auto interval : graph.dependencies(dset, point)) { + long first_dep = clamp(interval.first, next_field_offset, next_field_offset + next_field_width); + long last_dep = clamp(interval.second + 1, next_field_offset, next_field_offset + next_field_width); + assert(first_dep <= last_dep); + war_out += last_dep - first_dep; + } + state.n_war_out(graph.graph_index, point_index, timestep) = war_out; + } for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { - dependencies[dset].resize(n_points); - reverse_dependencies[dset].resize(n_points); + auto deps = graph.dependencies(dset, point); + auto rev_deps = graph.dependencies(dset, point); - for (long point = first_point; point <= last_point; ++point) { - long point_index = point - first_point; + state.dependencies(graph.graph_index, dset, point_index) = deps; + state.reverse_dependencies(graph.graph_index, dset, point_index) = rev_deps; + } - dependencies[dset][point_index] = graph.dependencies(dset, point); - reverse_dependencies[dset][point_index] = graph.reverse_dependencies(dset, point); - } + if (!state.scratch.empty()) + TaskGraph::prepare_scratch(&state.scratch(graph.graph_index, point_index, 0), graph.scratch_bytes_per_task); + } + + rank_by_point[graph.graph_index].resize(graph.max_width); + for (gex_Rank_t r = 0; r < n_ranks; ++r) { + long r_first_point = r * graph.max_width / n_ranks; + long r_last_point = (r + 1) * graph.max_width / n_ranks - 1; + for (long p = r_first_point; p <= r_last_point; ++p) { + rank_by_point[graph.graph_index][p] = r; } } + } + + std::vector > send_queue; + + std::vector > sends_raw; + std::vector > sends_war; + + std::vector > task_ready_queue_local; + + double elapsed_time = 0.0; + for (int iter = 0; iter < 2; ++iter) { + std::fill(state.timestep.begin(), state.timestep.end(), 0); + std::fill(state.input_ready.begin(), state.input_ready.end(), 0); + std::fill(state.input_consumed.begin(), state.input_consumed.end(), 0); + std::fill(state.remote_input_empty.begin(), state.remote_input_empty.end(), 0); + std::fill(state.output_empty.begin(), state.output_empty.end(), 1); gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); double start_time = Timer::get_cur_time(); - while (!is_complete()) { - // Run any ready tasks. - { - AutoLock guard(state.lock); + // Advance and queue initial tasks. + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + long n_points = last_point - first_point + 1; - for (auto graph : app.graphs) { - long first_point = rank * graph.max_width / n_ranks; - long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; - for (long point = first_point; point <= last_point; ++point) { - long point_index = point - first_point; + advance_timestep(graph.graph_index, point, point_index); - long timestep = state.timesteps[graph.graph_index][point_index]; + auto &point_timestep = state.timestep(graph.graph_index, point_index); - if (timestep >= graph.timesteps) { - continue; - } + if (check_task_ready(graph.graph_index, point_index, point_timestep)) { + state.task_ready_queue.push_back( + std::tuple(graph.graph_index, point_index, point_timestep)); + } + } + } + + while (true) { + { + AutoLock guard(state.lock); + + // Check for completion. + if (state.complete == expected_tasks) + break; - check_and_run(graph.graph_index, point); + // Run any ready tasks. + task_ready_queue_local.swap(state.task_ready_queue); + for (auto entry : task_ready_queue_local) { + long graph_index, point, timestep; + std::tie(graph_index, point, timestep) = entry; + + auto send = run_task_body(graph_index, point, timestep); + if (send.first >= 0 || send.second >= 0) { + send_queue.push_back(std::tuple(graph_index, point, send.first, send.second)); } } - } - // Send data for RAW dependencies - for (auto graph : app.graphs) { - long first_point = rank * graph.max_width / n_ranks; - long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + // Clear the task queue. + task_ready_queue_local.clear(); + } - auto &outputs = state.outputs[graph.graph_index]; + // Issue queued sends. + for (auto entry : send_queue) { + long graph_index, point, raw_timestep, war_timestep; + std::tie(graph_index, point, raw_timestep, war_timestep) = entry; - auto &reverse_dependencies = state.reverse_dependencies[graph.graph_index]; + printf("processing queued send for point %ld RAW timestep %ld WAR timestep %ld\n", point, raw_timestep, war_timestep); - auto &rank_by_point = graph_rank_by_point[graph.graph_index]; + auto &graph = app.graphs[graph_index]; - for (long point = first_point; point <= last_point; ++point) { - long point_index = point - first_point; + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; - long timestep = timestep_to_send(graph.graph_index, point).first; - if (timestep >= 0) { - long offset = graph.offset_at_timestep(timestep); - long width = graph.width_at_timestep(timestep); + long point_index = point - first_point; - long next_offset = graph.offset_at_timestep(timestep+1); - long next_width = graph.width_at_timestep(timestep+1); + // RAW dependencies: + if (raw_timestep >= 0) { + long offset = graph.offset_at_timestep(raw_timestep); + long width = graph.width_at_timestep(raw_timestep); - long field = timestep % state.num_fields; + long next_offset = graph.offset_at_timestep(raw_timestep+1); + long next_width = graph.width_at_timestep(raw_timestep+1); - long dset = graph.dependence_set_at_timestep(timestep + 1); + long field = raw_timestep % state.num_fields; - auto &point_output = outputs[point_index][field]; - auto &point_rev_deps = reverse_dependencies[dset][point_index]; + long dset = graph.dependence_set_at_timestep(raw_timestep + 1); - if (point >= offset && point < offset + width) { - for (auto interval : point_rev_deps) { - for (long dep = interval.first; dep <= interval.second; dep++) { - if (dep < next_offset || dep >= next_offset + next_width) { - continue; - } + auto &point_output = state.outputs(graph_index, point_index, field, 0); + auto &point_rev_deps = state.reverse_dependencies(graph_index, dset, point_index); - CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[dep], handlers[0].gex_index, - point_output.data(), point_output.size(), - GEX_EVENT_GROUP, 0, - (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, - (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); - sends_raw.push_back(std::tuple(graph.graph_index, timestep, point)); + if (point >= offset && point < offset + width) { + for (auto interval : point_rev_deps) { + for (long dep = interval.first; dep <= interval.second; dep++) { + if (dep < next_offset || dep >= next_offset + next_width) { + continue; } + + CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[graph_index][dep], handlers[0].gex_index, + &point_output, graph.output_bytes_per_task, + GEX_EVENT_GROUP, 0, + (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)raw_timestep, + (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); + sends_raw.push_back(std::tuple(graph.graph_index, raw_timestep, point)); } } } } - } - // Send data for WAR dependencies - for (auto graph : app.graphs) { - long first_point = rank * graph.max_width / n_ranks; - long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + // WAR dependencies: + if (war_timestep >= 0) { + long offset = graph.offset_at_timestep(war_timestep); + long width = graph.width_at_timestep(war_timestep); - auto &dependencies = state.dependencies[graph.graph_index]; + long next_field_offset = graph.offset_at_timestep(war_timestep + state.num_fields - 1); + long next_field_width = graph.width_at_timestep(war_timestep + state.num_fields - 1); - auto &rank_by_point = graph_rank_by_point[graph.graph_index]; + long dset = graph.dependence_set_at_timestep(war_timestep); - for (long point = first_point; point <= last_point; ++point) { - long point_index = point - first_point; - - long timestep = timestep_to_send(graph.graph_index, point).second; - if (timestep >= 0) { - - long offset = graph.offset_at_timestep(timestep); - long width = graph.width_at_timestep(timestep); - - long next_field_offset = graph.offset_at_timestep(timestep + state.num_fields - 1); - long next_field_width = graph.width_at_timestep(timestep + state.num_fields - 1); - - long dset = graph.dependence_set_at_timestep(timestep); + auto &point_deps = state.dependencies(graph_index, dset, point_index); - auto &point_deps = dependencies[dset][point_index]; - - if (point >= offset && point < offset + width) { - for (auto interval : point_deps) { - for (long dep = interval.first; dep <= interval.second; dep++) { - if (dep < next_field_offset || dep >= next_field_offset + next_field_width) { - continue; - } - - CHECK_OK(gex_AM_RequestShort(tm, rank_by_point[dep], handlers[1].gex_index, 0, - (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)timestep, (gex_AM_Arg_t)dep)); - sends_war.push_back(std::tuple(graph.graph_index, timestep, point)); + if (point >= offset && point < offset + width) { + for (auto interval : point_deps) { + for (long dep = interval.first; dep <= interval.second; dep++) { + if (dep < next_field_offset || dep >= next_field_offset + next_field_width) { + continue; } + + CHECK_OK(gex_AM_RequestShort(tm, rank_by_point[graph_index][dep], handlers[1].gex_index, 0, + (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)war_timestep, (gex_AM_Arg_t)dep)); + sends_war.push_back(std::tuple(graph.graph_index, war_timestep, point)); } } } } } + send_queue.clear(); // Wait for local completion so it's safe to override output buffers. gex_NBI_Wait(GEX_EC_LC, 0); @@ -643,7 +617,7 @@ int main(int argc, char *argv[]) long last_field = (timestep + state.num_fields - 1) % state.num_fields; long field = timestep % state.num_fields; - auto &point_output_empty = state.output_empty[graph_index][point_index][field]; + auto &point_output_empty = state.output_empty(graph_index, point_index, field); point_output_empty = 1; } for (auto &send : sends_war) { @@ -661,7 +635,7 @@ int main(int argc, char *argv[]) long last_field = (timestep + state.num_fields - 1) % state.num_fields; - auto &point_input_consumed = state.input_consumed[graph_index][point_index][last_field]; + auto &point_input_consumed = state.input_consumed(graph_index, point_index, last_field); point_input_consumed = 0; } } @@ -683,4 +657,7 @@ int main(int argc, char *argv[]) } gex_HSL_Destroy(&state.lock); + + return 0; } + From e78c8a35f32abc2081777294c84487dc2228f589 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Thu, 5 Mar 2020 10:38:43 -0800 Subject: [PATCH 27/40] Checkpoint working code. --- gasnet/seq.cc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 3299720f..a1e9265c 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -127,7 +127,6 @@ static std::pair run_task_body(long graph_index, long point, long ti long point_index = point - first_point; auto &point_timestep = state.timestep(graph_index, point_index); - printf("sanity check timestep given %ld actual %ld\n", timestep, point_timestep); assert(point_timestep == timestep); long last_field = (point_timestep + state.num_fields - 1) % state.num_fields; @@ -436,7 +435,7 @@ int main(int argc, char *argv[]) for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { auto deps = graph.dependencies(dset, point); - auto rev_deps = graph.dependencies(dset, point); + auto rev_deps = graph.reverse_dependencies(dset, point); state.dependencies(graph.graph_index, dset, point_index) = deps; state.reverse_dependencies(graph.graph_index, dset, point_index) = rev_deps; @@ -465,6 +464,7 @@ int main(int argc, char *argv[]) double elapsed_time = 0.0; for (int iter = 0; iter < 2; ++iter) { + state.complete = 0; std::fill(state.timestep.begin(), state.timestep.end(), 0); std::fill(state.input_ready.begin(), state.input_ready.end(), 0); std::fill(state.input_consumed.begin(), state.input_consumed.end(), 0); @@ -488,7 +488,9 @@ int main(int argc, char *argv[]) auto &point_timestep = state.timestep(graph.graph_index, point_index); - if (check_task_ready(graph.graph_index, point_index, point_timestep)) { + if (point_timestep >= graph.timesteps) { + ++state.complete; + } else if (check_task_ready(graph.graph_index, point_index, point_timestep)) { state.task_ready_queue.push_back( std::tuple(graph.graph_index, point_index, point_timestep)); } @@ -548,13 +550,18 @@ int main(int argc, char *argv[]) auto &point_output = state.outputs(graph_index, point_index, field, 0); auto &point_rev_deps = state.reverse_dependencies(graph_index, dset, point_index); + printf("check for RAW send point %ld offset %ld width %ld\n", point, offset, width); + if (point >= offset && point < offset + width) { for (auto interval : point_rev_deps) { + printf(" got interval %ld %ld\n", interval.first, interval.second); for (long dep = interval.first; dep <= interval.second; dep++) { + printf(" check for RAW send dep %ld next_offset %ld next_width %ld\n", dep, next_offset, next_width); if (dep < next_offset || dep >= next_offset + next_width) { continue; } + printf("send RAW source %ld dest %ld timestep %ld\n", point, dep, raw_timestep); CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[graph_index][dep], handlers[0].gex_index, &point_output, graph.output_bytes_per_task, GEX_EVENT_GROUP, 0, From 420c2549b902956736de4560fee14d63524a0e83 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Thu, 5 Mar 2020 11:20:13 -0800 Subject: [PATCH 28/40] Checkpoint with more fixes, non-deterministic freeze on DOM multi-node. --- gasnet/seq.cc | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index a1e9265c..70ec8775 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -75,7 +75,7 @@ T clamp(T value, T low, T high) { } // IMPORTANT: must be called with state lock held -static bool check_task_ready(long graph_index, long point_index, long timestep) { +static bool check_task_ready(long graph_index, long point, long point_index, long timestep) { auto &graph = state.graphs[graph_index]; long last_field = (timestep + state.num_fields - 1) % state.num_fields; @@ -89,8 +89,11 @@ static bool check_task_ready(long graph_index, long point_index, long timestep) const auto point_n_war_in = state.n_war_in(graph_index, point_index, timestep); bool result = timestep < graph.timesteps && point_input_ready == point_n_raw_in && point_remote_input_empty == point_n_war_in && point_output_empty == 1 && point_input_consumed == 0; - printf("checking timestep %ld point_index %ld RAW in %d (of %d) WAR in %d (of %d) output empty %d input consumed %d result %d\n", - timestep, point_index, + auto rank = state.rank; + auto n_ranks = state.n_ranks; + printf("[rank %d/%d] checking timestep %ld point %ld point_index %ld RAW in %d (of %d) WAR in %d (of %d) output empty %d input consumed %d result %d\n", + rank, n_ranks, + timestep, point, point_index, point_input_ready, point_n_raw_in, point_remote_input_empty, point_n_war_in, point_output_empty, point_input_consumed, result); @@ -126,6 +129,8 @@ static std::pair run_task_body(long graph_index, long point, long ti long point_index = point - first_point; + printf("running task timestep %ld point %ld\n", timestep, point); + auto &point_timestep = state.timestep(graph_index, point_index); assert(point_timestep == timestep); @@ -144,8 +149,6 @@ static std::pair run_task_body(long graph_index, long point, long ti const auto point_n_raw_out = state.n_raw_out(graph_index, point_index, timestep); const auto point_n_war_out = state.n_war_out(graph_index, point_index, timestep); - printf("running task timestep %ld point %ld\n", timestep, point); - graph.execute_point(point_timestep, point, &point_output, graph.output_bytes_per_task, point_input_ptr, point_input_bytes, point_n_raw_in, @@ -160,9 +163,9 @@ static std::pair run_task_body(long graph_index, long point, long ti advance_timestep(graph_index, point, point_index); if (point_timestep >= graph.timesteps) { ++state.complete; - } else if (check_task_ready(graph_index, point_index, point_timestep)) { + } else if (check_task_ready(graph_index, point, point_index, point_timestep)) { state.task_ready_queue.push_back( - std::tuple(graph_index, point_index, point_timestep)); + std::tuple(graph_index, point, point_timestep)); } long send_raw = point_output_empty == 0 && timestep < graph.timesteps-1 ? timestep : -1; @@ -220,9 +223,10 @@ static void RAW_handler(gex_Token_t token, void *buffer, size_t size, printf("RAW handler timestep %d source %d dest %d input ready (after) %d\n", timestep, source_point, dest_point, point_input_ready); auto &point_timestep = state.timestep(graph_index, point_index); - if (point_timestep < graph.timesteps && check_task_ready(graph_index, point_index, point_timestep)) { + if (timestep + 1 == point_timestep && point_timestep < graph.timesteps && check_task_ready(graph_index, point, point_index, point_timestep)) { + printf(" queueing task timestep %ld point %ld from RAW handler\n", point_timestep, point); state.task_ready_queue.push_back( - std::tuple(graph_index, point_index, point_timestep)); + std::tuple(graph_index, point, point_timestep)); } } @@ -254,9 +258,9 @@ static void WAR_handler(gex_Token_t token, // Need to check that timestep == next_field_timestep, or something like that. // auto &point_timestep = state.timestep(graph_index, point_index); - // if (point_timestep < graph.timesteps && check_task_ready(graph_index, point_index, point_timestep)) { + // if (point_timestep < graph.timesteps && check_task_ready(graph_index, point, point_index, point_timestep)) { // state.task_ready_queue.push_back( - // std::tuple(graph_index, point_index, point_timestep)); + // std::tuple(graph_index, point, point_timestep)); // } } @@ -293,6 +297,9 @@ int main(int argc, char *argv[]) state.rank = rank; state.n_ranks = n_ranks; + // printf("[rank %d/%d] PID %d sleeping for 30 seconds...\n", rank, n_ranks, getpid()); + // sleep(30); + // uintptr_t max_size = 0; // gasnet_getMaxLocalSegmentSize(); // don't need this with AM Medium // gex_Segment_t segment; // CHECK_OK(gex_Segment_Attach(&segment, tm, max_size)); @@ -463,7 +470,7 @@ int main(int argc, char *argv[]) std::vector > task_ready_queue_local; double elapsed_time = 0.0; - for (int iter = 0; iter < 2; ++iter) { + for (int iter = 0; iter < 1; ++iter) { state.complete = 0; std::fill(state.timestep.begin(), state.timestep.end(), 0); std::fill(state.input_ready.begin(), state.input_ready.end(), 0); @@ -490,9 +497,9 @@ int main(int argc, char *argv[]) if (point_timestep >= graph.timesteps) { ++state.complete; - } else if (check_task_ready(graph.graph_index, point_index, point_timestep)) { + } else if (check_task_ready(graph.graph_index, point, point_index, point_timestep)) { state.task_ready_queue.push_back( - std::tuple(graph.graph_index, point_index, point_timestep)); + std::tuple(graph.graph_index, point, point_timestep)); } } } From 164c91c0e0a51e36c86b43805ecb8d86afad870c Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Thu, 5 Mar 2020 15:15:51 -0800 Subject: [PATCH 29/40] Fix for queueing on WAR dependencies. --- gasnet/seq.cc | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 70ec8775..0a297810 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -251,17 +251,12 @@ static void WAR_handler(gex_Token_t token, printf("WAR handler timestep %d point %d remote input empty (after) %d\n", timestep, point, point_remote_input_empty); - // FIXME: I think there are some conditions under which we need this - // (if a WAR handler comes very late) but if you just do it blindly - // you end up with double triggers. - - // Need to check that timestep == next_field_timestep, or something like that. - - // auto &point_timestep = state.timestep(graph_index, point_index); - // if (point_timestep < graph.timesteps && check_task_ready(graph_index, point, point_index, point_timestep)) { - // state.task_ready_queue.push_back( - // std::tuple(graph_index, point, point_timestep)); - // } + auto &point_timestep = state.timestep(graph_index, point_index); + if (timestep + state.num_fields - 1 == point_timestep && point_timestep < graph.timesteps && check_task_ready(graph_index, point, point_index, point_timestep)) { + printf(" queueing task timestep %ld point %d from WAR handler\n", point_timestep, point); + state.task_ready_queue.push_back( + std::tuple(graph_index, point, point_timestep)); + } } const int N_HANDLERS = 2; From 05a58caf075fe5929a87a04abaff5639349815c3 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Thu, 5 Mar 2020 15:20:49 -0800 Subject: [PATCH 30/40] Cleanup. --- gasnet/seq.cc | 42 ++++++------------------------------------ 1 file changed, 6 insertions(+), 36 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 0a297810..bac81b1d 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -75,7 +75,7 @@ T clamp(T value, T low, T high) { } // IMPORTANT: must be called with state lock held -static bool check_task_ready(long graph_index, long point, long point_index, long timestep) { +static bool check_task_ready(long graph_index, long point_index, long timestep) { auto &graph = state.graphs[graph_index]; long last_field = (timestep + state.num_fields - 1) % state.num_fields; @@ -88,16 +88,7 @@ static bool check_task_ready(long graph_index, long point, long point_index, lon const auto point_n_raw_in = state.n_raw_in(graph_index, point_index, timestep); const auto point_n_war_in = state.n_war_in(graph_index, point_index, timestep); - bool result = timestep < graph.timesteps && point_input_ready == point_n_raw_in && point_remote_input_empty == point_n_war_in && point_output_empty == 1 && point_input_consumed == 0; - auto rank = state.rank; - auto n_ranks = state.n_ranks; - printf("[rank %d/%d] checking timestep %ld point %ld point_index %ld RAW in %d (of %d) WAR in %d (of %d) output empty %d input consumed %d result %d\n", - rank, n_ranks, - timestep, point, point_index, - point_input_ready, point_n_raw_in, - point_remote_input_empty, point_n_war_in, - point_output_empty, point_input_consumed, result); - return result; + return timestep < graph.timesteps && point_input_ready == point_n_raw_in && point_remote_input_empty == point_n_war_in && point_output_empty == 1 && point_input_consumed == 0; } // IMPORTANT: must be called with state lock held @@ -112,9 +103,6 @@ static void advance_timestep(long graph_index, long point, long point_index) { if (point >= offset && point < offset + width) break; - - printf("advance skipping timestep %ld point %ld\n", point_timestep, point); - } } @@ -129,8 +117,6 @@ static std::pair run_task_body(long graph_index, long point, long ti long point_index = point - first_point; - printf("running task timestep %ld point %ld\n", timestep, point); - auto &point_timestep = state.timestep(graph_index, point_index); assert(point_timestep == timestep); @@ -163,7 +149,7 @@ static std::pair run_task_body(long graph_index, long point, long ti advance_timestep(graph_index, point, point_index); if (point_timestep >= graph.timesteps) { ++state.complete; - } else if (check_task_ready(graph_index, point, point_index, point_timestep)) { + } else if (check_task_ready(graph_index, point_index, point_timestep)) { state.task_ready_queue.push_back( std::tuple(graph_index, point, point_timestep)); } @@ -220,11 +206,8 @@ static void RAW_handler(gex_Token_t token, void *buffer, size_t size, point_input_ready++; - printf("RAW handler timestep %d source %d dest %d input ready (after) %d\n", timestep, source_point, dest_point, point_input_ready); - auto &point_timestep = state.timestep(graph_index, point_index); - if (timestep + 1 == point_timestep && point_timestep < graph.timesteps && check_task_ready(graph_index, point, point_index, point_timestep)) { - printf(" queueing task timestep %ld point %ld from RAW handler\n", point_timestep, point); + if (timestep + 1 == point_timestep && point_timestep < graph.timesteps && check_task_ready(graph_index, point_index, point_timestep)) { state.task_ready_queue.push_back( std::tuple(graph_index, point, point_timestep)); } @@ -249,11 +232,8 @@ static void WAR_handler(gex_Token_t token, auto &point_remote_input_empty = state.remote_input_empty(graph_index, point_index, last_field); point_remote_input_empty++; - printf("WAR handler timestep %d point %d remote input empty (after) %d\n", timestep, point, point_remote_input_empty); - auto &point_timestep = state.timestep(graph_index, point_index); - if (timestep + state.num_fields - 1 == point_timestep && point_timestep < graph.timesteps && check_task_ready(graph_index, point, point_index, point_timestep)) { - printf(" queueing task timestep %ld point %d from WAR handler\n", point_timestep, point); + if (timestep + state.num_fields - 1 == point_timestep && point_timestep < graph.timesteps && check_task_ready(graph_index, point_index, point_timestep)) { state.task_ready_queue.push_back( std::tuple(graph_index, point, point_timestep)); } @@ -292,9 +272,6 @@ int main(int argc, char *argv[]) state.rank = rank; state.n_ranks = n_ranks; - // printf("[rank %d/%d] PID %d sleeping for 30 seconds...\n", rank, n_ranks, getpid()); - // sleep(30); - // uintptr_t max_size = 0; // gasnet_getMaxLocalSegmentSize(); // don't need this with AM Medium // gex_Segment_t segment; // CHECK_OK(gex_Segment_Attach(&segment, tm, max_size)); @@ -492,7 +469,7 @@ int main(int argc, char *argv[]) if (point_timestep >= graph.timesteps) { ++state.complete; - } else if (check_task_ready(graph.graph_index, point, point_index, point_timestep)) { + } else if (check_task_ready(graph.graph_index, point_index, point_timestep)) { state.task_ready_queue.push_back( std::tuple(graph.graph_index, point, point_timestep)); } @@ -528,8 +505,6 @@ int main(int argc, char *argv[]) long graph_index, point, raw_timestep, war_timestep; std::tie(graph_index, point, raw_timestep, war_timestep) = entry; - printf("processing queued send for point %ld RAW timestep %ld WAR timestep %ld\n", point, raw_timestep, war_timestep); - auto &graph = app.graphs[graph_index]; long first_point = rank * graph.max_width / n_ranks; @@ -552,18 +527,13 @@ int main(int argc, char *argv[]) auto &point_output = state.outputs(graph_index, point_index, field, 0); auto &point_rev_deps = state.reverse_dependencies(graph_index, dset, point_index); - printf("check for RAW send point %ld offset %ld width %ld\n", point, offset, width); - if (point >= offset && point < offset + width) { for (auto interval : point_rev_deps) { - printf(" got interval %ld %ld\n", interval.first, interval.second); for (long dep = interval.first; dep <= interval.second; dep++) { - printf(" check for RAW send dep %ld next_offset %ld next_width %ld\n", dep, next_offset, next_width); if (dep < next_offset || dep >= next_offset + next_width) { continue; } - printf("send RAW source %ld dest %ld timestep %ld\n", point, dep, raw_timestep); CHECK_OK(gex_AM_RequestMedium(tm, rank_by_point[graph_index][dep], handlers[0].gex_index, &point_output, graph.output_bytes_per_task, GEX_EVENT_GROUP, 0, From 1b55805f23c3841c1e4ba32450611ace58a82e91 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Thu, 5 Mar 2020 15:23:58 -0800 Subject: [PATCH 31/40] Avoid unnecessary work. --- gasnet/seq.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index bac81b1d..279f6984 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -528,6 +528,7 @@ int main(int argc, char *argv[]) auto &point_rev_deps = state.reverse_dependencies(graph_index, dset, point_index); if (point >= offset && point < offset + width) { + bool sent = false; for (auto interval : point_rev_deps) { for (long dep = interval.first; dep <= interval.second; dep++) { if (dep < next_offset || dep >= next_offset + next_width) { @@ -539,9 +540,11 @@ int main(int argc, char *argv[]) GEX_EVENT_GROUP, 0, (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)raw_timestep, (gex_AM_Arg_t)point, (gex_AM_Arg_t)dep)); - sends_raw.push_back(std::tuple(graph.graph_index, raw_timestep, point)); + sent = true; } } + if (sent) + sends_raw.push_back(std::tuple(graph.graph_index, raw_timestep, point)); } } @@ -558,6 +561,7 @@ int main(int argc, char *argv[]) auto &point_deps = state.dependencies(graph_index, dset, point_index); if (point >= offset && point < offset + width) { + bool sent = false; for (auto interval : point_deps) { for (long dep = interval.first; dep <= interval.second; dep++) { if (dep < next_field_offset || dep >= next_field_offset + next_field_width) { @@ -566,9 +570,11 @@ int main(int argc, char *argv[]) CHECK_OK(gex_AM_RequestShort(tm, rank_by_point[graph_index][dep], handlers[1].gex_index, 0, (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)war_timestep, (gex_AM_Arg_t)dep)); - sends_war.push_back(std::tuple(graph.graph_index, war_timestep, point)); + sent = true; } } + if (sent) + sends_war.push_back(std::tuple(graph.graph_index, war_timestep, point)); } } } From 98943e2ebf367b58ace1d6e2815f0a4f39b01308 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Thu, 5 Mar 2020 15:31:19 -0800 Subject: [PATCH 32/40] Forgot to add file. --- gasnet/multi_dimensional_array.h | 194 +++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 gasnet/multi_dimensional_array.h diff --git a/gasnet/multi_dimensional_array.h b/gasnet/multi_dimensional_array.h new file mode 100644 index 00000000..bd633ab5 --- /dev/null +++ b/gasnet/multi_dimensional_array.h @@ -0,0 +1,194 @@ +/* Copyright 2020 Stanford University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MULTI_DIMENSIONAL_ARRAY_H +#define MULTI_DIMENSIONAL_ARRAY_H + +#include +#include + +#include +#include + +// #define BOUNDS_CHECKS +// #define MAIN + +template +class Array { +public: + Array() + : data(nullptr) + { + } + + Array(std::initializer_list ext) + { + assert(ext.size() == N); + std::copy(ext.begin(), ext.end(), extent); + + total_extent = 1; + for (size_t dim = 0; dim < N; dim++) { + total_extent *= extent[dim]; + } + data = new T[total_extent]; + } + + Array(Array &array) + : extent(array.extent) + , total_extent(array.total_extent) + { + data = new T[total_extent]; + std::copy(array.begin(), array.end(), begin()); + } + + ~Array() + { + if (data) + delete [] data; + } + + Array & operator=(const Array &array) { + if (data) + delete [] data; + + extent = array.extent; + total_extent = array.total_extent; + std::copy(array.begin(), array.end(), begin()); + + return *this; + } + + void resize(std::initializer_list ext) + { + if (data) + delete [] data; + + assert(ext.size() == N); + std::copy(ext.begin(), ext.end(), extent); + + total_extent = 1; + for (size_t dim = 0; dim < N; dim++) { + total_extent *= extent[dim]; + } + data = new T[total_extent]; + } + + size_t size() const + { + return total_extent; + } + + bool empty() const + { + return size() == 0; + } + + T * begin() + { + return data; + } + + T * end() + { + return data + total_extent; + } + + template ::type = 0> + T & operator() (IDX... idx) + { +#ifdef BOUNDS_CHECKS + check(idx...); +#endif + size_t lin = linearize(idx...); +#ifdef BOUNDS_CHECKS + assert(lin < total_extent); +#endif + return data[lin]; + } + +private: + template + void check(FIRST first) { + const size_t DIM = N-1; + + assert((size_t) first >= 0 && (size_t) first < extent[DIM]); + } + + template + void check(FIRST first, REST... rest) { + const size_t DIM = N - sizeof...(REST) - 1; + + assert((size_t) first >= 0 && (size_t) first < extent[DIM]); + + check(rest...); + } + + template + size_t linearize(size_t acc, FIRST first) { + const size_t DIM = N-1; + + return first + acc * extent[DIM]; + } + + template + size_t linearize(size_t acc, FIRST first, REST... rest) { + const size_t DIM = N - sizeof...(REST) - 1; + + return linearize(first + acc * extent[DIM], rest...); + } + + size_t extent[N]; + size_t total_extent; + T *data; +}; + +#ifdef MAIN +int main() { + Array<5, int> a({2, 3, 4, 5, 6}); + Array<3, float> b({1, 2, 3}); + Array<2, double> c({1, 2}); + + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < 4; k++) { + for (int l = 0; l < 5; l++) { + for (int m = 0; m < 6; m++) { + a(i, j, k, l, m) = i*j*k*l*m; + } + } + } + } + } + + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < 4; k++) { + for (int l = 0; l < 5; l++) { + for (int m = 0; m < 6; m++) { + assert(a(i, j, k, l, m) == i*j*k*l*m); + } + } + } + } + } + + // varies most quickly in the first dimension + assert(&a(0, 0, 0, 0, 1) - &a(0, 0, 0, 0, 0) == 1); + + return 0; +} +#endif + +#endif // MULTI_DIMENSIONAL_ARRAY_H From 4a3632c94276e8c0a2a3376b770053af1ecffd37 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Thu, 5 Mar 2020 15:48:02 -0800 Subject: [PATCH 33/40] Run two iters. --- gasnet/seq.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 279f6984..1e597912 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -442,7 +442,7 @@ int main(int argc, char *argv[]) std::vector > task_ready_queue_local; double elapsed_time = 0.0; - for (int iter = 0; iter < 1; ++iter) { + for (int iter = 0; iter < 2; ++iter) { state.complete = 0; std::fill(state.timestep.begin(), state.timestep.end(), 0); std::fill(state.input_ready.begin(), state.input_ready.end(), 0); From c262a516e08f3d56203067c33d3ba9beb8ba21ee Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Fri, 6 Mar 2020 11:18:17 -0800 Subject: [PATCH 34/40] Set default on Crays back to static linking. --- build_all.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/build_all.sh b/build_all.sh index c3004a59..7e6b3498 100755 --- a/build_all.sh +++ b/build_all.sh @@ -19,6 +19,11 @@ else fi THREADS=${THREADS:-$DEFAULT_THREADS} +# On Cray machines, default to static build. (Cori switched this +# default from static to dynamic in the January 2020 maintenance +# cycle, but we want to stick with static builds.) +export CRAYPE_LINK_TYPE=static + make -C core clean make -C core -j$THREADS From 4a48afeb840a72307593d35b318c4bd18b011465 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Mon, 9 Mar 2020 14:39:42 -0700 Subject: [PATCH 35/40] Flush when printing report. --- core/core.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/core.cc b/core/core.cc index a13d7202..37515952 100644 --- a/core/core.cc +++ b/core/core.cc @@ -1186,4 +1186,6 @@ void App::report_timing(double elapsed_seconds) const #ifdef DEBUG_CORE printf("Task Graph Execution Mask %llx\n", has_executed_graph.load()); #endif + + fflush(stdout); } From 8cc0bc6b7158db786665720ad9dc0135f6f34497 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Mon, 9 Mar 2020 14:39:58 -0700 Subject: [PATCH 36/40] Barrier on GASNet exit. --- gasnet/seq.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 1e597912..ddeb363b 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -643,6 +643,9 @@ int main(int argc, char *argv[]) gex_HSL_Destroy(&state.lock); + // Barrier to make sure report gets flushed before nodes exit. + gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); + return 0; } From c4040879b816bbeebf5e3de5fff7d6d77921f801 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Mon, 9 Mar 2020 16:56:42 -0700 Subject: [PATCH 37/40] Initial work on AM Long implementation. --- gasnet/Makefile | 2 +- gasnet/multi_dimensional_array.h | 96 ++++- gasnet/seq.cc | 3 +- gasnet/seq_long.cc | 687 +++++++++++++++++++++++++++++++ test_all.sh | 10 +- 5 files changed, 770 insertions(+), 28 deletions(-) create mode 100644 gasnet/seq_long.cc diff --git a/gasnet/Makefile b/gasnet/Makefile index df9619d6..d07a5d3d 100644 --- a/gasnet/Makefile +++ b/gasnet/Makefile @@ -18,7 +18,7 @@ endif include ../core/make_blas.mk -BIN := seq +BIN := seq seq_long HEADERS := multi_dimensional_array.h diff --git a/gasnet/multi_dimensional_array.h b/gasnet/multi_dimensional_array.h index bd633ab5..d9bd9ed1 100644 --- a/gasnet/multi_dimensional_array.h +++ b/gasnet/multi_dimensional_array.h @@ -38,51 +38,79 @@ class Array { assert(ext.size() == N); std::copy(ext.begin(), ext.end(), extent); - total_extent = 1; - for (size_t dim = 0; dim < N; dim++) { - total_extent *= extent[dim]; - } + compute_total_extent(); data = new T[total_extent]; + owned = true; } - Array(Array &array) - : extent(array.extent) - , total_extent(array.total_extent) + Array(T *ptr, std::initializer_list ext) { - data = new T[total_extent]; - std::copy(array.begin(), array.end(), begin()); + assert(ext.size() == N); + std::copy(ext.begin(), ext.end(), extent); + + compute_total_extent(); + data = ptr; + owned = false; } - ~Array() + Array(const Array &array) { - if (data) - delete [] data; + std::copy(array.extent, array.extent+N, extent); + total_extent = array.total_extent; + + if (array.owned) { + data = new T[total_extent]; + std::copy(array.begin(), array.end(), begin()); + } else { + data = array.data; + } + owned = array.owned; } Array & operator=(const Array &array) { - if (data) - delete [] data; + destroy(); extent = array.extent; total_extent = array.total_extent; - std::copy(array.begin(), array.end(), begin()); + + if (array.owned) { + data = new T[total_extent]; + std::copy(array.begin(), array.end(), begin()); + } else { + data = array.data; + } + owned = array.owned; return *this; } + ~Array() + { + destroy(); + } + void resize(std::initializer_list ext) { - if (data) - delete [] data; + destroy(); assert(ext.size() == N); std::copy(ext.begin(), ext.end(), extent); - total_extent = 1; - for (size_t dim = 0; dim < N; dim++) { - total_extent *= extent[dim]; - } + compute_total_extent(); data = new T[total_extent]; + owned = true; + } + + void resize(T *ptr, std::initializer_list ext) + { + destroy(); + + assert(ext.size() == N); + std::copy(ext.begin(), ext.end(), extent); + + compute_total_extent(); + data = ptr; + owned = false; } size_t size() const @@ -100,11 +128,21 @@ class Array { return data; } + const T * begin() const + { + return data; + } + T * end() { return data + total_extent; } + const T * end() const + { + return data + total_extent; + } + template ::type = 0> T & operator() (IDX... idx) { @@ -118,6 +156,21 @@ class Array { return data[lin]; } +private: + void compute_total_extent() + { + total_extent = 1; + for (size_t dim = 0; dim < N; dim++) { + total_extent *= extent[dim]; + } + } + + void destroy() + { + if (data && owned) + delete [] data; + } + private: template void check(FIRST first) { @@ -152,6 +205,7 @@ class Array { size_t extent[N]; size_t total_extent; T *data; + bool owned; }; #ifdef MAIN diff --git a/gasnet/seq.cc b/gasnet/seq.cc index ddeb363b..4021930b 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -161,7 +161,7 @@ static std::pair run_task_body(long graph_index, long point, long ti } static void RAW_handler(gex_Token_t token, void *buffer, size_t size, - gex_AM_Arg_t graph_index, gex_AM_Arg_t timestep, gex_AM_Arg_t source_point, gex_AM_Arg_t dest_point) + gex_AM_Arg_t graph_index, gex_AM_Arg_t timestep, gex_AM_Arg_t source_point, gex_AM_Arg_t dest_point) { AutoLock guard(state.lock); @@ -648,4 +648,3 @@ int main(int argc, char *argv[]) return 0; } - diff --git a/gasnet/seq_long.cc b/gasnet/seq_long.cc new file mode 100644 index 00000000..d68913e7 --- /dev/null +++ b/gasnet/seq_long.cc @@ -0,0 +1,687 @@ +/* Copyright 2020 Stanford University + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "core.h" +#include "timer.h" + +#include "multi_dimensional_array.h" + +#include "gasnetex.h" + +#define CHECK_OK(x) assert((x) == GASNET_OK); + +class AutoLock { +public: + AutoLock(gex_HSL_t &lock_) : lock(&lock_) { + gex_HSL_Lock(lock); + } + ~AutoLock() { + gex_HSL_Unlock(lock); + } + +private: + gex_HSL_t *lock; +}; + +struct RankState { + gex_HSL_t lock; + gex_Rank_t rank; + gex_Rank_t n_ranks; + long num_fields; + long complete; + std::vector graphs; + std::vector > task_ready_queue; + Array<5, char> inputs; + Array<4, char> outputs; + std::vector > remote_inputs; + std::vector > remote_outputs; + + Array<2, long> timestep; + Array<3, int> input_ready; + Array<3, int> input_consumed; + Array<3, int> remote_input_empty; + Array<4, const char *> input_ptr; + Array<4, size_t> input_bytes; + Array<3, int> output_empty; + Array<3, int> n_raw_in; + Array<3, int> n_raw_out; + Array<3, int> n_war_in; + Array<3, int> n_war_out; + Array<3, char> scratch; + Array<3, std::vector > > dependencies; + Array<3, std::vector > > reverse_dependencies; +}; + +RankState state; + +template +T clamp(T value, T low, T high) { + return std::min(std::max(value, low), high); +} + +// IMPORTANT: must be called with state lock held +static bool check_task_ready(long graph_index, long point_index, long timestep) { + auto &graph = state.graphs[graph_index]; + + long last_field = (timestep + state.num_fields - 1) % state.num_fields; + long field = timestep % state.num_fields; + + const auto point_input_ready = state.input_ready(graph_index, point_index, last_field); + const auto point_input_consumed = state.input_consumed(graph_index, point_index, last_field); + const auto point_remote_input_empty = state.remote_input_empty(graph_index, point_index, field); + const auto point_output_empty = state.output_empty(graph_index, point_index, field); + const auto point_n_raw_in = state.n_raw_in(graph_index, point_index, timestep); + const auto point_n_war_in = state.n_war_in(graph_index, point_index, timestep); + + return timestep < graph.timesteps && point_input_ready == point_n_raw_in && point_remote_input_empty == point_n_war_in && point_output_empty == 1 && point_input_consumed == 0; +} + +// IMPORTANT: must be called with state lock held +static void advance_timestep(long graph_index, long point, long point_index) { + auto &graph = state.graphs[graph_index]; + + auto &point_timestep = state.timestep(graph_index, point_index); + + for (; point_timestep < graph.timesteps; ++point_timestep) { + long offset = graph.offset_at_timestep(point_timestep); + long width = graph.width_at_timestep(point_timestep); + + if (point >= offset && point < offset + width) + break; + } +} + +// IMPORTANT: must be called with state lock held +static std::pair run_task_body(long graph_index, long point, long timestep) { + auto rank = state.rank; + auto n_ranks = state.n_ranks; + + auto &graph = state.graphs[graph_index]; + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point_index = point - first_point; + + auto &point_timestep = state.timestep(graph_index, point_index); + assert(point_timestep == timestep); + + long last_field = (point_timestep + state.num_fields - 1) % state.num_fields; + long field = point_timestep % state.num_fields; + + auto &point_input_ready = state.input_ready(graph_index, point_index, last_field); + auto &point_input_consumed = state.input_consumed(graph_index, point_index, last_field); + auto &point_remote_input_empty = state.remote_input_empty(graph_index, point_index, field); + auto point_input_ptr = state.input_ptr.empty() ? nullptr : &state.input_ptr(graph_index, point_index, last_field, 0); + auto point_input_bytes = state.input_bytes.empty() ? nullptr : &state.input_bytes(graph_index, point_index, last_field, 0); + auto &point_output = state.outputs(graph_index, point_index, field, 0); + auto &point_output_empty = state.output_empty(graph_index, point_index, field); + auto point_scratch = state.scratch.empty() ? nullptr : &state.scratch(graph_index, point_index, 0); + const auto point_n_raw_in = state.n_raw_in(graph_index, point_index, timestep); + const auto point_n_raw_out = state.n_raw_out(graph_index, point_index, timestep); + const auto point_n_war_out = state.n_war_out(graph_index, point_index, timestep); + + graph.execute_point(point_timestep, point, + &point_output, graph.output_bytes_per_task, + point_input_ptr, point_input_bytes, point_n_raw_in, + point_scratch, graph.scratch_bytes_per_task); + + point_input_ready = 0; + point_remote_input_empty = 0; + point_input_consumed = point_n_war_out != 0; + point_output_empty = point_n_raw_out == 0; + + ++point_timestep; + advance_timestep(graph_index, point, point_index); + if (point_timestep >= graph.timesteps) { + ++state.complete; + } else if (check_task_ready(graph_index, point_index, point_timestep)) { + state.task_ready_queue.push_back( + std::tuple(graph_index, point, point_timestep)); + } + + long send_raw = point_output_empty == 0 && timestep < graph.timesteps-1 ? timestep : -1; + long send_war = point_input_consumed == 1 && timestep < graph.timesteps-1 ? timestep : -1; + + return std::pair(send_raw, send_war); +} + +static void RAW_handler(gex_Token_t token, void *buffer, size_t size, + gex_AM_Arg_t graph_index, gex_AM_Arg_t timestep, gex_AM_Arg_t dest_point) +{ + AutoLock guard(state.lock); + + auto rank = state.rank; + auto n_ranks = state.n_ranks; + + auto &graph = state.graphs[graph_index]; + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point = dest_point; + long point_index = point - first_point; + + long field = timestep % state.num_fields; + + auto &point_input_ready = state.input_ready(graph_index, point_index, field); + + point_input_ready++; + + auto &point_timestep = state.timestep(graph_index, point_index); + if (timestep + 1 == point_timestep && point_timestep < graph.timesteps && check_task_ready(graph_index, point_index, point_timestep)) { + state.task_ready_queue.push_back( + std::tuple(graph_index, point, point_timestep)); + } +} + +static void WAR_handler(gex_Token_t token, + gex_AM_Arg_t graph_index, gex_AM_Arg_t timestep, gex_AM_Arg_t point) +{ + AutoLock guard(state.lock); + + auto rank = state.rank; + auto n_ranks = state.n_ranks; + + auto &graph = state.graphs[graph_index]; + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point_index = point - first_point; + + long last_field = (timestep + state.num_fields - 1) % state.num_fields; + + auto &point_remote_input_empty = state.remote_input_empty(graph_index, point_index, last_field); + point_remote_input_empty++; + + auto &point_timestep = state.timestep(graph_index, point_index); + if (timestep + state.num_fields - 1 == point_timestep && point_timestep < graph.timesteps && check_task_ready(graph_index, point_index, point_timestep)) { + state.task_ready_queue.push_back( + std::tuple(graph_index, point, point_timestep)); + } +} + +const int N_HANDLERS = 2; + +gex_AM_Entry_t handlers[N_HANDLERS] = { + gex_AM_Entry_t { + .gex_index = 0, + .gex_fnptr = (void (*)())RAW_handler, + .gex_flags = GEX_FLAG_AM_LONG | GEX_FLAG_AM_REQUEST, + .gex_nargs = 3, + .gex_cdata = NULL, + .gex_name = "RAW handler", + }, + gex_AM_Entry_t { + .gex_index = 0, + .gex_fnptr = (void (*)())WAR_handler, + .gex_flags = GEX_FLAG_AM_SHORT | GEX_FLAG_AM_REQUEST, + .gex_nargs = 3, + .gex_cdata = NULL, + .gex_name = "WAR handler", + }, +}; + +int main(int argc, char *argv[]) +{ + gex_Client_t client; + gex_EP_t ep; + gex_TM_t tm; + CHECK_OK(gex_Client_Init(&client, &ep, &tm, "main", &argc, &argv, 0)); + + gex_Rank_t rank = gex_TM_QueryRank(tm); + gex_Rank_t n_ranks = gex_TM_QuerySize(tm); + state.rank = rank; + state.n_ranks = n_ranks; + + uintptr_t max_size = gasnet_getMaxLocalSegmentSize(); + gex_Segment_t segment; + CHECK_OK(gex_Segment_Attach(&segment, tm, max_size)); + + CHECK_OK(gex_EP_RegisterHandlers(ep, handlers, N_HANDLERS)); + + App app(argc, argv); + if (rank == 0) app.display(); + + gex_HSL_Init(&state.lock); + + state.num_fields = 5; + + state.graphs = app.graphs; + + long max_timesteps = 0; + long max_points = 0; + long max_dsets = 0; + long max_deps = 0; + size_t max_output_bytes = 0; + size_t max_scratch_bytes = 0; + long expected_tasks = 0; + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + long n_points = last_point - first_point + 1; + + max_timesteps = std::max(max_timesteps, graph.timesteps); + + max_points = std::max(max_points, n_points); + + max_output_bytes = std::max(max_output_bytes, graph.output_bytes_per_task); + max_scratch_bytes = std::max(max_scratch_bytes, graph.scratch_bytes_per_task); + + max_dsets = std::max(max_dsets, graph.max_dependence_sets()); + + for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { + for (long point = first_point; point <= last_point; ++point) { + long deps = 0; + for (auto interval : graph.dependencies(dset, point)) { + deps += interval.second - interval.first + 1; + } + max_deps = std::max(max_deps, deps); + } + } + + expected_tasks += n_points; + } + + size_t n_graphs = app.graphs.size(); + + size_t total_input_bytes = n_graphs * max_points * state.num_fields * max_deps * max_output_bytes; + size_t total_output_bytes = n_graphs * max_points * state.num_fields * max_output_bytes; + + assert(total_input_bytes + total_output_bytes < max_size); + + void *segment_start = gex_Segment_QueryAddr(segment); + + char *input_addr = (char *)segment_start; + char *output_addr = (char *)segment_start + total_input_bytes; + + state.inputs.resize(input_addr, {n_graphs, (size_t)max_points, (size_t)state.num_fields, (size_t)max_deps, max_output_bytes}); + state.outputs.resize(output_addr, {n_graphs, (size_t)max_points, (size_t)state.num_fields, max_output_bytes}); + + state.remote_inputs.resize(n_ranks); + state.remote_outputs.resize(n_ranks); + + for (size_t other_rank = 0; other_rank < n_ranks; ++other_rank) { + void *other_segment_start; + void *local_other_segment_start; + size_t other_size; + CHECK_OK(gex_Segment_QueryBound(tm, + other_rank, + &other_segment_start, + &local_other_segment_start, + &other_size)); + + char *other_input_addr = (char *)other_segment_start; + char *other_output_addr = (char *)other_segment_start + total_input_bytes; + + state.remote_inputs[other_rank].resize(other_input_addr, {n_graphs, (size_t)max_points, (size_t)state.num_fields, (size_t)max_deps, max_output_bytes}); + state.remote_outputs[other_rank].resize(other_output_addr, {n_graphs, (size_t)max_points, (size_t)state.num_fields, max_output_bytes}); + } + + state.timestep.resize( {n_graphs, (size_t)max_points}); + state.input_ready.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields}); + state.input_consumed.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields}); + state.remote_input_empty.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields}); + state.input_ptr.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields, (size_t)max_deps}); + state.input_bytes.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields, (size_t)max_deps}); + state.output_empty.resize( {n_graphs, (size_t)max_points, (size_t)state.num_fields}); + state.n_raw_in.resize( {n_graphs, (size_t)max_points, (size_t)max_timesteps}); + state.n_raw_out.resize( {n_graphs, (size_t)max_points, (size_t)max_timesteps}); + state.n_war_in.resize( {n_graphs, (size_t)max_points, (size_t)max_timesteps}); + state.n_war_out.resize( {n_graphs, (size_t)max_points, (size_t)max_timesteps}); + state.scratch.resize( {n_graphs, (size_t)max_points, max_scratch_bytes}); + state.dependencies.resize( {n_graphs, (size_t)max_dsets, (size_t)max_points}); + state.reverse_dependencies.resize({n_graphs, (size_t)max_dsets, (size_t)max_points}); + + std::vector > rank_by_point(app.graphs.size()); + + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + long n_points = last_point - first_point + 1; + + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + for (long field = 0; field < state.num_fields; ++field) { + for (long dep = 0; dep < max_deps; ++dep) { + state.input_ptr(graph.graph_index, point_index, field, dep) = &state.inputs(graph.graph_index, point_index, field, dep, 0); + state.input_bytes(graph.graph_index, point_index, field, dep) = graph.output_bytes_per_task; + } + } + + for (long timestep = 0; timestep < graph.timesteps; ++timestep) { + long last_offset = graph.offset_at_timestep(timestep-1); + long last_width = graph.width_at_timestep(timestep-1); + + long next_offset = graph.offset_at_timestep(timestep+1); + long next_width = graph.width_at_timestep(timestep+1); + + long last_field_offset = graph.offset_at_timestep(timestep - state.num_fields + 1); + long last_field_width = graph.width_at_timestep(timestep - state.num_fields + 1); + + long next_field_offset = graph.offset_at_timestep(timestep + state.num_fields - 1); + long next_field_width = graph.width_at_timestep(timestep + state.num_fields - 1); + + long dset = graph.dependence_set_at_timestep(timestep); + long next_dset = graph.dependence_set_at_timestep(timestep+1); + long last_field_dset = graph.dependence_set_at_timestep(timestep - state.num_fields + 1); + long next_field_dset = graph.dependence_set_at_timestep(timestep + state.num_fields - 1); + + long raw_in = 0; + for (auto interval : graph.dependencies(dset, point)) { + long first_dep = clamp(interval.first, last_offset, last_offset + last_width); + long last_dep = clamp(interval.second + 1, last_offset, last_offset + last_width); + assert(first_dep <= last_dep); + raw_in += last_dep - first_dep; + } + state.n_raw_in(graph.graph_index, point_index, timestep) = raw_in; + + long raw_out = 0; + for (auto interval : graph.reverse_dependencies(next_dset, point)) { + long first_dep = clamp(interval.first, next_offset, next_offset + next_width); + long last_dep = clamp(interval.second + 1, next_offset, next_offset + next_width); + assert(first_dep <= last_dep); + raw_out += last_dep - first_dep; + } + state.n_raw_out(graph.graph_index, point_index, timestep) = raw_out; + + long war_in = 0; + for (auto interval : graph.reverse_dependencies(last_field_dset, point)) { + long first_dep = clamp(interval.first, last_field_offset, last_field_offset + last_field_width); + long last_dep = clamp(interval.second + 1, last_field_offset, last_field_offset + last_field_width); + assert(first_dep <= last_dep); + war_in += last_dep - first_dep; + } + state.n_war_in(graph.graph_index, point_index, timestep) = war_in; + + long war_out = 0; + for (auto interval : graph.dependencies(dset, point)) { + long first_dep = clamp(interval.first, next_field_offset, next_field_offset + next_field_width); + long last_dep = clamp(interval.second + 1, next_field_offset, next_field_offset + next_field_width); + assert(first_dep <= last_dep); + war_out += last_dep - first_dep; + } + state.n_war_out(graph.graph_index, point_index, timestep) = war_out; + } + + for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { + auto deps = graph.dependencies(dset, point); + auto rev_deps = graph.reverse_dependencies(dset, point); + + state.dependencies(graph.graph_index, dset, point_index) = deps; + state.reverse_dependencies(graph.graph_index, dset, point_index) = rev_deps; + } + + if (!state.scratch.empty()) + TaskGraph::prepare_scratch(&state.scratch(graph.graph_index, point_index, 0), graph.scratch_bytes_per_task); + } + + rank_by_point[graph.graph_index].resize(graph.max_width); + for (gex_Rank_t r = 0; r < n_ranks; ++r) { + long r_first_point = r * graph.max_width / n_ranks; + long r_last_point = (r + 1) * graph.max_width / n_ranks - 1; + for (long p = r_first_point; p <= r_last_point; ++p) { + rank_by_point[graph.graph_index][p] = r; + } + } + } + + std::vector > send_queue; + + std::vector > sends_raw; + std::vector > sends_war; + + std::vector > task_ready_queue_local; + + double elapsed_time = 0.0; + for (int iter = 0; iter < 2; ++iter) { + state.complete = 0; + std::fill(state.timestep.begin(), state.timestep.end(), 0); + std::fill(state.input_ready.begin(), state.input_ready.end(), 0); + std::fill(state.input_consumed.begin(), state.input_consumed.end(), 0); + std::fill(state.remote_input_empty.begin(), state.remote_input_empty.end(), 0); + std::fill(state.output_empty.begin(), state.output_empty.end(), 1); + + gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); + + double start_time = Timer::get_cur_time(); + + // Advance and queue initial tasks. + for (auto graph : app.graphs) { + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + long n_points = last_point - first_point + 1; + + for (long point = first_point; point <= last_point; ++point) { + long point_index = point - first_point; + + advance_timestep(graph.graph_index, point, point_index); + + auto &point_timestep = state.timestep(graph.graph_index, point_index); + + if (point_timestep >= graph.timesteps) { + ++state.complete; + } else if (check_task_ready(graph.graph_index, point_index, point_timestep)) { + state.task_ready_queue.push_back( + std::tuple(graph.graph_index, point, point_timestep)); + } + } + } + + while (true) { + { + AutoLock guard(state.lock); + + // Check for completion. + if (state.complete == expected_tasks) + break; + + // Run any ready tasks. + task_ready_queue_local.swap(state.task_ready_queue); + for (auto entry : task_ready_queue_local) { + long graph_index, point, timestep; + std::tie(graph_index, point, timestep) = entry; + + auto send = run_task_body(graph_index, point, timestep); + if (send.first >= 0 || send.second >= 0) { + send_queue.push_back(std::tuple(graph_index, point, send.first, send.second)); + } + } + + // Clear the task queue. + task_ready_queue_local.clear(); + } + + // Issue queued sends. + for (auto entry : send_queue) { + long graph_index, point, raw_timestep, war_timestep; + std::tie(graph_index, point, raw_timestep, war_timestep) = entry; + + auto &graph = app.graphs[graph_index]; + + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point_index = point - first_point; + + // RAW dependencies: + if (raw_timestep >= 0) { + long offset = graph.offset_at_timestep(raw_timestep); + long width = graph.width_at_timestep(raw_timestep); + + long next_offset = graph.offset_at_timestep(raw_timestep+1); + long next_width = graph.width_at_timestep(raw_timestep+1); + + long field = raw_timestep % state.num_fields; + + long dset = graph.dependence_set_at_timestep(raw_timestep + 1); + + auto &point_output = state.outputs(graph_index, point_index, field, 0); + auto &point_rev_deps = state.reverse_dependencies(graph_index, dset, point_index); + + if (point >= offset && point < offset + width) { + bool sent = false; + for (auto interval : point_rev_deps) { + for (long dep = interval.first; dep <= interval.second; dep++) { + if (dep < next_offset || dep >= next_offset + next_width) { + continue; + } + + int other_rank = rank_by_point[graph_index][dep]; + + long other_first_point = other_rank * graph.max_width / n_ranks; + long other_last_point = (other_rank + 1) * graph.max_width / n_ranks - 1; + + long other_point = dep; + long other_point_index = other_point - other_first_point; + + auto &other_point_deps = state.dependencies(graph_index, dset, other_point_index); + + long input_idx = 0; + for (auto interval : other_point_deps) { + long first_dep = clamp(interval.first, offset, offset + width); + long last_dep = clamp(interval.second + 1, offset, offset + width); + assert(first_dep <= last_dep); + if (first_dep <= point && point <= last_dep) { + first_dep = std::min(first_dep, point); + last_dep = std::min(last_dep, point); + } + input_idx += last_dep - first_dep; + if (first_dep <= point && point <= last_dep) { + break; + } + } + + auto &other_point_input = state.remote_inputs[other_rank](graph_index, other_point_index, field, input_idx, 0); + + CHECK_OK(gex_AM_RequestLong(tm, other_rank, handlers[0].gex_index, + &point_output, graph.output_bytes_per_task, + &other_point_input, + GEX_EVENT_GROUP, 0, + (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)raw_timestep, + (gex_AM_Arg_t)dep)); + sent = true; + } + } + if (sent) + sends_raw.push_back(std::tuple(graph.graph_index, raw_timestep, point)); + } + } + + // WAR dependencies: + if (war_timestep >= 0) { + long offset = graph.offset_at_timestep(war_timestep); + long width = graph.width_at_timestep(war_timestep); + + long next_field_offset = graph.offset_at_timestep(war_timestep + state.num_fields - 1); + long next_field_width = graph.width_at_timestep(war_timestep + state.num_fields - 1); + + long dset = graph.dependence_set_at_timestep(war_timestep); + + auto &point_deps = state.dependencies(graph_index, dset, point_index); + + if (point >= offset && point < offset + width) { + bool sent = false; + for (auto interval : point_deps) { + for (long dep = interval.first; dep <= interval.second; dep++) { + if (dep < next_field_offset || dep >= next_field_offset + next_field_width) { + continue; + } + + CHECK_OK(gex_AM_RequestShort(tm, rank_by_point[graph_index][dep], handlers[1].gex_index, 0, + (gex_AM_Arg_t)graph.graph_index, (gex_AM_Arg_t)war_timestep, (gex_AM_Arg_t)dep)); + sent = true; + } + } + if (sent) + sends_war.push_back(std::tuple(graph.graph_index, war_timestep, point)); + } + } + } + send_queue.clear(); + + // Wait for local completion so it's safe to override output buffers. + gex_NBI_Wait(GEX_EC_LC, 0); + + // Mark readiness of the output buffers. + { + AutoLock guard(state.lock); + for (auto &send : sends_raw) { + long graph_index; + long timestep; + long point; + std::tie(graph_index, timestep, point) = send; + + auto &graph = state.graphs[graph_index]; + + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point_index = point - first_point; + + long last_field = (timestep + state.num_fields - 1) % state.num_fields; + long field = timestep % state.num_fields; + + auto &point_output_empty = state.output_empty(graph_index, point_index, field); + point_output_empty = 1; + } + for (auto &send : sends_war) { + long graph_index; + long timestep; + long point; + std::tie(graph_index, timestep, point) = send; + + auto &graph = state.graphs[graph_index]; + + long first_point = rank * graph.max_width / n_ranks; + long last_point = (rank + 1) * graph.max_width / n_ranks - 1; + + long point_index = point - first_point; + + long last_field = (timestep + state.num_fields - 1) % state.num_fields; + + auto &point_input_consumed = state.input_consumed(graph_index, point_index, last_field); + point_input_consumed = 0; + } + } + sends_raw.clear(); + sends_war.clear(); + + // Poll the network to make sure we're making progress. + CHECK_OK(gasnet_AMPoll()); + } + + gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); + + double stop_time = Timer::get_cur_time(); + elapsed_time = stop_time - start_time; + } + + if (rank == 0) { + app.report_timing(elapsed_time); + } + + gex_HSL_Destroy(&state.lock); + + // Barrier to make sure report gets flushed before nodes exit. + gex_Event_Wait(gex_Coll_BarrierNB(tm, 0)); + + return 0; +} diff --git a/test_all.sh b/test_all.sh index 69b2e6af..e27416a7 100755 --- a/test_all.sh +++ b/test_all.sh @@ -74,10 +74,12 @@ fi if [[ $TASKBENCH_USE_GASNET -eq 1 ]]; then for t in "${extended_types[@]}"; do for k in "${kernels[@]}"; do - mpirun -np 1 ./gasnet/seq -steps $steps -type $t $k - mpirun -np 2 ./gasnet/seq -steps $steps -type $t $k - mpirun -np 4 ./gasnet/seq -steps $steps -type $t $k - mpirun -np 4 ./gasnet/seq -steps $steps -type $t $k -and -steps $steps -type $t $k + for binary in seq seq_long; do + mpirun -np 1 ./gasnet/$binary -steps $steps -type $t $k + mpirun -np 2 ./gasnet/$binary -steps $steps -type $t $k + mpirun -np 4 ./gasnet/$binary -steps $steps -type $t $k + mpirun -np 4 ./gasnet/$binary -steps $steps -type $t $k -and -steps $steps -type $t $k + done done done fi From 7686b259ae7171637bc69b034a61a5646748b002 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 10 Mar 2020 11:33:46 -0700 Subject: [PATCH 38/40] Fixes for AM Long implementation. --- gasnet/seq_long.cc | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/gasnet/seq_long.cc b/gasnet/seq_long.cc index d68913e7..f7e15cee 100644 --- a/gasnet/seq_long.cc +++ b/gasnet/seq_long.cc @@ -52,7 +52,6 @@ struct RankState { Array<5, char> inputs; Array<4, char> outputs; std::vector > remote_inputs; - std::vector > remote_outputs; Array<2, long> timestep; Array<3, int> input_ready; @@ -68,6 +67,7 @@ struct RankState { Array<3, char> scratch; Array<3, std::vector > > dependencies; Array<3, std::vector > > reverse_dependencies; + Array<3, std::vector > > remote_dependencies; }; RankState state; @@ -267,6 +267,7 @@ int main(int argc, char *argv[]) long max_timesteps = 0; long max_points = 0; + long max_width = 0; long max_dsets = 0; long max_deps = 0; size_t max_output_bytes = 0; @@ -281,6 +282,8 @@ int main(int argc, char *argv[]) max_points = std::max(max_points, n_points); + max_width = std::max(max_width, graph.max_width); + max_output_bytes = std::max(max_output_bytes, graph.output_bytes_per_task); max_scratch_bytes = std::max(max_scratch_bytes, graph.scratch_bytes_per_task); @@ -315,7 +318,6 @@ int main(int argc, char *argv[]) state.outputs.resize(output_addr, {n_graphs, (size_t)max_points, (size_t)state.num_fields, max_output_bytes}); state.remote_inputs.resize(n_ranks); - state.remote_outputs.resize(n_ranks); for (size_t other_rank = 0; other_rank < n_ranks; ++other_rank) { void *other_segment_start; @@ -328,10 +330,28 @@ int main(int argc, char *argv[]) &other_size)); char *other_input_addr = (char *)other_segment_start; - char *other_output_addr = (char *)other_segment_start + total_input_bytes; - state.remote_inputs[other_rank].resize(other_input_addr, {n_graphs, (size_t)max_points, (size_t)state.num_fields, (size_t)max_deps, max_output_bytes}); - state.remote_outputs[other_rank].resize(other_output_addr, {n_graphs, (size_t)max_points, (size_t)state.num_fields, max_output_bytes}); + long other_max_points = 0; + long other_max_deps = 0; + for (auto graph : app.graphs) { + long other_first_point = other_rank * graph.max_width / n_ranks; + long other_last_point = (other_rank + 1) * graph.max_width / n_ranks - 1; + long other_n_points = other_last_point - other_first_point + 1; + + other_max_points = std::max(other_max_points, other_n_points); + + for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { + for (long other_point = other_first_point; other_point <= other_last_point; ++other_point) { + long deps = 0; + for (auto interval : graph.dependencies(dset, other_point)) { + deps += interval.second - interval.first + 1; + } + other_max_deps = std::max(other_max_deps, deps); + } + } + } + + state.remote_inputs[other_rank].resize(other_input_addr, {n_graphs, (size_t)other_max_points, (size_t)state.num_fields, (size_t)other_max_deps, max_output_bytes}); } state.timestep.resize( {n_graphs, (size_t)max_points}); @@ -348,6 +368,7 @@ int main(int argc, char *argv[]) state.scratch.resize( {n_graphs, (size_t)max_points, max_scratch_bytes}); state.dependencies.resize( {n_graphs, (size_t)max_dsets, (size_t)max_points}); state.reverse_dependencies.resize({n_graphs, (size_t)max_dsets, (size_t)max_points}); + state.remote_dependencies.resize( {n_graphs, (size_t)max_dsets, (size_t)max_width}); std::vector > rank_by_point(app.graphs.size()); @@ -443,6 +464,16 @@ int main(int argc, char *argv[]) } } + for (auto graph : app.graphs) { + for (long point = 0; point < graph.max_width; ++point) { + for (long dset = 0; dset < graph.max_dependence_sets(); ++dset) { + auto deps = graph.dependencies(dset, point); + + state.remote_dependencies(graph.graph_index, dset, point) = deps; + } + } + } + std::vector > send_queue; std::vector > sends_raw; @@ -552,7 +583,7 @@ int main(int argc, char *argv[]) long other_point = dep; long other_point_index = other_point - other_first_point; - auto &other_point_deps = state.dependencies(graph_index, dset, other_point_index); + auto &other_point_deps = state.remote_dependencies(graph_index, dset, other_point); long input_idx = 0; for (auto interval : other_point_deps) { From cc3a28a34934e1f0e79b28003a67de2c7156be97 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Tue, 10 Mar 2020 11:34:38 -0700 Subject: [PATCH 39/40] Update .gitignore. --- gasnet/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/gasnet/.gitignore b/gasnet/.gitignore index 9403f506..b0eeffc9 100644 --- a/gasnet/.gitignore +++ b/gasnet/.gitignore @@ -1 +1,2 @@ /seq +/seq_long From 690ab323b495f04b8380fc5f0692258f135d40f3 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Fri, 20 Mar 2020 14:03:28 -0700 Subject: [PATCH 40/40] Workaround for low probability freeze in GASNet. --- gasnet/seq.cc | 4 +++- gasnet/seq_long.cc | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/gasnet/seq.cc b/gasnet/seq.cc index 4021930b..2bb2ba82 100644 --- a/gasnet/seq.cc +++ b/gasnet/seq.cc @@ -442,7 +442,9 @@ int main(int argc, char *argv[]) std::vector > task_ready_queue_local; double elapsed_time = 0.0; - for (int iter = 0; iter < 2; ++iter) { + // FIXME: This can't be set to 2 because it seems to result in a low + // probability (< 1%) hang + for (int iter = 0; iter < 1; ++iter) { state.complete = 0; std::fill(state.timestep.begin(), state.timestep.end(), 0); std::fill(state.input_ready.begin(), state.input_ready.end(), 0); diff --git a/gasnet/seq_long.cc b/gasnet/seq_long.cc index f7e15cee..4c8dcfc9 100644 --- a/gasnet/seq_long.cc +++ b/gasnet/seq_long.cc @@ -482,7 +482,9 @@ int main(int argc, char *argv[]) std::vector > task_ready_queue_local; double elapsed_time = 0.0; - for (int iter = 0; iter < 2; ++iter) { + // FIXME: This can't be set to 2 because it seems to result in a low + // probability (< 1%) hang + for (int iter = 0; iter < 1; ++iter) { state.complete = 0; std::fill(state.timestep.begin(), state.timestep.end(), 0); std::fill(state.input_ready.begin(), state.input_ready.end(), 0);