diff --git a/.cargo/config.toml b/.cargo/config.toml index 3c801ff8f56..585b069ccf2 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,84 +1,7 @@ # Zebra cargo configuration -# Flags that apply to all Zebra crates and configurations -[target.'cfg(all())'] -rustflags = [ - # Zebra standard lints for Rust 1.65+ - - # High-risk code - "-Dunsafe_code", - "-Dnon_ascii_idents", - - # Potential bugs - # - # If we deny these lints, we could be excluded from Crater builds: - # https://www.reddit.com/r/rust/comments/f5xpib/psa_denywarnings_is_actively_harmful/ - - # Compatibility - "-Wrust_2021_compatibility", - "-Wnonstandard_style", - "-Wfuture_incompatible", - - # Async code - "-Wclippy::await_holding_lock", - "-Wclippy::await_holding_refcell_ref", - - # Pointers - "-Wclippy::cast_ptr_alignment", - "-Wclippy::fn_to_numeric_cast_any", - - # Integers - "-Wclippy::checked_conversions", - "-Wclippy::implicit_saturating_sub", - "-Wclippy::invalid_upcast_comparisons", - "-Wclippy::range_minus_one", - "-Wclippy::range_plus_one", - "-Wclippy::unnecessary_cast", - - # Incomplete code - "-Wclippy::dbg_macro", - "-Wclippy::todo", - - # Manual debugging output. - # Use tracing::trace!() or tracing::debug!() instead. - "-Wclippy::print_stdout", - "-Wclippy::print_stderr", - "-Wclippy::dbg_macro", - - # Code styles we want to accept - "-Aclippy::try_err", - - # Panics - "-Wclippy::fallible_impl_from", - "-Wclippy::unwrap_in_result", - - # Documentation - "-Wmissing_docs", - - # TODOs: - # Fix this lint eventually. - "-Aclippy::result_large_err", - - # `cargo fix` might help do these fixes, - # or add a config.toml to sub-directories which should allow these lints, - # or try allowing the lint in the specific module (lib.rs doesn't seem to work in some cases) - # - # lint configs that don't work: - # - allowing these lints in lib.rs (command-line warn overrides allow in lib.rs?) - # - adding a [target.'cfg(not(test))'] rustflags config (it runs on test code anyway) - - # fix code that triggers these lints, - # or disable the lint for that code (or for all test code) - # - #"-Wclippy::cast_lossless", # 30 non-test warnings, a few test warnings - #"-Wclippy::cast_possible_truncation", # 40 non-test warnings, 20 test warnings - #"-Wclippy::cast_possible_wrap", # 13 test warnings (fixed outside tests) - #"-Wclippy::cast_precision_loss", # 25 non-test warnings, 10 test warnings - #"-Wclippy::cast_sign_loss", # 6 non-test warnings, 15 test warnings - - # fix hidden lifetime parameters - #"-Wrust_2018_idioms", -] +[alias] +xtask = "run --package xtask --" [build] rustdocflags = [ @@ -91,3 +14,7 @@ rustdocflags = [ [env] RUST_BACKTRACE="1" + +[profile.profiling] +inherits = "release" +debug = true diff --git a/.codespellrc b/.codespellrc index 8c06f3917e1..6358b6ae3cc 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,4 +1,4 @@ [codespell] -ignore-words-list = crate,Sur,inout,Groth,groth,re-use,abl, +ignore-words-list = crate,Sur,inout,Groth,groth,re-use,abl,zcash,zcashd,zebrad,zebra,utxo,utxos,nullifier,nullifiers,sapling,orchard,sprout,backpressure,proptest,thiserror,rocksdb,libsecp,fullnode,peerset,threadsafe,unrepresentable,getblocktemplate,syncer,Actix,Akka,mermaid,println,eprintln,usize,nocapture,Parallelizable,invis,UTXO,Zcash,Zaino,Zallet,librustzcash,Mainnet,Testnet,mainnet,testnet,idents,reentrancy,serializable,deserializable,deserialization,zkSNARK,zkSNARKs,lightwalletd,statics,ser,endianity,aranges,daa exclude-file = book/mermaid.min.js -skip = ./zebra-rpc/qa/rpc-tests,./supply-chain +skip = ./zebra-rpc/qa/rpc-tests,./supply-chain,./target diff --git a/.config/nextest.toml b/.config/nextest.toml index f8b1b63abcf..bd02dbc58a2 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -30,13 +30,25 @@ filter = "not test(=trusted_chain_sync_handles_forks_correctly) and not test(=de # TODO: We need a better test architecture to run all non-stateful [profile.all-tests] failure-output = "immediate" -default-filter = "not test(check_no_git_dependencies) and not test(=fully_synced_rpc_z_getsubtreesbyindex_snapshot_test) and not test(=lwd_rpc_test) and not test(=lwd_rpc_send_tx) and not test(=lwd_grpc_wallet) and not test(=lwd_integration) and not test(=lwd_sync_full) and not test(=lwd_sync_update) and not test(=lightwalletd_test_suite) and not test(=rpc_get_block_template) and not test(=rpc_submit_block) and not test(=get_peer_info) and not test(~generate_checkpoints_) and not test(=sync_update_mainnet) and not test(=activate_mempool_mainnet)" +default-filter = "not test(check_no_git_dependencies) and not test(=fully_synced_rpc_z_getsubtreesbyindex_snapshot_test) and not test(=lwd_rpc_test) and not test(=lwd_rpc_send_tx) and not test(=lwd_grpc_wallet) and not test(=lwd_integration) and not test(=lwd_sync_full) and not test(=lwd_sync_update) and not test(=lightwalletd_test_suite) and not test(=rpc_get_block_template) and not test(=rpc_submit_block) and not test(=get_peer_info) and not test(~generate_checkpoints_) and not test(=sync_update_mainnet) and not test(=activate_mempool_mainnet) and not test(=sync_large_checkpoints_mempool_mainnet) and not test(cluster_forms_native_two_node_mesh) and not test(=pruned_storage_mode_prunes_during_regtest_sync)" + +# --- Full Tests profile --- +# Broader coverage for `ironwood-main`, scheduled, manual, and merge-queue runs. +# This preserves the previous `all-tests` selection while the PR lane stays fast. +[profile.full-tests] +failure-output = "immediate" +default-filter = "not test(check_no_git_dependencies) and not test(=fully_synced_rpc_z_getsubtreesbyindex_snapshot_test) and not test(=lwd_rpc_test) and not test(=lwd_rpc_send_tx) and not test(=lwd_grpc_wallet) and not test(=lwd_integration) and not test(=lwd_sync_full) and not test(=lwd_sync_update) and not test(=lightwalletd_test_suite) and not test(=rpc_get_block_template) and not test(=rpc_submit_block) and not test(=get_peer_info) and not test(~generate_checkpoints_) and not test(=sync_update_mainnet) and not test(=activate_mempool_mainnet) and not test(=sync_large_checkpoints_mempool_mainnet) and not test(cluster_forms_native_two_node_mesh)" # --- Individual Test Profiles --- [profile.check-no-git-dependencies] default-filter = 'test(check_no_git_dependencies)' +[profile.zakura-integration] +retries = 2 +slow-timeout = { period = "15m", terminate-after = 1 } +default-filter = 'package(zebra-network) and test(~zakura)' + [profile.sync-large-checkpoints-empty] slow-timeout = { period = "60m", terminate-after = 2 } default-filter = 'package(zebrad) and test(=sync_large_checkpoints_empty)' @@ -74,6 +86,22 @@ default-filter = 'package(zebrad) and test(=sync_past_mandatory_checkpoint_mainn slow-timeout = { period = "60m", terminate-after = 2 } default-filter = 'package(zebrad) and test(=sync_past_mandatory_checkpoint_testnet)' +[profile.sync-range-pre-nu62] +# A healthy 5k-window sync is ~3-5 min. Terminate a stalled (broken) sync at ~20 min so a +# consensus regression fails fast and cleanly (a real test timeout) instead of waiting out the +# job-level timeout. Generous margin over the healthy duration to avoid flakes on a slow droplet. +slow-timeout = { period = "5m", terminate-after = 4 } +success-output = "immediate" +default-filter = 'package(zebrad) and test(=sync_range_pre_nu62)' + +[profile.sync-range-post-nu62] +# A healthy 5k-window sync is ~3-5 min. Terminate a stalled (broken) sync at ~20 min so a +# consensus regression fails fast and cleanly (a real test timeout) instead of waiting out the +# job-level timeout. Generous margin over the healthy duration to avoid flakes on a slow droplet. +slow-timeout = { period = "5m", terminate-after = 4 } +success-output = "immediate" +default-filter = 'package(zebrad) and test(=sync_range_post_nu62)' + [profile.generate-checkpoints-mainnet] slow-timeout = { period = "90m", terminate-after = 1 } success-output = "immediate" diff --git a/.cursor/skills/zakura-trace-plots/SKILL.md b/.cursor/skills/zakura-trace-plots/SKILL.md new file mode 100644 index 00000000000..24d1e9fa1c0 --- /dev/null +++ b/.cursor/skills/zakura-trace-plots/SKILL.md @@ -0,0 +1,67 @@ +--- +name: zakura-trace-plots +description: Generate metrics-aware plots and summaries from Zebra Zakura perf trace directories. Use when the user asks to plot or analyze Zakura traces, block_sync.jsonl, commit_state.jsonl, feedrun CSVs, applying/reorder/stalls, HoL stalls, throughput, or commit metrics from a trace_dir. +--- + +# Zakura Trace Plots + +## Quick Start + +When the user gives a Zakura `trace_dir`, generate plots with: + +```bash +python3 .cursor/skills/zakura-trace-plots/scripts/plot_zakura_traces.py TRACE_DIR --out-dir perf-artifacts +``` + +If the matching feed-run CSV is not auto-detected, pass it explicitly: + +```bash +python3 .cursor/skills/zakura-trace-plots/scripts/plot_zakura_traces.py TRACE_DIR --csv /root/wal-bench/feedrun-r1.csv --out-dir perf-artifacts +``` + +## What To Plot + +Default outputs: + +- `*-time-apply-reorder-stalls-bps.svg` +- `*-height-apply-reorder-stalls-bps.svg` +- `*-summary.txt` + +Use the time plot for stall diagnosis. Height plots collapse zero-progress stalls onto one x-position. + +## Metrics Awareness + +Use `block_sync.jsonl` as the source of truth for: + +- `applying` +- `reorder` +- `unsubmitted_applying_count` +- `submitted_applies` +- byte counters like `applying_buffered_bytes` and `retained_pipeline_wire_bytes` +- floor-gap states such as `outstanding`, `queued`, and `in_flight_without_outstanding` + +Use the CSV only for sampled node metrics: + +- `blk_s`, recomputed from finalized height deltas while skipping startup `0 -> snapshot` jumps +- commit phase counters like `sur`, `ar`, `bp`, `bc` +- `cpu_cores`, peers, and other sampler-only columns + +Do not trust the CSV `reorder` column unless verified against `block_sync.jsonl`; in prior runs it was effectively zero while trace reorder was thousands. + +## Interpreting Output + +HoL/body-floor stall signature: + +- `applying` collapses near zero +- `reorder` grows +- `blk_s` drops near zero +- trace floor-gap state is usually `outstanding` + +Commit/memory-pressure signature: + +- `applying` and submitted applies stay high +- `reorder` stays low +- `spent_utxo_reads` or other commit phases spike +- host memory pressure or retained pipeline bytes rise + +When summarizing, report the generated file paths and the strongest signature, not every metric. diff --git a/.cursor/skills/zakura-trace-plots/scripts/plot_zakura_traces.py b/.cursor/skills/zakura-trace-plots/scripts/plot_zakura_traces.py new file mode 100755 index 00000000000..2e9e3ffd30d --- /dev/null +++ b/.cursor/skills/zakura-trace-plots/scripts/plot_zakura_traces.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python3 +"""Generate metrics-aware SVG plots from Zebra Zakura trace directories.""" + +from __future__ import annotations + +import argparse +import bisect +import csv +import html +import json +import math +from collections import Counter +from pathlib import Path + + +def number(value: object) -> float: + try: + return float(value or 0) + except (TypeError, ValueError): + return 0.0 + + +def nice_max(values: list[float]) -> float: + values = [value for value in values if math.isfinite(value)] + maximum = max(values) if values else 1 + if maximum <= 0: + return 1 + exponent = math.floor(math.log10(maximum)) + base = 10**exponent + for multiplier in (1, 2, 5, 10): + if maximum <= multiplier * base: + return multiplier * base + return 10 * base + + +def infer_label(trace_dir: Path) -> str: + name = trace_dir.name + if name.startswith("feedrun-") and name.endswith("-traces"): + return name.removeprefix("feedrun-").removesuffix("-traces") + return name.removesuffix("-traces") + + +def find_csv(trace_dir: Path, label: str, explicit: str | None) -> Path | None: + if explicit: + path = Path(explicit) + return path if path.exists() else None + + candidates = [ + trace_dir.parent / f"feedrun-{label}.csv", + Path("/root/wal-bench") / f"feedrun-{label}.csv", + Path.cwd() / f"feedrun-{label}.csv", + ] + return next((candidate for candidate in candidates if candidate.exists()), None) + + +def load_csv_throughput(csv_path: Path | None) -> tuple[list[tuple[float, float, float]], list[dict[str, str]]]: + if csv_path is None: + return [], [] + + rows = list(csv.DictReader(csv_path.open())) + throughput: list[tuple[float, float, float]] = [] + previous: tuple[float, float] | None = None + + for row in rows: + elapsed = number(row.get("elapsed")) + height = number(row.get("height")) + if height <= 0: + previous = None + continue + + if previous is not None: + dt = elapsed - previous[0] + dh = height - previous[1] + if dt > 0 and dh >= 0: + throughput.append((elapsed, height, dh / dt)) + + previous = (elapsed, height) + + return throughput, rows + + +def load_trace_states(trace_dir: Path) -> list[tuple[float, float, float, float, float, float, float, str]]: + path = trace_dir / "block_sync.jsonl" + if not path.exists(): + raise SystemExit(f"missing block_sync.jsonl in {trace_dir}") + + states: list[tuple[float, float, float, float, float, float, float, str]] = [] + first_ts: float | None = None + + with path.open() as trace: + for line in trace: + try: + row = json.loads(line) + except json.JSONDecodeError: + continue + + if row.get("event") != "block_sync_state": + continue + + ts = number(row.get("ts")) / 1_000_000.0 + if first_ts is None: + first_ts = ts + + elapsed = ts - first_ts + height = number(row.get("verified_block_tip")) + if height <= 0: + continue + + applying = number(row.get("applying")) + reorder = number(row.get("reorder")) + # Floor HoL: verifier/apply has almost nothing, but later bodies are buffered. + hol_stall = 1.0 if applying <= 10 and reorder >= 100 else 0.0 + applying_bytes = number(row.get("applying_buffered_bytes")) + retained_bytes = number(row.get("retained_pipeline_wire_bytes")) + floor_state = str(row.get("floor_gap_state") or "") + states.append((elapsed, height, applying, reorder, hol_stall, applying_bytes, retained_bytes, floor_state)) + + return states + + +def downsample(points: list[tuple], max_points: int = 1400) -> list[tuple]: + if len(points) <= max_points: + return points + + bucket_size = max(1, len(points) // max_points) + sampled = [] + for offset in range(0, len(points), bucket_size): + bucket = points[offset : offset + bucket_size] + sampled.append(bucket[0]) + peak_reorder = max(bucket, key=lambda point: point[3]) + if peak_reorder is not bucket[0]: + sampled.append(peak_reorder) + return sampled + + +def throughput_at_time(throughput: list[tuple[float, float, float]], elapsed: float) -> float: + if not throughput: + return 0.0 + + times = [row[0] for row in throughput] + index = bisect.bisect_right(times, elapsed) - 1 + index = max(0, index) + return throughput[index][2] + + +def make_svg(points: list[tuple], x_get, xlabel: str, title: str, out_path: Path) -> None: + if not points: + raise SystemExit("no points to plot") + + xs = [x_get(point) for point in points] + x_min = min(xs) + x_max = max(xs) + if x_max <= x_min: + x_max = x_min + 1 + + series = [ + ("Applying", [point[1] for point in points], "#1f77b4"), + ("Reorder", [point[2] for point in points], "#ff7f0e"), + ("Stalls", [point[3] for point in points], "#9467bd"), + ("Blocks/s", [point[4] for point in points], "#2ca02c"), + ] + + width, height = 1450, 980 + left, right, top, bottom = 105, 35, 65, 65 + gap = 30 + panel_height = (height - top - bottom - gap * (len(series) - 1)) / len(series) + plot_width = width - left - right + + def sx(value: float) -> float: + return left + (value - x_min) / (x_max - x_min) * plot_width + + parts = [ + f'', + '', + "", + f'{html.escape(title)}', + f'points={len(points)} x range {x_min:.0f}-{x_max:.0f}', + ] + + xticks = [x_min + (x_max - x_min) * index / 5 for index in range(6)] + + for index, (name, values, color) in enumerate(series): + y0 = top + index * (panel_height + gap) + ymax = nice_max(values) + parts.append(f'') + parts.append(f'{html.escape(name)}') + + for tick_index in range(5): + value = ymax * tick_index / 4 + y = y0 + panel_height - (value / ymax) * panel_height + label = f"{value:.0f}" if ymax >= 10 else f"{value:.2g}" + parts.append(f'') + parts.append(f'{label}') + + for xtick in xticks: + x = sx(xtick) + parts.append(f'') + + if name == "Stalls": + for point, value in zip(points, values): + if value >= 0.5: + x = sx(x_get(point)) + parts.append(f'') + + coords = [] + for point, value in zip(points, values): + x = sx(x_get(point)) + y = y0 + panel_height - (value / ymax) * panel_height + coords.append(f"{x:.1f},{y:.1f}") + parts.append(f'') + parts.append(f'') + parts.append(f'') + + last_y = top + (len(series) - 1) * (panel_height + gap) + panel_height + for xtick in xticks: + parts.append(f'{xtick:.0f}') + parts.append(f'{html.escape(xlabel)}') + parts.append("") + + out_path.write_text("\n".join(parts)) + + +def write_summary( + out_path: Path, + trace_dir: Path, + csv_path: Path | None, + states: list[tuple[float, float, float, float, float, float, float, str]], + csv_rows: list[dict[str, str]], +) -> None: + floor_states = Counter(state[7] for state in states if state[7]) + hol_samples = sum(1 for state in states if state[4] >= 0.5) + max_reorder = max((state[3] for state in states), default=0) + max_applying = max((state[2] for state in states), default=0) + max_retained_gb = max((state[6] for state in states), default=0) / 1e9 + + lines = [ + f"trace_dir: {trace_dir}", + f"csv: {csv_path or '(not found)'}", + f"trace_samples: {len(states)}", + f"height_range: {min((s[1] for s in states), default=0):.0f}-{max((s[1] for s in states), default=0):.0f}", + f"max_applying: {max_applying:.0f}", + f"max_reorder: {max_reorder:.0f}", + f"hol_stall_samples: {hol_samples}", + f"max_retained_pipeline_wire_gb: {max_retained_gb:.2f}", + f"floor_gap_states: {floor_states.most_common(8)}", + ] + + if csv_rows: + nonzero = [row for row in csv_rows if number(row.get("height")) > 0] + if nonzero: + lines.append(f"csv_last_height: {number(nonzero[-1].get('height')):.0f}") + lines.append(f"csv_last_blk_s: {number(nonzero[-1].get('blk_s')):.1f}") + + out_path.write_text("\n".join(lines) + "\n") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("trace_dir", help="Directory containing block_sync.jsonl and related Zakura traces") + parser.add_argument("--csv", help="Optional feed_run CSV. Auto-detected when omitted.") + parser.add_argument("--out-dir", default="perf-artifacts", help="Directory for generated artifacts") + parser.add_argument("--prefix", help="Output filename prefix. Defaults to the trace label.") + args = parser.parse_args() + + trace_dir = Path(args.trace_dir).expanduser().resolve() + label = infer_label(trace_dir) + csv_path = find_csv(trace_dir, label, args.csv) + output_dir = Path(args.out_dir).expanduser().resolve() + output_dir.mkdir(parents=True, exist_ok=True) + prefix = args.prefix or label + + throughput, csv_rows = load_csv_throughput(csv_path) + states = load_trace_states(trace_dir) + sampled = downsample(states) + points = [ + (elapsed, applying, reorder, stall, throughput_at_time(throughput, elapsed), height) + for elapsed, height, applying, reorder, stall, _applying_bytes, _retained_bytes, _floor_state in sampled + ] + height_points = sorted(points, key=lambda point: (point[5], point[0])) + + height_svg = output_dir / f"{prefix}-height-apply-reorder-stalls-bps.svg" + time_svg = output_dir / f"{prefix}-time-apply-reorder-stalls-bps.svg" + summary = output_dir / f"{prefix}-summary.txt" + + make_svg(height_points, lambda point: point[5], "Verified/finalized height", "", height_svg) + make_svg(points, lambda point: point[0], "Elapsed seconds", "", time_svg) + write_summary(summary, trace_dir, csv_path, states, csv_rows) + + print(height_svg) + print(time_svg) + print(summary) + + +if __name__ == "__main__": + main() diff --git a/.cursor/skills/zakura-trace-zip/SKILL.md b/.cursor/skills/zakura-trace-zip/SKILL.md new file mode 100644 index 00000000000..8800213bb99 --- /dev/null +++ b/.cursor/skills/zakura-trace-zip/SKILL.md @@ -0,0 +1,49 @@ +--- +name: zakura-trace-zip +description: Create shareable zip archives from Zebra Zakura perf trace directories. Use when the user asks to zip, package, archive, share, or export a trace_dir such as feedrun-*-traces or Zakura JSONL traces. +--- + +# Zakura Trace Zip + +## Quick Start + +When the user gives a Zakura `trace_dir`, create a shareable archive with: + +```bash +python3 .cursor/skills/zakura-trace-zip/scripts/zip_zakura_traces.py TRACE_DIR --out-dir perf-artifacts +``` + +Default output: + +```text +perf-artifacts/.zip +``` + +The zip preserves the trace directory as the top-level folder inside the archive, matching archives like `perf-artifacts/feedrun-r1-traces.zip`. + +## Options + +Use `--output PATH` to choose the exact zip path: + +```bash +python3 .cursor/skills/zakura-trace-zip/scripts/zip_zakura_traces.py /mnt/roman-dev-2-data/feedrun-r1-traces --output perf-artifacts/feedrun-r1-traces.zip +``` + +Use `--force` to overwrite an existing archive. + +Use `--include-related` to include nearby run files when they exist: + +- `/root/wal-bench/feedrun-