From 9ab87cb5d7ad0a2089f596c35c10aa726fe60212 Mon Sep 17 00:00:00 2001 From: j-rafique Date: Mon, 4 May 2026 15:09:37 +0000 Subject: [PATCH] feat(self_healing): add LEP-6 chain-driven heal-op dispatch runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the gonode-era peer-watchlist self-healing with a chain-mediated LEP-6 §18-§22 (Workstream C) implementation. Healer reconstructs locally and STAGES (no KAD publish), verifiers fetch reconstructed bytes from the assigned healer over a streaming gRPC RPC (§19 healer-served path) and hash-compare against op.ResultHash, then publish to KAD only after chain VERIFIED quorum. Three-phase flow Phase 1 — RECONSTRUCT (no publish) cascade.RecoveryReseed(PersistArtifacts=false, StagingDir) → download remaining symbols → RaptorQ-decode → verify file hash against Action.DataHash → re-encode → stage symbols+idFiles+layout +reconstructed.bin to ~/.supernode/heal-staging//. Submit MsgClaimHealComplete{HealManifestHash}; chain transitions SCHEDULED → HEALER_REPORTED, sets op.ResultHash = HealManifestHash. Phase 2 — VERIFY (§19 healer-served path) Verifier opens supernode.SelfHealingService/ServeReconstructedArtefacts on the assigned healer (op.HealerSupernodeAccount), streams the reconstructed bytes, computes BLAKE3 base64 (=Action.DataHash recipe via cascadekit.ComputeBlake3DataHashB64), compares against op.ResultHash (NOT Action.DataHash — chain enforces at lumera/x/audit/v1/keeper/msg_storage_truth.go:291), and submits MsgSubmitHealVerification{verified, hash}. Chain quorum n/2+1. Phase 3 — PUBLISH (only on VERIFIED) Finalizer polls heal_claims_submitted (Opt 2b per-op poll, folded into single tick loop alongside healer + verifier dispatch), reads op.Status, calls cascade.PublishStagedArtefacts on VERIFIED (same storeArtefacts path as register/upload), deletes staging on FAILED/EXPIRED. Chain may reschedule a different healer on EXPIRED. Crash-recovery / restart-safety Submit-then-persist ordering: SQLite dedup row is written ONLY after chain has accepted the tx. A failed submit (mempool, signing, chain reject) leaves no row and staging is removed, so the next tick can retry cleanly. If chain accepted a prior submit but the supernode crashed before persisting, the next tick's resubmit fails with "does not accept healer completion claim" and reconcileExistingClaim re-fetches the heal-op, confirms chain ResultHash equals our manifest, and persists the dedup row so finalizer takes over. Negative-attestation hash: chain rejects empty VerificationHash even on verified=false (msg_storage_truth.go:271-273). Verifier synthesizes a deterministic non-empty placeholder (sha256("lep6:negative-attestation:"+reason) base64) on fetch_failed and hash_compute_failed paths. Chain only validates VerificationHash content for positive votes (msg_storage_truth.go:288-294), so any non-empty value is well-formed for negatives. Components added supernode/self_healing/ service.go Single tick loop; mode gate (UNSPECIFIED skips); healer dispatch; verifier dispatch; finalizer poll; sync.Map in-flight + buffered semaphores (reconstructs=2, verifications=4, publishes=2). healer.go Phase 1: submit-then-persist ordering; reconcileExistingClaim handles post-crash recovery when chain accepted a prior submit. verifier.go Phase 2: fetch from assigned healer, retry with exponential backoff (3 attempts), submit verified= false with non-empty placeholder hash on persistent fetch failure; positive-path hash compares against op.ResultHash; reconciles chain-side "verification already submitted" idempotency. finalizer.go Phase 3: VERIFIED → publish + cleanup; FAILED/ EXPIRED → cleanup only; transient states no-op. peer_client.go secureVerifierFetcher dials via the same secure-rpc / lumeraid stack the legacy storage_challenge loop uses. supernode/transport/grpc/self_healing/handler.go Streaming ServeReconstructedArtefacts RPC. DefaultCallerIdentityResolver pulls verifier identity from the secure-rpc (Lumera ALTS) handshake via pkg/reachability.GrpcRemoteIdentityAndAddr — production wiring uses this so req.VerifierAccount is never trusted alone. Authorizes caller ∈ op.VerifierSupernodeAccounts AND identity == op.HealerSupernodeAccount; refuses with FailedPrecondition when not the assigned healer and PermissionDenied for unassigned callers. 1 MiB chunks. proto/supernode/self_healing.proto SelfHealingService { ServeReconstructedArtefacts streams chunks }. Makefile gen-supernode wires it; gen/supernode/self_healing*.pb.go regenerated. supernode/cascade/reseed.go Split RecoveryReseed: PersistArtifacts=true (legacy/republish) vs PersistArtifacts=false (LEP-6 stage-only). Adds stageArtefacts + PublishStagedArtefacts. Stages reconstructed file bytes and a JSON manifest the §19 transport reads. supernode/cascade/staged.go ReadStagedHealOp helper used by the transport handler. supernode/cascade/interfaces.go CascadeTask interface gains RecoveryReseed + PublishStagedArtefacts so self_healing depends only on the factory abstraction. pkg/storage/queries/self_healing_lep6.go Tables heal_claims_submitted (PK heal_op_id) and heal_verifications_submitted (PK (heal_op_id, verifier_account)) for restart dedup. Typed sentinel errors ErrLEP6ClaimAlreadyRecorded / ErrLEP6VerificationAlreadyRecorded. Migrations wired in OpenHistoryDB. pkg/storage/queries/local.go LocalStoreInterface embeds LEP6HealQueries. supernode/config/config.go SelfHealingConfig YAML block (enabled, poll_interval_ms, max_concurrent_*, staging_dir, verifier_fetch_timeout_ms, verifier_fetch_attempts). Default disabled until activation. supernode/cmd/start.go Constructs selfHealingService.Service + selfHealingRPC.Server (with DefaultCallerIdentityResolver) when SelfHealingConfig.Enabled, registers SelfHealingService_ServiceDesc on the gRPC server, appends the runner to the lifecycle services list. Reuses cService (cascade factory) and historyStore. Tests (16 mandatory; all PASS) supernode/self_healing/service_test.go 1. TestVerifier_ReadsOpResultHashForComparison (R-bug pin) 2. TestVerifier_HashMismatchProducesVerifiedFalse 2b. TestVerifier_FetchFailureSubmitsNonEmptyHash (BLOCKER pin) 3. TestVerifier_FetchesFromAssignedHealerOnly (§19 gate) 6. TestHealer_FailedSubmitDoesNotPersistDedupRow (ordering) 6b. TestHealer_ReconcilesExistingChainClaimAfterCrash (recovery) 7. TestHealer_RaptorQReconstructionFailureSkipsClaim (Scenario C1) 8. TestFinalizer_VerifiedTriggersPublishToKAD (Scenario A) 9. TestFinalizer_FailedSkipsPublish_DeletesStaging (Scenario B) 10. TestFinalizer_ExpiredSkipsPublish_DeletesStaging (Scenario C2) 11. TestService_NoRoleSkipsOp 12. TestService_UnspecifiedModeSkipsEntirely (mode gate) 13. TestService_FinalStateOpsIgnored 14. TestDedup_RestartDoesNotResubmit (3-layer dedup) supernode/transport/grpc/self_healing/handler_test.go 4. TestServeReconstructedArtefacts_AuthorizesOnlyAssignedVerifiers 5. TestServeReconstructedArtefacts_RejectsUnassignedCaller (also covers non-assigned-healer FailedPrecondition refusal) pkg/storage/queries/self_healing_lep6_test.go TestLEP6_HealClaim_RoundTripAndDedup TestLEP6_HealVerification_PerVerifierDedup Validation go test ./supernode/self_healing/... PASS (2.66s) go test ./supernode/transport/grpc/self_healing/... PASS (0.09s) go test ./supernode/cascade/... PASS (0.09s) go test ./pkg/storage/queries/... PASS (0.20s) go test ./pkg/storagechallenge/... ./supernode/storage_challenge \ ./supernode/host_reporter ./pkg/lumera/modules/audit \ ./pkg/lumera/modules/audit_msg PASS go vet (touched + all transitively reachable pkgs) PASS go build (targeted) PASS (full repo go build fails only on pre-existing github.com/kolesa-team/go-webp libwebp-dev system-header issue; unrelated to this change.) Resolved decisions applied ✓ Branch base: PR-3 tip f79f88f, NOT self-healing-improvements (single chain-driven service per Bilal direction; legacy 3-way Request/Verify/Commit RPC discarded). ✓ Verifier compares against op.ResultHash (chain msg_storage_truth.go :291). Pinned by TestVerifier_ReadsOpResultHashForComparison. ✓ Hash recipe = cascadekit.ComputeBlake3DataHashB64 (=Action.DataHash recipe). Same recipe healer + verifier + chain enforce. ✓ KAD publish AFTER chain VERIFIED (§19 healer-served-path gate); staging directory is the only authority before quorum. ✓ Finalizer mechanism: Opt 2b (per-op GetHealOp poll, folded into single tick loop) — no Tendermint WS, no monotonic-growth poll. ✓ Concurrency default: semaphore=2 reconstructs (RaptorQ RAM-aware), 4 verifications, 2 publishes. ✓ Mode gate: UNSPECIFIED skips dispatcher entirely (Service.tick early-return; verified by TestService_UnspecifiedModeSkipsEntirely). ✓ Three-layer dedup: sync.Map + bounded semaphores + SQLite (heal_claims_submitted + heal_verifications_submitted). ✓ Submit-then-persist ordering with reconcile path for crash recovery. ✓ Non-empty placeholder VerificationHash on negative attestations (chain rejects empty regardless of verified bool). ✓ Caller authentication via secure-rpc / Lumera ALTS handshake at transport layer; req.VerifierAccount never trusted alone in production. Plan: docs/plans/LEP6_PR4_EXECUTION_PLAN.md --- Makefile | 2 +- gen/supernode/self_healing.pb.go | 265 +++++++ gen/supernode/self_healing.swagger.json | 68 ++ gen/supernode/self_healing_grpc.pb.go | 166 +++++ pkg/netutil/hostport.go | 76 ++ pkg/netutil/hostport_test.go | 39 + pkg/storage/queries/local.go | 1 + pkg/storage/queries/self_healing_lep6.go | 186 +++++ pkg/storage/queries/self_healing_lep6_test.go | 88 +++ pkg/storage/queries/sqlite.go | 8 + proto/supernode/self_healing.proto | 50 ++ supernode/cascade/interfaces.go | 7 + supernode/cascade/reseed.go | 204 +++++- supernode/cascade/staged.go | 42 ++ supernode/cmd/start.go | 64 +- supernode/config/config.go | 29 + supernode/self_healing/cascade_fake_test.go | 114 +++ supernode/self_healing/finalizer.go | 116 +++ supernode/self_healing/healer.go | 163 +++++ supernode/self_healing/lumera_test.go | 46 ++ supernode/self_healing/mocks_test.go | 169 +++++ supernode/self_healing/peer_client.go | 121 ++++ supernode/self_healing/service.go | 442 ++++++++++++ supernode/self_healing/service_test.go | 668 ++++++++++++++++++ supernode/self_healing/verifier.go | 190 +++++ .../storage_challenge/lep6_client_factory.go | 3 +- supernode/storage_challenge/service.go | 62 +- .../transport/grpc/self_healing/handler.go | 206 ++++++ .../grpc/self_healing/handler_test.go | 277 ++++++++ .../grpc/self_healing/helpers_test.go | 38 + 30 files changed, 3839 insertions(+), 71 deletions(-) create mode 100644 gen/supernode/self_healing.pb.go create mode 100644 gen/supernode/self_healing.swagger.json create mode 100644 gen/supernode/self_healing_grpc.pb.go create mode 100644 pkg/netutil/hostport.go create mode 100644 pkg/netutil/hostport_test.go create mode 100644 pkg/storage/queries/self_healing_lep6.go create mode 100644 pkg/storage/queries/self_healing_lep6_test.go create mode 100644 proto/supernode/self_healing.proto create mode 100644 supernode/cascade/staged.go create mode 100644 supernode/self_healing/cascade_fake_test.go create mode 100644 supernode/self_healing/finalizer.go create mode 100644 supernode/self_healing/healer.go create mode 100644 supernode/self_healing/lumera_test.go create mode 100644 supernode/self_healing/mocks_test.go create mode 100644 supernode/self_healing/peer_client.go create mode 100644 supernode/self_healing/service.go create mode 100644 supernode/self_healing/service_test.go create mode 100644 supernode/self_healing/verifier.go create mode 100644 supernode/transport/grpc/self_healing/handler.go create mode 100644 supernode/transport/grpc/self_healing/handler_test.go create mode 100644 supernode/transport/grpc/self_healing/helpers_test.go diff --git a/Makefile b/Makefile index 52d9589e..9445e724 100644 --- a/Makefile +++ b/Makefile @@ -152,7 +152,7 @@ gen-supernode: --grpc-gateway_out=gen \ --grpc-gateway_opt=paths=source_relative \ --openapiv2_out=gen \ - proto/supernode/service.proto proto/supernode/status.proto proto/supernode/storage_challenge.proto + proto/supernode/service.proto proto/supernode/status.proto proto/supernode/storage_challenge.proto proto/supernode/self_healing.proto # Define the paths SUPERNODE_SRC=supernode/main.go diff --git a/gen/supernode/self_healing.pb.go b/gen/supernode/self_healing.pb.go new file mode 100644 index 00000000..59ae049c --- /dev/null +++ b/gen/supernode/self_healing.pb.go @@ -0,0 +1,265 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.34.2 +// protoc v4.25.1 +// source: supernode/self_healing.proto + +package supernode + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type ServeReconstructedArtefactsRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // heal_op_id identifies the heal-op the caller is verifying. Server uses it + // to look up the staging dir and authorize the caller against + // op.VerifierSupernodeAccounts. + HealOpId uint64 `protobuf:"varint,1,opt,name=heal_op_id,json=healOpId,proto3" json:"heal_op_id,omitempty"` + // verifier_account is the caller's chain-side supernode account address. + // Server cross-checks against authenticated grpc identity AND against + // op.VerifierSupernodeAccounts. + VerifierAccount string `protobuf:"bytes,2,opt,name=verifier_account,json=verifierAccount,proto3" json:"verifier_account,omitempty"` +} + +func (x *ServeReconstructedArtefactsRequest) Reset() { + *x = ServeReconstructedArtefactsRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_supernode_self_healing_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ServeReconstructedArtefactsRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ServeReconstructedArtefactsRequest) ProtoMessage() {} + +func (x *ServeReconstructedArtefactsRequest) ProtoReflect() protoreflect.Message { + mi := &file_supernode_self_healing_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ServeReconstructedArtefactsRequest.ProtoReflect.Descriptor instead. +func (*ServeReconstructedArtefactsRequest) Descriptor() ([]byte, []int) { + return file_supernode_self_healing_proto_rawDescGZIP(), []int{0} +} + +func (x *ServeReconstructedArtefactsRequest) GetHealOpId() uint64 { + if x != nil { + return x.HealOpId + } + return 0 +} + +func (x *ServeReconstructedArtefactsRequest) GetVerifierAccount() string { + if x != nil { + return x.VerifierAccount + } + return "" +} + +type ServeReconstructedArtefactsResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // chunk is a contiguous slice of the reconstructed file bytes. Chunks are + // ordered; concatenating chunks across the stream yields the original file + // whose BLAKE3 must equal op.ResultHash (= action.DataHash recipe). + Chunk []byte `protobuf:"bytes,1,opt,name=chunk,proto3" json:"chunk,omitempty"` + // total_size is the full file size; populated on the first message and + // optionally repeated. Allows clients to pre-allocate buffers. + TotalSize uint64 `protobuf:"varint,2,opt,name=total_size,json=totalSize,proto3" json:"total_size,omitempty"` + // is_last indicates this message carries the final chunk. + IsLast bool `protobuf:"varint,3,opt,name=is_last,json=isLast,proto3" json:"is_last,omitempty"` +} + +func (x *ServeReconstructedArtefactsResponse) Reset() { + *x = ServeReconstructedArtefactsResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_supernode_self_healing_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ServeReconstructedArtefactsResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ServeReconstructedArtefactsResponse) ProtoMessage() {} + +func (x *ServeReconstructedArtefactsResponse) ProtoReflect() protoreflect.Message { + mi := &file_supernode_self_healing_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ServeReconstructedArtefactsResponse.ProtoReflect.Descriptor instead. +func (*ServeReconstructedArtefactsResponse) Descriptor() ([]byte, []int) { + return file_supernode_self_healing_proto_rawDescGZIP(), []int{1} +} + +func (x *ServeReconstructedArtefactsResponse) GetChunk() []byte { + if x != nil { + return x.Chunk + } + return nil +} + +func (x *ServeReconstructedArtefactsResponse) GetTotalSize() uint64 { + if x != nil { + return x.TotalSize + } + return 0 +} + +func (x *ServeReconstructedArtefactsResponse) GetIsLast() bool { + if x != nil { + return x.IsLast + } + return false +} + +var File_supernode_self_healing_proto protoreflect.FileDescriptor + +var file_supernode_self_healing_proto_rawDesc = []byte{ + 0x0a, 0x1c, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x2f, 0x73, 0x65, 0x6c, 0x66, + 0x5f, 0x68, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x09, + 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x22, 0x6d, 0x0a, 0x22, 0x53, 0x65, 0x72, + 0x76, 0x65, 0x52, 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x65, 0x64, 0x41, + 0x72, 0x74, 0x65, 0x66, 0x61, 0x63, 0x74, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, + 0x1c, 0x0a, 0x0a, 0x68, 0x65, 0x61, 0x6c, 0x5f, 0x6f, 0x70, 0x5f, 0x69, 0x64, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x04, 0x52, 0x08, 0x68, 0x65, 0x61, 0x6c, 0x4f, 0x70, 0x49, 0x64, 0x12, 0x29, 0x0a, + 0x10, 0x76, 0x65, 0x72, 0x69, 0x66, 0x69, 0x65, 0x72, 0x5f, 0x61, 0x63, 0x63, 0x6f, 0x75, 0x6e, + 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0f, 0x76, 0x65, 0x72, 0x69, 0x66, 0x69, 0x65, + 0x72, 0x41, 0x63, 0x63, 0x6f, 0x75, 0x6e, 0x74, 0x22, 0x73, 0x0a, 0x23, 0x53, 0x65, 0x72, 0x76, + 0x65, 0x52, 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x65, 0x64, 0x41, 0x72, + 0x74, 0x65, 0x66, 0x61, 0x63, 0x74, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, + 0x14, 0x0a, 0x05, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x05, + 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x74, 0x6f, 0x74, 0x61, 0x6c, 0x5f, 0x73, + 0x69, 0x7a, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x04, 0x52, 0x09, 0x74, 0x6f, 0x74, 0x61, 0x6c, + 0x53, 0x69, 0x7a, 0x65, 0x12, 0x17, 0x0a, 0x07, 0x69, 0x73, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x18, + 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x69, 0x73, 0x4c, 0x61, 0x73, 0x74, 0x32, 0x97, 0x01, + 0x0a, 0x12, 0x53, 0x65, 0x6c, 0x66, 0x48, 0x65, 0x61, 0x6c, 0x69, 0x6e, 0x67, 0x53, 0x65, 0x72, + 0x76, 0x69, 0x63, 0x65, 0x12, 0x80, 0x01, 0x0a, 0x1b, 0x53, 0x65, 0x72, 0x76, 0x65, 0x52, 0x65, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x65, 0x64, 0x41, 0x72, 0x74, 0x65, 0x66, + 0x61, 0x63, 0x74, 0x73, 0x12, 0x2d, 0x2e, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, + 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x52, 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, + 0x74, 0x65, 0x64, 0x41, 0x72, 0x74, 0x65, 0x66, 0x61, 0x63, 0x74, 0x73, 0x52, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x1a, 0x2e, 0x2e, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x2e, + 0x53, 0x65, 0x72, 0x76, 0x65, 0x52, 0x65, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, + 0x65, 0x64, 0x41, 0x72, 0x74, 0x65, 0x66, 0x61, 0x63, 0x74, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, + 0x6e, 0x73, 0x65, 0x22, 0x00, 0x30, 0x01, 0x42, 0x36, 0x5a, 0x34, 0x67, 0x69, 0x74, 0x68, 0x75, + 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x4c, 0x75, 0x6d, 0x65, 0x72, 0x61, 0x50, 0x72, 0x6f, 0x74, + 0x6f, 0x63, 0x6f, 0x6c, 0x2f, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x2f, 0x76, + 0x32, 0x2f, 0x67, 0x65, 0x6e, 0x2f, 0x73, 0x75, 0x70, 0x65, 0x72, 0x6e, 0x6f, 0x64, 0x65, 0x62, + 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_supernode_self_healing_proto_rawDescOnce sync.Once + file_supernode_self_healing_proto_rawDescData = file_supernode_self_healing_proto_rawDesc +) + +func file_supernode_self_healing_proto_rawDescGZIP() []byte { + file_supernode_self_healing_proto_rawDescOnce.Do(func() { + file_supernode_self_healing_proto_rawDescData = protoimpl.X.CompressGZIP(file_supernode_self_healing_proto_rawDescData) + }) + return file_supernode_self_healing_proto_rawDescData +} + +var file_supernode_self_healing_proto_msgTypes = make([]protoimpl.MessageInfo, 2) +var file_supernode_self_healing_proto_goTypes = []any{ + (*ServeReconstructedArtefactsRequest)(nil), // 0: supernode.ServeReconstructedArtefactsRequest + (*ServeReconstructedArtefactsResponse)(nil), // 1: supernode.ServeReconstructedArtefactsResponse +} +var file_supernode_self_healing_proto_depIdxs = []int32{ + 0, // 0: supernode.SelfHealingService.ServeReconstructedArtefacts:input_type -> supernode.ServeReconstructedArtefactsRequest + 1, // 1: supernode.SelfHealingService.ServeReconstructedArtefacts:output_type -> supernode.ServeReconstructedArtefactsResponse + 1, // [1:2] is the sub-list for method output_type + 0, // [0:1] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_supernode_self_healing_proto_init() } +func file_supernode_self_healing_proto_init() { + if File_supernode_self_healing_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_supernode_self_healing_proto_msgTypes[0].Exporter = func(v any, i int) any { + switch v := v.(*ServeReconstructedArtefactsRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_supernode_self_healing_proto_msgTypes[1].Exporter = func(v any, i int) any { + switch v := v.(*ServeReconstructedArtefactsResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_supernode_self_healing_proto_rawDesc, + NumEnums: 0, + NumMessages: 2, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_supernode_self_healing_proto_goTypes, + DependencyIndexes: file_supernode_self_healing_proto_depIdxs, + MessageInfos: file_supernode_self_healing_proto_msgTypes, + }.Build() + File_supernode_self_healing_proto = out.File + file_supernode_self_healing_proto_rawDesc = nil + file_supernode_self_healing_proto_goTypes = nil + file_supernode_self_healing_proto_depIdxs = nil +} diff --git a/gen/supernode/self_healing.swagger.json b/gen/supernode/self_healing.swagger.json new file mode 100644 index 00000000..41f787b6 --- /dev/null +++ b/gen/supernode/self_healing.swagger.json @@ -0,0 +1,68 @@ +{ + "swagger": "2.0", + "info": { + "title": "supernode/self_healing.proto", + "version": "version not set" + }, + "tags": [ + { + "name": "SelfHealingService" + } + ], + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "paths": {}, + "definitions": { + "protobufAny": { + "type": "object", + "properties": { + "@type": { + "type": "string" + } + }, + "additionalProperties": {} + }, + "rpcStatus": { + "type": "object", + "properties": { + "code": { + "type": "integer", + "format": "int32" + }, + "message": { + "type": "string" + }, + "details": { + "type": "array", + "items": { + "type": "object", + "$ref": "#/definitions/protobufAny" + } + } + } + }, + "supernodeServeReconstructedArtefactsResponse": { + "type": "object", + "properties": { + "chunk": { + "type": "string", + "format": "byte", + "description": "chunk is a contiguous slice of the reconstructed file bytes. Chunks are\nordered; concatenating chunks across the stream yields the original file\nwhose BLAKE3 must equal op.ResultHash (= action.DataHash recipe)." + }, + "totalSize": { + "type": "string", + "format": "uint64", + "description": "total_size is the full file size; populated on the first message and\noptionally repeated. Allows clients to pre-allocate buffers." + }, + "isLast": { + "type": "boolean", + "description": "is_last indicates this message carries the final chunk." + } + } + } + } +} diff --git a/gen/supernode/self_healing_grpc.pb.go b/gen/supernode/self_healing_grpc.pb.go new file mode 100644 index 00000000..759116bb --- /dev/null +++ b/gen/supernode/self_healing_grpc.pb.go @@ -0,0 +1,166 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.5.1 +// - protoc v4.25.1 +// source: supernode/self_healing.proto + +package supernode + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.64.0 or later. +const _ = grpc.SupportPackageIsVersion9 + +const ( + SelfHealingService_ServeReconstructedArtefacts_FullMethodName = "/supernode.SelfHealingService/ServeReconstructedArtefacts" +) + +// SelfHealingServiceClient is the client API for SelfHealingService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +// +// SelfHealingService — LEP-6 §19 healer-served path. +// +// LEP-6 prescribes a strict three-phase heal flow: +// +// Phase 1 (RECONSTRUCT) — assigned healer reconstructs the file locally and +// submits MsgClaimHealComplete{HealManifestHash}. Artefacts are STAGED +// locally; they are NOT yet published to KAD. +// Phase 2 (VERIFY) — assigned verifiers MUST fetch the reconstructed bytes +// directly from the assigned healer (this RPC), not from KAD, because the +// healer-served path is the only authority before chain VERIFIED. +// Phase 3 (PUBLISH) — only after chain reaches VERIFIED quorum the healer's +// finalizer publishes staged artefacts to KAD via the same store path +// register/upload uses. +// +// This service exposes only the verifier-side fetch — chain coordinates role +// assignment and quorum, so the legacy peer Request/Verify/Commit RPCs are +// gone in the LEP-6 model. +type SelfHealingServiceClient interface { + // ServeReconstructedArtefacts streams the reconstructed file bytes to an + // authorized verifier. The handler MUST verify caller ∈ + // op.VerifierSupernodeAccounts before serving any bytes. + ServeReconstructedArtefacts(ctx context.Context, in *ServeReconstructedArtefactsRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[ServeReconstructedArtefactsResponse], error) +} + +type selfHealingServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewSelfHealingServiceClient(cc grpc.ClientConnInterface) SelfHealingServiceClient { + return &selfHealingServiceClient{cc} +} + +func (c *selfHealingServiceClient) ServeReconstructedArtefacts(ctx context.Context, in *ServeReconstructedArtefactsRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[ServeReconstructedArtefactsResponse], error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + stream, err := c.cc.NewStream(ctx, &SelfHealingService_ServiceDesc.Streams[0], SelfHealingService_ServeReconstructedArtefacts_FullMethodName, cOpts...) + if err != nil { + return nil, err + } + x := &grpc.GenericClientStream[ServeReconstructedArtefactsRequest, ServeReconstructedArtefactsResponse]{ClientStream: stream} + if err := x.ClientStream.SendMsg(in); err != nil { + return nil, err + } + if err := x.ClientStream.CloseSend(); err != nil { + return nil, err + } + return x, nil +} + +// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name. +type SelfHealingService_ServeReconstructedArtefactsClient = grpc.ServerStreamingClient[ServeReconstructedArtefactsResponse] + +// SelfHealingServiceServer is the server API for SelfHealingService service. +// All implementations must embed UnimplementedSelfHealingServiceServer +// for forward compatibility. +// +// SelfHealingService — LEP-6 §19 healer-served path. +// +// LEP-6 prescribes a strict three-phase heal flow: +// +// Phase 1 (RECONSTRUCT) — assigned healer reconstructs the file locally and +// submits MsgClaimHealComplete{HealManifestHash}. Artefacts are STAGED +// locally; they are NOT yet published to KAD. +// Phase 2 (VERIFY) — assigned verifiers MUST fetch the reconstructed bytes +// directly from the assigned healer (this RPC), not from KAD, because the +// healer-served path is the only authority before chain VERIFIED. +// Phase 3 (PUBLISH) — only after chain reaches VERIFIED quorum the healer's +// finalizer publishes staged artefacts to KAD via the same store path +// register/upload uses. +// +// This service exposes only the verifier-side fetch — chain coordinates role +// assignment and quorum, so the legacy peer Request/Verify/Commit RPCs are +// gone in the LEP-6 model. +type SelfHealingServiceServer interface { + // ServeReconstructedArtefacts streams the reconstructed file bytes to an + // authorized verifier. The handler MUST verify caller ∈ + // op.VerifierSupernodeAccounts before serving any bytes. + ServeReconstructedArtefacts(*ServeReconstructedArtefactsRequest, grpc.ServerStreamingServer[ServeReconstructedArtefactsResponse]) error + mustEmbedUnimplementedSelfHealingServiceServer() +} + +// UnimplementedSelfHealingServiceServer must be embedded to have +// forward compatible implementations. +// +// NOTE: this should be embedded by value instead of pointer to avoid a nil +// pointer dereference when methods are called. +type UnimplementedSelfHealingServiceServer struct{} + +func (UnimplementedSelfHealingServiceServer) ServeReconstructedArtefacts(*ServeReconstructedArtefactsRequest, grpc.ServerStreamingServer[ServeReconstructedArtefactsResponse]) error { + return status.Errorf(codes.Unimplemented, "method ServeReconstructedArtefacts not implemented") +} +func (UnimplementedSelfHealingServiceServer) mustEmbedUnimplementedSelfHealingServiceServer() {} +func (UnimplementedSelfHealingServiceServer) testEmbeddedByValue() {} + +// UnsafeSelfHealingServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to SelfHealingServiceServer will +// result in compilation errors. +type UnsafeSelfHealingServiceServer interface { + mustEmbedUnimplementedSelfHealingServiceServer() +} + +func RegisterSelfHealingServiceServer(s grpc.ServiceRegistrar, srv SelfHealingServiceServer) { + // If the following call pancis, it indicates UnimplementedSelfHealingServiceServer was + // embedded by pointer and is nil. This will cause panics if an + // unimplemented method is ever invoked, so we test this at initialization + // time to prevent it from happening at runtime later due to I/O. + if t, ok := srv.(interface{ testEmbeddedByValue() }); ok { + t.testEmbeddedByValue() + } + s.RegisterService(&SelfHealingService_ServiceDesc, srv) +} + +func _SelfHealingService_ServeReconstructedArtefacts_Handler(srv interface{}, stream grpc.ServerStream) error { + m := new(ServeReconstructedArtefactsRequest) + if err := stream.RecvMsg(m); err != nil { + return err + } + return srv.(SelfHealingServiceServer).ServeReconstructedArtefacts(m, &grpc.GenericServerStream[ServeReconstructedArtefactsRequest, ServeReconstructedArtefactsResponse]{ServerStream: stream}) +} + +// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name. +type SelfHealingService_ServeReconstructedArtefactsServer = grpc.ServerStreamingServer[ServeReconstructedArtefactsResponse] + +// SelfHealingService_ServiceDesc is the grpc.ServiceDesc for SelfHealingService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var SelfHealingService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "supernode.SelfHealingService", + HandlerType: (*SelfHealingServiceServer)(nil), + Methods: []grpc.MethodDesc{}, + Streams: []grpc.StreamDesc{ + { + StreamName: "ServeReconstructedArtefacts", + Handler: _SelfHealingService_ServeReconstructedArtefacts_Handler, + ServerStreams: true, + }, + }, + Metadata: "supernode/self_healing.proto", +} diff --git a/pkg/netutil/hostport.go b/pkg/netutil/hostport.go new file mode 100644 index 00000000..0945103b --- /dev/null +++ b/pkg/netutil/hostport.go @@ -0,0 +1,76 @@ +package netutil + +import ( + "net" + "net/url" + "strconv" + "strings" +) + +// ParseHostAndPort parses a raw host/address into host and port. +// +// Accepted inputs include: +// - "host" (uses defaultPort) +// - "host:1234" +// - "scheme://host:1234/path" (uses URL host portion) +// - "[2001:db8::1]:1234" +// - "[2001:db8::1]" (uses defaultPort) +// - "fe80::1%eth0" (IPv6 literal with zone, uses defaultPort) +// +// If a port is present but invalid, the parser falls back to defaultPort for +// compatibility with the existing storage-challenge address parser. +func ParseHostAndPort(address string, defaultPort int) (host string, port int, ok bool) { + address = strings.TrimSpace(address) + if address == "" { + return "", 0, false + } + + // If it looks like a URL, parse and use the host[:port] portion. + if u, err := url.Parse(address); err == nil && u.Host != "" { + address = u.Host + } + + if h, p, err := net.SplitHostPort(address); err == nil { + h = strings.TrimSpace(h) + if h == "" { + return "", 0, false + } + if n, err := strconv.Atoi(p); err == nil && n > 0 && n <= 65535 { + return h, n, true + } + return h, defaultPort, true + } + + // No port present. Treat it as a raw host if it is plausibly valid; otherwise fail. + host = strings.TrimSpace(address) + if host == "" { + return "", 0, false + } + + // Accept bracketed IPv6 literal without a port (e.g. "[2001:db8::1]") by stripping brackets. + if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") && strings.Count(host, "]") == 1 { + host = strings.TrimPrefix(strings.TrimSuffix(host, "]"), "[") + host = strings.TrimSpace(host) + if host == "" { + return "", 0, false + } + } + + // Reject obviously malformed inputs (paths, fragments, userinfo, whitespace, or stray brackets). + if strings.ContainsAny(host, " \t\r\n/\\?#@[]") { + return "", 0, false + } + + // If it contains ':' it must be a valid IPv6 literal (optionally with a zone, e.g. "fe80::1%eth0"). + if strings.Contains(host, ":") { + ipPart := host + if i := strings.IndexByte(ipPart, '%'); i >= 0 { + ipPart = ipPart[:i] + } + if net.ParseIP(ipPart) == nil { + return "", 0, false + } + } + + return host, defaultPort, true +} diff --git a/pkg/netutil/hostport_test.go b/pkg/netutil/hostport_test.go new file mode 100644 index 00000000..ec288ac6 --- /dev/null +++ b/pkg/netutil/hostport_test.go @@ -0,0 +1,39 @@ +package netutil + +import "testing" + +func TestParseHostAndPort(t *testing.T) { + tests := []struct { + name string + address string + defaultPort int + wantHost string + wantPort int + wantOK bool + }{ + {name: "host without port", address: "sn.example.com", defaultPort: 9090, wantHost: "sn.example.com", wantPort: 9090, wantOK: true}, + {name: "host with port", address: "sn.example.com:1234", defaultPort: 9090, wantHost: "sn.example.com", wantPort: 1234, wantOK: true}, + {name: "url host portion", address: "grpc://sn.example.com:2345/path", defaultPort: 9090, wantHost: "sn.example.com", wantPort: 2345, wantOK: true}, + {name: "bracketed ipv6 with port", address: "[2001:db8::1]:3456", defaultPort: 9090, wantHost: "2001:db8::1", wantPort: 3456, wantOK: true}, + {name: "bracketed ipv6 without port", address: "[2001:db8::1]", defaultPort: 9090, wantHost: "2001:db8::1", wantPort: 9090, wantOK: true}, + {name: "ipv6 with zone", address: "fe80::1%eth0", defaultPort: 9090, wantHost: "fe80::1%eth0", wantPort: 9090, wantOK: true}, + {name: "invalid port falls back", address: "sn.example.com:notaport", defaultPort: 9090, wantHost: "sn.example.com", wantPort: 9090, wantOK: true}, + {name: "empty", address: " ", defaultPort: 9090, wantOK: false}, + {name: "path rejected", address: "sn.example.com/path", defaultPort: 9090, wantOK: false}, + {name: "userinfo rejected", address: "user@sn.example.com", defaultPort: 9090, wantOK: false}, + {name: "stray bracket rejected", address: "sn.example.com]", defaultPort: 9090, wantOK: false}, + {name: "malformed ipv6 rejected", address: "2001:db8:::bad", defaultPort: 9090, wantOK: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotHost, gotPort, gotOK := ParseHostAndPort(tt.address, tt.defaultPort) + if gotOK != tt.wantOK { + t.Fatalf("ok = %v, want %v", gotOK, tt.wantOK) + } + if gotHost != tt.wantHost || gotPort != tt.wantPort { + t.Fatalf("ParseHostAndPort() = (%q, %d, %v), want (%q, %d, %v)", gotHost, gotPort, gotOK, tt.wantHost, tt.wantPort, tt.wantOK) + } + }) + } +} diff --git a/pkg/storage/queries/local.go b/pkg/storage/queries/local.go index e677de76..f7fa5275 100644 --- a/pkg/storage/queries/local.go +++ b/pkg/storage/queries/local.go @@ -13,4 +13,5 @@ type LocalStoreInterface interface { StorageChallengeQueries PingHistoryQueries HealthCheckChallengeQueries + LEP6HealQueries } diff --git a/pkg/storage/queries/self_healing_lep6.go b/pkg/storage/queries/self_healing_lep6.go new file mode 100644 index 00000000..87d8765e --- /dev/null +++ b/pkg/storage/queries/self_healing_lep6.go @@ -0,0 +1,186 @@ +package queries + +import ( + "context" + "database/sql" + "errors" + "strings" + "time" +) + +// LEP6HealQueries — restart-safe dedup tables for the LEP-6 self-healing +// runtime. The LEP-6 dispatcher is chain-driven (poll heal-ops, role-decide +// from HealerSupernodeAccount / VerifierSupernodeAccounts), so a process +// restart that lost in-flight sync.Map entries could otherwise re-submit a +// claim or verification the chain has already accepted. Both tables are +// keyed so every (heal_op_id) or (heal_op_id, verifier) is permitted exactly +// once. +type LEP6HealQueries interface { + // RecordHealClaim persists a successfully-submitted MsgClaimHealComplete + // for restart-time dedup. Returns ErrLEP6ClaimAlreadyRecorded if the + // heal_op_id row already exists (idempotent on retry). + RecordHealClaim(ctx context.Context, healOpID uint64, ticketID, manifestHash, stagingDir string) error + // HasHealClaim reports whether RecordHealClaim has been called for this + // heal_op_id. Used by the dispatcher to skip submission on restart. + HasHealClaim(ctx context.Context, healOpID uint64) (bool, error) + // GetHealClaim returns the persisted claim row (or sql.ErrNoRows). The + // finalizer reads staging_dir from this row when promoting a heal-op + // from HEALER_REPORTED to VERIFIED → publish. + GetHealClaim(ctx context.Context, healOpID uint64) (HealClaimRecord, error) + // ListHealClaims returns every persisted claim — used by the finalizer + // to enumerate staging entries on a fresh tick or after restart. + ListHealClaims(ctx context.Context) ([]HealClaimRecord, error) + // DeleteHealClaim removes the row after the finalizer has published or + // discarded the staging dir. + DeleteHealClaim(ctx context.Context, healOpID uint64) error + + // RecordHealVerification persists a successfully-submitted + // MsgSubmitHealVerification for restart-time dedup. Returns + // ErrLEP6VerificationAlreadyRecorded if the (heal_op_id, verifier_account) + // pair already exists. + RecordHealVerification(ctx context.Context, healOpID uint64, verifierAccount string, verified bool, verificationHash string) error + // HasHealVerification reports whether the (heal_op_id, verifier_account) + // row exists. Verifier dispatch uses this to skip resubmission on + // restart. + HasHealVerification(ctx context.Context, healOpID uint64, verifierAccount string) (bool, error) +} + +// HealClaimRecord is the row shape for heal_claims_submitted. +type HealClaimRecord struct { + HealOpID uint64 + TicketID string + ManifestHash string + StagingDir string + SubmittedAt int64 +} + +// ErrLEP6ClaimAlreadyRecorded is returned by RecordHealClaim when the +// heal_op_id has already been persisted. +var ErrLEP6ClaimAlreadyRecorded = errors.New("lep6: heal claim already recorded") + +// ErrLEP6VerificationAlreadyRecorded is returned by RecordHealVerification +// when (heal_op_id, verifier_account) is already persisted. +var ErrLEP6VerificationAlreadyRecorded = errors.New("lep6: heal verification already recorded") + +const createHealClaimsSubmitted = ` +CREATE TABLE IF NOT EXISTS heal_claims_submitted ( + heal_op_id INTEGER PRIMARY KEY, + ticket_id TEXT NOT NULL, + manifest_hash TEXT NOT NULL, + staging_dir TEXT NOT NULL, + submitted_at INTEGER NOT NULL +);` + +const createHealVerificationsSubmitted = ` +CREATE TABLE IF NOT EXISTS heal_verifications_submitted ( + heal_op_id INTEGER NOT NULL, + verifier_account TEXT NOT NULL, + verified INTEGER NOT NULL, + verification_hash TEXT NOT NULL, + submitted_at INTEGER NOT NULL, + PRIMARY KEY (heal_op_id, verifier_account) +);` + +// RecordHealClaim — see LEP6HealQueries.RecordHealClaim. +func (s *SQLiteStore) RecordHealClaim(ctx context.Context, healOpID uint64, ticketID, manifestHash, stagingDir string) error { + const stmt = `INSERT INTO heal_claims_submitted (heal_op_id, ticket_id, manifest_hash, staging_dir, submitted_at) VALUES (?, ?, ?, ?, ?)` + _, err := s.db.ExecContext(ctx, stmt, healOpID, ticketID, manifestHash, stagingDir, time.Now().Unix()) + if err != nil { + if isSQLiteUniqueViolation(err) { + return ErrLEP6ClaimAlreadyRecorded + } + return err + } + return nil +} + +// HasHealClaim — see LEP6HealQueries.HasHealClaim. +func (s *SQLiteStore) HasHealClaim(ctx context.Context, healOpID uint64) (bool, error) { + const stmt = `SELECT 1 FROM heal_claims_submitted WHERE heal_op_id = ? LIMIT 1` + var x int + err := s.db.QueryRowContext(ctx, stmt, healOpID).Scan(&x) + if errors.Is(err, sql.ErrNoRows) { + return false, nil + } + if err != nil { + return false, err + } + return true, nil +} + +// GetHealClaim — see LEP6HealQueries.GetHealClaim. +func (s *SQLiteStore) GetHealClaim(ctx context.Context, healOpID uint64) (HealClaimRecord, error) { + const stmt = `SELECT heal_op_id, ticket_id, manifest_hash, staging_dir, submitted_at FROM heal_claims_submitted WHERE heal_op_id = ?` + var r HealClaimRecord + err := s.db.QueryRowContext(ctx, stmt, healOpID).Scan(&r.HealOpID, &r.TicketID, &r.ManifestHash, &r.StagingDir, &r.SubmittedAt) + return r, err +} + +// ListHealClaims — see LEP6HealQueries.ListHealClaims. +func (s *SQLiteStore) ListHealClaims(ctx context.Context) ([]HealClaimRecord, error) { + const stmt = `SELECT heal_op_id, ticket_id, manifest_hash, staging_dir, submitted_at FROM heal_claims_submitted ORDER BY heal_op_id ASC` + rows, err := s.db.QueryContext(ctx, stmt) + if err != nil { + return nil, err + } + defer rows.Close() + out := make([]HealClaimRecord, 0) + for rows.Next() { + var r HealClaimRecord + if err := rows.Scan(&r.HealOpID, &r.TicketID, &r.ManifestHash, &r.StagingDir, &r.SubmittedAt); err != nil { + return nil, err + } + out = append(out, r) + } + return out, rows.Err() +} + +// DeleteHealClaim — see LEP6HealQueries.DeleteHealClaim. +func (s *SQLiteStore) DeleteHealClaim(ctx context.Context, healOpID uint64) error { + const stmt = `DELETE FROM heal_claims_submitted WHERE heal_op_id = ?` + _, err := s.db.ExecContext(ctx, stmt, healOpID) + return err +} + +// RecordHealVerification — see LEP6HealQueries.RecordHealVerification. +func (s *SQLiteStore) RecordHealVerification(ctx context.Context, healOpID uint64, verifierAccount string, verified bool, verificationHash string) error { + const stmt = `INSERT INTO heal_verifications_submitted (heal_op_id, verifier_account, verified, verification_hash, submitted_at) VALUES (?, ?, ?, ?, ?)` + verifiedInt := 0 + if verified { + verifiedInt = 1 + } + _, err := s.db.ExecContext(ctx, stmt, healOpID, verifierAccount, verifiedInt, verificationHash, time.Now().Unix()) + if err != nil { + if isSQLiteUniqueViolation(err) { + return ErrLEP6VerificationAlreadyRecorded + } + return err + } + return nil +} + +// HasHealVerification — see LEP6HealQueries.HasHealVerification. +func (s *SQLiteStore) HasHealVerification(ctx context.Context, healOpID uint64, verifierAccount string) (bool, error) { + const stmt = `SELECT 1 FROM heal_verifications_submitted WHERE heal_op_id = ? AND verifier_account = ? LIMIT 1` + var x int + err := s.db.QueryRowContext(ctx, stmt, healOpID, verifierAccount).Scan(&x) + if errors.Is(err, sql.ErrNoRows) { + return false, nil + } + if err != nil { + return false, err + } + return true, nil +} + +// isSQLiteUniqueViolation matches both the sqlite3 driver's typed error and +// the textual surface ("UNIQUE constraint failed") so the dedup helpers stay +// portable against driver changes. +func isSQLiteUniqueViolation(err error) bool { + if err == nil { + return false + } + msg := err.Error() + return strings.Contains(msg, "UNIQUE constraint failed") || + strings.Contains(msg, "PRIMARY KEY must be unique") +} diff --git a/pkg/storage/queries/self_healing_lep6_test.go b/pkg/storage/queries/self_healing_lep6_test.go new file mode 100644 index 00000000..6dbe8a4d --- /dev/null +++ b/pkg/storage/queries/self_healing_lep6_test.go @@ -0,0 +1,88 @@ +package queries + +import ( + "context" + "errors" + "path/filepath" + "testing" + + _ "github.com/mattn/go-sqlite3" + "github.com/jmoiron/sqlx" +) + +func newTestStore(t *testing.T) *SQLiteStore { + t.Helper() + dbFile := filepath.Join(t.TempDir(), "history.db") + db, err := sqlx.Connect("sqlite3", dbFile) + if err != nil { + t.Fatalf("connect: %v", err) + } + t.Cleanup(func() { _ = db.Close() }) + for _, stmt := range []string{createHealClaimsSubmitted, createHealVerificationsSubmitted} { + if _, err := db.Exec(stmt); err != nil { + t.Fatalf("exec migration: %v", err) + } + } + return &SQLiteStore{db: db} +} + +func TestLEP6_HealClaim_RoundTripAndDedup(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + + if has, err := s.HasHealClaim(ctx, 42); err != nil || has { + t.Fatalf("HasHealClaim before insert: has=%v err=%v", has, err) + } + if err := s.RecordHealClaim(ctx, 42, "ticket-x", "manifest-h", "/tmp/staging/42"); err != nil { + t.Fatalf("RecordHealClaim: %v", err) + } + // Restart-safety: second insert must be rejected with the typed error. + err := s.RecordHealClaim(ctx, 42, "ticket-x", "manifest-h", "/tmp/staging/42") + if !errors.Is(err, ErrLEP6ClaimAlreadyRecorded) { + t.Fatalf("expected ErrLEP6ClaimAlreadyRecorded on duplicate, got %v", err) + } + if has, err := s.HasHealClaim(ctx, 42); err != nil || !has { + t.Fatalf("HasHealClaim after insert: has=%v err=%v", has, err) + } + rec, err := s.GetHealClaim(ctx, 42) + if err != nil { + t.Fatalf("GetHealClaim: %v", err) + } + if rec.HealOpID != 42 || rec.TicketID != "ticket-x" || rec.ManifestHash != "manifest-h" || rec.StagingDir != "/tmp/staging/42" { + t.Fatalf("GetHealClaim mismatch: %+v", rec) + } + all, err := s.ListHealClaims(ctx) + if err != nil || len(all) != 1 { + t.Fatalf("ListHealClaims: %v %d", err, len(all)) + } + if err := s.DeleteHealClaim(ctx, 42); err != nil { + t.Fatalf("DeleteHealClaim: %v", err) + } + if has, err := s.HasHealClaim(ctx, 42); err != nil || has { + t.Fatalf("HasHealClaim after delete: has=%v err=%v", has, err) + } +} + +func TestLEP6_HealVerification_PerVerifierDedup(t *testing.T) { + s := newTestStore(t) + ctx := context.Background() + + if err := s.RecordHealVerification(ctx, 7, "sn-a", true, "hash-a"); err != nil { + t.Fatalf("record A: %v", err) + } + // Same heal_op, different verifier — must succeed. + if err := s.RecordHealVerification(ctx, 7, "sn-b", false, "hash-b"); err != nil { + t.Fatalf("record B: %v", err) + } + // Same (op, verifier) — must dedup. + err := s.RecordHealVerification(ctx, 7, "sn-a", true, "hash-a") + if !errors.Is(err, ErrLEP6VerificationAlreadyRecorded) { + t.Fatalf("expected dedup error, got %v", err) + } + if has, err := s.HasHealVerification(ctx, 7, "sn-a"); err != nil || !has { + t.Fatalf("HasHealVerification(sn-a): has=%v err=%v", has, err) + } + if has, err := s.HasHealVerification(ctx, 7, "sn-c"); err != nil || has { + t.Fatalf("HasHealVerification(sn-c) should be false: has=%v err=%v", has, err) + } +} diff --git a/pkg/storage/queries/sqlite.go b/pkg/storage/queries/sqlite.go index dfdd90cd..35b5ef3d 100644 --- a/pkg/storage/queries/sqlite.go +++ b/pkg/storage/queries/sqlite.go @@ -388,6 +388,14 @@ func OpenHistoryDB() (LocalStoreInterface, error) { return nil, fmt.Errorf("cannot create table(s): %w", err) } + if _, err := db.Exec(createHealClaimsSubmitted); err != nil { + return nil, fmt.Errorf("cannot create heal_claims_submitted: %w", err) + } + + if _, err := db.Exec(createHealVerificationsSubmitted); err != nil { + return nil, fmt.Errorf("cannot create heal_verifications_submitted: %w", err) + } + _, _ = db.Exec(alterTaskHistory) _, _ = db.Exec(alterTablePingHistory) diff --git a/proto/supernode/self_healing.proto b/proto/supernode/self_healing.proto new file mode 100644 index 00000000..9af6373a --- /dev/null +++ b/proto/supernode/self_healing.proto @@ -0,0 +1,50 @@ +syntax = "proto3"; +package supernode; +option go_package = "github.com/LumeraProtocol/supernode/v2/gen/supernode"; + +// SelfHealingService — LEP-6 §19 healer-served path. +// +// LEP-6 prescribes a strict three-phase heal flow: +// Phase 1 (RECONSTRUCT) — assigned healer reconstructs the file locally and +// submits MsgClaimHealComplete{HealManifestHash}. Artefacts are STAGED +// locally; they are NOT yet published to KAD. +// Phase 2 (VERIFY) — assigned verifiers MUST fetch the reconstructed bytes +// directly from the assigned healer (this RPC), not from KAD, because the +// healer-served path is the only authority before chain VERIFIED. +// Phase 3 (PUBLISH) — only after chain reaches VERIFIED quorum the healer's +// finalizer publishes staged artefacts to KAD via the same store path +// register/upload uses. +// +// This service exposes only the verifier-side fetch — chain coordinates role +// assignment and quorum, so the legacy peer Request/Verify/Commit RPCs are +// gone in the LEP-6 model. +service SelfHealingService { + // ServeReconstructedArtefacts streams the reconstructed file bytes to an + // authorized verifier. The handler MUST verify caller ∈ + // op.VerifierSupernodeAccounts before serving any bytes. + rpc ServeReconstructedArtefacts(ServeReconstructedArtefactsRequest) + returns (stream ServeReconstructedArtefactsResponse) {} +} + +message ServeReconstructedArtefactsRequest { + // heal_op_id identifies the heal-op the caller is verifying. Server uses it + // to look up the staging dir and authorize the caller against + // op.VerifierSupernodeAccounts. + uint64 heal_op_id = 1; + // verifier_account is the caller's chain-side supernode account address. + // Server cross-checks against authenticated grpc identity AND against + // op.VerifierSupernodeAccounts. + string verifier_account = 2; +} + +message ServeReconstructedArtefactsResponse { + // chunk is a contiguous slice of the reconstructed file bytes. Chunks are + // ordered; concatenating chunks across the stream yields the original file + // whose BLAKE3 must equal op.ResultHash (= action.DataHash recipe). + bytes chunk = 1; + // total_size is the full file size; populated on the first message and + // optionally repeated. Allows clients to pre-allocate buffers. + uint64 total_size = 2; + // is_last indicates this message carries the final chunk. + bool is_last = 3; +} diff --git a/supernode/cascade/interfaces.go b/supernode/cascade/interfaces.go index 5a4d0d4e..7069b4e3 100644 --- a/supernode/cascade/interfaces.go +++ b/supernode/cascade/interfaces.go @@ -16,4 +16,11 @@ type CascadeTask interface { Register(ctx context.Context, req *RegisterRequest, send func(resp *RegisterResponse) error) error Download(ctx context.Context, req *DownloadRequest, send func(resp *DownloadResponse) error) error CleanupDownload(ctx context.Context, tmpDir string) error + + // LEP-6 healer entrypoints. Surface RecoveryReseed and the staged-publish + // promotion so the self_healing service can consume the cascade pipeline + // through CascadeServiceFactory without depending on the concrete + // *CascadeRegistrationTask. + RecoveryReseed(ctx context.Context, req *RecoveryReseedRequest) (*RecoveryReseedResult, error) + PublishStagedArtefacts(ctx context.Context, stagingDir string) error } diff --git a/supernode/cascade/reseed.go b/supernode/cascade/reseed.go index 3cdd9a26..5343b9fb 100644 --- a/supernode/cascade/reseed.go +++ b/supernode/cascade/reseed.go @@ -2,7 +2,11 @@ package cascade import ( "context" + "encoding/base64" + "encoding/json" "fmt" + "os" + "path/filepath" "sort" "strings" @@ -12,8 +16,16 @@ import ( "github.com/LumeraProtocol/supernode/v2/pkg/utils" ) +// RecoveryReseedRequest carries the inputs for an end-to-end LEP-6 heal +// reconstruction. When PersistArtifacts is true (legacy / register-equivalent +// behavior) the rebuilt artefacts are stored to KAD via the same store path +// register/upload uses. When PersistArtifacts is false (LEP-6 §19 healer-served +// path) the artefacts are STAGED to StagingDir and not published; a later +// PublishStagedArtefacts call performs the KAD store after chain VERIFIED. type RecoveryReseedRequest struct { - ActionID string + ActionID string + PersistArtifacts bool // false = stage only (LEP-6 default); true = publish to KAD + StagingDir string // required when PersistArtifacts=false } type RecoveryReseedResult struct { @@ -31,11 +43,44 @@ type RecoveryReseedResult struct { LayoutFilesGenerated int IDFilesGenerated int SymbolsGenerated int + // StagingDir is set when artefacts were staged rather than published. + StagingDir string + // ReconstructedFilePath is the local path of the decoded original file. + // Caller is responsible for cleanup; on staged paths this is informational. + ReconstructedFilePath string + // ReconstructedHashB64 is the base64-encoded BLAKE3 of the reconstructed + // file (= action.DataHash recipe; LEP-6 HealManifestHash). + ReconstructedHashB64 string } +// stagedManifest is the on-disk descriptor written into a heal-op staging dir +// so a later PublishStagedArtefacts() call can reconstruct the storeArtefacts +// inputs without re-running download/decode/encode. +type stagedManifest struct { + ActionID string `json:"action_id"` + Layout codec.Layout `json:"layout"` + IDFiles []string `json:"id_files"` // base64 of idFile bytes + SymbolKeys []string `json:"symbol_keys"` // ordered, deduped + SymbolsDir string `json:"symbols_dir"` // absolute path inside StagingDir/symbols + ReconstructedRel string `json:"reconstructed_rel"`// staging-dir-relative path of the reconstructed file + ManifestHashB64 string `json:"manifest_hash_b64"`// = action.DataHash recipe; HealManifestHash +} + +const stagedManifestFilename = "manifest.json" +const stagedSymbolsDirname = "symbols" +const stagedIDFilesDirname = "id_files" +const stagedReconstructedFilename = "reconstructed.bin" + // RecoveryReseed decodes an existing action, re-encodes the reconstructed file, -// regenerates RQ artefacts with the action's original RQ params, and stores -// them via the same store path used by register. +// regenerates RQ artefacts with the action's original RQ params, and either +// stages them to disk (LEP-6 healer flow, PersistArtifacts=false) or stores +// them via the same store path used by register (legacy / republish flow, +// PersistArtifacts=true). +// +// LEP-6 §19 mandates the healer-served path: heal-op artefacts MUST NOT enter +// KAD until the chain has reached VERIFIED quorum, otherwise verifiers could +// fetch from KAD before the healer's hash is attested. PR-4 finalizer calls +// PublishStagedArtefacts only after observing op.Status == VERIFIED. func (task *CascadeRegistrationTask) RecoveryReseed(ctx context.Context, req *RecoveryReseedRequest) (*RecoveryReseedResult, error) { if req == nil { return nil, fmt.Errorf("missing request") @@ -44,9 +89,12 @@ func (task *CascadeRegistrationTask) RecoveryReseed(ctx context.Context, req *Re if actionID == "" { return nil, fmt.Errorf("missing action_id") } + if !req.PersistArtifacts && strings.TrimSpace(req.StagingDir) == "" { + return nil, fmt.Errorf("staging_dir required when persist_artifacts=false") + } task.taskID = actionID - fields := logtrace.Fields{logtrace.FieldMethod: "RecoveryReseed", logtrace.FieldActionID: actionID} + fields := logtrace.Fields{logtrace.FieldMethod: "RecoveryReseed", logtrace.FieldActionID: actionID, "persist_artifacts": req.PersistArtifacts} action, err := task.fetchAction(ctx, actionID, fields) if err != nil { @@ -115,6 +163,11 @@ func (task *CascadeRegistrationTask) RecoveryReseed(ctx context.Context, req *Re return result, task.wrapErr(ctx, "decoded file hash does not match action metadata", err, fields) } result.DataHashVerified = true + result.ReconstructedFilePath = decodeFilePath + // HealManifestHash = base64(BLAKE3(reconstructed_file)) — same recipe as + // Action.DataHash (cascadekit.ComputeBlake3DataHashB64). meta.DataHash is + // already that exact string, and VerifyB64DataHash above proved equality. + result.ReconstructedHashB64 = strings.TrimSpace(meta.DataHash) encodeResult, err := task.encodeInput(ctx, actionID, decodeFilePath, fields) if err != nil { @@ -128,8 +181,16 @@ func (task *CascadeRegistrationTask) RecoveryReseed(ctx context.Context, req *Re if err != nil { return result, err } - if err := task.storeArtefacts(ctx, action.ActionID, idFiles, encodeResult.SymbolsDir, encodeResult.Layout, fields); err != nil { - return result, err + + if req.PersistArtifacts { + if err := task.storeArtefacts(ctx, action.ActionID, idFiles, encodeResult.SymbolsDir, encodeResult.Layout, fields); err != nil { + return result, err + } + } else { + if err := task.stageArtefacts(ctx, req.StagingDir, action.ActionID, idFiles, encodeResult.SymbolsDir, encodeResult.Layout, decodeFilePath, result.ReconstructedHashB64, fields); err != nil { + return result, err + } + result.StagingDir = req.StagingDir } result.IndexIDs = indexIDs @@ -143,6 +204,110 @@ func (task *CascadeRegistrationTask) RecoveryReseed(ctx context.Context, req *Re return result, nil } +// stageArtefacts copies the encoded symbols + idFiles + layout + the +// reconstructed file into stagingDir, writing a manifest the finalizer reads +// when publishing and the §19 transport reads when serving verifiers. +// stagingDir is the per-heal-op directory (e.g. +// ~/.supernode/heal-staging//). +func (task *CascadeRegistrationTask) stageArtefacts(ctx context.Context, stagingDir, actionID string, idFiles [][]byte, symbolsDir string, layout codec.Layout, reconstructedFilePath, manifestHashB64 string, f logtrace.Fields) error { + if f == nil { + f = logtrace.Fields{} + } + lf := logtrace.Fields{logtrace.FieldActionID: actionID, logtrace.FieldTaskID: task.taskID, "staging_dir": stagingDir, "id_files_count": len(idFiles)} + for k, v := range f { + lf[k] = v + } + if err := os.MkdirAll(stagingDir, 0o700); err != nil { + return task.wrapErr(ctx, "failed to create staging dir", err, lf) + } + stagedSymbols := filepath.Join(stagingDir, stagedSymbolsDirname) + if err := os.MkdirAll(stagedSymbols, 0o700); err != nil { + return task.wrapErr(ctx, "failed to create staged symbols dir", err, lf) + } + if err := copyDirContents(symbolsDir, stagedSymbols); err != nil { + return task.wrapErr(ctx, "failed to copy symbols into staging dir", err, lf) + } + stagedIDDir := filepath.Join(stagingDir, stagedIDFilesDirname) + if err := os.MkdirAll(stagedIDDir, 0o700); err != nil { + return task.wrapErr(ctx, "failed to create staged id_files dir", err, lf) + } + idFilesEncoded := make([]string, 0, len(idFiles)) + for i, b := range idFiles { + // Persist raw bytes for fidelity; encode to base64 in manifest for + // portability across filesystems / observation. + path := filepath.Join(stagedIDDir, fmt.Sprintf("idfile_%05d.bin", i)) + if err := os.WriteFile(path, b, 0o600); err != nil { + return task.wrapErr(ctx, "failed to write staged id file", err, lf) + } + idFilesEncoded = append(idFilesEncoded, base64.StdEncoding.EncodeToString(b)) + } + manifest := stagedManifest{ + ActionID: actionID, + Layout: layout, + IDFiles: idFilesEncoded, + SymbolKeys: symbolIDsFromLayout(layout), + SymbolsDir: stagedSymbols, + ReconstructedRel: stagedReconstructedFilename, + ManifestHashB64: manifestHashB64, + } + // Stage the reconstructed file bytes so the §19 healer-served-path + // transport can stream them to verifiers without re-running download + + // decode. + if strings.TrimSpace(reconstructedFilePath) != "" { + src, err := os.ReadFile(reconstructedFilePath) + if err != nil { + return task.wrapErr(ctx, "failed to read reconstructed file for staging", err, lf) + } + if err := os.WriteFile(filepath.Join(stagingDir, stagedReconstructedFilename), src, 0o600); err != nil { + return task.wrapErr(ctx, "failed to stage reconstructed file", err, lf) + } + } + manifestPath := filepath.Join(stagingDir, stagedManifestFilename) + mb, err := json.Marshal(manifest) + if err != nil { + return task.wrapErr(ctx, "failed to marshal staged manifest", err, lf) + } + if err := os.WriteFile(manifestPath, mb, 0o600); err != nil { + return task.wrapErr(ctx, "failed to write staged manifest", err, lf) + } + logtrace.Info(ctx, "stage: artefacts staged", lf) + return nil +} + +// PublishStagedArtefacts reads a stagingDir produced by stageArtefacts and +// performs the KAD store via the same store path register/upload uses. Called +// by the LEP-6 finalizer after the chain reports HealOp.Status == VERIFIED. +func (task *CascadeRegistrationTask) PublishStagedArtefacts(ctx context.Context, stagingDir string) error { + stagingDir = strings.TrimSpace(stagingDir) + if stagingDir == "" { + return fmt.Errorf("missing staging_dir") + } + manifestPath := filepath.Join(stagingDir, stagedManifestFilename) + mb, err := os.ReadFile(manifestPath) + if err != nil { + return fmt.Errorf("read staged manifest: %w", err) + } + var manifest stagedManifest + if err := json.Unmarshal(mb, &manifest); err != nil { + return fmt.Errorf("parse staged manifest: %w", err) + } + idFiles := make([][]byte, 0, len(manifest.IDFiles)) + for i, enc := range manifest.IDFiles { + b, err := base64.StdEncoding.DecodeString(enc) + if err != nil { + return fmt.Errorf("decode id_file[%d]: %w", i, err) + } + idFiles = append(idFiles, b) + } + task.taskID = manifest.ActionID + fields := logtrace.Fields{ + logtrace.FieldMethod: "PublishStagedArtefacts", + logtrace.FieldActionID: manifest.ActionID, + "staging_dir": stagingDir, + } + return task.storeArtefacts(ctx, manifest.ActionID, idFiles, manifest.SymbolsDir, manifest.Layout, fields) +} + func symbolIDsFromLayout(layout codec.Layout) []string { seen := make(map[string]struct{}, 1024) for _, block := range layout.Blocks { @@ -161,3 +326,30 @@ func symbolIDsFromLayout(layout codec.Layout) []string { sort.Strings(out) return out } + +func copyDirContents(srcDir, dstDir string) error { + entries, err := os.ReadDir(srcDir) + if err != nil { + return err + } + for _, e := range entries { + if e.IsDir() { + // symbols layout is flat; recurse defensively + if err := os.MkdirAll(filepath.Join(dstDir, e.Name()), 0o700); err != nil { + return err + } + if err := copyDirContents(filepath.Join(srcDir, e.Name()), filepath.Join(dstDir, e.Name())); err != nil { + return err + } + continue + } + b, err := os.ReadFile(filepath.Join(srcDir, e.Name())) + if err != nil { + return err + } + if err := os.WriteFile(filepath.Join(dstDir, e.Name()), b, 0o600); err != nil { + return err + } + } + return nil +} diff --git a/supernode/cascade/staged.go b/supernode/cascade/staged.go new file mode 100644 index 00000000..05d5f6c7 --- /dev/null +++ b/supernode/cascade/staged.go @@ -0,0 +1,42 @@ +package cascade + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" +) + +// StagedHealOpInfo is the public projection of stagedManifest used by the +// LEP-6 §19 healer-served transport (supernode/transport/grpc/self_healing). +type StagedHealOpInfo struct { + ActionID string + ReconstructedFilePath string + ManifestHashB64 string +} + +// ReadStagedHealOp loads the manifest from a heal-op staging directory and +// returns the absolute reconstructed-file path the §19 transport streams to +// verifiers, plus the manifest hash for cross-checks. Returns os.ErrNotExist +// (wrapped) when the staging dir or its manifest is missing — caller may +// treat that as "not yet staged" and respond NotFound to the gRPC client. +func ReadStagedHealOp(stagingDir string) (StagedHealOpInfo, error) { + manifestPath := filepath.Join(stagingDir, stagedManifestFilename) + mb, err := os.ReadFile(manifestPath) + if err != nil { + return StagedHealOpInfo{}, fmt.Errorf("read staged manifest %q: %w", manifestPath, err) + } + var m stagedManifest + if err := json.Unmarshal(mb, &m); err != nil { + return StagedHealOpInfo{}, fmt.Errorf("parse staged manifest %q: %w", manifestPath, err) + } + rel := m.ReconstructedRel + if rel == "" { + rel = stagedReconstructedFilename + } + return StagedHealOpInfo{ + ActionID: m.ActionID, + ReconstructedFilePath: filepath.Join(stagingDir, rel), + ManifestHashB64: m.ManifestHashB64, + }, nil +} diff --git a/supernode/cmd/start.go b/supernode/cmd/start.go index 2fe062b9..b0ac611f 100644 --- a/supernode/cmd/start.go +++ b/supernode/cmd/start.go @@ -24,6 +24,7 @@ import ( cascadeService "github.com/LumeraProtocol/supernode/v2/supernode/cascade" "github.com/LumeraProtocol/supernode/v2/supernode/config" hostReporterService "github.com/LumeraProtocol/supernode/v2/supernode/host_reporter" + selfHealingService "github.com/LumeraProtocol/supernode/v2/supernode/self_healing" statusService "github.com/LumeraProtocol/supernode/v2/supernode/status" storageChallengeService "github.com/LumeraProtocol/supernode/v2/supernode/storage_challenge" // Legacy supernode metrics reporter (MsgReportSupernodeMetrics) has been superseded by @@ -31,6 +32,7 @@ import ( // supernodeMetrics "github.com/LumeraProtocol/supernode/v2/supernode/supernode_metrics" "github.com/LumeraProtocol/supernode/v2/supernode/transport/gateway" cascadeRPC "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/cascade" + selfHealingRPC "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/self_healing" server "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/status" storageChallengeRPC "github.com/LumeraProtocol/supernode/v2/supernode/transport/grpc/storage_challenge" "github.com/LumeraProtocol/supernode/v2/supernode/verifier" @@ -258,7 +260,62 @@ The supernode will connect to the Lumera network and begin participating in the // Create supernode server supernodeServer := server.NewSupernodeServer(statusSvc) + // LEP-6 self-healing runtime (chain-driven heal-op dispatch). + // The dispatcher polls audit heal-ops and runs healer/verifier/ + // finalizer roles based on chain assignment. The §19 transport + // server lets verifiers fetch reconstructed bytes from the + // assigned healer before chain VERIFIED quorum. + var selfHealingRunner *selfHealingService.Service + var selfHealingServer *selfHealingRPC.Server + if appConfig.SelfHealingConfig.Enabled { + pollInterval := time.Duration(appConfig.SelfHealingConfig.PollIntervalMs) * time.Millisecond + fetchTimeout := time.Duration(appConfig.SelfHealingConfig.VerifierFetchTimeoutMs) * time.Millisecond + shCfg := selfHealingService.Config{ + Enabled: true, + PollInterval: pollInterval, + MaxConcurrentReconstructs: appConfig.SelfHealingConfig.MaxConcurrentReconstructs, + MaxConcurrentVerifications: appConfig.SelfHealingConfig.MaxConcurrentVerifications, + MaxConcurrentPublishes: appConfig.SelfHealingConfig.MaxConcurrentPublishes, + StagingRoot: appConfig.SelfHealingConfig.StagingDir, + VerifierFetchTimeout: fetchTimeout, + VerifierFetchAttempts: appConfig.SelfHealingConfig.VerifierFetchAttempts, + KeyName: appConfig.SupernodeConfig.KeyName, + } + fetcher := selfHealingService.NewSecureVerifierFetcher(lumeraClient, kr, appConfig.SupernodeConfig.Identity, appConfig.SupernodeConfig.Port) + selfHealingRunner, err = selfHealingService.New( + appConfig.SupernodeConfig.Identity, + shCfg, + lumeraClient, + historyStore, + cService, + fetcher, + ) + if err != nil { + logtrace.Fatal(ctx, "Failed to initialize self-healing runner", logtrace.Fields{"error": err.Error()}) + } + selfHealingServer, err = selfHealingRPC.NewServer( + appConfig.SupernodeConfig.Identity, + shCfg.StagingRoot, + lumeraClient, + selfHealingRPC.DefaultCallerIdentityResolver(), + ) + if err != nil { + logtrace.Fatal(ctx, "Failed to initialize self-healing transport", logtrace.Fields{"error": err.Error()}) + } + } + // Create gRPC server (explicit args, no config struct) + grpcServices := []grpcserver.ServiceDesc{ + {Desc: &pbcascade.CascadeService_ServiceDesc, Service: cascadeActionServer}, + {Desc: &pbsupernode.SupernodeService_ServiceDesc, Service: supernodeServer}, + {Desc: &pbsupernode.StorageChallengeService_ServiceDesc, Service: storageChallengeServer}, + } + if selfHealingServer != nil { + grpcServices = append(grpcServices, grpcserver.ServiceDesc{ + Desc: &pbsupernode.SelfHealingService_ServiceDesc, + Service: selfHealingServer, + }) + } grpcServer, err := server.New( appConfig.SupernodeConfig.Identity, appConfig.SupernodeConfig.Host, @@ -266,9 +323,7 @@ The supernode will connect to the Lumera network and begin participating in the "service", kr, lumeraClient, - grpcserver.ServiceDesc{Desc: &pbcascade.CascadeService_ServiceDesc, Service: cascadeActionServer}, - grpcserver.ServiceDesc{Desc: &pbsupernode.SupernodeService_ServiceDesc, Service: supernodeServer}, - grpcserver.ServiceDesc{Desc: &pbsupernode.StorageChallengeService_ServiceDesc, Service: storageChallengeServer}, + grpcServices..., ) if err != nil { logtrace.Fatal(ctx, "Failed to create gRPC server", logtrace.Fields{"error": err.Error()}) @@ -301,6 +356,9 @@ The supernode will connect to the Lumera network and begin participating in the if storageChallengeRunner != nil { services = append(services, storageChallengeRunner) } + if selfHealingRunner != nil { + services = append(services, selfHealingRunner) + } servicesErr <- RunServices(ctx, services...) }() diff --git a/supernode/config/config.go b/supernode/config/config.go index 4a3722e7..619bdfed 100644 --- a/supernode/config/config.go +++ b/supernode/config/config.go @@ -92,6 +92,34 @@ type StorageChallengeLEP6Config struct { RecipientReadTimeout time.Duration `yaml:"recipient_read_timeout,omitempty"` } +// SelfHealingConfig configures the LEP-6 chain-driven self-healing runtime +// (supernode/self_healing). Mode gating is also enforced at runtime via +// the chain's StorageTruthEnforcementMode param — UNSPECIFIED skips the +// dispatcher regardless of Enabled. +type SelfHealingConfig struct { + // Enabled toggles the dispatcher and the §19 transport server. Default + // false until activation rollout (PR-6). + Enabled bool `yaml:"enabled"` + // PollIntervalMs is the dispatcher tick cadence (default 30000). + PollIntervalMs int `yaml:"poll_interval_ms,omitempty"` + // MaxConcurrentReconstructs bounds RaptorQ reseeds (RAM-heavy). + // Default 2. + MaxConcurrentReconstructs int `yaml:"max_concurrent_reconstructs,omitempty"` + // MaxConcurrentVerifications bounds verifier fetch+hash workers. + // Default 4. + MaxConcurrentVerifications int `yaml:"max_concurrent_verifications,omitempty"` + // MaxConcurrentPublishes bounds publish-to-KAD workers. Default 2. + MaxConcurrentPublishes int `yaml:"max_concurrent_publishes,omitempty"` + // StagingDir is the local staging root (default ~/.supernode/heal-staging). + StagingDir string `yaml:"staging_dir,omitempty"` + // VerifierFetchTimeoutMs caps a single ServeReconstructedArtefacts + // stream from healer (default 60000). + VerifierFetchTimeoutMs int `yaml:"verifier_fetch_timeout_ms,omitempty"` + // VerifierFetchAttempts bounds retries when fetching from healer + // (default 3). + VerifierFetchAttempts int `yaml:"verifier_fetch_attempts,omitempty"` +} + type Config struct { SupernodeConfig `yaml:"supernode"` KeyringConfig `yaml:"keyring"` @@ -99,6 +127,7 @@ type Config struct { LumeraClientConfig `yaml:"lumera"` RaptorQConfig `yaml:"raptorq"` StorageChallengeConfig `yaml:"storage_challenge"` + SelfHealingConfig `yaml:"self_healing"` // Store base directory (not from YAML) BaseDir string `yaml:"-"` diff --git a/supernode/self_healing/cascade_fake_test.go b/supernode/self_healing/cascade_fake_test.go new file mode 100644 index 00000000..267ea095 --- /dev/null +++ b/supernode/self_healing/cascade_fake_test.go @@ -0,0 +1,114 @@ +package self_healing + +import ( + "context" + "errors" + "os" + "path/filepath" + "sync" + "sync/atomic" + + cascadeService "github.com/LumeraProtocol/supernode/v2/supernode/cascade" +) + +// fakeCascadeFactory.NewCascadeRegistrationTask returns a programmable +// fakeCascadeTask. The healer flow exercises only RecoveryReseed; the +// finalizer only PublishStagedArtefacts. Other methods panic — a regression +// that calls Register/Download in the heal path is loud. +type fakeCascadeFactory struct { + mu sync.Mutex + reseedFn func(ctx context.Context, req *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) + publishFn func(ctx context.Context, stagingDir string) error + publishCalls atomic.Int64 + reseedCalls atomic.Int64 + lastPublishedDir atomic.Value // string +} + +func newFakeCascadeFactory() *fakeCascadeFactory { + f := &fakeCascadeFactory{} + f.lastPublishedDir.Store("") + return f +} + +func (f *fakeCascadeFactory) NewCascadeRegistrationTask() cascadeService.CascadeTask { + return &fakeCascadeTask{f: f} +} + +type fakeCascadeTask struct { + f *fakeCascadeFactory +} + +func (t *fakeCascadeTask) Register(ctx context.Context, req *cascadeService.RegisterRequest, send func(resp *cascadeService.RegisterResponse) error) error { + panic("self_healing test: cascade Register must not be called") +} +func (t *fakeCascadeTask) Download(ctx context.Context, req *cascadeService.DownloadRequest, send func(resp *cascadeService.DownloadResponse) error) error { + panic("self_healing test: cascade Download must not be called") +} +func (t *fakeCascadeTask) CleanupDownload(ctx context.Context, tmpDir string) error { return nil } + +func (t *fakeCascadeTask) RecoveryReseed(ctx context.Context, req *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) { + t.f.reseedCalls.Add(1) + t.f.mu.Lock() + fn := t.f.reseedFn + t.f.mu.Unlock() + if fn == nil { + return nil, errors.New("fakeCascade: no reseedFn configured") + } + return fn(ctx, req) +} + +func (t *fakeCascadeTask) PublishStagedArtefacts(ctx context.Context, stagingDir string) error { + t.f.publishCalls.Add(1) + t.f.lastPublishedDir.Store(stagingDir) + t.f.mu.Lock() + fn := t.f.publishFn + t.f.mu.Unlock() + if fn == nil { + return nil + } + return fn(ctx, stagingDir) +} + +// makeStagingDir creates an empty staging dir + minimal manifest+reconstructed +// file pair the §19 transport expects. Useful for finalizer tests that don't +// drive the full RecoveryReseed. +func makeStagingDir(t testing_T, root string, opID uint64, hashB64 string, body []byte) string { + dir := filepath.Join(root, itoa(opID)) + mustMkdir(t, dir) + mustMkdir(t, filepath.Join(dir, "symbols")) + mustWrite(t, filepath.Join(dir, "reconstructed.bin"), body) + manifest := []byte(`{"action_id":"ticket-` + itoa(opID) + `","layout":{"blocks":[]},"id_files":[],"symbol_keys":[],"symbols_dir":"` + filepath.Join(dir, "symbols") + `","reconstructed_rel":"reconstructed.bin","manifest_hash_b64":"` + hashB64 + `"}`) + mustWrite(t, filepath.Join(dir, "manifest.json"), manifest) + return dir +} + +// minimal testing.T-like surface so test helpers can be reused without +// importing testing.B. +type testing_T interface { + Helper() + Fatalf(format string, args ...interface{}) +} + +func mustMkdir(t testing_T, p string) { + if err := os.MkdirAll(p, 0o700); err != nil { + t.Helper() + t.Fatalf("mkdir %q: %v", p, err) + } +} +func mustWrite(t testing_T, p string, b []byte) { + if err := os.WriteFile(p, b, 0o600); err != nil { + t.Helper() + t.Fatalf("write %q: %v", p, err) + } +} +func itoa(u uint64) string { + if u == 0 { + return "0" + } + digits := []byte{} + for u > 0 { + digits = append([]byte{byte('0' + u%10)}, digits...) + u /= 10 + } + return string(digits) +} diff --git a/supernode/self_healing/finalizer.go b/supernode/self_healing/finalizer.go new file mode 100644 index 00000000..d86d8171 --- /dev/null +++ b/supernode/self_healing/finalizer.go @@ -0,0 +1,116 @@ +package self_healing + +import ( + "context" + "fmt" + "os" + "strings" + + audittypes "github.com/LumeraProtocol/lumera/x/audit/v1/types" + "github.com/LumeraProtocol/supernode/v2/pkg/logtrace" + "github.com/LumeraProtocol/supernode/v2/pkg/storage/queries" +) + +// finalizeClaim runs LEP-6 §19 Phase 3 for one persisted heal-op claim. +// +// Possible chain states for a claim row whose heal_op_id is queried: +// - SCHEDULED / IN_PROGRESS — chain has not yet recorded the healer's +// claim. Treat as transient; do nothing this tick. +// - HEALER_REPORTED — claim recorded but quorum not yet reached. No-op. +// - VERIFIED — quorum reached; publish staging dir to KAD via +// cascadeService.PublishStagedArtefacts, then delete the dir + the +// dedup row. +// - FAILED — verifiers rejected the claim or the chain finalized +// negatively. Delete staging dir + dedup row; do NOT publish (Scenario +// B). Chain has already applied §20 penalties. +// - EXPIRED — deadline passed before quorum (Scenario C, late-detected). +// Same handling as FAILED on the supernode side. +// - GetHealOp errors with not-found — treat as EXPIRED (chain may have +// pruned), delete staging. +func (s *Service) finalizeClaim(ctx context.Context, claim queries.HealClaimRecord) error { + resp, err := s.lumera.Audit().GetHealOp(ctx, claim.HealOpID) + if err != nil { + if isChainHealOpNotFound(err) { + logtrace.Warn(ctx, "self_healing(LEP-6): heal-op not found on chain; cleaning abandoned claim", logtrace.Fields{ + logtrace.FieldError: err.Error(), + "heal_op_id": claim.HealOpID, + "staging_dir": claim.StagingDir, + }) + return s.cleanupClaim(ctx, claim, audittypes.HealOpStatus_HEAL_OP_STATUS_EXPIRED) + } + // Defensive: don't blow away local state on transient query errors. + // A persistent error is logged by the caller; row will be retried + // next tick. + return fmt.Errorf("get heal op: %w", err) + } + if resp == nil { + return fmt.Errorf("nil heal op response") + } + op := resp.HealOp + switch op.Status { + case audittypes.HealOpStatus_HEAL_OP_STATUS_VERIFIED: + return s.publishStagingDir(ctx, claim) + case audittypes.HealOpStatus_HEAL_OP_STATUS_FAILED, + audittypes.HealOpStatus_HEAL_OP_STATUS_EXPIRED: + return s.cleanupClaim(ctx, claim, op.Status) + default: + // SCHEDULED / IN_PROGRESS / HEALER_REPORTED — quorum pending. + return nil + } +} + +func (s *Service) publishStagingDir(ctx context.Context, claim queries.HealClaimRecord) error { + if err := s.semPublish.Acquire(ctx, 1); err != nil { + return err + } + defer s.semPublish.Release(1) + + task := s.cascadeFactory.NewCascadeRegistrationTask() + if err := task.PublishStagedArtefacts(ctx, claim.StagingDir); err != nil { + // Leave row + staging in place; next tick retries publish. Chain + // has already recorded VERIFIED so no on-chain work pending. + return fmt.Errorf("publish staged artefacts: %w", err) + } + if err := os.RemoveAll(claim.StagingDir); err != nil { + logtrace.Warn(ctx, "self_healing(LEP-6): staging cleanup after publish failed", logtrace.Fields{ + logtrace.FieldError: err.Error(), + "heal_op_id": claim.HealOpID, + "staging_dir": claim.StagingDir, + }) + } + if err := s.store.DeleteHealClaim(ctx, claim.HealOpID); err != nil { + return fmt.Errorf("delete heal claim row: %w", err) + } + logtrace.Info(ctx, "self_healing(LEP-6): published staged artefacts to KAD", logtrace.Fields{ + "heal_op_id": claim.HealOpID, + "ticket_id": claim.TicketID, + "staging_dir": claim.StagingDir, + }) + return nil +} + +func (s *Service) cleanupClaim(ctx context.Context, claim queries.HealClaimRecord, status audittypes.HealOpStatus) error { + if err := os.RemoveAll(claim.StagingDir); err != nil { + logtrace.Warn(ctx, "self_healing(LEP-6): staging cleanup failed", logtrace.Fields{ + logtrace.FieldError: err.Error(), + "heal_op_id": claim.HealOpID, + "status": status.String(), + }) + } + if err := s.store.DeleteHealClaim(ctx, claim.HealOpID); err != nil { + return fmt.Errorf("delete heal claim row: %w", err) + } + logtrace.Info(ctx, "self_healing(LEP-6): claim cleaned up (no publish)", logtrace.Fields{ + "heal_op_id": claim.HealOpID, + "status": status.String(), + }) + return nil +} + +func isChainHealOpNotFound(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "not found") || strings.Contains(msg, "not_found") +} diff --git a/supernode/self_healing/healer.go b/supernode/self_healing/healer.go new file mode 100644 index 00000000..7fb6e7f1 --- /dev/null +++ b/supernode/self_healing/healer.go @@ -0,0 +1,163 @@ +package self_healing + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "strings" + + audittypes "github.com/LumeraProtocol/lumera/x/audit/v1/types" + "github.com/LumeraProtocol/supernode/v2/pkg/logtrace" + "github.com/LumeraProtocol/supernode/v2/pkg/storage/queries" + cascadeService "github.com/LumeraProtocol/supernode/v2/supernode/cascade" +) + +// reconstructAndClaim runs LEP-6 §19 Phase 1 for one heal-op. +// +// Steps: +// +// 1. Acquire semReconstruct (RAM cap; RaptorQ is heavy). +// 2. cascadeService.RecoveryReseed(PersistArtifacts=false, StagingDir=…) — +// reconstructs the file, verifies hash against Action.DataHash, +// regenerates RQ artefacts, STAGES to disk. NO KAD publish. +// 3. Submit MsgClaimHealComplete{HealManifestHash} FIRST. Submit-then- +// persist ordering: if submit fails (mempool, signing, chain-rejected) +// no SQLite row is left; next tick retries cleanly. +// 4. On chain acceptance, persist (heal_op_id, ticket_id, manifest_hash, +// staging_dir) to heal_claims_submitted so finalizer can drive the op. +// +// Crash-recovery path: if submit succeeded but persist crashed, the next +// tick's dispatchHealerOps sees the chain has moved past SCHEDULED (or +// the resubmit fails with "does not accept healer completion claim"). We +// reconcile via reconcileExistingClaim — query GetHealOp; if status ∈ +// {HEALER_REPORTED, VERIFIED, FAILED, EXPIRED} and ResultHash matches +// the manifest we just rebuilt, persist the dedup row and let finalizer +// take over. +func (s *Service) reconstructAndClaim(ctx context.Context, op audittypes.HealOp) error { + if err := s.semReconstruct.Acquire(ctx, 1); err != nil { + return err + } + defer s.semReconstruct.Release(1) + + stagingDir := filepath.Join(s.cfg.StagingRoot, fmt.Sprintf("%d", op.HealOpId)) + if err := os.MkdirAll(stagingDir, 0o700); err != nil { + return fmt.Errorf("mkdir staging: %w", err) + } + + task := s.cascadeFactory.NewCascadeRegistrationTask() + res, err := task.RecoveryReseed(ctx, &cascadeService.RecoveryReseedRequest{ + ActionID: op.TicketId, + PersistArtifacts: false, + StagingDir: stagingDir, + }) + if err != nil { + // Reconstruction failed (Scenario C). Per LEP-6, healer simply does + // not submit ClaimHealComplete; chain will EXPIRE the op at deadline. + // Clean staging dir; nothing to publish. + _ = os.RemoveAll(stagingDir) + return fmt.Errorf("recovery reseed: %w", err) + } + if !res.DataHashVerified { + _ = os.RemoveAll(stagingDir) + return fmt.Errorf("data hash not verified") + } + manifestHash := strings.TrimSpace(res.ReconstructedHashB64) + if manifestHash == "" { + _ = os.RemoveAll(stagingDir) + return fmt.Errorf("empty manifest hash") + } + + // Submit FIRST — let chain be the source of truth. Only persist on + // chain acceptance. + if _, err := s.lumera.AuditMsg().ClaimHealComplete(ctx, op.HealOpId, op.TicketId, manifestHash, ""); err != nil { + // If the chain rejected because the op already moved past SCHEDULED + // (a prior submit that we lost the response for), reconcile. + if isChainHealOpInvalidState(err) { + if recErr := s.reconcileExistingClaim(ctx, op, manifestHash, stagingDir); recErr != nil { + _ = os.RemoveAll(stagingDir) + return fmt.Errorf("submit failed (%v) and reconcile failed: %w", err, recErr) + } + return nil + } + _ = os.RemoveAll(stagingDir) + return fmt.Errorf("submit claim: %w", err) + } + + if err := s.store.RecordHealClaim(ctx, op.HealOpId, op.TicketId, manifestHash, stagingDir); err != nil { + if errors.Is(err, queries.ErrLEP6ClaimAlreadyRecorded) { + // Concurrent tick beat us; staging on disk matches. + return nil + } + // Persist failed but chain accepted — we'll see the row missing + // next tick; reconcileExistingClaim will fix it on retry. + return fmt.Errorf("record heal claim (chain accepted): %w", err) + } + logtrace.Info(ctx, "self_healing(LEP-6): claim submitted", logtrace.Fields{ + "heal_op_id": op.HealOpId, + "ticket_id": op.TicketId, + "manifest_h": manifestHash, + "staging_dir": stagingDir, + }) + return nil +} + +// reconcileExistingClaim handles the post-crash case where the chain has +// advanced past SCHEDULED (i.e. our prior submit was accepted but we lost +// the response or crashed before persisting). We re-fetch the op, confirm +// the recorded ResultHash matches the manifest we just rebuilt, and then +// persist the dedup row so the finalizer takes over. +// +// If the chain ResultHash differs, the staged data is irrelevant (a +// previous run produced different bytes — file changed underneath, or +// non-determinism slipped in). Drop staging, do nothing — let the heal-op +// run its course on chain. +func (s *Service) reconcileExistingClaim(ctx context.Context, op audittypes.HealOp, manifestHash, stagingDir string) error { + resp, err := s.lumera.Audit().GetHealOp(ctx, op.HealOpId) + if err != nil { + return fmt.Errorf("get heal op: %w", err) + } + if resp == nil { + return fmt.Errorf("nil heal op response") + } + chainOp := resp.HealOp + if chainOp.ResultHash != manifestHash { + // Different manifest on chain → our staged bytes don't match what + // chain expects. Discard staging and let the existing chain op + // finish without our involvement. + logtrace.Warn(ctx, "self_healing(LEP-6): chain ResultHash differs from current manifest; abandoning staging", logtrace.Fields{ + "heal_op_id": op.HealOpId, + "chain_hash": chainOp.ResultHash, + "current_hash": manifestHash, + "staging_dir": stagingDir, + "chain_status": chainOp.Status.String(), + }) + _ = os.RemoveAll(stagingDir) + return nil + } + // Manifest matches — persist dedup row (no-op if already present) so + // finalizer can publish on VERIFIED. + if err := s.store.RecordHealClaim(ctx, op.HealOpId, op.TicketId, manifestHash, stagingDir); err != nil && !errors.Is(err, queries.ErrLEP6ClaimAlreadyRecorded) { + return fmt.Errorf("record reconciled claim: %w", err) + } + logtrace.Info(ctx, "self_healing(LEP-6): reconciled existing chain claim", logtrace.Fields{ + "heal_op_id": op.HealOpId, + "chain_status": chainOp.Status.String(), + "manifest_h": manifestHash, + }) + return nil +} + +// isChainHealOpInvalidState detects the chain's wrapped +// ErrHealOpInvalidState surface for "status does not accept healer +// completion claim" — meaning the op has already moved past SCHEDULED. +// String-matched because audittypes errors are wrapped and we want to be +// resilient to both go-error chain lookups and any client-side wrapping. +func isChainHealOpInvalidState(err error) bool { + if err == nil { + return false + } + msg := err.Error() + return strings.Contains(msg, "does not accept healer completion claim") +} diff --git a/supernode/self_healing/lumera_test.go b/supernode/self_healing/lumera_test.go new file mode 100644 index 00000000..47c8284c --- /dev/null +++ b/supernode/self_healing/lumera_test.go @@ -0,0 +1,46 @@ +package self_healing + +import ( + "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/action" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/action_msg" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/audit" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/audit_msg" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/auth" + bankmod "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/bank" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/node" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/supernode" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/supernode_msg" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/tx" + "github.com/LumeraProtocol/supernode/v2/pkg/testutil" +) + +// fakeLumera satisfies lumera.Client by composing per-test programmable +// audit modules with the existing testutil.MockLumeraClient stubs for the +// other modules. The dispatcher only touches Audit() and AuditMsg(); the +// other methods are present solely to satisfy the interface contract. +type fakeLumera struct { + audit audit.Module + auditMsg audit_msg.Module + other lumera.Client // testutil mock; supplies stub for non-audit modules +} + +func newFakeLumera(a audit.Module, am audit_msg.Module) lumera.Client { + c, err := testutil.NewMockLumeraClient(nil, nil) + if err != nil { + panic(err) + } + return &fakeLumera{audit: a, auditMsg: am, other: c} +} + +func (f *fakeLumera) Auth() auth.Module { return f.other.Auth() } +func (f *fakeLumera) Action() action.Module { return f.other.Action() } +func (f *fakeLumera) ActionMsg() action_msg.Module { return f.other.ActionMsg() } +func (f *fakeLumera) Audit() audit.Module { return f.audit } +func (f *fakeLumera) AuditMsg() audit_msg.Module { return f.auditMsg } +func (f *fakeLumera) SuperNode() supernode.Module { return f.other.SuperNode() } +func (f *fakeLumera) SuperNodeMsg() supernode_msg.Module { return f.other.SuperNodeMsg() } +func (f *fakeLumera) Bank() bankmod.Module { return f.other.Bank() } +func (f *fakeLumera) Tx() tx.Module { return f.other.Tx() } +func (f *fakeLumera) Node() node.Module { return f.other.Node() } +func (f *fakeLumera) Close() error { return nil } diff --git a/supernode/self_healing/mocks_test.go b/supernode/self_healing/mocks_test.go new file mode 100644 index 00000000..25814b70 --- /dev/null +++ b/supernode/self_healing/mocks_test.go @@ -0,0 +1,169 @@ +package self_healing + +import ( + "context" + "errors" + "sync" + "sync/atomic" + + audittypes "github.com/LumeraProtocol/lumera/x/audit/v1/types" + query "github.com/cosmos/cosmos-sdk/types/query" + sdktx "github.com/cosmos/cosmos-sdk/types/tx" +) + +// programmableAudit is a per-test programmable audit module. The dispatcher +// reads only GetParams, GetHealOp, and GetHealOpsByStatus, so other methods +// are unused and may be left zero. +type programmableAudit struct { + mu sync.Mutex + params audittypes.Params + opsByStatus map[audittypes.HealOpStatus][]audittypes.HealOp + opsByID map[uint64]audittypes.HealOp + getOpErr error +} + +func newProgrammableAudit(mode audittypes.StorageTruthEnforcementMode) *programmableAudit { + return &programmableAudit{ + params: audittypes.Params{ + StorageTruthEnforcementMode: mode, + }, + opsByStatus: map[audittypes.HealOpStatus][]audittypes.HealOp{}, + opsByID: map[uint64]audittypes.HealOp{}, + } +} + +func (p *programmableAudit) put(op audittypes.HealOp) { + p.mu.Lock() + defer p.mu.Unlock() + p.opsByID[op.HealOpId] = op + p.opsByStatus[op.Status] = append(p.opsByStatus[op.Status], op) +} + +func (p *programmableAudit) setStatus(opID uint64, st audittypes.HealOpStatus) { + p.mu.Lock() + defer p.mu.Unlock() + op := p.opsByID[opID] + op.Status = st + p.opsByID[opID] = op +} + +func (p *programmableAudit) GetParams(ctx context.Context) (*audittypes.QueryParamsResponse, error) { + p.mu.Lock() + defer p.mu.Unlock() + return &audittypes.QueryParamsResponse{Params: p.params}, nil +} +func (p *programmableAudit) GetHealOp(ctx context.Context, healOpID uint64) (*audittypes.QueryHealOpResponse, error) { + p.mu.Lock() + defer p.mu.Unlock() + if p.getOpErr != nil { + return nil, p.getOpErr + } + op, ok := p.opsByID[healOpID] + if !ok { + return nil, errors.New("not found") + } + return &audittypes.QueryHealOpResponse{HealOp: op}, nil +} +func (p *programmableAudit) GetHealOpsByStatus(ctx context.Context, status audittypes.HealOpStatus, pagination *query.PageRequest) (*audittypes.QueryHealOpsByStatusResponse, error) { + p.mu.Lock() + defer p.mu.Unlock() + out := make([]audittypes.HealOp, 0, len(p.opsByStatus[status])) + for _, op := range p.opsByStatus[status] { + out = append(out, op) + } + return &audittypes.QueryHealOpsByStatusResponse{HealOps: out}, nil +} +func (p *programmableAudit) GetHealOpsByTicket(ctx context.Context, ticketID string, pagination *query.PageRequest) (*audittypes.QueryHealOpsByTicketResponse, error) { + return &audittypes.QueryHealOpsByTicketResponse{}, nil +} +func (p *programmableAudit) GetEpochAnchor(ctx context.Context, epochID uint64) (*audittypes.QueryEpochAnchorResponse, error) { + return &audittypes.QueryEpochAnchorResponse{}, nil +} +func (p *programmableAudit) GetCurrentEpochAnchor(ctx context.Context) (*audittypes.QueryCurrentEpochAnchorResponse, error) { + return &audittypes.QueryCurrentEpochAnchorResponse{}, nil +} +func (p *programmableAudit) GetCurrentEpoch(ctx context.Context) (*audittypes.QueryCurrentEpochResponse, error) { + return &audittypes.QueryCurrentEpochResponse{}, nil +} +func (p *programmableAudit) GetAssignedTargets(ctx context.Context, supernodeAccount string, epochID uint64) (*audittypes.QueryAssignedTargetsResponse, error) { + return &audittypes.QueryAssignedTargetsResponse{}, nil +} +func (p *programmableAudit) GetEpochReport(ctx context.Context, epochID uint64, supernodeAccount string) (*audittypes.QueryEpochReportResponse, error) { + return &audittypes.QueryEpochReportResponse{}, nil +} +func (p *programmableAudit) GetNodeSuspicionState(ctx context.Context, supernodeAccount string) (*audittypes.QueryNodeSuspicionStateResponse, error) { + return &audittypes.QueryNodeSuspicionStateResponse{}, nil +} +func (p *programmableAudit) GetReporterReliabilityState(ctx context.Context, reporterAccount string) (*audittypes.QueryReporterReliabilityStateResponse, error) { + return &audittypes.QueryReporterReliabilityStateResponse{}, nil +} +func (p *programmableAudit) GetTicketDeteriorationState(ctx context.Context, ticketID string) (*audittypes.QueryTicketDeteriorationStateResponse, error) { + return &audittypes.QueryTicketDeteriorationStateResponse{}, nil +} + +// programmableAuditMsg captures every claim/verification call so tests can +// assert on the exact arguments the dispatcher used (e.g. that +// VerificationHash matches op.ResultHash and never Action.DataHash). +type programmableAuditMsg struct { + mu sync.Mutex + claimCalls []claimCall + verificationCalls []verificationCall + claimErr error + verificationErr error + claimsCount atomic.Int64 + verificationsCount atomic.Int64 +} + +type claimCall struct { + HealOpID uint64 + TicketID string + HealManifestHash string + Details string +} + +type verificationCall struct { + HealOpID uint64 + Verified bool + VerificationHash string + Details string +} + +func newProgrammableAuditMsg() *programmableAuditMsg { return &programmableAuditMsg{} } + +func (p *programmableAuditMsg) ClaimHealComplete(ctx context.Context, healOpID uint64, ticketID, healManifestHash, details string) (*sdktx.BroadcastTxResponse, error) { + p.mu.Lock() + defer p.mu.Unlock() + if p.claimErr != nil { + return nil, p.claimErr + } + p.claimCalls = append(p.claimCalls, claimCall{healOpID, ticketID, healManifestHash, details}) + p.claimsCount.Add(1) + return &sdktx.BroadcastTxResponse{}, nil +} +func (p *programmableAuditMsg) SubmitHealVerification(ctx context.Context, healOpID uint64, verified bool, verificationHash, details string) (*sdktx.BroadcastTxResponse, error) { + p.mu.Lock() + defer p.mu.Unlock() + if p.verificationErr != nil { + return nil, p.verificationErr + } + p.verificationCalls = append(p.verificationCalls, verificationCall{healOpID, verified, verificationHash, details}) + p.verificationsCount.Add(1) + return &sdktx.BroadcastTxResponse{}, nil +} +func (p *programmableAuditMsg) SubmitEvidence(ctx context.Context, subjectAddress string, evidenceType audittypes.EvidenceType, actionID string, metadataJSON string) (*sdktx.BroadcastTxResponse, error) { + return &sdktx.BroadcastTxResponse{}, nil +} +func (p *programmableAuditMsg) SubmitEpochReport(ctx context.Context, epochID uint64, hostReport audittypes.HostReport, storageChallengeObservations []*audittypes.StorageChallengeObservation, storageProofResults []*audittypes.StorageProofResult) (*sdktx.BroadcastTxResponse, error) { + return &sdktx.BroadcastTxResponse{}, nil +} +func (p *programmableAuditMsg) SubmitStorageRecheckEvidence(ctx context.Context, epochID uint64, challengedSupernodeAccount string, ticketID string, challengedResultTranscriptHash string, recheckTranscriptHash string, recheckResultClass audittypes.StorageProofResultClass, details string) (*sdktx.BroadcastTxResponse, error) { + return &sdktx.BroadcastTxResponse{}, nil +} + +func (p *programmableAuditMsg) snapshot() ([]claimCall, []verificationCall) { + p.mu.Lock() + defer p.mu.Unlock() + c := append([]claimCall(nil), p.claimCalls...) + v := append([]verificationCall(nil), p.verificationCalls...) + return c, v +} diff --git a/supernode/self_healing/peer_client.go b/supernode/self_healing/peer_client.go new file mode 100644 index 00000000..936c2aa9 --- /dev/null +++ b/supernode/self_healing/peer_client.go @@ -0,0 +1,121 @@ +package self_healing + +import ( + "context" + "fmt" + "io" + "net" + "strconv" + "strings" + "sync" + + "github.com/LumeraProtocol/lumera/x/lumeraid/securekeyx" + "github.com/LumeraProtocol/supernode/v2/gen/supernode" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + "github.com/LumeraProtocol/supernode/v2/pkg/net/credentials" + grpcclient "github.com/LumeraProtocol/supernode/v2/pkg/net/grpc/client" + "github.com/LumeraProtocol/supernode/v2/pkg/netutil" + "github.com/cosmos/cosmos-sdk/crypto/keyring" +) + +// secureVerifierFetcher implements VerifierFetcher by dialing the assigned +// healer over the same secure-rpc / lumeraid stack the legacy +// storage_challenge loop uses. +type secureVerifierFetcher struct { + lumera lumera.Client + kr keyring.Keyring + self string + defaultPort uint16 + + mu sync.Mutex + grpcClient *grpcclient.Client + grpcOpts *grpcclient.ClientOptions +} + +// NewSecureVerifierFetcher constructs the production-grade VerifierFetcher +// for the LEP-6 §19 healer-served path. self is the local supernode +// identity; defaultPort is the supernode gRPC port to fall back to when the +// chain-registered address omits a port. +func NewSecureVerifierFetcher(client lumera.Client, kr keyring.Keyring, self string, defaultPort uint16) VerifierFetcher { + return &secureVerifierFetcher{ + lumera: client, + kr: kr, + self: strings.TrimSpace(self), + defaultPort: defaultPort, + } +} + +func (f *secureVerifierFetcher) ensureClient() error { + f.mu.Lock() + defer f.mu.Unlock() + + if f.grpcClient != nil { + return nil + } + validator := lumera.NewSecureKeyExchangeValidator(f.lumera) + creds, err := credentials.NewClientCreds(&credentials.ClientOptions{ + CommonOptions: credentials.CommonOptions{ + Keyring: f.kr, + LocalIdentity: f.self, + PeerType: securekeyx.Supernode, + Validator: validator, + }, + }) + if err != nil { + return fmt.Errorf("create secure gRPC client creds: %w", err) + } + f.grpcClient = grpcclient.NewClient(creds) + f.grpcOpts = grpcclient.DefaultClientOptions() + f.grpcOpts.EnableRetries = false // verifier orchestrates retries itself + return nil +} + +// FetchReconstructed dials healerAccount and streams the reconstructed +// bytes for healOpID, returning the concatenated payload. +func (f *secureVerifierFetcher) FetchReconstructed(ctx context.Context, healOpID uint64, healerAccount, verifierAccount string) ([]byte, error) { + if err := f.ensureClient(); err != nil { + return nil, err + } + info, err := f.lumera.SuperNode().GetSupernodeWithLatestAddress(ctx, healerAccount) + if err != nil || info == nil { + return nil, fmt.Errorf("resolve healer %q: %w", healerAccount, err) + } + raw := strings.TrimSpace(info.LatestAddress) + if raw == "" { + return nil, fmt.Errorf("no address for healer %q", healerAccount) + } + host, port, ok := netutil.ParseHostAndPort(raw, int(f.defaultPort)) + if !ok || strings.TrimSpace(host) == "" { + return nil, fmt.Errorf("invalid healer address %q", raw) + } + addr := net.JoinHostPort(strings.TrimSpace(host), strconv.Itoa(port)) + conn, err := f.grpcClient.Connect(ctx, fmt.Sprintf("%s@%s", strings.TrimSpace(healerAccount), addr), f.grpcOpts) + if err != nil { + return nil, fmt.Errorf("dial healer %q: %w", healerAccount, err) + } + defer conn.Close() + client := supernode.NewSelfHealingServiceClient(conn) + stream, err := client.ServeReconstructedArtefacts(ctx, &supernode.ServeReconstructedArtefactsRequest{ + HealOpId: healOpID, + VerifierAccount: verifierAccount, + }) + if err != nil { + return nil, fmt.Errorf("open serve stream: %w", err) + } + var buf []byte + for { + msg, err := stream.Recv() + if err == io.EOF { + return buf, nil + } + if err != nil { + return nil, fmt.Errorf("recv: %w", err) + } + buf = append(buf, msg.Chunk...) + if msg.IsLast { + // Drain any trailer. + _, _ = stream.Recv() + return buf, nil + } + } +} diff --git a/supernode/self_healing/service.go b/supernode/self_healing/service.go new file mode 100644 index 00000000..73106770 --- /dev/null +++ b/supernode/self_healing/service.go @@ -0,0 +1,442 @@ +// Package self_healing implements the LEP-6 chain-driven heal-op runtime. +// +// # Architecture +// +// LEP-6 §18-§22 (Workstream C) replaces the gonode-era peer-watchlist self- +// healing with a chain-mediated three-phase flow. The chain (lumera/x/audit) +// owns role assignment via HealOp.HealerSupernodeAccount + .VerifierSupernode +// Accounts, and quorum via MsgClaimHealComplete + MsgSubmitHealVerification +// (n/2+1 positive verifications). The supernode side is purely an executor: +// +// Phase 1 — RECONSTRUCT (no publish) +// Healer fetches symbols from KAD, RaptorQ-decodes, verifies hash against +// Action.DataHash, re-encodes, STAGES to local disk, then submits +// MsgClaimHealComplete{HealManifestHash}. The reconstructed file MUST NOT +// enter KAD before chain VERIFIED — §19 healer-served path. +// +// Phase 2 — VERIFY +// Each verifier fetches the reconstructed bytes from the assigned healer +// via supernode.SelfHealingService/ServeReconstructedArtefacts, hashes +// them with cascadekit.ComputeBlake3DataHashB64 (= Action.DataHash recipe), +// compares against op.ResultHash (NOT Action.DataHash — chain-side +// enforcement at lumera/x/audit/v1/keeper/msg_storage_truth.go:291), and +// submits MsgSubmitHealVerification{verified, hash}. The "compare against +// op.ResultHash" choice is the v3-plan landmine pinned by +// TestVerifier_ComparesAgainstOpResultHash. +// +// Phase 3 — PUBLISH (only on VERIFIED) +// Healer's finalizer polls staging entries, calls +// cascadeService.PublishStagedArtefacts on op.Status == VERIFIED, then +// deletes the staging dir. On FAILED / EXPIRED, the staging dir is +// deleted with no publish — chain may reschedule with a different healer. +// +// # Concurrency +// +// Three-layer dedup so a process restart can never double-submit: +// 1. sync.Map keyed on (heal_op_id, role) for in-flight locking. +// 2. Buffered semaphore (default 2) capping concurrent RaptorQ reseeds — +// reseed is RAM-heavy. Verification semaphore default 4, publish 2. +// 3. SQLite tables heal_claims_submitted + heal_verifications_submitted +// (pkg/storage/queries/self_healing_lep6.go) for restart dedup. +// +// # Mode gate +// +// When params.StorageTruthEnforcementMode == UNSPECIFIED the chain creates +// no heal-ops, so the dispatcher early-returns from Service.tick. The check +// also serves as a final supernode-side guard. +package self_healing + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "time" + + audittypes "github.com/LumeraProtocol/lumera/x/audit/v1/types" + "github.com/LumeraProtocol/supernode/v2/pkg/logtrace" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + "github.com/LumeraProtocol/supernode/v2/pkg/storage/queries" + cascadeService "github.com/LumeraProtocol/supernode/v2/supernode/cascade" + "golang.org/x/sync/semaphore" +) + +// Defaults captured here for clarity at the boundary; Config exposes overrides. +const ( + defaultPollInterval = 30 * time.Second + defaultMaxConcurrentReconstructs = 2 + defaultMaxConcurrentVerifications = 4 + defaultMaxConcurrentPublishes = 2 + defaultStagingRoot = "heal-staging" + defaultVerifierFetchTimeout = 60 * time.Second + defaultVerifierFetchAttempts = 3 + defaultVerifierBackoffBase = 2 * time.Second +) + +// Config captures supernode-binary-owned tunables for the LEP-6 heal runtime. +type Config struct { + // Enabled toggles the entire dispatcher. Independent of the chain mode + // gate; if Enabled=false the service never runs even when chain mode is + // FULL. Used for staged rollouts. + Enabled bool + PollInterval time.Duration + MaxConcurrentReconstructs int + MaxConcurrentVerifications int + MaxConcurrentPublishes int + + // StagingRoot is the local directory under which per-heal-op staging + // dirs are created. Default: ~/.supernode/heal-staging/. + StagingRoot string + + // VerifierFetchTimeout / VerifierFetchAttempts / VerifierBackoffBase + // shape the retry policy verifiers use when fetching from the assigned + // healer. After exhausting attempts, verifier submits verified=false + // with reason "fetch_failed". + VerifierFetchTimeout time.Duration + VerifierFetchAttempts int + VerifierBackoffBase time.Duration + + // KeyName is the supernode's keyring key used to sign claim/verification + // txs. Must match the on-chain HealerSupernodeAccount / + // VerifierSupernodeAccount. + KeyName string +} + +func (c Config) withDefaults() Config { + if c.PollInterval <= 0 { + c.PollInterval = defaultPollInterval + } + if c.MaxConcurrentReconstructs <= 0 { + c.MaxConcurrentReconstructs = defaultMaxConcurrentReconstructs + } + if c.MaxConcurrentVerifications <= 0 { + c.MaxConcurrentVerifications = defaultMaxConcurrentVerifications + } + if c.MaxConcurrentPublishes <= 0 { + c.MaxConcurrentPublishes = defaultMaxConcurrentPublishes + } + if strings.TrimSpace(c.StagingRoot) == "" { + home, err := os.UserHomeDir() + if err == nil { + c.StagingRoot = filepath.Join(home, ".supernode", defaultStagingRoot) + } else { + c.StagingRoot = filepath.Join(os.TempDir(), defaultStagingRoot) + } + } + if c.VerifierFetchTimeout <= 0 { + c.VerifierFetchTimeout = defaultVerifierFetchTimeout + } + if c.VerifierFetchAttempts <= 0 { + c.VerifierFetchAttempts = defaultVerifierFetchAttempts + } + if c.VerifierBackoffBase <= 0 { + c.VerifierBackoffBase = defaultVerifierBackoffBase + } + return c +} + +// VerifierFetcher abstracts the verifier→healer transport. Real +// implementation is grpc-based (peer_client.go); tests inject in-memory +// fakes that don't need a listening server. +type VerifierFetcher interface { + // FetchReconstructed retrieves the reconstructed file bytes from the + // healer assigned to healOpID. Implementations are responsible for + // dialing the healer's grpc endpoint (resolved from the supernode + // registry) and authenticating as verifierAccount. + FetchReconstructed(ctx context.Context, healOpID uint64, healerAccount, verifierAccount string) ([]byte, error) +} + +// Service is the single LEP-6 heal-op dispatcher. One instance per +// supernode binary. +type Service struct { + cfg Config + identity string + + lumera lumera.Client + store queries.LocalStoreInterface + cascadeFactory cascadeService.CascadeServiceFactory + fetcher VerifierFetcher + + // In-flight dedup. Key: opRoleKey(healOpID, role). Value: struct{}. + inFlight sync.Map + + // Per-role concurrency caps. + semReconstruct *semaphore.Weighted + semVerify *semaphore.Weighted + semPublish *semaphore.Weighted +} + +const ( + roleHealer = "healer" + roleVerifier = "verifier" + rolePublisher = "publisher" +) + +func opRoleKey(healOpID uint64, role string) string { + return fmt.Sprintf("%d/%s", healOpID, role) +} + +// New constructs a Service. fetcher may be nil if Config.Enabled is false +// (constructor still validates required deps so misconfig is caught early). +func New( + identity string, + cfg Config, + lumeraClient lumera.Client, + store queries.LocalStoreInterface, + cascadeFactory cascadeService.CascadeServiceFactory, + fetcher VerifierFetcher, +) (*Service, error) { + identity = strings.TrimSpace(identity) + if identity == "" { + return nil, fmt.Errorf("identity is empty") + } + if lumeraClient == nil || lumeraClient.Audit() == nil || lumeraClient.AuditMsg() == nil { + return nil, fmt.Errorf("lumera client missing required audit modules") + } + if store == nil { + return nil, fmt.Errorf("local store is nil") + } + if cascadeFactory == nil { + return nil, fmt.Errorf("cascade service factory is nil") + } + cfg = cfg.withDefaults() + if err := os.MkdirAll(cfg.StagingRoot, 0o700); err != nil { + return nil, fmt.Errorf("create staging root %q: %w", cfg.StagingRoot, err) + } + return &Service{ + cfg: cfg, + identity: identity, + lumera: lumeraClient, + store: store, + cascadeFactory: cascadeFactory, + fetcher: fetcher, + semReconstruct: semaphore.NewWeighted(int64(cfg.MaxConcurrentReconstructs)), + semVerify: semaphore.NewWeighted(int64(cfg.MaxConcurrentVerifications)), + semPublish: semaphore.NewWeighted(int64(cfg.MaxConcurrentPublishes)), + }, nil +} + +// Run blocks until ctx is cancelled, ticking every cfg.PollInterval. +// Tick steps (single mechanism per LEP-6 plan §C.4 finalizer Opt-2b decision): +// +// 1. Mode gate: query audit params; if UNSPECIFIED, skip everything. +// 2. Healer dispatch: GetHealOpsByStatus(SCHEDULED), filter by +// HealerSupernodeAccount==identity, run reconstructHealOp() bounded by +// semReconstruct. +// 3. Verifier dispatch: GetHealOpsByStatus(HEALER_REPORTED), filter by +// identity ∈ VerifierSupernodeAccounts, run verifyHealOp() bounded by +// semVerify. +// 4. Finalizer (Opt 2b per-op poll): for each row in heal_claims_submitted, +// GetHealOp(opID) and act on Status (VERIFIED → publish, FAILED/EXPIRED +// → cleanup). +// +// Final-state ops are excluded by status filter, so a misordered tick is +// idempotent (sync.Map dedup + sqlite dedup catch any race). +func (s *Service) Run(ctx context.Context) error { + if !s.cfg.Enabled { + logtrace.Info(ctx, "self_healing(LEP-6): disabled in config; not starting", logtrace.Fields{}) + return nil + } + logtrace.Info(ctx, "self_healing(LEP-6): start", logtrace.Fields{ + "identity": s.identity, + "poll_interval": s.cfg.PollInterval.String(), + "max_concurrent_reconstructs": s.cfg.MaxConcurrentReconstructs, + "max_concurrent_verifications": s.cfg.MaxConcurrentVerifications, + "max_concurrent_publishes": s.cfg.MaxConcurrentPublishes, + "staging_root": s.cfg.StagingRoot, + }) + t := time.NewTicker(s.cfg.PollInterval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-t.C: + if err := s.tick(ctx); err != nil { + logtrace.Warn(ctx, "self_healing(LEP-6): tick error", logtrace.Fields{logtrace.FieldError: err.Error()}) + } + } + } +} + +// tick performs one dispatch cycle. Exposed for tests. +func (s *Service) tick(ctx context.Context) error { + skip, err := s.modeGate(ctx) + if err != nil { + return fmt.Errorf("mode gate: %w", err) + } + if skip { + return nil + } + if err := s.dispatchHealerOps(ctx); err != nil { + logtrace.Warn(ctx, "self_healing(LEP-6): dispatch healer ops", logtrace.Fields{logtrace.FieldError: err.Error()}) + } + if err := s.dispatchVerifierOps(ctx); err != nil { + logtrace.Warn(ctx, "self_healing(LEP-6): dispatch verifier ops", logtrace.Fields{logtrace.FieldError: err.Error()}) + } + if err := s.dispatchFinalizer(ctx); err != nil { + logtrace.Warn(ctx, "self_healing(LEP-6): dispatch finalizer", logtrace.Fields{logtrace.FieldError: err.Error()}) + } + return nil +} + +// modeGate returns (skip=true) when the chain enforcement mode is +// UNSPECIFIED. Heal-ops only exist in SHADOW/SOFT/FULL. +func (s *Service) modeGate(ctx context.Context) (bool, error) { + resp, err := s.lumera.Audit().GetParams(ctx) + if err != nil { + return false, err + } + mode := resp.Params.StorageTruthEnforcementMode + if mode == audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_UNSPECIFIED { + return true, nil + } + return false, nil +} + +// dispatchHealerOps: pulls SCHEDULED ops where I'm the assigned healer and +// kicks off reconstruction via the healer goroutine pool. +func (s *Service) dispatchHealerOps(ctx context.Context) error { + ops, err := s.listOps(ctx, audittypes.HealOpStatus_HEAL_OP_STATUS_SCHEDULED) + if err != nil { + return err + } + for i := range ops { + op := ops[i] + if op.HealerSupernodeAccount != s.identity { + continue + } + if isFinalStatus(op.Status) { + continue + } + key := opRoleKey(op.HealOpId, roleHealer) + if _, loaded := s.inFlight.LoadOrStore(key, struct{}{}); loaded { + continue + } + // Restart-time dedup: if a row already exists in heal_claims_submitted + // the chain has accepted the claim — switch to publisher / leave to + // finalizer. + has, err := s.store.HasHealClaim(ctx, op.HealOpId) + if err != nil { + s.inFlight.Delete(key) + logtrace.Warn(ctx, "self_healing(LEP-6): HasHealClaim", logtrace.Fields{logtrace.FieldError: err.Error(), "heal_op_id": op.HealOpId}) + continue + } + if has { + s.inFlight.Delete(key) + continue + } + go func(op audittypes.HealOp, key string) { + defer s.inFlight.Delete(key) + if err := s.reconstructAndClaim(ctx, op); err != nil { + logtrace.Warn(ctx, "self_healing(LEP-6): reconstructAndClaim", logtrace.Fields{ + logtrace.FieldError: err.Error(), + "heal_op_id": op.HealOpId, + "ticket_id": op.TicketId, + }) + } + }(op, key) + } + return nil +} + +// dispatchVerifierOps: pulls HEALER_REPORTED ops where I'm an assigned +// verifier and kicks off verification. +func (s *Service) dispatchVerifierOps(ctx context.Context) error { + ops, err := s.listOps(ctx, audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED) + if err != nil { + return err + } + for i := range ops { + op := ops[i] + if !accountInList(s.identity, op.VerifierSupernodeAccounts) { + continue + } + if isFinalStatus(op.Status) { + continue + } + key := opRoleKey(op.HealOpId, roleVerifier) + if _, loaded := s.inFlight.LoadOrStore(key, struct{}{}); loaded { + continue + } + has, err := s.store.HasHealVerification(ctx, op.HealOpId, s.identity) + if err != nil { + s.inFlight.Delete(key) + logtrace.Warn(ctx, "self_healing(LEP-6): HasHealVerification", logtrace.Fields{logtrace.FieldError: err.Error(), "heal_op_id": op.HealOpId}) + continue + } + if has { + s.inFlight.Delete(key) + continue + } + go func(op audittypes.HealOp, key string) { + defer s.inFlight.Delete(key) + if err := s.verifyAndSubmit(ctx, op); err != nil { + logtrace.Warn(ctx, "self_healing(LEP-6): verifyAndSubmit", logtrace.Fields{ + logtrace.FieldError: err.Error(), + "heal_op_id": op.HealOpId, + }) + } + }(op, key) + } + return nil +} + +// dispatchFinalizer: for each persisted heal_claims_submitted row, look up +// the on-chain status and either publish (VERIFIED) or cleanup +// (FAILED/EXPIRED). SCHEDULED / HEALER_REPORTED / IN_PROGRESS are no-ops. +func (s *Service) dispatchFinalizer(ctx context.Context) error { + claims, err := s.store.ListHealClaims(ctx) + if err != nil { + return err + } + for _, claim := range claims { + key := opRoleKey(claim.HealOpID, rolePublisher) + if _, loaded := s.inFlight.LoadOrStore(key, struct{}{}); loaded { + continue + } + go func(claim queries.HealClaimRecord, key string) { + defer s.inFlight.Delete(key) + if err := s.finalizeClaim(ctx, claim); err != nil { + logtrace.Warn(ctx, "self_healing(LEP-6): finalizeClaim", logtrace.Fields{ + logtrace.FieldError: err.Error(), + "heal_op_id": claim.HealOpID, + }) + } + }(claim, key) + } + return nil +} + +// listOps wraps the paginated audit query. Returns a flattened slice. +func (s *Service) listOps(ctx context.Context, status audittypes.HealOpStatus) ([]audittypes.HealOp, error) { + resp, err := s.lumera.Audit().GetHealOpsByStatus(ctx, status, nil) + if err != nil { + return nil, err + } + if resp == nil { + return nil, nil + } + return resp.HealOps, nil +} + +func accountInList(account string, list []string) bool { + for _, a := range list { + if a == account { + return true + } + } + return false +} + +func isFinalStatus(s audittypes.HealOpStatus) bool { + switch s { + case audittypes.HealOpStatus_HEAL_OP_STATUS_VERIFIED, + audittypes.HealOpStatus_HEAL_OP_STATUS_FAILED, + audittypes.HealOpStatus_HEAL_OP_STATUS_EXPIRED: + return true + } + return false +} diff --git a/supernode/self_healing/service_test.go b/supernode/self_healing/service_test.go new file mode 100644 index 00000000..6559bcd6 --- /dev/null +++ b/supernode/self_healing/service_test.go @@ -0,0 +1,668 @@ +package self_healing + +import ( + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + "time" + + audittypes "github.com/LumeraProtocol/lumera/x/audit/v1/types" + "github.com/LumeraProtocol/supernode/v2/pkg/cascadekit" + "github.com/LumeraProtocol/supernode/v2/pkg/storage/queries" + cascadeService "github.com/LumeraProtocol/supernode/v2/supernode/cascade" +) + +// helper builds a Service + its hooks for testing. Returns Service plus the +// programmable mocks so individual tests can drive scenarios. +type harness struct { + svc *Service + audit *programmableAudit + auditMsg *programmableAuditMsg + cascade *fakeCascadeFactory + store queries.LocalStoreInterface + stagingRoot string + identity string +} + +func newHarness(t *testing.T, identity string, mode audittypes.StorageTruthEnforcementMode) *harness { + t.Helper() + a := newProgrammableAudit(mode) + am := newProgrammableAuditMsg() + cf := newFakeCascadeFactory() + store := newTestStore(t) + root := filepath.Join(t.TempDir(), "heal-staging") + cfg := Config{ + Enabled: true, + PollInterval: time.Second, + MaxConcurrentReconstructs: 2, + MaxConcurrentVerifications: 4, + MaxConcurrentPublishes: 2, + StagingRoot: root, + VerifierFetchAttempts: 2, + VerifierFetchTimeout: time.Second, + VerifierBackoffBase: 10 * time.Millisecond, + KeyName: "test", + } + svc, err := New(identity, cfg, newFakeLumera(a, am), store, cf, &fakeFetcher{}) + if err != nil { + t.Fatalf("New: %v", err) + } + return &harness{svc: svc, audit: a, auditMsg: am, cascade: cf, store: store, stagingRoot: root, identity: identity} +} + +// newTestStore mirrors the test helper in pkg/storage/queries; we re-create +// it here so this package's tests don't depend on internal sqlite test +// scaffolding. +func newTestStore(t *testing.T) queries.LocalStoreInterface { + // Reuse the public OpenHistoryDB by setting HOME to a tempdir so the + // resolved ~/.supernode/history.db lives there. + t.Helper() + tmp := t.TempDir() + old := os.Getenv("HOME") + if err := os.Setenv("HOME", tmp); err != nil { + t.Fatalf("setenv: %v", err) + } + t.Cleanup(func() { _ = os.Setenv("HOME", old) }) + store, err := queries.OpenHistoryDB() + if err != nil { + t.Fatalf("OpenHistoryDB: %v", err) + } + t.Cleanup(func() { store.CloseHistoryDB(context.Background()) }) + return store +} + +// fakeFetcher returns a configurable response. Configure per-test by +// reassigning .body / .err. +type fakeFetcher struct { + body []byte + err error +} + +func (f *fakeFetcher) FetchReconstructed(ctx context.Context, healOpID uint64, healerAccount, verifierAccount string) ([]byte, error) { + if f.err != nil { + return nil, f.err + } + return append([]byte(nil), f.body...), nil +} + +// hashOf returns the action.DataHash recipe (BLAKE3 base64) of body. Used as +// the expected op.ResultHash in verifier tests. +func hashOf(t *testing.T, body []byte) string { + t.Helper() + h, err := cascadekit.ComputeBlake3DataHashB64(body) + if err != nil { + t.Fatalf("hash: %v", err) + } + return h +} + +// --------------------------------------------------------------------------- +// Test 1 — TestVerifier_ReadsOpResultHashForComparison (R-bug regression). +// --------------------------------------------------------------------------- +// +// Spec: verifier MUST submit verified=true only when its computed hash +// equals op.ResultHash (chain enforcement at msg_storage_truth.go:291). +// The supernode does not read Action.DataHash anywhere in the heal flow, +// so the regression surface is "do we read op.ResultHash and compare +// against THAT?". This test gives the verifier a body whose hash matches +// op.ResultHash and asserts verified=true with VerificationHash equal to +// the computed hash. A regression that hard-coded a constant or pulled +// from a different field would fail this test. +func TestVerifier_ReadsOpResultHashForComparison(t *testing.T) { + h := newHarness(t, "sn-verifier", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + + body := []byte("recovered-bytes-OK") + // The whole point of the R-bug pin: op.ResultHash is what the healer + // reported; verifier must compare against THIS. + h.audit.put(audittypes.HealOp{ + HealOpId: 10, + TicketId: "ticket-x", + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED, + HealerSupernodeAccount: "sn-healer", + VerifierSupernodeAccounts: []string{"sn-verifier"}, + ResultHash: hashOf(t, body), + }) + h.svc.fetcher = &fakeFetcher{body: body} + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + waitForVerifications(t, h.auditMsg, 1) + _, vc := h.auditMsg.snapshot() + if len(vc) != 1 { + t.Fatalf("expected 1 verification call, got %d", len(vc)) + } + if !vc[0].Verified { + t.Fatalf("expected verified=true (computed==op.ResultHash); details=%q", vc[0].Details) + } + if vc[0].VerificationHash != hashOf(t, body) { + t.Fatalf("VerificationHash should equal computed hash; got %q want %q", vc[0].VerificationHash, hashOf(t, body)) + } +} + +// --------------------------------------------------------------------------- +// Test 2 — TestVerifier_HashMismatchProducesVerifiedFalse. +// --------------------------------------------------------------------------- +func TestVerifier_HashMismatchProducesVerifiedFalse(t *testing.T) { + h := newHarness(t, "sn-verifier", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + wantBody := []byte("expected-body") + gotBody := []byte("tampered-body") + h.audit.put(audittypes.HealOp{ + HealOpId: 11, + TicketId: "ticket-y", + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED, + HealerSupernodeAccount: "sn-healer", + VerifierSupernodeAccounts: []string{"sn-verifier"}, + ResultHash: hashOf(t, wantBody), + }) + h.svc.fetcher = &fakeFetcher{body: gotBody} + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + waitForVerifications(t, h.auditMsg, 1) + _, vc := h.auditMsg.snapshot() + if vc[0].Verified { + t.Fatalf("expected verified=false on hash mismatch") + } + if !strings.Contains(vc[0].Details, "hash_mismatch") { + t.Fatalf("expected details to mention hash_mismatch, got %q", vc[0].Details) + } + if vc[0].VerificationHash == "" { + t.Fatalf("VerificationHash must be non-empty even on negative votes (chain rejects empty)") + } +} + +// --------------------------------------------------------------------------- +// Test 2b — TestVerifier_FetchFailureSubmitsNonEmptyHash. +// --------------------------------------------------------------------------- +// +// BLOCKER fix regression: chain rejects empty VerificationHash even on +// verified=false (msg_storage_truth.go:271-273). When the verifier can't +// reach the healer, it MUST synthesize a non-empty placeholder hash so the +// negative attestation is well-formed. +func TestVerifier_FetchFailureSubmitsNonEmptyHash(t *testing.T) { + h := newHarness(t, "sn-verifier", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + h.audit.put(audittypes.HealOp{ + HealOpId: 13, + TicketId: "ticket-fetch-fail", + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED, + HealerSupernodeAccount: "sn-unreachable-healer", + VerifierSupernodeAccounts: []string{"sn-verifier"}, + ResultHash: hashOf(t, []byte("expected")), + }) + h.svc.fetcher = &fakeFetcher{err: errors.New("connection refused")} + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + waitForVerifications(t, h.auditMsg, 1) + _, vc := h.auditMsg.snapshot() + if vc[0].Verified { + t.Fatalf("expected verified=false on fetch failure") + } + if vc[0].VerificationHash == "" { + t.Fatalf("BLOCKER regression: VerificationHash must be non-empty (chain rejects empty for both positive and negative)") + } + if !strings.Contains(vc[0].Details, "fetch_failed") { + t.Fatalf("details should record reason; got %q", vc[0].Details) + } +} + +// --------------------------------------------------------------------------- +// Test 3 — TestVerifier_FetchesFromAssignedHealerOnly (§19 gate). +// --------------------------------------------------------------------------- +// +// Verifier passes (op.HealerSupernodeAccount, identity) to the fetcher and +// nothing else. Verifier must never address an arbitrary peer or KAD. +func TestVerifier_FetchesFromAssignedHealerOnly(t *testing.T) { + h := newHarness(t, "sn-verifier", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + body := []byte("payload") + h.audit.put(audittypes.HealOp{ + HealOpId: 12, + TicketId: "ticket-z", + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED, + HealerSupernodeAccount: "sn-healer-7", + VerifierSupernodeAccounts: []string{"sn-verifier", "sn-other"}, + ResultHash: hashOf(t, body), + }) + rec := &recordingFetcher{body: body} + h.svc.fetcher = rec + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + waitForVerifications(t, h.auditMsg, 1) + if rec.lastHealer != "sn-healer-7" { + t.Fatalf("verifier addressed wrong healer: got %q want sn-healer-7", rec.lastHealer) + } + if rec.lastVerifier != "sn-verifier" { + t.Fatalf("verifier identity not propagated: got %q", rec.lastVerifier) + } + if rec.calls != 1 { + t.Fatalf("expected exactly 1 fetch call, got %d", rec.calls) + } +} + +type recordingFetcher struct { + body []byte + lastHealer string + lastVerifier string + calls int +} + +func (r *recordingFetcher) FetchReconstructed(ctx context.Context, healOpID uint64, healerAccount, verifierAccount string) ([]byte, error) { + r.lastHealer = healerAccount + r.lastVerifier = verifierAccount + r.calls++ + return append([]byte(nil), r.body...), nil +} + +// --------------------------------------------------------------------------- +// Tests 4 + 5 — transport handler authorization. +// --------------------------------------------------------------------------- +// Implemented in handler_test.go (transport package). + +// --------------------------------------------------------------------------- +// Test 6 — TestHealer_FailedSubmitDoesNotPersistDedupRow. +// --------------------------------------------------------------------------- +// +// Crash-recovery contract: SubmitClaim is the source of truth — only when +// the chain has accepted the claim is the SQLite dedup row written. A +// failed submit (mempool full, signing error, chain reject) leaves NO row, +// so the next tick can retry cleanly. Reverse ordering would strand the +// op forever on flaky submits, so this test pins the ordering. +// +// Companion: when chain has already accepted a prior submit but the +// supernode crashed before persisting, reconcileExistingClaim queries +// GetHealOp on resubmit-error and persists the row when ResultHash matches. +// That recovery path is exercised separately. +func TestHealer_FailedSubmitDoesNotPersistDedupRow(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + body := []byte("recovered-payload") + wantHash := hashOf(t, body) + h.cascade.reseedFn = func(ctx context.Context, req *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) { + // Simulate stageArtefacts side-effect: write reconstructed file + + // minimal manifest under StagingDir. + _ = makeStagingDir(t, h.stagingRoot, 20, wantHash, body) + return &cascadeService.RecoveryReseedResult{ + ActionID: req.ActionID, + DataHashVerified: true, + ReconstructedHashB64: wantHash, + StagingDir: req.StagingDir, + }, nil + } + // Simulate a non-state-error submit failure (e.g. mempool full). + h.auditMsg.claimErr = errors.New("simulated mempool full") + h.audit.put(audittypes.HealOp{ + HealOpId: 20, + TicketId: "ticket-q", + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_SCHEDULED, + HealerSupernodeAccount: "sn-healer", + ResultHash: "", + }) + _ = h.svc.tick(context.Background()) + // Wait for the goroutine to finish. + time.Sleep(200 * time.Millisecond) + // No row should have been written (chain didn't accept). + has, _ := h.store.HasHealClaim(context.Background(), 20) + if has { + t.Fatalf("dedup row must NOT exist when chain submit failed; row found") + } + // Staging dir should be cleaned up so the next tick starts fresh. + stagingDir := filepath.Join(h.stagingRoot, "20") + if _, err := os.Stat(stagingDir); !os.IsNotExist(err) { + t.Fatalf("staging dir should be removed on submit failure; stat err=%v", err) + } +} + +// --------------------------------------------------------------------------- +// Test 6b — TestHealer_ReconcilesExistingChainClaimAfterCrash. +// --------------------------------------------------------------------------- +// +// Crash-recovery: prior submit succeeded but supernode crashed before +// persisting. Resubmit returns "does not accept healer completion claim" +// (chain advanced past SCHEDULED). reconcileExistingClaim must: +// - re-fetch the heal-op +// - confirm chain ResultHash equals our manifest +// - persist the dedup row so finalizer can take over +func TestHealer_ReconcilesExistingChainClaimAfterCrash(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + body := []byte("recovered-payload-22") + wantHash := hashOf(t, body) + h.cascade.reseedFn = func(ctx context.Context, req *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) { + _ = makeStagingDir(t, h.stagingRoot, 22, wantHash, body) + return &cascadeService.RecoveryReseedResult{ + ActionID: req.ActionID, + DataHashVerified: true, + ReconstructedHashB64: wantHash, + StagingDir: req.StagingDir, + }, nil + } + // Simulate chain having already accepted a previous submit. + h.auditMsg.claimErr = errors.New("rpc error: code = Unknown desc = heal op status HEAL_OP_STATUS_HEALER_REPORTED does not accept healer completion claim") + // Heal-op is in HEALER_REPORTED with our manifest hash. + h.audit.put(audittypes.HealOp{ + HealOpId: 22, + TicketId: "ticket-r", + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED, + HealerSupernodeAccount: "sn-healer", + ResultHash: wantHash, + }) + // Note: dispatchHealerOps filters on SCHEDULED, so we drive the + // reconcile path directly via reconstructAndClaim. + op := audittypes.HealOp{ + HealOpId: 22, + TicketId: "ticket-r", + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_SCHEDULED, // healer's local view + HealerSupernodeAccount: "sn-healer", + } + if err := h.svc.reconstructAndClaim(context.Background(), op); err != nil { + t.Fatalf("reconstructAndClaim: %v", err) + } + has, _ := h.store.HasHealClaim(context.Background(), 22) + if !has { + t.Fatalf("reconcile must persist dedup row when chain ResultHash matches manifest") + } +} + +func TestHealer_ReconcileHashMismatchCleansStagingWithoutPersisting(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + body := []byte("recovered-payload-23") + wantHash := hashOf(t, body) + stagingDir := makeStagingDir(t, h.stagingRoot, 23, wantHash, body) + h.audit.put(audittypes.HealOp{ + HealOpId: 23, + TicketId: "ticket-s", + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED, + HealerSupernodeAccount: "sn-healer", + ResultHash: "different-manifest", + }) + if err := h.svc.reconcileExistingClaim(context.Background(), audittypes.HealOp{HealOpId: 23, TicketId: "ticket-s"}, wantHash, stagingDir); err != nil { + t.Fatalf("reconcileExistingClaim: %v", err) + } + has, _ := h.store.HasHealClaim(context.Background(), 23) + if has { + t.Fatalf("hash mismatch must not persist dedup row") + } + if _, err := os.Stat(stagingDir); !os.IsNotExist(err) { + t.Fatalf("staging dir should be removed on hash mismatch; stat err=%v", err) + } +} + +// --------------------------------------------------------------------------- +// Test 7 — TestHealer_RaptorQReconstructionFailureSkipsClaim (Scenario C1). +// --------------------------------------------------------------------------- +func TestHealer_RaptorQReconstructionFailureSkipsClaim(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + h.cascade.reseedFn = func(ctx context.Context, req *cascadeService.RecoveryReseedRequest) (*cascadeService.RecoveryReseedResult, error) { + return nil, errors.New("RaptorQ decode failed: insufficient symbols") + } + h.audit.put(audittypes.HealOp{ + HealOpId: 21, + TicketId: "ticket-broken", + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_SCHEDULED, + HealerSupernodeAccount: "sn-healer", + }) + _ = h.svc.tick(context.Background()) + // Sleep briefly to let the goroutine run. + time.Sleep(200 * time.Millisecond) + if h.auditMsg.claimsCount.Load() != 0 { + t.Fatalf("expected zero claim submissions; got %d", h.auditMsg.claimsCount.Load()) + } + has, _ := h.store.HasHealClaim(context.Background(), 21) + if has { + t.Fatalf("no row should be persisted on reconstruction failure") + } +} + +// --------------------------------------------------------------------------- +// Test 8 — TestFinalizer_VerifiedTriggersPublishToKAD. +// --------------------------------------------------------------------------- +func TestFinalizer_VerifiedTriggersPublishToKAD(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + hash := hashOf(t, []byte("body")) + stagingDir := makeStagingDir(t, h.stagingRoot, 30, hash, []byte("body")) + // Pre-seed the dedup row. + if err := h.store.RecordHealClaim(context.Background(), 30, "ticket-30", hash, stagingDir); err != nil { + t.Fatalf("seed claim: %v", err) + } + h.audit.put(audittypes.HealOp{HealOpId: 30, TicketId: "ticket-30", Status: audittypes.HealOpStatus_HEAL_OP_STATUS_VERIFIED, ResultHash: hash}) + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + waitForCondition(t, 2*time.Second, func() bool { + return h.cascade.publishCalls.Load() == 1 + }) + if got := h.cascade.lastPublishedDir.Load().(string); got != stagingDir { + t.Fatalf("published wrong dir: got %q want %q", got, stagingDir) + } + // Row must be deleted after successful publish. + has, _ := h.store.HasHealClaim(context.Background(), 30) + if has { + t.Fatalf("dedup row should be deleted after publish") + } + // Staging dir cleaned. + if _, err := os.Stat(stagingDir); !os.IsNotExist(err) { + t.Fatalf("staging dir should be removed after publish; stat err=%v", err) + } +} + +// --------------------------------------------------------------------------- +// Test 9 — TestFinalizer_FailedSkipsPublish_DeletesStaging. +// --------------------------------------------------------------------------- +func TestFinalizer_FailedSkipsPublish_DeletesStaging(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + hash := hashOf(t, []byte("x")) + stagingDir := makeStagingDir(t, h.stagingRoot, 31, hash, []byte("x")) + if err := h.store.RecordHealClaim(context.Background(), 31, "ticket-31", hash, stagingDir); err != nil { + t.Fatalf("seed: %v", err) + } + h.audit.put(audittypes.HealOp{HealOpId: 31, TicketId: "ticket-31", Status: audittypes.HealOpStatus_HEAL_OP_STATUS_FAILED}) + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + waitForCondition(t, 2*time.Second, func() bool { + has, _ := h.store.HasHealClaim(context.Background(), 31) + return !has + }) + if h.cascade.publishCalls.Load() != 0 { + t.Fatalf("publish must not be called on FAILED") + } + if _, err := os.Stat(stagingDir); !os.IsNotExist(err) { + t.Fatalf("staging should be removed on FAILED") + } +} + +// --------------------------------------------------------------------------- +// Test 10 — TestFinalizer_ExpiredSkipsPublish_DeletesStaging. +// --------------------------------------------------------------------------- +func TestFinalizer_ExpiredSkipsPublish_DeletesStaging(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + hash := hashOf(t, []byte("y")) + stagingDir := makeStagingDir(t, h.stagingRoot, 32, hash, []byte("y")) + if err := h.store.RecordHealClaim(context.Background(), 32, "ticket-32", hash, stagingDir); err != nil { + t.Fatalf("seed: %v", err) + } + h.audit.put(audittypes.HealOp{HealOpId: 32, Status: audittypes.HealOpStatus_HEAL_OP_STATUS_EXPIRED}) + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + waitForCondition(t, 2*time.Second, func() bool { + has, _ := h.store.HasHealClaim(context.Background(), 32) + return !has + }) + if h.cascade.publishCalls.Load() != 0 { + t.Fatalf("publish must not be called on EXPIRED") + } + if _, err := os.Stat(stagingDir); !os.IsNotExist(err) { + t.Fatalf("staging should be removed on EXPIRED") + } +} + +func TestFinalizer_NotFoundCleansClaimAndStaging(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + hash := hashOf(t, []byte("pruned")) + stagingDir := makeStagingDir(t, h.stagingRoot, 33, hash, []byte("pruned")) + if err := h.store.RecordHealClaim(context.Background(), 33, "ticket-33", hash, stagingDir); err != nil { + t.Fatalf("seed: %v", err) + } + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + waitForCondition(t, 2*time.Second, func() bool { + has, _ := h.store.HasHealClaim(context.Background(), 33) + return !has + }) + if h.cascade.publishCalls.Load() != 0 { + t.Fatalf("publish must not be called when chain heal-op is not found") + } + if _, err := os.Stat(stagingDir); !os.IsNotExist(err) { + t.Fatalf("staging should be removed when heal-op is not found") + } +} + +// --------------------------------------------------------------------------- +// Test 11 — TestService_NoRoleSkipsOp. +// --------------------------------------------------------------------------- +func TestService_NoRoleSkipsOp(t *testing.T) { + h := newHarness(t, "sn-bystander", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + h.audit.put(audittypes.HealOp{ + HealOpId: 40, + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_SCHEDULED, + HealerSupernodeAccount: "sn-other-healer", + VerifierSupernodeAccounts: []string{"sn-v1", "sn-v2"}, + }) + h.audit.put(audittypes.HealOp{ + HealOpId: 41, + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED, + HealerSupernodeAccount: "sn-other-healer", + VerifierSupernodeAccounts: []string{"sn-v1", "sn-v2"}, + ResultHash: "any", + }) + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + time.Sleep(150 * time.Millisecond) + if h.cascade.reseedCalls.Load() != 0 { + t.Fatalf("non-assigned supernode must not reconstruct") + } + if h.auditMsg.claimsCount.Load() != 0 || h.auditMsg.verificationsCount.Load() != 0 { + t.Fatalf("no tx should be submitted by non-assigned supernode") + } +} + +// --------------------------------------------------------------------------- +// Test 12 — TestService_UnspecifiedModeSkipsEntirely. +// --------------------------------------------------------------------------- +func TestService_UnspecifiedModeSkipsEntirely(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_UNSPECIFIED) + // Even ops we'd otherwise be assigned to. + h.audit.put(audittypes.HealOp{ + HealOpId: 50, + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_SCHEDULED, + HealerSupernodeAccount: "sn-healer", + }) + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + time.Sleep(150 * time.Millisecond) + if h.cascade.reseedCalls.Load() != 0 { + t.Fatalf("UNSPECIFIED mode must skip dispatcher entirely") + } +} + +// --------------------------------------------------------------------------- +// Test 13 — TestService_FinalStateOpsIgnored. +// --------------------------------------------------------------------------- +func TestService_FinalStateOpsIgnored(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + // Even with sn-healer assigned, VERIFIED/FAILED/EXPIRED are filtered out + // at the dispatcher level (status != SCHEDULED, status != HEALER_REPORTED). + h.audit.put(audittypes.HealOp{HealOpId: 60, Status: audittypes.HealOpStatus_HEAL_OP_STATUS_VERIFIED, HealerSupernodeAccount: "sn-healer"}) + h.audit.put(audittypes.HealOp{HealOpId: 61, Status: audittypes.HealOpStatus_HEAL_OP_STATUS_FAILED, HealerSupernodeAccount: "sn-healer"}) + h.audit.put(audittypes.HealOp{HealOpId: 62, Status: audittypes.HealOpStatus_HEAL_OP_STATUS_EXPIRED, HealerSupernodeAccount: "sn-healer"}) + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + time.Sleep(150 * time.Millisecond) + if h.cascade.reseedCalls.Load() != 0 { + t.Fatalf("final-state ops must not trigger reconstruction") + } + if h.auditMsg.claimsCount.Load() != 0 { + t.Fatalf("no claim submissions for final-state ops") + } +} + +// --------------------------------------------------------------------------- +// Test 14 — TestDedup_RestartDoesNotResubmit. +// --------------------------------------------------------------------------- +func TestDedup_RestartDoesNotResubmit(t *testing.T) { + h := newHarness(t, "sn-healer", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + hash := hashOf(t, []byte("body")) + stagingDir := makeStagingDir(t, h.stagingRoot, 70, hash, []byte("body")) + // Simulate a prior tick that already persisted + submitted. + if err := h.store.RecordHealClaim(context.Background(), 70, "ticket-70", hash, stagingDir); err != nil { + t.Fatalf("seed: %v", err) + } + // New tick sees op in SCHEDULED (chain hasn't seen the tx in the simulator, + // but supernode dedup must short-circuit). + h.audit.put(audittypes.HealOp{HealOpId: 70, TicketId: "ticket-70", Status: audittypes.HealOpStatus_HEAL_OP_STATUS_SCHEDULED, HealerSupernodeAccount: "sn-healer"}) + if err := h.svc.tick(context.Background()); err != nil { + t.Fatalf("tick: %v", err) + } + time.Sleep(150 * time.Millisecond) + if h.cascade.reseedCalls.Load() != 0 { + t.Fatalf("restart must NOT re-run RaptorQ for an already-claimed op") + } + if h.auditMsg.claimsCount.Load() != 0 { + t.Fatalf("restart must NOT resubmit claim tx") + } + // And same property for verifier dedup: + hv := newHarness(t, "sn-verifier", audittypes.StorageTruthEnforcementMode_STORAGE_TRUTH_ENFORCEMENT_MODE_FULL) + if err := hv.store.RecordHealVerification(context.Background(), 71, "sn-verifier", true, hash); err != nil { + t.Fatalf("seed verification: %v", err) + } + hv.audit.put(audittypes.HealOp{ + HealOpId: 71, + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED, + HealerSupernodeAccount: "sn-h", + VerifierSupernodeAccounts: []string{"sn-verifier"}, + ResultHash: hash, + }) + hv.svc.fetcher = &fakeFetcher{body: []byte("body")} + if err := hv.svc.tick(context.Background()); err != nil { + t.Fatalf("tick verifier: %v", err) + } + time.Sleep(150 * time.Millisecond) + if hv.auditMsg.verificationsCount.Load() != 0 { + t.Fatalf("restart must NOT resubmit verification tx") + } +} + +// --------------------------------------------------------------------------- +// helpers +// --------------------------------------------------------------------------- + +func waitForVerifications(t *testing.T, am *programmableAuditMsg, want int64) { + t.Helper() + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if am.verificationsCount.Load() >= want { + return + } + time.Sleep(10 * time.Millisecond) + } + t.Fatalf("timeout waiting for %d verifications; got %d", want, am.verificationsCount.Load()) +} + +func waitForCondition(t *testing.T, timeout time.Duration, cond func() bool) { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if cond() { + return + } + time.Sleep(10 * time.Millisecond) + } + t.Fatalf("timeout waiting for condition") +} diff --git a/supernode/self_healing/verifier.go b/supernode/self_healing/verifier.go new file mode 100644 index 00000000..b8b407a3 --- /dev/null +++ b/supernode/self_healing/verifier.go @@ -0,0 +1,190 @@ +package self_healing + +import ( + "context" + "crypto/sha256" + "encoding/base64" + "errors" + "fmt" + "strings" + "time" + + audittypes "github.com/LumeraProtocol/lumera/x/audit/v1/types" + "github.com/LumeraProtocol/supernode/v2/pkg/cascadekit" + "github.com/LumeraProtocol/supernode/v2/pkg/logtrace" + "github.com/LumeraProtocol/supernode/v2/pkg/storage/queries" +) + +// verifyAndSubmit runs LEP-6 §19 Phase 2 for one heal-op. +// +// Critical correctness rules +// +// 1. The verifier MUST fetch from the assigned healer (op.HealerSupernode +// Account), not from KAD. KAD is empty during HEALER_REPORTED — the +// healer publishes only after VERIFIED — so reading from KAD would +// loop on miss. More importantly, the §19 healer-served path is the +// only authority before chain quorum. +// +// 2. The verifier MUST compare its computed hash against op.ResultHash +// (set by the chain from the healer's HealManifestHash), NOT against +// Action.DataHash. The chain enforces this at +// lumera/x/audit/v1/keeper/msg_storage_truth.go:291. A verifier that +// submits VerificationHash != op.ResultHash with verified=true is +// rejected by the chain. Pinned by TestVerifier_ReadsOpResultHashForComparison. +// +// 3. On fetch failure after VerifierFetchAttempts retries the verifier +// submits verified=false. The chain rejects empty VerificationHash even +// for negatives (msg_storage_truth.go:271-273), so we synthesize a +// non-empty deterministic placeholder hash — for negative attestations +// the chain only validates equality when `req.Verified == true` +// (msg_storage_truth.go:288-294), so any non-empty value is accepted. +// +// 4. Persist-AFTER-submit ordering: SQLite dedup row is written ONLY after +// the chain accepted the tx. A failed submit therefore leaves no row, +// letting the next tick retry. Reverse ordering would strand the op +// forever on flaky submits. +func (s *Service) verifyAndSubmit(ctx context.Context, op audittypes.HealOp) error { + if err := s.semVerify.Acquire(ctx, 1); err != nil { + return err + } + defer s.semVerify.Release(1) + + expectedHash := strings.TrimSpace(op.ResultHash) + if expectedHash == "" { + return fmt.Errorf("op.ResultHash empty (op not in HEALER_REPORTED?)") + } + + bytesGot, fetchErr := s.fetchFromHealerWithRetry(ctx, op) + if fetchErr != nil { + // Submit negative verification with a non-empty placeholder hash — + // chain rejects empty VerificationHash even for negative votes. + details := fmt.Sprintf("fetch_failed:%v", fetchErr) + if err := s.submitNegativeWithReason(ctx, op.HealOpId, details); err != nil { + return fmt.Errorf("fetch %v; submit-negative %w", fetchErr, err) + } + logtrace.Warn(ctx, "self_healing(LEP-6): verifier submitted negative due to fetch failure", logtrace.Fields{ + "heal_op_id": op.HealOpId, + logtrace.FieldError: fetchErr.Error(), + }) + return nil + } + + computedHash, hashErr := cascadekit.ComputeBlake3DataHashB64(bytesGot) + if hashErr != nil { + details := fmt.Sprintf("hash_compute_failed:%v", hashErr) + if err := s.submitNegativeWithReason(ctx, op.HealOpId, details); err != nil { + return fmt.Errorf("hash %v; submit-negative %w", hashErr, err) + } + return nil + } + verified := computedHash == expectedHash + details := "" + if !verified { + details = "hash_mismatch" + } + // Positive: chain validates VerificationHash == op.ResultHash. Negative: + // chain accepts any non-empty hash. Send computedHash either way so audit + // trails always carry the verifier's own observation. + if err := s.submitVerification(ctx, op.HealOpId, verified, computedHash, details); err != nil { + return fmt.Errorf("submit verification: %w", err) + } + logtrace.Info(ctx, "self_healing(LEP-6): verification submitted", logtrace.Fields{ + "heal_op_id": op.HealOpId, + "verified": verified, + "expected_h": expectedHash, + "computed_h": computedHash, + "bytes_length": len(bytesGot), + }) + return nil +} + +// submitNegativeWithReason synthesizes a deterministic non-empty placeholder +// hash from the failure reason and submits a negative verification. Chain +// only validates VerificationHash content for positive votes +// (msg_storage_truth.go:288-294), so any non-empty value is well-formed. +func (s *Service) submitNegativeWithReason(ctx context.Context, healOpID uint64, reason string) error { + placeholder := negativeAttestationHash(reason) + return s.submitVerification(ctx, healOpID, false, placeholder, reason) +} + +// negativeAttestationHash returns a stable non-empty base64 hash derived +// from `reason` so audit trails can correlate identical failure modes. +// Format matches the action.DataHash recipe (32-byte digest, base64) so +// downstream consumers don't have to special-case width. +func negativeAttestationHash(reason string) string { + sum := sha256.Sum256([]byte("lep6:negative-attestation:" + reason)) + return base64.StdEncoding.EncodeToString(sum[:]) +} + +// submitVerification submits MsgSubmitHealVerification THEN persists the +// SQLite dedup row only on successful chain acceptance. +// +// Idempotency on retry: if the chain has already recorded a verification +// from this verifier (for instance, a previous tick's submit succeeded but +// the supernode crashed before persisting), it returns ErrHealVerification +// Exists. We treat that as success and persist the row so the next tick +// stops retrying. +func (s *Service) submitVerification(ctx context.Context, healOpID uint64, verified bool, hash, details string) error { + resp, err := s.lumera.AuditMsg().SubmitHealVerification(ctx, healOpID, verified, hash, details) + if err != nil { + // If the chain already has a verification from us (prior submit + // succeeded but persist crashed), reconcile by persisting the + // dedup row now. + if isChainVerificationAlreadyExists(err) { + if persistErr := s.store.RecordHealVerification(ctx, healOpID, s.identity, verified, hash); persistErr != nil && !errors.Is(persistErr, queries.ErrLEP6VerificationAlreadyRecorded) { + return fmt.Errorf("reconcile dedup row: %w", persistErr) + } + return nil + } + return err + } + _ = resp + // Chain accepted — persist for restart dedup. If row already exists + // (in-flight retry beat us), it's a no-op. + if err := s.store.RecordHealVerification(ctx, healOpID, s.identity, verified, hash); err != nil { + if errors.Is(err, queries.ErrLEP6VerificationAlreadyRecorded) { + return nil + } + return fmt.Errorf("record heal verification: %w", err) + } + return nil +} + +// isChainVerificationAlreadyExists detects the chain's +// ErrHealVerificationExists wrapped string. We can't import the chain's +// errors package here without cycling through audittypes, but the wrapped +// message is stable. +func isChainVerificationAlreadyExists(err error) bool { + if err == nil { + return false + } + return strings.Contains(err.Error(), "verification already submitted by creator") +} + +// fetchFromHealerWithRetry is the §19 healer-served-path GET with bounded +// exponential backoff. Returns the reconstructed file bytes (concatenated +// from chunks if chunked). +func (s *Service) fetchFromHealerWithRetry(ctx context.Context, op audittypes.HealOp) ([]byte, error) { + if s.fetcher == nil { + return nil, fmt.Errorf("verifier fetcher is nil") + } + var lastErr error + for attempt := 0; attempt < s.cfg.VerifierFetchAttempts; attempt++ { + fetchCtx, cancel := context.WithTimeout(ctx, s.cfg.VerifierFetchTimeout) + bytesGot, err := s.fetcher.FetchReconstructed(fetchCtx, op.HealOpId, op.HealerSupernodeAccount, s.identity) + cancel() + if err == nil { + return bytesGot, nil + } + lastErr = err + if attempt+1 < s.cfg.VerifierFetchAttempts { + delay := s.cfg.VerifierBackoffBase * (1 << attempt) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(delay): + } + } + } + return nil, lastErr +} diff --git a/supernode/storage_challenge/lep6_client_factory.go b/supernode/storage_challenge/lep6_client_factory.go index 61821a7e..12f30521 100644 --- a/supernode/storage_challenge/lep6_client_factory.go +++ b/supernode/storage_challenge/lep6_client_factory.go @@ -13,6 +13,7 @@ import ( "github.com/LumeraProtocol/supernode/v2/pkg/lumera" "github.com/LumeraProtocol/supernode/v2/pkg/net/credentials" grpcclient "github.com/LumeraProtocol/supernode/v2/pkg/net/grpc/client" + "github.com/LumeraProtocol/supernode/v2/pkg/netutil" "github.com/cosmos/cosmos-sdk/crypto/keyring" "google.golang.org/grpc" ) @@ -85,7 +86,7 @@ func (f *secureSupernodeClientFactory) Dial(ctx context.Context, target string) if raw == "" { return nil, fmt.Errorf("no address for target %q", target) } - host, port, ok := parseHostAndPort(raw, int(f.defaultPort)) + host, port, ok := netutil.ParseHostAndPort(raw, int(f.defaultPort)) if !ok || strings.TrimSpace(host) == "" { return nil, fmt.Errorf("invalid address %q for target %q", raw, target) } diff --git a/supernode/storage_challenge/service.go b/supernode/storage_challenge/service.go index 5f3b7e06..467c0734 100644 --- a/supernode/storage_challenge/service.go +++ b/supernode/storage_challenge/service.go @@ -6,7 +6,6 @@ import ( "encoding/json" "fmt" "net" - "net/url" "sort" "strconv" "strings" @@ -20,6 +19,7 @@ import ( "github.com/LumeraProtocol/supernode/v2/pkg/lumera" "github.com/LumeraProtocol/supernode/v2/pkg/net/credentials" grpcclient "github.com/LumeraProtocol/supernode/v2/pkg/net/grpc/client" + "github.com/LumeraProtocol/supernode/v2/pkg/netutil" "github.com/LumeraProtocol/supernode/v2/pkg/storage/queries" "github.com/LumeraProtocol/supernode/v2/pkg/storagechallenge/deterministic" "github.com/cosmos/cosmos-sdk/crypto/keyring" @@ -514,71 +514,13 @@ func (s *Service) supernodeGRPCAddr(ctx context.Context, supernodeAccount string // both forms: // - "host" -> use our configured default gRPC port // - "host:port" -> use the stored port as the dial target - host, port, ok := parseHostAndPort(raw, int(s.grpcPort)) + host, port, ok := netutil.ParseHostAndPort(raw, int(s.grpcPort)) if !ok || strings.TrimSpace(host) == "" { return "", fmt.Errorf("invalid supernode address for %s: %q", supernodeAccount, raw) } return net.JoinHostPort(strings.TrimSpace(host), strconv.Itoa(port)), nil } -// parseHostAndPort parses a "host" or "host:port" string and returns a host and port. -// If a port is not present, defaultPort is returned. If a port is present but invalid, -func parseHostAndPort(address string, defaultPort int) (host string, port int, ok bool) { - address = strings.TrimSpace(address) - if address == "" { - return "", 0, false - } - - // If it looks like a URL, parse and use the host[:port] portion. - if u, err := url.Parse(address); err == nil && u.Host != "" { - address = u.Host - } - - if h, p, err := net.SplitHostPort(address); err == nil { - h = strings.TrimSpace(h) - if h == "" { - return "", 0, false - } - if n, err := strconv.Atoi(p); err == nil && n > 0 && n <= 65535 { - return h, n, true - } - return h, defaultPort, true - } - - // No port present. Treat it as a raw host if it is plausibly valid; otherwise fail. - host = strings.TrimSpace(address) - if host == "" { - return "", 0, false - } - - // Accept bracketed IPv6 literal without a port (e.g. "[2001:db8::1]") by stripping brackets. - if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") && strings.Count(host, "]") == 1 { - host = strings.TrimPrefix(strings.TrimSuffix(host, "]"), "[") - host = strings.TrimSpace(host) - if host == "" { - return "", 0, false - } - } - - // Reject obviously malformed inputs (paths, fragments, userinfo, whitespace, or stray brackets). - if strings.ContainsAny(host, " \t\r\n/\\?#@[]") { - return "", 0, false - } - - // If it contains ':' it must be a valid IPv6 literal (optionally with a zone, e.g. "fe80::1%eth0"). - if strings.Contains(host, ":") { - ipPart := host - if i := strings.IndexByte(ipPart, '%'); i >= 0 { - ipPart = ipPart[:i] - } - if net.ParseIP(ipPart) == nil { - return "", 0, false - } - } - - return host, defaultPort, true -} - func (s *Service) callGetSliceProof(ctx context.Context, remoteIdentity string, address string, req *supernode.GetSliceProofRequest, timeout time.Duration) (*supernode.GetSliceProofResponse, error) { cctx, cancel := context.WithTimeout(ctx, timeout) defer cancel() diff --git a/supernode/transport/grpc/self_healing/handler.go b/supernode/transport/grpc/self_healing/handler.go new file mode 100644 index 00000000..d1714dfa --- /dev/null +++ b/supernode/transport/grpc/self_healing/handler.go @@ -0,0 +1,206 @@ +// Package self_healing implements the §19 healer-served-path transport. +// +// LEP-6 §19 requires verifiers to fetch reconstructed bytes directly from +// the assigned healer (NOT from KAD), because before chain VERIFIED no copy +// is yet in KAD and the healer is the only authority. This handler exposes +// the verifier-side fetch as a streaming gRPC RPC, gated on caller ∈ +// op.VerifierSupernodeAccounts. +package self_healing + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/LumeraProtocol/supernode/v2/gen/supernode" + "github.com/LumeraProtocol/supernode/v2/pkg/logtrace" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + "github.com/LumeraProtocol/supernode/v2/pkg/reachability" + cascadeService "github.com/LumeraProtocol/supernode/v2/supernode/cascade" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +const ( + // streamChunkBytes is the chunk size used by ServeReconstructedArtefacts. + // Tuned for grpc max message default (4 MiB) — chunks are 1 MiB so + // a 100 MiB file streams in ~100 messages. + streamChunkBytes = 1 << 20 +) + +// CallerIdentityResolver returns the authenticated chain-side supernode +// account address of the gRPC caller. The production resolver pulls it +// from the secure-rpc / lumeraid handshake the storage_challenge handler +// uses (pkg/reachability.GrpcRemoteIdentityAndAddr). +type CallerIdentityResolver func(ctx context.Context) (string, error) + +// DefaultCallerIdentityResolver returns a resolver backed by the secure-rpc +// (Lumera ALTS) handshake. The returned identity is the verifier's +// chain-side supernode account; if the inbound connection is NOT secure-rpc +// the resolver returns an error so the handler refuses to serve. +func DefaultCallerIdentityResolver() CallerIdentityResolver { + return func(ctx context.Context) (string, error) { + identity, _ := reachability.GrpcRemoteIdentityAndAddr(ctx) + identity = strings.TrimSpace(identity) + if identity == "" { + return "", errors.New("caller identity unavailable: secure-rpc / ALTS handshake required") + } + return identity, nil + } +} + +// Server implements supernode.SelfHealingServiceServer for the LEP-6 §19 +// healer-served path. One instance per supernode binary; runs alongside the +// dispatcher Service in self_healing.Service. +type Server struct { + supernode.UnimplementedSelfHealingServiceServer + + identity string + stagingRoot string + lumera lumera.Client + resolveCaller CallerIdentityResolver +} + +// NewServer constructs the §19 transport handler. +// +// resolveCaller authenticates the gRPC peer. Pass DefaultCallerIdentity +// Resolver() in production — it pulls the identity from the secure-rpc +// (Lumera ALTS) handshake. Tests may pass a stub or nil; nil falls back to +// trusting `req.VerifierAccount` (NOT secure — only for unit tests where +// no transport stack is wired up). +func NewServer(identity, stagingRoot string, lumeraClient lumera.Client, resolveCaller CallerIdentityResolver) (*Server, error) { + identity = strings.TrimSpace(identity) + if identity == "" { + return nil, fmt.Errorf("identity is empty") + } + if lumeraClient == nil || lumeraClient.Audit() == nil { + return nil, fmt.Errorf("lumera client missing audit module") + } + if strings.TrimSpace(stagingRoot) == "" { + return nil, fmt.Errorf("staging root is empty") + } + return &Server{ + identity: identity, + stagingRoot: stagingRoot, + lumera: lumeraClient, + resolveCaller: resolveCaller, + }, nil +} + +// ServeReconstructedArtefacts streams the reconstructed file bytes for one +// heal-op to an authorized verifier. +// +// Authorization (§19): caller must be a member of +// op.VerifierSupernodeAccounts. Caller account is preferentially read from +// CallerIdentityResolver (authenticated transport identity); req.Verifier +// Account is used only as a fallback for tests where no resolver was +// configured — production paths MUST use DefaultCallerIdentityResolver(). +func (s *Server) ServeReconstructedArtefacts(req *supernode.ServeReconstructedArtefactsRequest, stream supernode.SelfHealingService_ServeReconstructedArtefactsServer) error { + if req == nil || req.HealOpId == 0 { + return status.Error(codes.InvalidArgument, "missing heal_op_id") + } + ctx := stream.Context() + + // Resolve caller identity. If a resolver is configured (production), + // the resolver's verdict wins over req.VerifierAccount — never trust + // the request payload alone. + var caller string + if s.resolveCaller != nil { + auth, err := s.resolveCaller(ctx) + if err != nil { + return status.Errorf(codes.Unauthenticated, "resolve caller: %v", err) + } + caller = strings.TrimSpace(auth) + } else { + caller = strings.TrimSpace(req.VerifierAccount) + } + if caller == "" { + return status.Error(codes.Unauthenticated, "caller identity unknown") + } + + // Authorize against on-chain heal-op. + resp, err := s.lumera.Audit().GetHealOp(ctx, req.HealOpId) + if err != nil { + return status.Errorf(codes.NotFound, "heal op %d: %v", req.HealOpId, err) + } + if resp == nil { + return status.Errorf(codes.NotFound, "heal op %d not found", req.HealOpId) + } + op := resp.HealOp + if op.HealerSupernodeAccount != s.identity { + // Not the assigned healer for this op — refuse to serve so verifiers + // don't accidentally consult a non-authoritative supernode. + return status.Error(codes.FailedPrecondition, "this supernode is not the assigned healer for this heal op") + } + authorized := false + for _, v := range op.VerifierSupernodeAccounts { + if v == caller { + authorized = true + break + } + } + if !authorized { + return status.Errorf(codes.PermissionDenied, "caller %q not in verifier set", caller) + } + + // Resolve staging dir + reconstructed file. + stagingDir := filepath.Join(s.stagingRoot, fmt.Sprintf("%d", req.HealOpId)) + info, err := cascadeService.ReadStagedHealOp(stagingDir) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return status.Errorf(codes.NotFound, "no staged heal-op %d", req.HealOpId) + } + return status.Errorf(codes.Internal, "read staged heal op: %v", err) + } + + f, err := os.Open(info.ReconstructedFilePath) + if err != nil { + return status.Errorf(codes.Internal, "open staged file: %v", err) + } + defer f.Close() + st, err := f.Stat() + if err != nil { + return status.Errorf(codes.Internal, "stat staged file: %v", err) + } + totalSize := uint64(st.Size()) + + logtrace.Info(ctx, "self_healing(LEP-6): serving reconstructed artefacts", logtrace.Fields{ + "heal_op_id": req.HealOpId, + "caller": caller, + "size": totalSize, + }) + + buf := make([]byte, streamChunkBytes) + first := true + var sent uint64 + for { + n, rerr := f.Read(buf) + if n > 0 { + sent += uint64(n) + out := &supernode.ServeReconstructedArtefactsResponse{ + Chunk: append([]byte(nil), buf[:n]...), + IsLast: false, + } + if first { + out.TotalSize = totalSize + first = false + } + if rerr == io.EOF || sent == totalSize { + out.IsLast = true + } + if err := stream.Send(out); err != nil { + return err + } + } + if rerr == io.EOF { + return nil + } + if rerr != nil { + return status.Errorf(codes.Internal, "read staged file: %v", rerr) + } + } +} diff --git a/supernode/transport/grpc/self_healing/handler_test.go b/supernode/transport/grpc/self_healing/handler_test.go new file mode 100644 index 00000000..12adf91d --- /dev/null +++ b/supernode/transport/grpc/self_healing/handler_test.go @@ -0,0 +1,277 @@ +package self_healing + +import ( + "context" + "errors" + "io" + "net" + "os" + "path/filepath" + "sync" + "testing" + + audittypes "github.com/LumeraProtocol/lumera/x/audit/v1/types" + "github.com/LumeraProtocol/supernode/v2/gen/supernode" + "github.com/LumeraProtocol/supernode/v2/pkg/cascadekit" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/action" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/action_msg" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/audit" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/audit_msg" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/auth" + bankmod "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/bank" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/node" + supernodeMod "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/supernode" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/supernode_msg" + "github.com/LumeraProtocol/supernode/v2/pkg/lumera/modules/tx" + "github.com/LumeraProtocol/supernode/v2/pkg/testutil" + query "github.com/cosmos/cosmos-sdk/types/query" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "google.golang.org/grpc/test/bufconn" +) + +// --------------------------------------------------------------------------- +// Test 4 — TestServeReconstructedArtefacts_AuthorizesOnlyAssignedVerifiers. +// --------------------------------------------------------------------------- +func TestServeReconstructedArtefacts_AuthorizesOnlyAssignedVerifiers(t *testing.T) { + srv, cleanup, _ := newHandlerHarness(t, "sn-healer", &handlerOp{ + HealOpId: 100, + HealerSupernodeAccount: "sn-healer", + VerifierSupernodeAccounts: []string{"sn-v1", "sn-v2"}, + }, []byte("payload-bytes")) + defer cleanup() + + body, err := callServe(t, srv, &supernode.ServeReconstructedArtefactsRequest{ + HealOpId: 100, + VerifierAccount: "sn-v1", + }) + if err != nil { + t.Fatalf("authorized verifier should succeed: %v", err) + } + if string(body) != "payload-bytes" { + t.Fatalf("unexpected body: %q", string(body)) + } +} + +// --------------------------------------------------------------------------- +// Test 5 — TestServeReconstructedArtefacts_RejectsUnassignedCaller. +// --------------------------------------------------------------------------- +func TestServeReconstructedArtefacts_RejectsUnassignedCaller(t *testing.T) { + srv, cleanup, _ := newHandlerHarness(t, "sn-healer", &handlerOp{ + HealOpId: 101, + HealerSupernodeAccount: "sn-healer", + VerifierSupernodeAccounts: []string{"sn-v1", "sn-v2"}, + }, []byte("p")) + defer cleanup() + + _, err := callServe(t, srv, &supernode.ServeReconstructedArtefactsRequest{ + HealOpId: 101, + VerifierAccount: "sn-attacker", + }) + if err == nil { + t.Fatalf("unauthorized caller must be rejected") + } + st, _ := status.FromError(err) + if st.Code() != codes.PermissionDenied { + t.Fatalf("expected PermissionDenied, got %v: %v", st.Code(), err) + } + + // Also: a different supernode that isn't even the assigned healer should + // refuse to serve regardless of caller. + wrongHealerSrv, wrongCleanup, _ := newHandlerHarness(t, "sn-not-healer", &handlerOp{ + HealOpId: 102, + HealerSupernodeAccount: "sn-real-healer", + VerifierSupernodeAccounts: []string{"sn-v1"}, + }, []byte("p")) + defer wrongCleanup() + _, err = callServe(t, wrongHealerSrv, &supernode.ServeReconstructedArtefactsRequest{ + HealOpId: 102, + VerifierAccount: "sn-v1", + }) + if err == nil { + t.Fatalf("non-assigned-healer must refuse to serve") + } + st, _ = status.FromError(err) + if st.Code() != codes.FailedPrecondition { + t.Fatalf("expected FailedPrecondition, got %v: %v", st.Code(), err) + } +} + +// --------------------------------------------------------------------------- +// handler harness +// --------------------------------------------------------------------------- + +type handlerOp struct { + HealOpId uint64 + HealerSupernodeAccount string + VerifierSupernodeAccounts []string +} + +func newHandlerHarness(t *testing.T, identity string, op *handlerOp, body []byte) (*Server, func(), string) { + t.Helper() + root := filepath.Join(t.TempDir(), "heal-staging") + if err := os.MkdirAll(root, 0o700); err != nil { + t.Fatalf("mkdir: %v", err) + } + hash, err := cascadekit.ComputeBlake3DataHashB64(body) + if err != nil { + t.Fatalf("hash: %v", err) + } + dir := makeStagingDir(t, root, op.HealOpId, hash, body) + + a := &handlerStubAudit{op: audittypes.HealOp{ + HealOpId: op.HealOpId, + HealerSupernodeAccount: op.HealerSupernodeAccount, + VerifierSupernodeAccounts: op.VerifierSupernodeAccounts, + Status: audittypes.HealOpStatus_HEAL_OP_STATUS_HEALER_REPORTED, + ResultHash: hash, + }} + srv, err := NewServer(identity, root, &handlerLumera{audit: a}, nil) + if err != nil { + t.Fatalf("NewServer: %v", err) + } + cleanup := func() { _ = os.RemoveAll(dir) } + return srv, cleanup, hash +} + +// callServe dials the server through bufconn and consumes the stream. +func callServe(t *testing.T, srv *Server, req *supernode.ServeReconstructedArtefactsRequest) ([]byte, error) { + t.Helper() + listener := bufconn.Listen(1 << 16) + gs := grpc.NewServer() + supernode.RegisterSelfHealingServiceServer(gs, srv) + go func() { _ = gs.Serve(listener) }() + defer gs.Stop() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + conn, err := grpc.DialContext(ctx, "bufnet", + grpc.WithContextDialer(func(ctx context.Context, _ string) (net.Conn, error) { return listener.DialContext(ctx) }), + grpc.WithInsecure(), + ) + if err != nil { + return nil, err + } + defer conn.Close() + c := supernode.NewSelfHealingServiceClient(conn) + stream, err := c.ServeReconstructedArtefacts(ctx, req) + if err != nil { + return nil, err + } + var buf []byte + for { + msg, err := stream.Recv() + if errors.Is(err, io.EOF) { + return buf, nil + } + if err != nil { + return nil, err + } + buf = append(buf, msg.Chunk...) + if msg.IsLast { + // Drain to surface a trailing status (if any). + _, _ = stream.Recv() + return buf, nil + } + } +} + +// handlerLumera is a minimal lumera.Client for the transport handler tests +// — only Audit() is consulted. +type handlerLumera struct { + mu sync.Mutex + audit audit.Module + stubsRef lumera.Client +} + +func (h *handlerLumera) Auth() auth.Module { + h.ensureStubs() + return h.stubsRef.Auth() +} +func (h *handlerLumera) Action() action.Module { + h.ensureStubs() + return h.stubsRef.Action() +} +func (h *handlerLumera) ActionMsg() action_msg.Module { + h.ensureStubs() + return h.stubsRef.ActionMsg() +} +func (h *handlerLumera) Audit() audit.Module { return h.audit } +func (h *handlerLumera) AuditMsg() audit_msg.Module { return h.stubsRef.AuditMsg() } +func (h *handlerLumera) SuperNode() supernodeMod.Module { + h.ensureStubs() + return h.stubsRef.SuperNode() +} +func (h *handlerLumera) SuperNodeMsg() supernode_msg.Module { + h.ensureStubs() + return h.stubsRef.SuperNodeMsg() +} +func (h *handlerLumera) Bank() bankmod.Module { + h.ensureStubs() + return h.stubsRef.Bank() +} +func (h *handlerLumera) Tx() tx.Module { + h.ensureStubs() + return h.stubsRef.Tx() +} +func (h *handlerLumera) Node() node.Module { + h.ensureStubs() + return h.stubsRef.Node() +} +func (h *handlerLumera) Close() error { return nil } + +func (h *handlerLumera) ensureStubs() { + h.mu.Lock() + defer h.mu.Unlock() + if h.stubsRef == nil { + c, err := testutil.NewMockLumeraClient(nil, nil) + if err != nil { + panic(err) + } + h.stubsRef = c + } +} + +type handlerStubAudit struct{ op audittypes.HealOp } + +func (h *handlerStubAudit) GetParams(ctx context.Context) (*audittypes.QueryParamsResponse, error) { + return &audittypes.QueryParamsResponse{}, nil +} +func (h *handlerStubAudit) GetEpochAnchor(ctx context.Context, epochID uint64) (*audittypes.QueryEpochAnchorResponse, error) { + return &audittypes.QueryEpochAnchorResponse{}, nil +} +func (h *handlerStubAudit) GetCurrentEpochAnchor(ctx context.Context) (*audittypes.QueryCurrentEpochAnchorResponse, error) { + return &audittypes.QueryCurrentEpochAnchorResponse{}, nil +} +func (h *handlerStubAudit) GetCurrentEpoch(ctx context.Context) (*audittypes.QueryCurrentEpochResponse, error) { + return &audittypes.QueryCurrentEpochResponse{}, nil +} +func (h *handlerStubAudit) GetAssignedTargets(ctx context.Context, supernodeAccount string, epochID uint64) (*audittypes.QueryAssignedTargetsResponse, error) { + return &audittypes.QueryAssignedTargetsResponse{}, nil +} +func (h *handlerStubAudit) GetEpochReport(ctx context.Context, epochID uint64, supernodeAccount string) (*audittypes.QueryEpochReportResponse, error) { + return &audittypes.QueryEpochReportResponse{}, nil +} +func (h *handlerStubAudit) GetNodeSuspicionState(ctx context.Context, supernodeAccount string) (*audittypes.QueryNodeSuspicionStateResponse, error) { + return &audittypes.QueryNodeSuspicionStateResponse{}, nil +} +func (h *handlerStubAudit) GetReporterReliabilityState(ctx context.Context, reporterAccount string) (*audittypes.QueryReporterReliabilityStateResponse, error) { + return &audittypes.QueryReporterReliabilityStateResponse{}, nil +} +func (h *handlerStubAudit) GetTicketDeteriorationState(ctx context.Context, ticketID string) (*audittypes.QueryTicketDeteriorationStateResponse, error) { + return &audittypes.QueryTicketDeteriorationStateResponse{}, nil +} +func (h *handlerStubAudit) GetHealOp(ctx context.Context, healOpID uint64) (*audittypes.QueryHealOpResponse, error) { + if healOpID != h.op.HealOpId { + return nil, errors.New("not found") + } + return &audittypes.QueryHealOpResponse{HealOp: h.op}, nil +} +func (h *handlerStubAudit) GetHealOpsByStatus(ctx context.Context, status audittypes.HealOpStatus, pagination *query.PageRequest) (*audittypes.QueryHealOpsByStatusResponse, error) { + return &audittypes.QueryHealOpsByStatusResponse{}, nil +} +func (h *handlerStubAudit) GetHealOpsByTicket(ctx context.Context, ticketID string, pagination *query.PageRequest) (*audittypes.QueryHealOpsByTicketResponse, error) { + return &audittypes.QueryHealOpsByTicketResponse{}, nil +} diff --git a/supernode/transport/grpc/self_healing/helpers_test.go b/supernode/transport/grpc/self_healing/helpers_test.go new file mode 100644 index 00000000..0d1ac770 --- /dev/null +++ b/supernode/transport/grpc/self_healing/helpers_test.go @@ -0,0 +1,38 @@ +package self_healing + +import ( + "os" + "path/filepath" + "testing" +) + +// makeStagingDir creates a minimal heal-op staging dir matching the layout +// produced by cascade.stageArtefacts: manifest.json + reconstructed.bin + +// empty symbols/ subdir. Returns the absolute staging dir path. +func makeStagingDir(t *testing.T, root string, opID uint64, hashB64 string, body []byte) string { + t.Helper() + dir := filepath.Join(root, itoa(opID)) + if err := os.MkdirAll(filepath.Join(dir, "symbols"), 0o700); err != nil { + t.Fatalf("mkdir staging: %v", err) + } + if err := os.WriteFile(filepath.Join(dir, "reconstructed.bin"), body, 0o600); err != nil { + t.Fatalf("write reconstructed: %v", err) + } + manifest := []byte(`{"action_id":"ticket-` + itoa(opID) + `","layout":{"blocks":[]},"id_files":[],"symbol_keys":[],"symbols_dir":"` + filepath.Join(dir, "symbols") + `","reconstructed_rel":"reconstructed.bin","manifest_hash_b64":"` + hashB64 + `"}`) + if err := os.WriteFile(filepath.Join(dir, "manifest.json"), manifest, 0o600); err != nil { + t.Fatalf("write manifest: %v", err) + } + return dir +} + +func itoa(u uint64) string { + if u == 0 { + return "0" + } + digits := []byte{} + for u > 0 { + digits = append([]byte{byte('0' + u%10)}, digits...) + u /= 10 + } + return string(digits) +}