From c7a5d04ee71450dbbf66d56359bde1f11ca10347 Mon Sep 17 00:00:00 2001 From: tester Date: Sun, 17 May 2026 02:46:05 -0700 Subject: [PATCH] feat: add 'td explain' semantic diff explainer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new 'td explain' command that runs git diff (or reads one from stdin) and prints a human-readable, file-grouped summary of what changed, classifying each hunk into semantic categories: function add/remove, signature change, comment-only, test-only, import/dependency, config, formatting-only, control-flow, struct-change, etc. Implementation lives in a new internal/semdiff package so the unified-diff parser and heuristic classifier are independently unit-testable. The classifier is pure-heuristic and dependency-free — no LLM calls — so output is deterministic and runs offline. Flags: --staged diff staged changes --stdin read diff from stdin (hermetic, no git needed) --json machine-readable output Nightshift-Task: semantic-diff Nightshift-Ref: https://github.com/marcus/nightshift --- cmd/explain.go | 92 ++++++ cmd/explain_test.go | 77 +++++ internal/semdiff/classify.go | 466 ++++++++++++++++++++++++++++++ internal/semdiff/classify_test.go | 228 +++++++++++++++ internal/semdiff/parser.go | 130 +++++++++ internal/semdiff/render.go | 88 ++++++ 6 files changed, 1081 insertions(+) create mode 100644 cmd/explain.go create mode 100644 cmd/explain_test.go create mode 100644 internal/semdiff/classify.go create mode 100644 internal/semdiff/classify_test.go create mode 100644 internal/semdiff/parser.go create mode 100644 internal/semdiff/render.go diff --git a/cmd/explain.go b/cmd/explain.go new file mode 100644 index 00000000..9876987a --- /dev/null +++ b/cmd/explain.go @@ -0,0 +1,92 @@ +package cmd + +import ( + "bytes" + "fmt" + "io" + "os" + "os/exec" + "strings" + + "github.com/marcus/td/internal/semdiff" + "github.com/spf13/cobra" +) + +var ( + explainJSON bool + explainStaged bool + explainStdin bool + + // stdinReader is overridable in tests. + stdinReader io.Reader = os.Stdin +) + +var explainCmd = &cobra.Command{ + Use: "explain [] []", + Short: "Explain the semantic meaning of code changes", + Long: `Run a git diff (or read one from stdin) and print a human-readable +summary of what changed, grouped by file with semantic categories +(functions added/removed, signature changes, comment-only edits, tests, +imports, config, formatting-only, etc.). + +Examples: + td explain # diff HEAD vs working tree + td explain --staged # diff staged changes + td explain main HEAD # diff main..HEAD + git diff | td explain --stdin # explain a piped diff + td explain --json # machine-readable output`, + Args: cobra.MaximumNArgs(2), + RunE: runExplain, +} + +func runExplain(cmd *cobra.Command, args []string) error { + if explainStdin { + return runExplainWithReader(cmd, args, stdinReader) + } + out, err := collectGitDiff(args, explainStaged) + if err != nil { + return err + } + return runExplainWithReader(cmd, args, bytes.NewReader(out)) +} + +func runExplainWithReader(cmd *cobra.Command, _ []string, r io.Reader) error { + files, err := semdiff.Parse(r) + if err != nil { + return fmt.Errorf("parse diff: %w", err) + } + summary := semdiff.Classify(files) + semdiff.SortFiles(summary.Files) + if explainJSON { + return semdiff.RenderJSON(cmd.OutOrStdout(), summary) + } + return semdiff.RenderText(cmd.OutOrStdout(), summary) +} + +func collectGitDiff(args []string, staged bool) ([]byte, error) { + cmdArgs := []string{"diff", "--no-color"} + if staged { + cmdArgs = append(cmdArgs, "--cached") + } + cmdArgs = append(cmdArgs, args...) + + gitCmd := exec.Command("git", cmdArgs...) + var out, stderr bytes.Buffer + gitCmd.Stdout = &out + gitCmd.Stderr = &stderr + if err := gitCmd.Run(); err != nil { + msg := strings.TrimSpace(stderr.String()) + if msg == "" { + msg = err.Error() + } + return nil, fmt.Errorf("git diff failed: %s", msg) + } + return out.Bytes(), nil +} + +func init() { + rootCmd.AddCommand(explainCmd) + explainCmd.Flags().BoolVar(&explainJSON, "json", false, "Output machine-readable JSON") + explainCmd.Flags().BoolVar(&explainStaged, "staged", false, "Diff staged changes (git diff --cached)") + explainCmd.Flags().BoolVar(&explainStdin, "stdin", false, "Read unified diff from stdin instead of running git") +} diff --git a/cmd/explain_test.go b/cmd/explain_test.go new file mode 100644 index 00000000..eee6fd16 --- /dev/null +++ b/cmd/explain_test.go @@ -0,0 +1,77 @@ +package cmd + +import ( + "bytes" + "strings" + "testing" +) + +const fixtureDiff = `diff --git a/foo.go b/foo.go +--- a/foo.go ++++ b/foo.go +@@ -1,5 +1,7 @@ + package foo + +-func Greet(name string) string { ++func Greet(name, salutation string) string { + return "hi " + name + } ++ ++// new comment line +` + +func TestExplainStdinText(t *testing.T) { + // Reset flags between tests + explainJSON = false + explainStaged = false + explainStdin = true + t.Cleanup(func() { + explainJSON = false + explainStaged = false + explainStdin = false + }) + + buf := &bytes.Buffer{} + cmd := explainCmd + cmd.SetOut(buf) + + // Re-route stdin + origStdin := stdinReader + stdinReader = strings.NewReader(fixtureDiff) + t.Cleanup(func() { stdinReader = origStdin }) + + if err := runExplainWithReader(cmd, nil, stdinReader); err != nil { + t.Fatalf("explain failed: %v", err) + } + out := buf.String() + if !strings.Contains(out, "foo.go") { + t.Fatalf("expected output to mention foo.go, got %q", out) + } + if !strings.Contains(out, "signature-change") { + t.Fatalf("expected signature-change category, got %q", out) + } +} + +func TestExplainStdinJSON(t *testing.T) { + explainJSON = true + explainStdin = true + t.Cleanup(func() { + explainJSON = false + explainStdin = false + }) + + buf := &bytes.Buffer{} + cmd := explainCmd + cmd.SetOut(buf) + + if err := runExplainWithReader(cmd, nil, strings.NewReader(fixtureDiff)); err != nil { + t.Fatalf("explain failed: %v", err) + } + out := buf.String() + if !strings.HasPrefix(strings.TrimSpace(out), "{") { + t.Fatalf("expected JSON output, got %q", out) + } + if !strings.Contains(out, "\"signature-change\"") { + t.Fatalf("expected signature-change in JSON, got %q", out) + } +} diff --git a/internal/semdiff/classify.go b/internal/semdiff/classify.go new file mode 100644 index 00000000..a14289fc --- /dev/null +++ b/internal/semdiff/classify.go @@ -0,0 +1,466 @@ +package semdiff + +import ( + "path/filepath" + "regexp" + "sort" + "strings" +) + +// Category describes the semantic flavor of a single change. +type Category string + +const ( + CatFileAdded Category = "file-added" + CatFileRemoved Category = "file-removed" + CatFuncAdded Category = "function-added" + CatFuncRemoved Category = "function-removed" + CatSignatureChange Category = "signature-change" + CatCommentOnly Category = "comment-only" + CatTestChange Category = "test-change" + CatDependency Category = "dependency-change" + CatImport Category = "import-change" + CatConfig Category = "config-change" + CatFormatting Category = "formatting-only" + CatControlFlow Category = "control-flow" + CatStructChange Category = "struct-change" + CatGeneric Category = "code-change" +) + +// Change is a single classified observation within a file. +type Change struct { + Category Category `json:"category"` + Detail string `json:"detail,omitempty"` +} + +// FileSummary is the classifier's output for one file. +type FileSummary struct { + Path string `json:"path"` + Language string `json:"language,omitempty"` + IsNew bool `json:"is_new,omitempty"` + IsDel bool `json:"is_deleted,omitempty"` + Changes []Change `json:"changes"` +} + +// Summary aggregates all files and provides a top-line headline. +type Summary struct { + Headline string `json:"headline"` + Files []FileSummary `json:"files"` +} + +var ( + goFuncRe = regexp.MustCompile(`^func\s+(\([^)]*\)\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*\(`) + goCommentRe = regexp.MustCompile(`^\s*(//|/\*|\*|\*/)`) + goImportLineRe = regexp.MustCompile(`^\s*(import\b|"[^"]+")`) + goStructFieldRe = regexp.MustCompile(`^\s*[A-Za-z_][A-Za-z0-9_]*\s+[A-Za-z_*\[\]]`) + goControlRe = regexp.MustCompile(`^\s*(if|for|switch|select|return|else|case)\b`) +) + +// Classify walks each file diff and produces a Summary. +func Classify(files []FileDiff) Summary { + out := Summary{} + for _, f := range files { + fs := classifyFile(f) + out.Files = append(out.Files, fs) + } + out.Headline = buildHeadline(out.Files) + return out +} + +func classifyFile(f FileDiff) FileSummary { + fs := FileSummary{ + Path: f.Path(), + Language: detectLanguage(f.Path()), + IsNew: f.IsNew, + IsDel: f.IsDel, + } + + if f.IsNew { + fs.Changes = append(fs.Changes, Change{Category: CatFileAdded, Detail: "new file"}) + } + if f.IsDel { + fs.Changes = append(fs.Changes, Change{Category: CatFileRemoved, Detail: "file deleted"}) + } + + // File-level shortcuts: dependency / config files are uninteresting at the + // hunk level — the fact that they changed is itself the signal. + if isDependencyFile(fs.Path) { + if hasContent(f) { + fs.Changes = append(fs.Changes, Change{Category: CatDependency, Detail: filepath.Base(fs.Path) + " updated"}) + } + return fs + } + if isConfigFile(fs.Path) { + if hasContent(f) { + fs.Changes = append(fs.Changes, Change{Category: CatConfig, Detail: filepath.Base(fs.Path) + " updated"}) + } + return fs + } + + isTest := isTestFile(fs.Path) + isGo := fs.Language == "go" + + for _, h := range f.Hunks { + cats := classifyHunk(h, isGo) + if isTest { + // Promote a copy of the change so test edits are easy to spot, + // while preserving the underlying category for filtering. + for _, c := range cats { + c.Detail = strings.TrimSpace("test: " + c.Detail) + fs.Changes = append(fs.Changes, Change{Category: CatTestChange, Detail: c.Detail}) + } + continue + } + fs.Changes = append(fs.Changes, cats...) + } + + fs.Changes = dedupeChanges(fs.Changes) + return fs +} + +func classifyHunk(h Hunk, isGo bool) []Change { + if len(h.Added) == 0 && len(h.Removed) == 0 { + return nil + } + + // Comment-only: every changed line (added+removed) looks like a comment. + if isGo && allComments(h.Added) && allComments(h.Removed) { + return []Change{{Category: CatCommentOnly, Detail: "comment/doc updated"}} + } + + // Formatting-only: the *content* of added and removed lines (sans + // whitespace) is identical. + if formattingOnly(h.Added, h.Removed) { + return []Change{{Category: CatFormatting, Detail: "whitespace/format only"}} + } + + var changes []Change + + if isGo { + changes = append(changes, goFunctionChanges(h)...) + changes = append(changes, goImportChanges(h)...) + changes = append(changes, goStructChanges(h)...) + changes = append(changes, goControlFlowChanges(h)...) + } + + if len(changes) == 0 { + changes = append(changes, Change{Category: CatGeneric, Detail: hunkSizeDetail(h)}) + } + return changes +} + +func goFunctionChanges(h Hunk) []Change { + added := collectFuncs(h.Added) + removed := collectFuncs(h.Removed) + + var changes []Change + // Signature change: a func name appears in both added and removed but the + // full signature line differs. + for name, sigAdd := range added { + if sigRem, ok := removed[name]; ok { + if normalize(sigAdd) != normalize(sigRem) { + changes = append(changes, Change{ + Category: CatSignatureChange, + Detail: "signature changed: " + name, + }) + } + delete(added, name) + delete(removed, name) + } + } + for name := range added { + changes = append(changes, Change{Category: CatFuncAdded, Detail: "added func " + name}) + } + for name := range removed { + changes = append(changes, Change{Category: CatFuncRemoved, Detail: "removed func " + name}) + } + return changes +} + +func goImportChanges(h Hunk) []Change { + inImportBlock := false + for _, l := range h.Context { + if strings.Contains(l, "import (") || strings.TrimSpace(l) == "import (" { + inImportBlock = true + break + } + } + if !inImportBlock && !onlyImportLines(h.Added) && !onlyImportLines(h.Removed) { + return nil + } + if hasImportLine(h.Added) || hasImportLine(h.Removed) { + return []Change{{Category: CatImport, Detail: "import block updated"}} + } + return nil +} + +func hasImportLine(lines []string) bool { + for _, l := range lines { + ts := strings.TrimSpace(l) + if ts == "" { + continue + } + if strings.HasPrefix(ts, "import ") { + return true + } + if strings.HasPrefix(ts, "\"") && strings.HasSuffix(ts, "\"") { + return true + } + } + return false +} + +func goStructChanges(h Hunk) []Change { + addedFields := 0 + removedFields := 0 + for _, l := range h.Added { + if goStructFieldRe.MatchString(l) && !strings.Contains(l, "func ") { + addedFields++ + } + } + for _, l := range h.Removed { + if goStructFieldRe.MatchString(l) && !strings.Contains(l, "func ") { + removedFields++ + } + } + if !strings.Contains(strings.Join(h.Context, "\n"), "struct {") { + return nil + } + if addedFields == 0 && removedFields == 0 { + return nil + } + return []Change{{Category: CatStructChange, Detail: "struct fields modified"}} +} + +func goControlFlowChanges(h Hunk) []Change { + for _, l := range append(append([]string{}, h.Added...), h.Removed...) { + if goControlRe.MatchString(l) { + return []Change{{Category: CatControlFlow, Detail: "control flow updated"}} + } + } + return nil +} + +func collectFuncs(lines []string) map[string]string { + out := map[string]string{} + for _, l := range lines { + m := goFuncRe.FindStringSubmatch(strings.TrimSpace(l)) + if m == nil { + continue + } + out[m[2]] = strings.TrimSpace(l) + } + return out +} + +func allComments(lines []string) bool { + if len(lines) == 0 { + return true + } + for _, l := range lines { + ts := strings.TrimSpace(l) + if ts == "" { + continue + } + if !goCommentRe.MatchString(l) { + return false + } + } + return true +} + +func formattingOnly(added, removed []string) bool { + if len(added) == 0 || len(removed) == 0 { + return false + } + addJoined := normalize(strings.Join(added, "")) + remJoined := normalize(strings.Join(removed, "")) + return addJoined == remJoined +} + +func onlyImportLines(lines []string) bool { + any := false + for _, l := range lines { + ts := strings.TrimSpace(l) + if ts == "" || ts == "(" || ts == ")" { + continue + } + if !goImportLineRe.MatchString(ts) { + return false + } + any = true + } + return any +} + +func dedupeChanges(in []Change) []Change { + if len(in) <= 1 { + return in + } + seen := map[string]bool{} + var out []Change + for _, c := range in { + k := string(c.Category) + "|" + c.Detail + if seen[k] { + continue + } + seen[k] = true + out = append(out, c) + } + return out +} + +func normalize(s string) string { + return strings.Join(strings.Fields(s), " ") +} + +func hasContent(f FileDiff) bool { + for _, h := range f.Hunks { + if len(h.Added) > 0 || len(h.Removed) > 0 { + return true + } + } + return false +} + +func hunkSizeDetail(h Hunk) string { + return joinCount(len(h.Added), len(h.Removed)) +} + +func joinCount(add, rem int) string { + b := strings.Builder{} + b.WriteString("modified (") + if add > 0 { + b.WriteString("+") + b.WriteString(itoa(add)) + } + if add > 0 && rem > 0 { + b.WriteString(" / ") + } + if rem > 0 { + b.WriteString("-") + b.WriteString(itoa(rem)) + } + b.WriteString(" lines)") + return b.String() +} + +func itoa(n int) string { + if n == 0 { + return "0" + } + neg := n < 0 + if neg { + n = -n + } + var buf [20]byte + i := len(buf) + for n > 0 { + i-- + buf[i] = byte('0' + n%10) + n /= 10 + } + if neg { + i-- + buf[i] = '-' + } + return string(buf[i:]) +} + +func detectLanguage(path string) string { + switch strings.ToLower(filepath.Ext(path)) { + case ".go": + return "go" + case ".py": + return "python" + case ".ts", ".tsx": + return "typescript" + case ".js", ".jsx": + return "javascript" + case ".rs": + return "rust" + case ".md": + return "markdown" + } + return "" +} + +func isTestFile(path string) bool { + base := filepath.Base(path) + if strings.HasSuffix(base, "_test.go") { + return true + } + if strings.Contains(base, ".test.") { + return true + } + if strings.Contains(base, "spec.") { + return true + } + return false +} + +func isDependencyFile(path string) bool { + base := filepath.Base(path) + switch base { + case "go.mod", "go.sum", "package.json", "package-lock.json", "yarn.lock", + "Cargo.toml", "Cargo.lock", "requirements.txt", "Pipfile", "Pipfile.lock", + "pyproject.toml": + return true + } + return false +} + +func isConfigFile(path string) bool { + base := filepath.Base(path) + switch base { + case ".golangci.yml", ".golangci.yaml", ".editorconfig", ".gitignore", + ".dockerignore", "Dockerfile", "docker-compose.yml", "docker-compose.yaml", + "Makefile": + return true + } + switch strings.ToLower(filepath.Ext(base)) { + case ".yml", ".yaml", ".toml", ".ini": + return true + } + return false +} + +func buildHeadline(files []FileSummary) string { + if len(files) == 0 { + return "No changes detected." + } + var added, removed, modified int + for _, f := range files { + switch { + case f.IsNew: + added++ + case f.IsDel: + removed++ + default: + modified++ + } + } + parts := []string{} + if added > 0 { + parts = append(parts, plural(added, "new file", "new files")) + } + if modified > 0 { + parts = append(parts, plural(modified, "modified file", "modified files")) + } + if removed > 0 { + parts = append(parts, plural(removed, "removed file", "removed files")) + } + return strings.Join(parts, ", ") +} + +func plural(n int, one, many string) string { + if n == 1 { + return "1 " + one + } + return itoa(n) + " " + many +} + +// SortFiles sorts FileSummary slices alphabetically for stable output. +func SortFiles(files []FileSummary) { + sort.SliceStable(files, func(i, j int) bool { return files[i].Path < files[j].Path }) +} diff --git a/internal/semdiff/classify_test.go b/internal/semdiff/classify_test.go new file mode 100644 index 00000000..8b77cf24 --- /dev/null +++ b/internal/semdiff/classify_test.go @@ -0,0 +1,228 @@ +package semdiff + +import ( + "strings" + "testing" +) + +func parseAndClassify(t *testing.T, diff string) Summary { + t.Helper() + files, err := Parse(strings.NewReader(diff)) + if err != nil { + t.Fatalf("parse: %v", err) + } + return Classify(files) +} + +func hasCategory(s Summary, path string, cat Category) bool { + for _, f := range s.Files { + if f.Path != path { + continue + } + for _, c := range f.Changes { + if c.Category == cat { + return true + } + } + } + return false +} + +func TestCommentOnlyChange(t *testing.T) { + diff := `diff --git a/foo.go b/foo.go +--- a/foo.go ++++ b/foo.go +@@ -1,3 +1,3 @@ + package foo +-// old comment ++// new comment +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "foo.go", CatCommentOnly) { + t.Fatalf("expected comment-only, got %+v", s) + } +} + +func TestFunctionAdded(t *testing.T) { + diff := `diff --git a/foo.go b/foo.go +--- a/foo.go ++++ b/foo.go +@@ -1,3 +1,6 @@ + package foo ++ ++func Greet(name string) string { ++ return "hi " + name ++} +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "foo.go", CatFuncAdded) { + t.Fatalf("expected function-added, got %+v", s) + } +} + +func TestSignatureChange(t *testing.T) { + diff := `diff --git a/foo.go b/foo.go +--- a/foo.go ++++ b/foo.go +@@ -1,4 +1,4 @@ + package foo +-func Greet(name string) string { ++func Greet(name, salutation string) string { + return "hi " + name + } +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "foo.go", CatSignatureChange) { + t.Fatalf("expected signature-change, got %+v", s) + } +} + +func TestImportOnlyChange(t *testing.T) { + diff := `diff --git a/foo.go b/foo.go +--- a/foo.go ++++ b/foo.go +@@ -1,5 +1,6 @@ + package foo + + import ( ++ "fmt" + "strings" + ) +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "foo.go", CatImport) { + t.Fatalf("expected import-change, got %+v", s) + } +} + +func TestTestOnlyChange(t *testing.T) { + diff := `diff --git a/foo_test.go b/foo_test.go +--- a/foo_test.go ++++ b/foo_test.go +@@ -1,4 +1,5 @@ + package foo + + func TestThing(t *testing.T) { ++ t.Log("extra") + } +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "foo_test.go", CatTestChange) { + t.Fatalf("expected test-change, got %+v", s) + } +} + +func TestDependencyChange(t *testing.T) { + diff := `diff --git a/go.mod b/go.mod +--- a/go.mod ++++ b/go.mod +@@ -3,2 +3,3 @@ + require ( ++ example.com/new v1.0.0 + ) +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "go.mod", CatDependency) { + t.Fatalf("expected dependency, got %+v", s) + } +} + +func TestNewFile(t *testing.T) { + diff := `diff --git a/new.go b/new.go +new file mode 100644 +--- /dev/null ++++ b/new.go +@@ -0,0 +1,2 @@ ++package foo ++ +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "new.go", CatFileAdded) { + t.Fatalf("expected file-added, got %+v", s) + } +} + +func TestDeletedFile(t *testing.T) { + diff := `diff --git a/old.go b/old.go +deleted file mode 100644 +--- a/old.go ++++ /dev/null +@@ -1,2 +0,0 @@ +-package foo +- +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "old.go", CatFileRemoved) { + t.Fatalf("expected file-removed, got %+v", s) + } +} + +func TestMixedChange(t *testing.T) { + diff := `diff --git a/foo.go b/foo.go +--- a/foo.go ++++ b/foo.go +@@ -1,10 +1,12 @@ + package foo + +-func Old() {} ++func New() { ++ if true { ++ return ++ } ++} +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "foo.go", CatFuncAdded) { + t.Fatalf("expected function-added in mixed diff, got %+v", s) + } + if !hasCategory(s, "foo.go", CatFuncRemoved) { + t.Fatalf("expected function-removed in mixed diff, got %+v", s) + } +} + +func TestFormattingOnly(t *testing.T) { + diff := `diff --git a/foo.go b/foo.go +--- a/foo.go ++++ b/foo.go +@@ -1,2 +1,2 @@ +-x := 1 ++x := 1 +` + s := parseAndClassify(t, diff) + if !hasCategory(s, "foo.go", CatFormatting) { + t.Fatalf("expected formatting-only, got %+v", s) + } +} + +func TestHeadlineCounts(t *testing.T) { + diff := `diff --git a/a.go b/a.go +new file mode 100644 +--- /dev/null ++++ b/a.go +@@ -0,0 +1,1 @@ ++package a +diff --git a/b.go b/b.go +--- a/b.go ++++ b/b.go +@@ -1,1 +1,2 @@ + package b ++// added +` + s := parseAndClassify(t, diff) + if s.Headline == "" || s.Headline == "No changes detected." { + t.Fatalf("expected non-empty headline, got %q", s.Headline) + } + if !strings.Contains(s.Headline, "new file") { + t.Errorf("expected new-file count, got %q", s.Headline) + } + if !strings.Contains(s.Headline, "modified file") { + t.Errorf("expected modified-file count, got %q", s.Headline) + } +} + +func TestEmptyDiff(t *testing.T) { + s := parseAndClassify(t, "") + if s.Headline != "No changes detected." { + t.Fatalf("expected no-changes headline, got %q", s.Headline) + } +} diff --git a/internal/semdiff/parser.go b/internal/semdiff/parser.go new file mode 100644 index 00000000..c6e66bc1 --- /dev/null +++ b/internal/semdiff/parser.go @@ -0,0 +1,130 @@ +// Package semdiff parses unified-diff output and classifies hunks by their +// semantic meaning. It is intentionally heuristic and dependency-free so the +// classifier can run offline and remain deterministic for tests. +package semdiff + +import ( + "bufio" + "io" + "strings" +) + +// FileDiff represents a single file's changes in a unified diff. +type FileDiff struct { + OldPath string + NewPath string + IsNew bool + IsDel bool + Hunks []Hunk +} + +// Path returns the most-relevant path for the file diff. +func (f FileDiff) Path() string { + if f.NewPath != "" && f.NewPath != "/dev/null" { + return f.NewPath + } + return f.OldPath +} + +// Hunk is a single @@ ... @@ block. +type Hunk struct { + Header string + Added []string + Removed []string + Context []string + RawLines []string +} + +// Parse reads a unified diff (as produced by `git diff` with default options) +// from r and returns one FileDiff per file. +func Parse(r io.Reader) ([]FileDiff, error) { + scanner := bufio.NewScanner(r) + // Allow large lines (some generated files have very long lines). + scanner.Buffer(make([]byte, 0, 64*1024), 4*1024*1024) + + var files []FileDiff + var cur *FileDiff + var curHunk *Hunk + + flushHunk := func() { + if cur != nil && curHunk != nil { + cur.Hunks = append(cur.Hunks, *curHunk) + curHunk = nil + } + } + flushFile := func() { + flushHunk() + if cur != nil { + files = append(files, *cur) + cur = nil + } + } + + for scanner.Scan() { + line := scanner.Text() + switch { + case strings.HasPrefix(line, "diff --git "): + flushFile() + old, new := parseDiffHeader(line) + cur = &FileDiff{OldPath: old, NewPath: new} + case strings.HasPrefix(line, "new file mode"): + if cur != nil { + cur.IsNew = true + } + case strings.HasPrefix(line, "deleted file mode"): + if cur != nil { + cur.IsDel = true + } + case strings.HasPrefix(line, "--- "): + if cur != nil { + cur.OldPath = stripPathPrefix(strings.TrimPrefix(line, "--- ")) + } + case strings.HasPrefix(line, "+++ "): + if cur != nil { + cur.NewPath = stripPathPrefix(strings.TrimPrefix(line, "+++ ")) + } + case strings.HasPrefix(line, "@@"): + flushHunk() + curHunk = &Hunk{Header: line} + default: + if curHunk == nil || cur == nil { + continue + } + curHunk.RawLines = append(curHunk.RawLines, line) + if strings.HasPrefix(line, "+") && !strings.HasPrefix(line, "+++") { + curHunk.Added = append(curHunk.Added, line[1:]) + } else if strings.HasPrefix(line, "-") && !strings.HasPrefix(line, "---") { + curHunk.Removed = append(curHunk.Removed, line[1:]) + } else if strings.HasPrefix(line, " ") { + curHunk.Context = append(curHunk.Context, line[1:]) + } + } + } + flushFile() + if err := scanner.Err(); err != nil { + return files, err + } + return files, nil +} + +func parseDiffHeader(line string) (string, string) { + // Format: diff --git a/path b/path + parts := strings.SplitN(line, " ", 4) + if len(parts) < 4 { + return "", "" + } + return stripPathPrefix(parts[2]), stripPathPrefix(parts[3]) +} + +func stripPathPrefix(p string) string { + p = strings.TrimSpace(p) + switch { + case strings.HasPrefix(p, "a/"): + return p[2:] + case strings.HasPrefix(p, "b/"): + return p[2:] + case p == "/dev/null": + return p + } + return p +} diff --git a/internal/semdiff/render.go b/internal/semdiff/render.go new file mode 100644 index 00000000..f234c618 --- /dev/null +++ b/internal/semdiff/render.go @@ -0,0 +1,88 @@ +package semdiff + +import ( + "encoding/json" + "fmt" + "io" + "strings" +) + +// RenderText writes a human-readable, file-grouped summary to w. +func RenderText(w io.Writer, s Summary) error { + if s.Headline == "" { + s.Headline = "No changes detected." + } + if _, err := fmt.Fprintln(w, s.Headline); err != nil { + return err + } + if len(s.Files) == 0 { + return nil + } + if _, err := fmt.Fprintln(w); err != nil { + return err + } + for _, f := range s.Files { + marker := "M" + switch { + case f.IsNew: + marker = "A" + case f.IsDel: + marker = "D" + } + header := fmt.Sprintf("[%s] %s", marker, f.Path) + if f.Language != "" { + header += " (" + f.Language + ")" + } + if _, err := fmt.Fprintln(w, header); err != nil { + return err + } + if len(f.Changes) == 0 { + if _, err := fmt.Fprintln(w, " - no semantic changes detected"); err != nil { + return err + } + continue + } + seen := map[string]bool{} + for _, c := range f.Changes { + key := string(c.Category) + "|" + c.Detail + if seen[key] { + continue + } + seen[key] = true + line := fmt.Sprintf(" - %s", c.Category) + if c.Detail != "" { + line += ": " + c.Detail + } + if _, err := fmt.Fprintln(w, line); err != nil { + return err + } + } + } + return nil +} + +// RenderJSON writes a machine-readable summary. +func RenderJSON(w io.Writer, s Summary) error { + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + return enc.Encode(s) +} + +// Diagnostics produces a one-line headline plus per-category counts. Useful for +// callers that want to embed a summary somewhere small. +func (s Summary) Diagnostics() string { + counts := map[Category]int{} + for _, f := range s.Files { + for _, c := range f.Changes { + counts[c.Category]++ + } + } + if len(counts) == 0 { + return s.Headline + } + parts := []string{s.Headline} + for cat, n := range counts { + parts = append(parts, fmt.Sprintf("%s=%d", cat, n)) + } + return strings.Join(parts, "; ") +}