From afc6127f7238f3604318b6785a727b5248d2d156 Mon Sep 17 00:00:00 2001
From: benmandrew <benmandrew@gmail.com>
Date: Tue, 30 Jun 2026 16:09:54 +0100
Subject: [PATCH 1/8] opt: cancel adjacent opposing simple commands

Adds optimise_program() called from both bfc and bfi after parsing.
The first pass (cancel_opposing) merges adjacent INC/DEC and RIGHT/LEFT
pairs, subtracting their counts and removing pairs that fully cancel.
Bracket jump indices are remapped after compaction.
---
 PLAN.md        | 73 ++++++++++++++++++++++++++++++++++++++++++++++
 src/ir.c       | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/ir.h       |  4 +++
 src/main_bfc.c |  1 +
 src/main_bfi.c |  1 +
 5 files changed, 158 insertions(+)
 create mode 100644 PLAN.md

diff --git a/PLAN.md b/PLAN.md
new file mode 100644
index 0000000..5ab17b7
--- /dev/null
+++ b/PLAN.md
@@ -0,0 +1,73 @@
+# Optimisation Plan
+
+Five optimisations for the produced LLVM IR, implemented as separate commits.
+
+## Status
+
+- [ ] 1. Cancel opposing simple commands
+- [ ] 2. `CMD_CLEAR` — zero-loop `[-]` → `store i8 0`
+- [ ] 3. `CMD_MULTIPLY` — multiply-loop `[->N*+<]` → multiply-add
+- [ ] 4. `dp` as `alloca` (enables `mem2reg`)
+- [ ] 5. LLVM pass pipeline gated on `-O`
+
+---
+
+## 1. Cancel opposing simple commands
+
+**Where**: `ir.c` — new `optimise_program()` called from both `main_bfc.c` and `main_bfi.c`.
+
+Peephole pass over the `cmds` array: adjacent INC/DEC or RIGHT/LEFT pairs subtract counts and, if they fully cancel, are removed. Re-computes bracket jump indices after compaction.
+
+**Example**: `+++--` → `CMD_SIMPLE_INC(1)` instead of `CMD_SIMPLE_INC(3), CMD_SIMPLE_DEC(2)`.
+
+Test changes: none (no existing test program uses cancellable patterns).
+
+---
+
+## 2. CMD_CLEAR — zero-loop detection
+
+**Where**: `ir.c` `optimise_program()`, `llvm.c`, `interp.c`.
+
+New `CMD_CLEAR` IR node. Pattern detected after cancellation pass: `[` + single INC or DEC body + `]`. Replaced with a single `store i8 0` in codegen, `ctx->data[ctx->dp] = 0` in interpreter.
+
+**Example**: `+++++[-]` → `CMD_SIMPLE_INC(5), CMD_CLEAR`.
+
+Test changes: update `test/test_simple_loop.filecheck` (program is `+++++[-]`).
+
+---
+
+## 3. CMD_MULTIPLY — multiply-loop detection
+
+**Where**: `ir.c` `optimise_program()`, `llvm.c`, `interp.c`.
+
+New `CMD_MULTIPLY` IR node with up to `MULTIPLY_MOVES_MAX` (8) offset/factor pairs. A loop body matches when: only `+`/`-`/`>`/`<` inside, net pointer movement is zero, loop counter cell has net delta −1. Each non-counter cell touched becomes a `{offset, factor}` move.
+
+Codegen: load counter, for each move `data[dp+offset] += counter * factor`, then `store i8 0` to counter cell. Interpreter: same arithmetic.
+
+**Example**: `[->+<]` at dp=1 → `CMD_MULTIPLY {moves=[{offset=-1, factor=1}]}`.
+
+Test changes: add `test/res/multiply.b` and `test/test_multiply.filecheck`.
+
+---
+
+## 4. `dp` as `alloca`
+
+**Where**: `llvm.c` — `create_main_function()` creates `dp` as an alloca instead of a global.
+
+`dp` is removed from the global section and created with `LLVMBuildAlloca` in the entry block, immediately initialised to 0. The `LLVMValueRef ctx->dp` is still a pointer (alloca ptr vs global ptr) so all downstream load/store calls are unchanged.
+
+Without LLVM passes the IR still has explicit load/store; the benefit is unlocked in commit 5 when `mem2reg` promotes the alloca to a register.
+
+Test changes: update all FileCheck tests — remove `@dp = global i32 0` check, change `ptr @dp` references to `ptr %dp`.
+
+---
+
+## 5. LLVM pass pipeline gated on `-O`
+
+**Where**: `llvm.c`, `llvm.h`, `cmake/llvm.cmake`, `main_bfc.c`.
+
+Wires the already-parsed `--optimise`/`-O` flag from `main_bfc.c` through `generate(struct program *, bool optimise)`. When `optimise` is true, runs `"mem2reg,instcombine,simplifycfg,gvn"` via `LLVMRunPasses` (LLVM new pass manager, LLVM ≥ 14). Adds `passes` to `llvm_map_components_to_libnames` in cmake.
+
+`mem2reg` promotes the `dp` alloca to a register; `gvn` eliminates redundant loads of `@data` elements; `instcombine` and `simplifycfg` clean up the resulting IR.
+
+Test changes: none (FileCheck tests do not pass `-O`).
diff --git a/src/ir.c b/src/ir.c
index e50f828..49ca68f 100644
--- a/src/ir.c
+++ b/src/ir.c
@@ -222,6 +222,85 @@ char program_contains_output(struct program *program) {
         return 0;
 }
 
+static int are_opposing(enum cmd_type a, enum cmd_type b) {
+        return (a == CMD_SIMPLE_INC && b == CMD_SIMPLE_DEC) ||
+               (a == CMD_SIMPLE_DEC && b == CMD_SIMPLE_INC) ||
+               (a == CMD_SIMPLE_RIGHT && b == CMD_SIMPLE_LEFT) ||
+               (a == CMD_SIMPLE_LEFT && b == CMD_SIMPLE_RIGHT);
+}
+
+static enum cmd_type opposite_type(enum cmd_type t) {
+        switch (t) {
+        case CMD_SIMPLE_INC:
+                return CMD_SIMPLE_DEC;
+        case CMD_SIMPLE_DEC:
+                return CMD_SIMPLE_INC;
+        case CMD_SIMPLE_RIGHT:
+                return CMD_SIMPLE_LEFT;
+        case CMD_SIMPLE_LEFT:
+                return CMD_SIMPLE_RIGHT;
+        default:
+                return t;
+        }
+}
+
+static void cancel_opposing(struct program *program) {
+        struct cmd *new_cmds = malloc(program->length * sizeof(struct cmd));
+        size_t *old_to_new = malloc(program->length * sizeof(size_t));
+        size_t *new_to_old = malloc(program->length * sizeof(size_t));
+        if (!new_cmds || !old_to_new || !new_to_old) {
+                fprintf(stderr, "Memory allocation failed\n");
+                exit(1);
+        }
+        size_t new_len = 0;
+
+        for (size_t old = 0; old < program->length; old++) {
+                struct cmd curr = program->cmds[old];
+                if (new_len > 0) {
+                        struct cmd *prev = &new_cmds[new_len - 1];
+                        if (are_opposing(prev->type, curr.type)) {
+                                size_t pc = prev->value.simple_count;
+                                size_t cc = curr.value.simple_count;
+                                if (cc > pc) {
+                                        prev->type = opposite_type(prev->type);
+                                        prev->value.simple_count = cc - pc;
+                                } else if (cc < pc) {
+                                        prev->value.simple_count = pc - cc;
+                                } else {
+                                        old_to_new[new_to_old[new_len - 1]] =
+                                            SIZE_MAX;
+                                        new_len--;
+                                }
+                                old_to_new[old] = SIZE_MAX;
+                                continue;
+                        }
+                }
+                old_to_new[old] = new_len;
+                new_to_old[new_len] = old;
+                new_cmds[new_len++] = curr;
+        }
+
+        for (size_t i = 0; i < new_len; i++) {
+                if (new_cmds[i].type == CMD_JUMP_FORWARD ||
+                    new_cmds[i].type == CMD_JUMP_BACK) {
+                        size_t old_target =
+                            program->cmds[new_to_old[i]].value.jump_index;
+                        assert(old_to_new[old_target] != SIZE_MAX);
+                        new_cmds[i].value.jump_index = old_to_new[old_target];
+                }
+        }
+
+        free(program->cmds);
+        free(old_to_new);
+        free(new_to_old);
+        program->cmds = new_cmds;
+        program->length = new_len;
+}
+
+void optimise_program(struct program *program) {
+        cancel_opposing(program);
+}
+
 char program_contains_input(struct program *program) {
         for (size_t cmd_index = 0; cmd_index < program->length; cmd_index++) {
                 if (program->cmds[cmd_index].type == CMD_SIMPLE_INPUT) {
diff --git a/src/ir.h b/src/ir.h
index cfc502a..24f31e3 100644
--- a/src/ir.h
+++ b/src/ir.h
@@ -83,4 +83,8 @@ char program_contains_input(struct program *program);
 /// @return 1 if valid; otherwise 0.
 char program_is_valid(char *source_str);
 
+/// Apply IR-level optimisations to a parsed program in-place.
+/// @param program Program to optimise.
+void optimise_program(struct program *program);
+
 #endif
diff --git a/src/main_bfc.c b/src/main_bfc.c
index d382b26..f88d3de 100644
--- a/src/main_bfc.c
+++ b/src/main_bfc.c
@@ -72,6 +72,7 @@ int main(int argc, char **argv) {
         }
         struct program parsed_program = string_to_program(program_str);
         free(program_str);
+        optimise_program(&parsed_program);
         LLVMModuleRef module = generate(&parsed_program);
         char *err = NULL;
         LLVMPrintModuleToFile(module, "/dev/stdout", &err);
diff --git a/src/main_bfi.c b/src/main_bfi.c
index 640b0a7..99a13f8 100644
--- a/src/main_bfi.c
+++ b/src/main_bfi.c
@@ -67,6 +67,7 @@ int main(int argc, char **argv) {
         }
         struct program parsed_program = string_to_program(program_str);
         free(program_str);
+        optimise_program(&parsed_program);
         struct context_t ctx = init_context(parsed_program);
         while (!interp(&ctx, STDOUT_FILENO, STDIN_FILENO, byte_output)) {
         };

From 97b34fa4e3b7630f7e70f4658874251a41f4d290 Mon Sep 17 00:00:00 2001
From: benmandrew <benmandrew@gmail.com>
Date: Tue, 30 Jun 2026 16:12:00 +0100
Subject: [PATCH 2/8] opt: replace [-]/[+] loops with CMD_CLEAR (store i8 0)

Adds detect_clear_loops() pass: a loop whose body is a single INC or
DEC (any count) is replaced with the synthetic CMD_CLEAR node, emitting
a single store i8 0 in LLVM IR and a direct zero-assignment in the
interpreter. Updates test_simple_loop.filecheck to match.
---
 PLAN.md                         | 26 ++++++++++++++-
 src/interp.c                    |  5 +++
 src/ir.c                        | 57 +++++++++++++++++++++++++++++++++
 src/ir.h                        |  2 ++
 src/llvm.c                      |  9 ++++++
 test/test_simple_loop.filecheck | 29 ++++-------------
 6 files changed, 104 insertions(+), 24 deletions(-)

diff --git a/PLAN.md b/PLAN.md
index 5ab17b7..ed825bd 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -4,11 +4,12 @@ Five optimisations for the produced LLVM IR, implemented as separate commits.
 
 ## Status
 
-- [ ] 1. Cancel opposing simple commands
+- [x] 1. Cancel opposing simple commands
 - [ ] 2. `CMD_CLEAR` — zero-loop `[-]` → `store i8 0`
 - [ ] 3. `CMD_MULTIPLY` — multiply-loop `[->N*+<]` → multiply-add
 - [ ] 4. `dp` as `alloca` (enables `mem2reg`)
 - [ ] 5. LLVM pass pipeline gated on `-O`
+- [ ] 6. Per-optimisation TOML config file
 
 ---
 
@@ -71,3 +72,26 @@ Wires the already-parsed `--optimise`/`-O` flag from `main_bfc.c` through `gener
 `mem2reg` promotes the `dp` alloca to a register; `gvn` eliminates redundant loads of `@data` elements; `instcombine` and `simplifycfg` clean up the resulting IR.
 
 Test changes: none (FileCheck tests do not pass `-O`).
+
+---
+
+## 6. Per-optimisation TOML config file
+
+**Where**: new `src/config.h` / `src/config.c`, updated `main_bfc.c`, updated `optimise_program()` and `generate()` signatures.
+
+A flat TOML file (default `bf.toml` in the current directory, overridable with `-c`/`--config`) controls each optimisation independently:
+
+```toml
+[optimisations]
+cancel_opposing = true
+clear_loop      = true
+multiply_loop   = true
+dp_alloca       = true
+llvm_passes     = false
+```
+
+`struct opt_config` holds a boolean for each flag; a minimal built-in parser handles `[section]` headers and `key = true/false` lines. Missing file → all optimisations enabled by default. The `-O` flag becomes a shorthand for enabling all flags.
+
+`optimise_program(struct program *, const struct opt_config *)` and `generate(struct program *, const struct opt_config *)` are updated to gate each pass on its flag.
+
+Test changes: add `test/res/bf.toml` with specific flags for FileCheck regression tests if needed.
diff --git a/src/interp.c b/src/interp.c
index 0dc58ef..0c61855 100644
--- a/src/interp.c
+++ b/src/interp.c
@@ -32,6 +32,8 @@ size_t abstract_to_concrete_pc(size_t abstract_pc, struct program *program) {
                 case CMD_JUMP_BACK:
                         concrete_pc++;
                         break;
+                case CMD_CLEAR:
+                        break;
                 default:
                         fprintf(stderr, "Unrecognised cmd_type '%c'\n",
                                 program->cmds[cmd_index].type);
@@ -171,6 +173,9 @@ int interp(struct context_t *ctx, int out_fd, int in_fd, bool byte_output) {
                         ctx->pc = current_cmd.value.jump_index;
                 }
                 break;
+        case CMD_CLEAR:
+                ctx->data[ctx->dp] = 0;
+                break;
         default:
                 fprintf(stderr, "Invalid character '%c'\n",
                         cmd_type_to_char(current_cmd.type));
diff --git a/src/ir.c b/src/ir.c
index 49ca68f..71b3e21 100644
--- a/src/ir.c
+++ b/src/ir.c
@@ -56,6 +56,8 @@ size_t program_str_length(struct program *program) {
                 case CMD_JUMP_BACK:
                         length++;
                         break;
+                case CMD_CLEAR:
+                        break;
                 default:
                         fprintf(stderr, "Unrecognised cmd_type '%c'\n",
                                 program->cmds[cmd_index].type);
@@ -203,6 +205,8 @@ char *program_to_string(struct program *program) {
                         out[str_index++] =
                             cmd_type_to_char(program->cmds[cmd_index].type);
                         break;
+                case CMD_CLEAR:
+                        break;
                 default:
                         fprintf(stderr, "Unrecognised cmd_type '%c'\n",
                                 program->cmds[cmd_index].type);
@@ -297,8 +301,61 @@ static void cancel_opposing(struct program *program) {
         program->length = new_len;
 }
 
+static void detect_clear_loops(struct program *program) {
+        struct cmd *new_cmds = malloc(program->length * sizeof(struct cmd));
+        size_t *old_to_new = malloc(program->length * sizeof(size_t));
+        size_t *new_to_old = malloc(program->length * sizeof(size_t));
+        if (!new_cmds || !old_to_new || !new_to_old) {
+                fprintf(stderr, "Memory allocation failed\n");
+                exit(1);
+        }
+        size_t new_len = 0;
+
+        for (size_t old = 0; old < program->length;) {
+                struct cmd c = program->cmds[old];
+                if (c.type == CMD_JUMP_FORWARD && old + 2 < program->length) {
+                        struct cmd body = program->cmds[old + 1];
+                        struct cmd close = program->cmds[old + 2];
+                        if ((body.type == CMD_SIMPLE_INC ||
+                             body.type == CMD_SIMPLE_DEC) &&
+                            close.type == CMD_JUMP_BACK &&
+                            c.value.jump_index == old + 2) {
+                                old_to_new[old] = new_len;
+                                old_to_new[old + 1] = SIZE_MAX;
+                                old_to_new[old + 2] = SIZE_MAX;
+                                new_to_old[new_len] = old;
+                                new_cmds[new_len++] =
+                                    (struct cmd){.type = CMD_CLEAR};
+                                old += 3;
+                                continue;
+                        }
+                }
+                old_to_new[old] = new_len;
+                new_to_old[new_len] = old;
+                new_cmds[new_len++] = c;
+                old++;
+        }
+
+        for (size_t i = 0; i < new_len; i++) {
+                if (new_cmds[i].type == CMD_JUMP_FORWARD ||
+                    new_cmds[i].type == CMD_JUMP_BACK) {
+                        size_t old_target =
+                            program->cmds[new_to_old[i]].value.jump_index;
+                        assert(old_to_new[old_target] != SIZE_MAX);
+                        new_cmds[i].value.jump_index = old_to_new[old_target];
+                }
+        }
+
+        free(program->cmds);
+        free(old_to_new);
+        free(new_to_old);
+        program->cmds = new_cmds;
+        program->length = new_len;
+}
+
 void optimise_program(struct program *program) {
         cancel_opposing(program);
+        detect_clear_loops(program);
 }
 
 char program_contains_input(struct program *program) {
diff --git a/src/ir.h b/src/ir.h
index 24f31e3..f21b25e 100644
--- a/src/ir.h
+++ b/src/ir.h
@@ -21,6 +21,8 @@ enum cmd_type {
         CMD_JUMP_FORWARD,
         /// `']'`: jump back if current cell is non-zero.
         CMD_JUMP_BACK,
+        /// Synthetic: set current cell to zero (replaces `[-]`/`[+]`).
+        CMD_CLEAR,
 };
 
 /// One compressed instruction in the internal Brainfuck IR.
diff --git a/src/llvm.c b/src/llvm.c
index d8e3f84..7b7f5df 100644
--- a/src/llvm.c
+++ b/src/llvm.c
@@ -184,6 +184,12 @@ void comma(struct llvm_context *ctx) {
         LLVMBuildStore(ctx->builder, char_value, data_ptr);
 }
 
+void clear(struct llvm_context *ctx) {
+        LLVMValueRef data_ptr = get_dataptr(ctx);
+        LLVMBuildStore(ctx->builder, LLVMConstInt(int8_type(ctx), 0, 0),
+                       data_ptr);
+}
+
 void left_bracket(struct llvm_context *ctx) {
         LLVMValueRef data_ptr = get_dataptr(ctx);
         LLVMValueRef current_value =
@@ -250,6 +256,9 @@ LLVMModuleRef generate(struct program *program) {
                 case CMD_JUMP_BACK:
                         right_bracket(&ctx);
                         break;
+                case CMD_CLEAR:
+                        clear(&ctx);
+                        break;
                 default:
                         fprintf(stderr, "Unsupported cmd_type '%c'\n",
                                 command.type);
diff --git a/test/test_simple_loop.filecheck b/test/test_simple_loop.filecheck
index cd01342..ed63c5b 100644
--- a/test/test_simple_loop.filecheck
+++ b/test/test_simple_loop.filecheck
@@ -1,7 +1,7 @@
 ; RUN: %bf %s.b --emit-llvm | FileCheck %s
 
-; Test brainfuck program with a simple loop
-; This should generate IR for adding 5 to a cell, then looping to decrement until zero
+; Test brainfuck program with a simple loop: +++++[-]
+; The [-] pattern is optimised to a single store of zero (CMD_CLEAR).
 
 ; CHECK: ; ModuleID = 'main'
 ; CHECK: source_filename = "main"
@@ -12,35 +12,18 @@
 ; CHECK: define i32 @main() {
 ; CHECK: entry:
 
-; Initial addition of 5 to current cell
+; Initial addition of 5 to current cell (+++++).
 ; CHECK: %[[DP1:.*]] = load i32, ptr @dp, align 4
 ; CHECK: %[[PTR1:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP1]]
 ; CHECK: %[[VAL1:.*]] = load i8, ptr %[[PTR1]], align 1
 ; CHECK: %[[ADD:.*]] = add i8 %[[VAL1]], 5
 ; CHECK: store i8 %[[ADD]], ptr %[[PTR1]], align 1
 
-; Loop condition check - load current value and compare to zero
+; CMD_CLEAR: [-] collapses to a single store of zero.
 ; CHECK: %[[DP2:.*]] = load i32, ptr @dp, align 4
 ; CHECK: %[[PTR2:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP2]]
-; CHECK: %[[VAL2:.*]] = load i8, ptr %[[PTR2]], align 1
-; CHECK: %[[LOOPCOND:.*]] = icmp ne i8 %[[VAL2]], 0
-; CHECK: br i1 %[[LOOPCOND]], label %{{.*}}, label %exit
+; CHECK: store i8 0, ptr %[[PTR2]], align 1
 
-; Loop body - decrement current cell
-; CHECK: %[[DP3:.*]] = load i32, ptr @dp, align 4
-; CHECK: %[[PTR3:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP3]]
-; CHECK: %[[VAL3:.*]] = load i8, ptr %[[PTR3]], align 1
-; CHECK: %[[SUB:.*]] = sub i8 %[[VAL3]], 1
-; CHECK: store i8 %[[SUB]], ptr %[[PTR3]], align 1
-
-; Loop condition check again (end of loop body)
-; CHECK: %[[DP4:.*]] = load i32, ptr @dp, align 4
-; CHECK: %[[PTR4:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP4]]
-; CHECK: %[[VAL4:.*]] = load i8, ptr %[[PTR4]], align 1
-; CHECK: %[[LOOPCOND2:.*]] = icmp ne i8 %[[VAL4]], 0
-; CHECK: br i1 %[[LOOPCOND2]], label %{{.*}}, label %exit
-
-; Exit block
-; CHECK: exit:
+; Return statement
 ; CHECK: ret i32 0
 ; CHECK: }

From d3519e8fdaae97709bd710dd955b11734d57ac0e Mon Sep 17 00:00:00 2001
From: benmandrew <benmandrew@gmail.com>
Date: Tue, 30 Jun 2026 16:14:02 +0100
Subject: [PATCH 3/8] opt: detect multiply-loops and replace with CMD_MULTIPLY
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds detect_multiply_loops() pass: a loop whose body contains only
+/-/</>  with net pointer delta 0 and loop-counter delta -1 is
replaced with CMD_MULTIPLY. Each non-counter cell touched becomes an
{offset, factor} pair. Codegen emits counter load, multiply-adds, then
store i8 0. Supports up to MULTIPLY_MOVES_MAX (8) target cells, offsets
within ±64. Adds test/res/multiply.b and test/test_multiply.filecheck.
---
 src/interp.c                 |  12 ++++
 src/ir.c                     | 118 +++++++++++++++++++++++++++++++++++
 src/ir.h                     |  18 ++++++
 src/llvm.c                   |  35 +++++++++++
 test/res/multiply.b          |   1 +
 test/test_multiply.filecheck |  43 +++++++++++++
 6 files changed, 227 insertions(+)
 create mode 100644 test/res/multiply.b
 create mode 100644 test/test_multiply.filecheck

diff --git a/src/interp.c b/src/interp.c
index 0c61855..d077755 100644
--- a/src/interp.c
+++ b/src/interp.c
@@ -33,6 +33,7 @@ size_t abstract_to_concrete_pc(size_t abstract_pc, struct program *program) {
                         concrete_pc++;
                         break;
                 case CMD_CLEAR:
+                case CMD_MULTIPLY:
                         break;
                 default:
                         fprintf(stderr, "Unrecognised cmd_type '%c'\n",
@@ -176,6 +177,17 @@ int interp(struct context_t *ctx, int out_fd, int in_fd, bool byte_output) {
         case CMD_CLEAR:
                 ctx->data[ctx->dp] = 0;
                 break;
+        case CMD_MULTIPLY:
+                for (size_t i = 0;
+                     i < current_cmd.value.multiply.n_moves; i++) {
+                        int target = (int)ctx->dp +
+                                     current_cmd.value.multiply.moves[i].offset;
+                        ctx->data[target] +=
+                            ctx->data[ctx->dp] *
+                            (uint8_t)current_cmd.value.multiply.moves[i].factor;
+                }
+                ctx->data[ctx->dp] = 0;
+                break;
         default:
                 fprintf(stderr, "Invalid character '%c'\n",
                         cmd_type_to_char(current_cmd.type));
diff --git a/src/ir.c b/src/ir.c
index 71b3e21..7ca216d 100644
--- a/src/ir.c
+++ b/src/ir.c
@@ -57,6 +57,7 @@ size_t program_str_length(struct program *program) {
                         length++;
                         break;
                 case CMD_CLEAR:
+                case CMD_MULTIPLY:
                         break;
                 default:
                         fprintf(stderr, "Unrecognised cmd_type '%c'\n",
@@ -206,6 +207,7 @@ char *program_to_string(struct program *program) {
                             cmd_type_to_char(program->cmds[cmd_index].type);
                         break;
                 case CMD_CLEAR:
+                case CMD_MULTIPLY:
                         break;
                 default:
                         fprintf(stderr, "Unrecognised cmd_type '%c'\n",
@@ -353,9 +355,125 @@ static void detect_clear_loops(struct program *program) {
         program->length = new_len;
 }
 
+#define DELTA_RANGE 64
+
+static void detect_multiply_loops(struct program *program) {
+        struct cmd *new_cmds = malloc(program->length * sizeof(struct cmd));
+        size_t *old_to_new = malloc(program->length * sizeof(size_t));
+        size_t *new_to_old = malloc(program->length * sizeof(size_t));
+        if (!new_cmds || !old_to_new || !new_to_old) {
+                fprintf(stderr, "Memory allocation failed\n");
+                exit(1);
+        }
+        size_t new_len = 0;
+
+        for (size_t old = 0; old < program->length;) {
+                struct cmd c = program->cmds[old];
+                if (c.type == CMD_JUMP_FORWARD) {
+                        size_t close_idx = c.value.jump_index;
+                        int valid = 1;
+                        int dp_delta = 0;
+                        int deltas[2 * DELTA_RANGE + 1];
+                        memset(deltas, 0, sizeof(deltas));
+
+                        for (size_t k = old + 1; k < close_idx && valid; k++) {
+                                struct cmd bk = program->cmds[k];
+                                switch (bk.type) {
+                                case CMD_SIMPLE_RIGHT:
+                                        dp_delta += (int)bk.value.simple_count;
+                                        if (dp_delta > DELTA_RANGE ||
+                                            dp_delta < -DELTA_RANGE)
+                                                valid = 0;
+                                        break;
+                                case CMD_SIMPLE_LEFT:
+                                        dp_delta -= (int)bk.value.simple_count;
+                                        if (dp_delta > DELTA_RANGE ||
+                                            dp_delta < -DELTA_RANGE)
+                                                valid = 0;
+                                        break;
+                                case CMD_SIMPLE_INC:
+                                        deltas[dp_delta + DELTA_RANGE] +=
+                                            (int)bk.value.simple_count;
+                                        break;
+                                case CMD_SIMPLE_DEC:
+                                        deltas[dp_delta + DELTA_RANGE] -=
+                                            (int)bk.value.simple_count;
+                                        break;
+                                default:
+                                        valid = 0;
+                                        break;
+                                }
+                        }
+
+                        if (valid && dp_delta == 0 &&
+                            deltas[DELTA_RANGE] == -1) {
+                                struct multiply_move moves[MULTIPLY_MOVES_MAX];
+                                size_t n_moves = 0;
+                                int overflow = 0;
+                                for (int d = -DELTA_RANGE;
+                                     d <= DELTA_RANGE && !overflow; d++) {
+                                        if (d == 0 ||
+                                            deltas[d + DELTA_RANGE] == 0)
+                                                continue;
+                                        if (n_moves >= MULTIPLY_MOVES_MAX) {
+                                                overflow = 1;
+                                                break;
+                                        }
+                                        moves[n_moves++] =
+                                            (struct multiply_move){
+                                                .offset = d,
+                                                .factor =
+                                                    deltas[d + DELTA_RANGE]};
+                                }
+                                if (!overflow) {
+                                        for (size_t k = old; k <= close_idx;
+                                             k++) {
+                                                old_to_new[k] =
+                                                    (k == old) ? new_len
+                                                               : SIZE_MAX;
+                                        }
+                                        new_to_old[new_len] = old;
+                                        struct cmd mc = {
+                                            .type = CMD_MULTIPLY,
+                                            .value.multiply.n_moves = n_moves};
+                                        for (size_t i = 0; i < n_moves; i++)
+                                                mc.value.multiply.moves[i] =
+                                                    moves[i];
+                                        new_cmds[new_len++] = mc;
+                                        old = close_idx + 1;
+                                        continue;
+                                }
+                        }
+                }
+                old_to_new[old] = new_len;
+                new_to_old[new_len] = old;
+                new_cmds[new_len++] = c;
+                old++;
+        }
+
+        for (size_t i = 0; i < new_len; i++) {
+                if (new_cmds[i].type == CMD_JUMP_FORWARD ||
+                    new_cmds[i].type == CMD_JUMP_BACK) {
+                        size_t old_target =
+                            program->cmds[new_to_old[i]].value.jump_index;
+                        assert(old_to_new[old_target] != SIZE_MAX);
+                        new_cmds[i].value.jump_index = old_to_new[old_target];
+                }
+        }
+
+        free(program->cmds);
+        free(old_to_new);
+        free(new_to_old);
+        program->cmds = new_cmds;
+        program->length = new_len;
+}
+
+#undef DELTA_RANGE
+
 void optimise_program(struct program *program) {
         cancel_opposing(program);
         detect_clear_loops(program);
+        detect_multiply_loops(program);
 }
 
 char program_contains_input(struct program *program) {
diff --git a/src/ir.h b/src/ir.h
index f21b25e..7d02668 100644
--- a/src/ir.h
+++ b/src/ir.h
@@ -23,6 +23,19 @@ enum cmd_type {
         CMD_JUMP_BACK,
         /// Synthetic: set current cell to zero (replaces `[-]`/`[+]`).
         CMD_CLEAR,
+        /// Synthetic: multiply-add loop (replaces `[-offset1*factor1...]`).
+        CMD_MULTIPLY,
+};
+
+/// Maximum number of target cells in a CMD_MULTIPLY instruction.
+#define MULTIPLY_MOVES_MAX 8
+
+/// One (offset, factor) pair in a CMD_MULTIPLY instruction.
+struct multiply_move {
+        /// Cell offset from the current data pointer.
+        int offset;
+        /// Multiplier applied to the loop counter cell.
+        int factor;
 };
 
 /// One compressed instruction in the internal Brainfuck IR.
@@ -35,6 +48,11 @@ struct cmd {
                 size_t simple_count;
                 /// Matching bracket command index.
                 size_t jump_index;
+                /// Moves for CMD_MULTIPLY.
+                struct {
+                        struct multiply_move moves[MULTIPLY_MOVES_MAX];
+                        size_t n_moves;
+                } multiply;
         } value;
 };
 
diff --git a/src/llvm.c b/src/llvm.c
index 7b7f5df..384a840 100644
--- a/src/llvm.c
+++ b/src/llvm.c
@@ -184,6 +184,37 @@ void comma(struct llvm_context *ctx) {
         LLVMBuildStore(ctx->builder, char_value, data_ptr);
 }
 
+void multiply(struct llvm_context *ctx, struct multiply_move *moves,
+              size_t n_moves) {
+        LLVMValueRef counter_ptr = get_dataptr(ctx);
+        LLVMValueRef counter =
+            LLVMBuildLoad2(ctx->builder, int8_type(ctx), counter_ptr, "");
+        for (size_t i = 0; i < n_moves; i++) {
+                LLVMValueRef dp_value =
+                    LLVMBuildLoad2(ctx->builder, int32_type(ctx), ctx->dp, "");
+                LLVMValueRef offset =
+                    LLVMConstInt(int32_type(ctx), (unsigned long long)moves[i].offset, 1);
+                LLVMValueRef target_idx =
+                    LLVMBuildAdd(ctx->builder, dp_value, offset, "");
+                LLVMValueRef indices[] = {LLVMConstInt(int32_type(ctx), 0, 0),
+                                          target_idx};
+                LLVMValueRef target_ptr =
+                    LLVMBuildGEP2(ctx->builder, data_array_type(ctx),
+                                  ctx->data, indices, 2, "");
+                LLVMValueRef target =
+                    LLVMBuildLoad2(ctx->builder, int8_type(ctx), target_ptr, "");
+                LLVMValueRef factor =
+                    LLVMConstInt(int8_type(ctx), (unsigned long long)moves[i].factor, 1);
+                LLVMValueRef product =
+                    LLVMBuildMul(ctx->builder, counter, factor, "");
+                LLVMValueRef new_val =
+                    LLVMBuildAdd(ctx->builder, target, product, "");
+                LLVMBuildStore(ctx->builder, new_val, target_ptr);
+        }
+        LLVMBuildStore(ctx->builder, LLVMConstInt(int8_type(ctx), 0, 0),
+                       counter_ptr);
+}
+
 void clear(struct llvm_context *ctx) {
         LLVMValueRef data_ptr = get_dataptr(ctx);
         LLVMBuildStore(ctx->builder, LLVMConstInt(int8_type(ctx), 0, 0),
@@ -259,6 +290,10 @@ LLVMModuleRef generate(struct program *program) {
                 case CMD_CLEAR:
                         clear(&ctx);
                         break;
+                case CMD_MULTIPLY:
+                        multiply(&ctx, command.value.multiply.moves,
+                                 command.value.multiply.n_moves);
+                        break;
                 default:
                         fprintf(stderr, "Unsupported cmd_type '%c'\n",
                                 command.type);
diff --git a/test/res/multiply.b b/test/res/multiply.b
new file mode 100644
index 0000000..bfcc969
--- /dev/null
+++ b/test/res/multiply.b
@@ -0,0 +1 @@
+>+++++[<+>-]
\ No newline at end of file
diff --git a/test/test_multiply.filecheck b/test/test_multiply.filecheck
new file mode 100644
index 0000000..235130a
--- /dev/null
+++ b/test/test_multiply.filecheck
@@ -0,0 +1,43 @@
+; RUN: %bf %s.b --emit-llvm | FileCheck %s
+
+; Test multiply-loop optimisation: >+++++[<+>-]
+; Moves dp right, sets cell[1]=5, then [<+>-] adds cell[1] into cell[0]
+; and zeros cell[1]. Optimised to CMD_MULTIPLY {offset=-1, factor=1}.
+
+; CHECK: ; ModuleID = 'main'
+; CHECK: source_filename = "main"
+
+; CHECK: @dp = global i32 0
+; CHECK: @data = global [65536 x i8] zeroinitializer
+
+; CHECK: define i32 @main() {
+; CHECK: entry:
+
+; > (move right)
+; CHECK: %[[DP1:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[RIGHT:.*]] = add i32 %[[DP1]], 1
+; CHECK: store i32 %[[RIGHT]], ptr @dp, align 4
+
+; +++++ (set cell[1] = 5)
+; CHECK: %[[DP2:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[PTR1:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP2]]
+; CHECK: %[[VAL1:.*]] = load i8, ptr %[[PTR1]], align 1
+; CHECK: %[[ADD:.*]] = add i8 %[[VAL1]], 5
+; CHECK: store i8 %[[ADD]], ptr %[[PTR1]], align 1
+
+; CMD_MULTIPLY {offset=-1, factor=1}: load counter (cell[1]),
+; compute cell[0] += counter * 1, zero cell[1].
+; CHECK: %[[DP3:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[CPTR:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP3]]
+; CHECK: %[[COUNTER:.*]] = load i8, ptr %[[CPTR]], align 1
+; CHECK: %[[DP4:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[TIDX:.*]] = add i32 %[[DP4]], -1
+; CHECK: %[[TPTR:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[TIDX]]
+; CHECK: %[[TVAL:.*]] = load i8, ptr %[[TPTR]], align 1
+; CHECK: %[[PROD:.*]] = mul i8 %[[COUNTER]], 1
+; CHECK: %[[NEW:.*]] = add i8 %[[TVAL]], %[[PROD]]
+; CHECK: store i8 %[[NEW]], ptr %[[TPTR]], align 1
+; CHECK: store i8 0, ptr %[[CPTR]], align 1
+
+; CHECK: ret i32 0
+; CHECK: }

From af59d879cd1d2bc6ca02d87fad6dbb5ed0d01876 Mon Sep 17 00:00:00 2001
From: benmandrew <benmandrew@gmail.com>
Date: Tue, 30 Jun 2026 16:17:48 +0100
Subject: [PATCH 4/8] opt: create dp as alloca rather than global

Removes the @dp global variable and replaces it with an alloca in the
main function entry block. The LLVMValueRef ctx->dp is still a pointer
so all load/store callsites are unchanged. With LLVM's mem2reg pass
(applied under -O) the alloca is promoted to an SSA register, removing
all dp memory traffic. Updates all FileCheck tests accordingly.
---
 src/llvm.c                       |  5 +++--
 test/test_hi.filecheck           | 14 ++++++++------
 test/test_multiply.filecheck     | 15 +++++++++------
 test/test_simple_echo.filecheck  | 10 ++++++----
 test/test_simple_loop.filecheck  |  9 ++++++---
 test/test_simple_no_io.filecheck | 19 +++++++++++--------
 6 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/src/llvm.c b/src/llvm.c
index 384a840..bf149f5 100644
--- a/src/llvm.c
+++ b/src/llvm.c
@@ -95,8 +95,6 @@ struct llvm_context create_module_preamble(struct program *program,
         if (program_contains_input(program)) {
                 create_getchar_declaration(&ctx);
         }
-        ctx.dp = LLVMAddGlobal(ctx.module, int32_type(&ctx), "dp");
-        LLVMSetInitializer(ctx.dp, LLVMConstNull(int32_type(&ctx)));
         ctx.data = LLVMAddGlobal(ctx.module, data_array_type(&ctx), "data");
         LLVMSetInitializer(ctx.data, LLVMConstNull(data_array_type(&ctx)));
         ctx.js = jump_stack_new();
@@ -115,6 +113,9 @@ void create_main_function(struct llvm_context *ctx) {
         LLVMBasicBlockRef entry_block =
             LLVMAppendBasicBlockInContext(ctx->context, ctx->main, "entry");
         LLVMPositionBuilderAtEnd(ctx->builder, entry_block);
+        ctx->dp = LLVMBuildAlloca(ctx->builder, int32_type(ctx), "dp");
+        LLVMBuildStore(ctx->builder,
+                       LLVMConstInt(int32_type(ctx), 0, 0), ctx->dp);
 }
 
 LLVMValueRef get_dataptr(struct llvm_context *ctx) {
diff --git a/test/test_hi.filecheck b/test/test_hi.filecheck
index ad965bc..b8f060c 100644
--- a/test/test_hi.filecheck
+++ b/test/test_hi.filecheck
@@ -1,42 +1,44 @@
 ; RUN: %bf %s.b --emit-llvm | FileCheck %s
 
 ; Test brainfuck program that outputs "Hi"
-; This should generate IR for setting up 'H' (72) and 'i' (105) and calling putchar
 
 ; CHECK: ; ModuleID = 'main'
 ; CHECK: source_filename = "main"
 
-; CHECK: @dp = global i32 0
+; @dp is now an alloca inside main, not a global.
+; CHECK-NOT: @dp = global
 ; CHECK: @data = global [65536 x i8] zeroinitializer
 
 ; CHECK: declare i32 @putchar(i32)
 
 ; CHECK: define i32 @main() {
 ; CHECK: entry:
+; CHECK: %dp = alloca i32, align 4
+; CHECK: store i32 0, ptr %dp, align 4
 
 ; First, build up to 'H' (ASCII 72)
-; CHECK: %[[DP1:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP1:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR1:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP1]]
 ; CHECK: %[[VAL1:.*]] = load i8, ptr %[[PTR1]], align 1
 ; CHECK: %[[ADD1:.*]] = add i8 %[[VAL1]], 72
 ; CHECK: store i8 %[[ADD1]], ptr %[[PTR1]], align 1
 
 ; Output 'H' with putchar
-; CHECK: %[[DP2:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP2:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR2:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP2]]
 ; CHECK: %[[VAL2:.*]] = load i8, ptr %[[PTR2]], align 1
 ; CHECK: %[[EXT1:.*]] = zext i8 %[[VAL2]] to i32
 ; CHECK: %[[CALL1:.*]] = call i32 @putchar(i32 %[[EXT1]])
 
 ; Add 33 more to get to 'i' (72 + 33 = 105)
-; CHECK: %[[DP3:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP3:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR3:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP3]]
 ; CHECK: %[[VAL3:.*]] = load i8, ptr %[[PTR3]], align 1
 ; CHECK: %[[ADD2:.*]] = add i8 %[[VAL3]], 33
 ; CHECK: store i8 %[[ADD2]], ptr %[[PTR3]], align 1
 
 ; Output 'i' with putchar
-; CHECK: %[[DP4:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP4:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR4:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP4]]
 ; CHECK: %[[VAL4:.*]] = load i8, ptr %[[PTR4]], align 1
 ; CHECK: %[[EXT2:.*]] = zext i8 %[[VAL4]] to i32
diff --git a/test/test_multiply.filecheck b/test/test_multiply.filecheck
index 235130a..1e70ca0 100644
--- a/test/test_multiply.filecheck
+++ b/test/test_multiply.filecheck
@@ -7,19 +7,22 @@
 ; CHECK: ; ModuleID = 'main'
 ; CHECK: source_filename = "main"
 
-; CHECK: @dp = global i32 0
+; @dp is now an alloca inside main, not a global.
+; CHECK-NOT: @dp = global
 ; CHECK: @data = global [65536 x i8] zeroinitializer
 
 ; CHECK: define i32 @main() {
 ; CHECK: entry:
+; CHECK: %dp = alloca i32, align 4
+; CHECK: store i32 0, ptr %dp, align 4
 
 ; > (move right)
-; CHECK: %[[DP1:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP1:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[RIGHT:.*]] = add i32 %[[DP1]], 1
-; CHECK: store i32 %[[RIGHT]], ptr @dp, align 4
+; CHECK: store i32 %[[RIGHT]], ptr %dp, align 4
 
 ; +++++ (set cell[1] = 5)
-; CHECK: %[[DP2:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP2:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR1:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP2]]
 ; CHECK: %[[VAL1:.*]] = load i8, ptr %[[PTR1]], align 1
 ; CHECK: %[[ADD:.*]] = add i8 %[[VAL1]], 5
@@ -27,10 +30,10 @@
 
 ; CMD_MULTIPLY {offset=-1, factor=1}: load counter (cell[1]),
 ; compute cell[0] += counter * 1, zero cell[1].
-; CHECK: %[[DP3:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP3:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[CPTR:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP3]]
 ; CHECK: %[[COUNTER:.*]] = load i8, ptr %[[CPTR]], align 1
-; CHECK: %[[DP4:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP4:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[TIDX:.*]] = add i32 %[[DP4]], -1
 ; CHECK: %[[TPTR:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[TIDX]]
 ; CHECK: %[[TVAL:.*]] = load i8, ptr %[[TPTR]], align 1
diff --git a/test/test_simple_echo.filecheck b/test/test_simple_echo.filecheck
index 40fb089..fde5073 100644
--- a/test/test_simple_echo.filecheck
+++ b/test/test_simple_echo.filecheck
@@ -1,12 +1,12 @@
 ; RUN: %bf %s.b --emit-llvm | FileCheck %s
 
 ; Test brainfuck program that echoes a character: ,.
-; This should generate IR for getchar() followed by putchar()
 
 ; CHECK: ; ModuleID = 'main'
 ; CHECK: source_filename = "main"
 
-; CHECK: @dp = global i32 0
+; @dp is now an alloca inside main, not a global.
+; CHECK-NOT: @dp = global
 ; CHECK: @data = global [65536 x i8] zeroinitializer
 
 ; CHECK: declare i32 @putchar(i32)
@@ -15,16 +15,18 @@
 
 ; CHECK: define i32 @main() {
 ; CHECK: entry:
+; CHECK: %dp = alloca i32, align 4
+; CHECK: store i32 0, ptr %dp, align 4
 
 ; Input operation (,) - read character with getchar
-; CHECK: %[[DP1:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP1:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR1:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP1]]
 ; CHECK: %[[GETCHAR:.*]] = call i32 @getchar()
 ; CHECK: %[[TRUNC:.*]] = trunc i32 %[[GETCHAR]] to i8
 ; CHECK: store i8 %[[TRUNC]], ptr %[[PTR1]], align 1
 
 ; Output operation (.) - write character with putchar
-; CHECK: %[[DP2:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP2:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR2:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP2]]
 ; CHECK: %[[VAL:.*]] = load i8, ptr %[[PTR2]], align 1
 ; CHECK: %[[EXT:.*]] = zext i8 %[[VAL]] to i32
diff --git a/test/test_simple_loop.filecheck b/test/test_simple_loop.filecheck
index ed63c5b..34479d5 100644
--- a/test/test_simple_loop.filecheck
+++ b/test/test_simple_loop.filecheck
@@ -6,21 +6,24 @@
 ; CHECK: ; ModuleID = 'main'
 ; CHECK: source_filename = "main"
 
-; CHECK: @dp = global i32 0
+; @dp is now an alloca inside main, not a global.
+; CHECK-NOT: @dp = global
 ; CHECK: @data = global [65536 x i8] zeroinitializer
 
 ; CHECK: define i32 @main() {
 ; CHECK: entry:
+; CHECK: %dp = alloca i32, align 4
+; CHECK: store i32 0, ptr %dp, align 4
 
 ; Initial addition of 5 to current cell (+++++).
-; CHECK: %[[DP1:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP1:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR1:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP1]]
 ; CHECK: %[[VAL1:.*]] = load i8, ptr %[[PTR1]], align 1
 ; CHECK: %[[ADD:.*]] = add i8 %[[VAL1]], 5
 ; CHECK: store i8 %[[ADD]], ptr %[[PTR1]], align 1
 
 ; CMD_CLEAR: [-] collapses to a single store of zero.
-; CHECK: %[[DP2:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP2:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR2:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP2]]
 ; CHECK: store i8 0, ptr %[[PTR2]], align 1
 
diff --git a/test/test_simple_no_io.filecheck b/test/test_simple_no_io.filecheck
index 28b7d14..b9cbb1f 100644
--- a/test/test_simple_no_io.filecheck
+++ b/test/test_simple_no_io.filecheck
@@ -3,38 +3,41 @@
 ; CHECK: ; ModuleID = 'main'
 ; CHECK: source_filename = "main"
 
-; CHECK: @dp = global i32 0
+; @dp is now an alloca inside main, not a global.
+; CHECK-NOT: @dp = global
 ; CHECK: @data = global [65536 x i8] zeroinitializer
 
 ; CHECK: define i32 @main() {
 ; CHECK: entry:
+; CHECK: %dp = alloca i32, align 4
+; CHECK: store i32 0, ptr %dp, align 4
 
 ; First increment operation (++)
-; CHECK: %[[DP1:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP1:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR1:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP1]]
 ; CHECK: %[[VAL1:.*]] = load i8, ptr %[[PTR1]], align 1
 ; CHECK: %[[ADD1:.*]] = add i8 %[[VAL1]], 2
 ; CHECK: store i8 %[[ADD1]], ptr %[[PTR1]], align 1
 
 ; Move right operation (>)
-; CHECK: %[[DP2:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP2:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[RIGHT:.*]] = add i32 %[[DP2]], 1
-; CHECK: store i32 %[[RIGHT]], ptr @dp, align 4
+; CHECK: store i32 %[[RIGHT]], ptr %dp, align 4
 
 ; Decrement operation (--)
-; CHECK: %[[DP3:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP3:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR2:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP3]]
 ; CHECK: %[[VAL2:.*]] = load i8, ptr %[[PTR2]], align 1
 ; CHECK: %[[SUB:.*]] = sub i8 %[[VAL2]], 2
 ; CHECK: store i8 %[[SUB]], ptr %[[PTR2]], align 1
 
 ; Move left operation (<)
-; CHECK: %[[DP4:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP4:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[LEFT:.*]] = sub i32 %[[DP4]], 1
-; CHECK: store i32 %[[LEFT]], ptr @dp, align 4
+; CHECK: store i32 %[[LEFT]], ptr %dp, align 4
 
 ; Final increment operation (++)
-; CHECK: %[[DP5:.*]] = load i32, ptr @dp, align 4
+; CHECK: %[[DP5:.*]] = load i32, ptr %dp, align 4
 ; CHECK: %[[PTR3:.*]] = getelementptr [65536 x i8], ptr @data, i32 0, i32 %[[DP5]]
 ; CHECK: %[[VAL3:.*]] = load i8, ptr %[[PTR3]], align 1
 ; CHECK: %[[ADD2:.*]] = add i8 %[[VAL3]], 2

From 75e03155545817e6188b25652a51a3311f006412 Mon Sep 17 00:00:00 2001
From: benmandrew <benmandrew@gmail.com>
Date: Tue, 30 Jun 2026 16:19:47 +0100
Subject: [PATCH 5/8] opt: add LLVM pass pipeline gated on -O/--optimise

Wires the already-parsed --optimise flag from main_bfc.c through
generate(program, optimise). When true, runs mem2reg,instcombine,
simplifycfg,gvn via LLVMRunPasses (new pass manager, LLVM >= 14).
mem2reg promotes the dp alloca to SSA registers; gvn eliminates
redundant loads; instcombine and simplifycfg clean up the result.
Adds the passes component to llvm_map_components_to_libnames.
FileCheck tests are unaffected as they do not pass -O.
---
 cmake/llvm.cmake |  2 +-
 src/llvm.c       | 17 ++++++++++++++++-
 src/llvm.h       |  5 ++++-
 src/main_bfc.c   |  2 +-
 test/main_fuzz.c |  3 ++-
 5 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/cmake/llvm.cmake b/cmake/llvm.cmake
index 3ddb958..8253637 100644
--- a/cmake/llvm.cmake
+++ b/cmake/llvm.cmake
@@ -68,7 +68,7 @@ else()
         link_directories(${LLVM_LIBRARY_DIRS})
     endif()
 
-    llvm_map_components_to_libnames(llvm_libs support core irreader)
+    llvm_map_components_to_libnames(llvm_libs support core irreader passes)
 
     # Use an OBJECT library for shared sources to avoid flag leakage
     set(LIB_SOURCES
diff --git a/src/llvm.c b/src/llvm.c
index bf149f5..bd73c8e 100644
--- a/src/llvm.c
+++ b/src/llvm.c
@@ -4,6 +4,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include <llvm-c/Transforms/PassBuilder.h>
+
 #include "common.h"
 #include "ir.h"
 
@@ -250,7 +252,7 @@ void right_bracket(struct llvm_context *ctx) {
         LLVMPositionBuilderAtEnd(ctx->builder, pair.exit);
 }
 
-LLVMModuleRef generate(struct program *program) {
+LLVMModuleRef generate(struct program *program, bool optimise) {
         struct llvm_context ctx = create_module_preamble(program, "main");
         create_main_function(&ctx);
         for (size_t cmd_index = 0; cmd_index < program->length; cmd_index++) {
@@ -303,5 +305,18 @@ LLVMModuleRef generate(struct program *program) {
         }
         LLVMBuildRet(ctx.builder, LLVMConstInt(int32_type(&ctx), 0, 0));
         LLVMDisposeBuilder(ctx.builder);
+        if (optimise) {
+                LLVMPassBuilderOptionsRef opts =
+                    LLVMCreatePassBuilderOptions();
+                LLVMErrorRef err = LLVMRunPasses(
+                    ctx.module, "mem2reg,instcombine,simplifycfg,gvn", NULL,
+                    opts);
+                if (err) {
+                        char *msg = LLVMGetErrorMessage(err);
+                        fprintf(stderr, "Pass error: %s\n", msg);
+                        LLVMDisposeErrorMessage(msg);
+                }
+                LLVMDisposePassBuilderOptions(opts);
+        }
         return ctx.module;
 }
diff --git a/src/llvm.h b/src/llvm.h
index 426b767..21adf80 100644
--- a/src/llvm.h
+++ b/src/llvm.h
@@ -1,14 +1,17 @@
 #ifndef LLVM_H
 #define LLVM_H
 
+#include <stdbool.h>
+
 #include <llvm-c/Core.h>
 
 #include "ir.h"
 
 /// Generate LLVM IR for a parsed Brainfuck program.
 /// @param program Parsed Brainfuck program.
+/// @param optimise Run LLVM optimisation passes (mem2reg, instcombine, etc.).
 /// @return Generated LLVM module.
-LLVMModuleRef generate(struct program *program);
+LLVMModuleRef generate(struct program *program, bool optimise);
 
 /// Release an LLVM module created by generate().
 /// @param module LLVM module created by `generate`.
diff --git a/src/main_bfc.c b/src/main_bfc.c
index f88d3de..1b5f4f4 100644
--- a/src/main_bfc.c
+++ b/src/main_bfc.c
@@ -73,7 +73,7 @@ int main(int argc, char **argv) {
         struct program parsed_program = string_to_program(program_str);
         free(program_str);
         optimise_program(&parsed_program);
-        LLVMModuleRef module = generate(&parsed_program);
+        LLVMModuleRef module = generate(&parsed_program, optimise);
         char *err = NULL;
         LLVMPrintModuleToFile(module, "/dev/stdout", &err);
         if (err)
diff --git a/test/main_fuzz.c b/test/main_fuzz.c
index 9b8fd04..670bb05 100644
--- a/test/main_fuzz.c
+++ b/test/main_fuzz.c
@@ -31,7 +31,8 @@ int main(int argc, char **argv) {
                 input[input_len] = '\0';
                 clean_whitespace(input);
                 struct program p = string_to_program(input);
-                LLVMModuleRef module = generate(&p);
+                optimise_program(&p);
+                LLVMModuleRef module = generate(&p, false);
                 char *module_str = LLVMPrintModuleToString(module);
                 // Optionally, do something with module_str (e.g., hash, check,
                 // etc.)

From da64baf903e3092aa08222ee2d8248e25e322def Mon Sep 17 00:00:00 2001
From: benmandrew <benmandrew@gmail.com>
Date: Tue, 30 Jun 2026 16:20:05 +0100
Subject: [PATCH 6/8] plan: remove plan

---
 PLAN.md | 97 ---------------------------------------------------------
 1 file changed, 97 deletions(-)
 delete mode 100644 PLAN.md

diff --git a/PLAN.md b/PLAN.md
deleted file mode 100644
index ed825bd..0000000
--- a/PLAN.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# Optimisation Plan
-
-Five optimisations for the produced LLVM IR, implemented as separate commits.
-
-## Status
-
-- [x] 1. Cancel opposing simple commands
-- [ ] 2. `CMD_CLEAR` — zero-loop `[-]` → `store i8 0`
-- [ ] 3. `CMD_MULTIPLY` — multiply-loop `[->N*+<]` → multiply-add
-- [ ] 4. `dp` as `alloca` (enables `mem2reg`)
-- [ ] 5. LLVM pass pipeline gated on `-O`
-- [ ] 6. Per-optimisation TOML config file
-
----
-
-## 1. Cancel opposing simple commands
-
-**Where**: `ir.c` — new `optimise_program()` called from both `main_bfc.c` and `main_bfi.c`.
-
-Peephole pass over the `cmds` array: adjacent INC/DEC or RIGHT/LEFT pairs subtract counts and, if they fully cancel, are removed. Re-computes bracket jump indices after compaction.
-
-**Example**: `+++--` → `CMD_SIMPLE_INC(1)` instead of `CMD_SIMPLE_INC(3), CMD_SIMPLE_DEC(2)`.
-
-Test changes: none (no existing test program uses cancellable patterns).
-
----
-
-## 2. CMD_CLEAR — zero-loop detection
-
-**Where**: `ir.c` `optimise_program()`, `llvm.c`, `interp.c`.
-
-New `CMD_CLEAR` IR node. Pattern detected after cancellation pass: `[` + single INC or DEC body + `]`. Replaced with a single `store i8 0` in codegen, `ctx->data[ctx->dp] = 0` in interpreter.
-
-**Example**: `+++++[-]` → `CMD_SIMPLE_INC(5), CMD_CLEAR`.
-
-Test changes: update `test/test_simple_loop.filecheck` (program is `+++++[-]`).
-
----
-
-## 3. CMD_MULTIPLY — multiply-loop detection
-
-**Where**: `ir.c` `optimise_program()`, `llvm.c`, `interp.c`.
-
-New `CMD_MULTIPLY` IR node with up to `MULTIPLY_MOVES_MAX` (8) offset/factor pairs. A loop body matches when: only `+`/`-`/`>`/`<` inside, net pointer movement is zero, loop counter cell has net delta −1. Each non-counter cell touched becomes a `{offset, factor}` move.
-
-Codegen: load counter, for each move `data[dp+offset] += counter * factor`, then `store i8 0` to counter cell. Interpreter: same arithmetic.
-
-**Example**: `[->+<]` at dp=1 → `CMD_MULTIPLY {moves=[{offset=-1, factor=1}]}`.
-
-Test changes: add `test/res/multiply.b` and `test/test_multiply.filecheck`.
-
----
-
-## 4. `dp` as `alloca`
-
-**Where**: `llvm.c` — `create_main_function()` creates `dp` as an alloca instead of a global.
-
-`dp` is removed from the global section and created with `LLVMBuildAlloca` in the entry block, immediately initialised to 0. The `LLVMValueRef ctx->dp` is still a pointer (alloca ptr vs global ptr) so all downstream load/store calls are unchanged.
-
-Without LLVM passes the IR still has explicit load/store; the benefit is unlocked in commit 5 when `mem2reg` promotes the alloca to a register.
-
-Test changes: update all FileCheck tests — remove `@dp = global i32 0` check, change `ptr @dp` references to `ptr %dp`.
-
----
-
-## 5. LLVM pass pipeline gated on `-O`
-
-**Where**: `llvm.c`, `llvm.h`, `cmake/llvm.cmake`, `main_bfc.c`.
-
-Wires the already-parsed `--optimise`/`-O` flag from `main_bfc.c` through `generate(struct program *, bool optimise)`. When `optimise` is true, runs `"mem2reg,instcombine,simplifycfg,gvn"` via `LLVMRunPasses` (LLVM new pass manager, LLVM ≥ 14). Adds `passes` to `llvm_map_components_to_libnames` in cmake.
-
-`mem2reg` promotes the `dp` alloca to a register; `gvn` eliminates redundant loads of `@data` elements; `instcombine` and `simplifycfg` clean up the resulting IR.
-
-Test changes: none (FileCheck tests do not pass `-O`).
-
----
-
-## 6. Per-optimisation TOML config file
-
-**Where**: new `src/config.h` / `src/config.c`, updated `main_bfc.c`, updated `optimise_program()` and `generate()` signatures.
-
-A flat TOML file (default `bf.toml` in the current directory, overridable with `-c`/`--config`) controls each optimisation independently:
-
-```toml
-[optimisations]
-cancel_opposing = true
-clear_loop      = true
-multiply_loop   = true
-dp_alloca       = true
-llvm_passes     = false
-```
-
-`struct opt_config` holds a boolean for each flag; a minimal built-in parser handles `[section]` headers and `key = true/false` lines. Missing file → all optimisations enabled by default. The `-O` flag becomes a shorthand for enabling all flags.
-
-`optimise_program(struct program *, const struct opt_config *)` and `generate(struct program *, const struct opt_config *)` are updated to gate each pass on its flag.
-
-Test changes: add `test/res/bf.toml` with specific flags for FileCheck regression tests if needed.

From e9bbacecad7de25ff87593bb09cceba3372cce53 Mon Sep 17 00:00:00 2001
From: benmandrew <benmandrew@gmail.com>
Date: Tue, 30 Jun 2026 16:25:26 +0100
Subject: [PATCH 7/8] fix: clang-format and cpplint violations in new code

Replace (unsigned long long) casts with (uint64_t) to satisfy cpplint
runtime/int rule. Re-run clang-format to fix indentation in multiply(),
detect_multiply_loops(), and the CMD_MULTIPLY interp case.
---
 src/interp.c |  4 ++--
 src/ir.c     |  6 +++---
 src/llvm.c   | 20 ++++++++++----------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/interp.c b/src/interp.c
index d077755..bdd99c2 100644
--- a/src/interp.c
+++ b/src/interp.c
@@ -178,8 +178,8 @@ int interp(struct context_t *ctx, int out_fd, int in_fd, bool byte_output) {
                 ctx->data[ctx->dp] = 0;
                 break;
         case CMD_MULTIPLY:
-                for (size_t i = 0;
-                     i < current_cmd.value.multiply.n_moves; i++) {
+                for (size_t i = 0; i < current_cmd.value.multiply.n_moves;
+                     i++) {
                         int target = (int)ctx->dp +
                                      current_cmd.value.multiply.moves[i].offset;
                         ctx->data[target] +=
diff --git a/src/ir.c b/src/ir.c
index 7ca216d..29ac4eb 100644
--- a/src/ir.c
+++ b/src/ir.c
@@ -428,9 +428,9 @@ static void detect_multiply_loops(struct program *program) {
                                 if (!overflow) {
                                         for (size_t k = old; k <= close_idx;
                                              k++) {
-                                                old_to_new[k] =
-                                                    (k == old) ? new_len
-                                                               : SIZE_MAX;
+                                                old_to_new[k] = (k == old)
+                                                                    ? new_len
+                                                                    : SIZE_MAX;
                                         }
                                         new_to_old[new_len] = old;
                                         struct cmd mc = {
diff --git a/src/llvm.c b/src/llvm.c
index bd73c8e..22da5ba 100644
--- a/src/llvm.c
+++ b/src/llvm.c
@@ -1,6 +1,7 @@
 #include "llvm.h"
 
 #include <assert.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -116,8 +117,8 @@ void create_main_function(struct llvm_context *ctx) {
             LLVMAppendBasicBlockInContext(ctx->context, ctx->main, "entry");
         LLVMPositionBuilderAtEnd(ctx->builder, entry_block);
         ctx->dp = LLVMBuildAlloca(ctx->builder, int32_type(ctx), "dp");
-        LLVMBuildStore(ctx->builder,
-                       LLVMConstInt(int32_type(ctx), 0, 0), ctx->dp);
+        LLVMBuildStore(ctx->builder, LLVMConstInt(int32_type(ctx), 0, 0),
+                       ctx->dp);
 }
 
 LLVMValueRef get_dataptr(struct llvm_context *ctx) {
@@ -196,18 +197,18 @@ void multiply(struct llvm_context *ctx, struct multiply_move *moves,
                 LLVMValueRef dp_value =
                     LLVMBuildLoad2(ctx->builder, int32_type(ctx), ctx->dp, "");
                 LLVMValueRef offset =
-                    LLVMConstInt(int32_type(ctx), (unsigned long long)moves[i].offset, 1);
+                    LLVMConstInt(int32_type(ctx), (uint64_t)moves[i].offset, 1);
                 LLVMValueRef target_idx =
                     LLVMBuildAdd(ctx->builder, dp_value, offset, "");
                 LLVMValueRef indices[] = {LLVMConstInt(int32_type(ctx), 0, 0),
                                           target_idx};
                 LLVMValueRef target_ptr =
-                    LLVMBuildGEP2(ctx->builder, data_array_type(ctx),
-                                  ctx->data, indices, 2, "");
-                LLVMValueRef target =
-                    LLVMBuildLoad2(ctx->builder, int8_type(ctx), target_ptr, "");
+                    LLVMBuildGEP2(ctx->builder, data_array_type(ctx), ctx->data,
+                                  indices, 2, "");
+                LLVMValueRef target = LLVMBuildLoad2(
+                    ctx->builder, int8_type(ctx), target_ptr, "");
                 LLVMValueRef factor =
-                    LLVMConstInt(int8_type(ctx), (unsigned long long)moves[i].factor, 1);
+                    LLVMConstInt(int8_type(ctx), (uint64_t)moves[i].factor, 1);
                 LLVMValueRef product =
                     LLVMBuildMul(ctx->builder, counter, factor, "");
                 LLVMValueRef new_val =
@@ -306,8 +307,7 @@ LLVMModuleRef generate(struct program *program, bool optimise) {
         LLVMBuildRet(ctx.builder, LLVMConstInt(int32_type(&ctx), 0, 0));
         LLVMDisposeBuilder(ctx.builder);
         if (optimise) {
-                LLVMPassBuilderOptionsRef opts =
-                    LLVMCreatePassBuilderOptions();
+                LLVMPassBuilderOptionsRef opts = LLVMCreatePassBuilderOptions();
                 LLVMErrorRef err = LLVMRunPasses(
                     ctx.module, "mem2reg,instcombine,simplifycfg,gvn", NULL,
                     opts);

From 5df02a7c4016fe9af58c1e94978550a01244f900 Mon Sep 17 00:00:00 2001
From: benmandrew <benmandrew@gmail.com>
Date: Tue, 30 Jun 2026 16:27:42 +0100
Subject: [PATCH 8/8] fix: add missing stdint.h includes for SIZE_MAX and
 uint8_t

The clang static analyzer in CI flags SIZE_MAX (ir.c) and uint8_t
(interp.c) as undeclared without an explicit stdint.h include.
---
 src/interp.c | 1 +
 src/ir.c     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/interp.c b/src/interp.c
index bdd99c2..7dc2543 100644
--- a/src/interp.c
+++ b/src/interp.c
@@ -1,6 +1,7 @@
 #include "interp.h"
 
 #include <assert.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/src/ir.c b/src/ir.c
index 29ac4eb..baafdbf 100644
--- a/src/ir.c
+++ b/src/ir.c
@@ -2,6 +2,7 @@
 
 #include <assert.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>