From 58e2fee09a0b21c44f02e1d77e412d16b7be326d Mon Sep 17 00:00:00 2001 From: Konstantine Tsafatinos Date: Fri, 24 Apr 2026 00:37:40 -0400 Subject: [PATCH] update generate_book_v2 --- generate_book_v2.py | 263 ++++++++++++++++++ .../W2D3_Microlearning/W2D3_Tutorial1.ipynb | 207 +------------- 2 files changed, 267 insertions(+), 203 deletions(-) diff --git a/generate_book_v2.py b/generate_book_v2.py index 2298c8ea4..d39e1de8d 100644 --- a/generate_book_v2.py +++ b/generate_book_v2.py @@ -189,6 +189,36 @@ def main(): "github": f"https://github.com/{ORG}/{REPO}", "license": "CC-BY-4.0", "edit_url": None, # disable: auto-computed URL gets book/ prefix from symlink + # Global LaTeX macros for KaTeX — prevents "Undefined control sequence" + # errors when \newcommand definitions in one cell are not visible to + # math environments in other cells/sections during MyST rendering. + # Key is "math" (the valid myst-frontmatter project key); values are + # plain macro strings — KaTeX infers argument count from #1, #2, etc. + # Single-char names (h, y, T, f) are intentionally excluded: they + # conflict with the same letters used as plain variables throughout + # other notebooks, causing KaTeX stack overflows in text mode. + "math": { + "stim": r"\mathbf{x}", + "noisew": r"\boldsymbol{\Psi}", + "noiser": r"\boldsymbol{\xi}", + "targetdim": r"\mathbf{y}", + "identity": r"\mathbf{I}", + "weight": r"\mathbf{W}", + "loss": r"\mathcal{L}", + "derivative": r"\frac{d#1}{d#2}", + "pderivative": r"\frac{\partial #1}{\partial #2}", + "rate": r"\mathbf{r}", + "RR": r"\mathbb{R}", + "EE": r"\mathbb{E}", + "brackets": r"\left(#1\right)", + "sqbrackets": r"\left[#1\right]", + "var": r"\mathbb{V}\mathrm{ar}\left(#1\right)", + "pred": r"\mathbf{\hat{y}}", + "weightout": r"\mathbf{W}^{\textrm{out}}", + "error": r"\boldsymbol{\delta}", + "losserror": r"\mathbf{e}", + "backweight": r"\mathbf{B}", + }, "toc": toc, }, "site": { @@ -238,6 +268,238 @@ def convert_sections_to_children(entries): # ---- Pre-processing helpers (ported verbatim from nmaci generate_book.py) ---- +def expand_latex_macros(content): + r"""Expand custom \newcommand macros in markdown cells. + + MyST/KaTeX build-time rendering does not pick up \newcommand definitions + from inline $...$ blocks in other cells. This function: + + 1. Removes cells that contain only \newcommand definitions. + 2. Expands all macro usages in remaining markdown cells to their LaTeX + equivalents, so every cell is self-contained. + 3. Fixes $N%$ patterns where % is treated as a LaTeX comment inside math. + + Only macros actually defined via \newcommand in the notebook are expanded, + so this is safe to run on any notebook. + """ + + def _parse_newcommands(src): + """Return {macro_name: (expansion, n_args)} from \newcommand blocks. + + Uses a brace-balanced scanner to handle arbitrarily nested expansions. + """ + + def _extract_brace_group(s, pos): + """Return (content, end_pos) for the brace group starting at pos.""" + if pos >= len(s) or s[pos] != "{": + return "", pos + depth, start = 0, pos + while pos < len(s): + if s[pos] == "{": + depth += 1 + elif s[pos] == "}": + depth -= 1 + if depth == 0: + return s[start + 1 : pos], pos + 1 + pos += 1 + return s[start + 1 :], pos + + macros = {} + i = 0 + while i < len(src): + idx = src.find("\\newcommand", i) + if idx < 0: + break + pos = idx + len("\\newcommand") + name, pos = _extract_brace_group(src, pos) + if not name.startswith("\\"): + i = idx + 1 + continue + # Optional [n_args] + n_args = 0 + if pos < len(src) and src[pos] == "[": + end = src.index("]", pos) + n_args = int(src[pos + 1 : end]) + pos = end + 1 + expansion, pos = _extract_brace_group(src, pos) + macros[name] = (expansion, n_args) + i = pos + return macros + + def _expand_arg_macro(text, name, template, n_args): + """Replace \name{a1}{a2} with template (#1->a1, #2->a2).""" + result = [] + i = 0 + while i < len(text): + if text[i:].startswith(name): + after = text[i + len(name):] + # Not a match if this is a prefix of a longer macro name + if after and after[0].isalpha(): + result.append(text[i]) + i += 1 + continue + pos = i + len(name) + while pos < len(text) and text[pos] in " \t\n": + pos += 1 + args = [] + ok = True + for _ in range(n_args): + if pos >= len(text) or text[pos] != "{": + ok = False + break + depth, start = 0, pos + while pos < len(text): + if text[pos] == "{": + depth += 1 + elif text[pos] == "}": + depth -= 1 + if depth == 0: + args.append(text[start + 1 : pos]) + pos += 1 + break + pos += 1 + else: + ok = False + break + if ok and len(args) == n_args: + expanded = template + for j, arg in enumerate(args): + expanded = expanded.replace(f"#{j + 1}", arg) + result.append(expanded) + i = pos + else: + result.append(text[i]) + i += 1 + else: + result.append(text[i]) + i += 1 + return "".join(result) + + def _expand_simple_macro(text, name, expansion): + """Replace \name not followed by a letter (str.join avoids re escape issues).""" + # Only exclude letters — subscripts like \weight_{ij} SHOULD expand \weight + parts = re.split(re.escape(name) + r"(?![a-zA-Z])", text) + return expansion.join(parts) + + def _strip_newcommands(src): + """Remove all \newcommand{...}{...} definitions from a string. + + Uses a brace-balanced scanner to handle arbitrarily nested expansions. + Also removes surrounding $ delimiters and collapses blank lines. + """ + + def _skip_brace_group(s, pos): + """Return end position after the balanced brace group starting at pos.""" + if pos >= len(s) or s[pos] != "{": + return pos + depth = 0 + while pos < len(s): + if s[pos] == "{": + depth += 1 + elif s[pos] == "}": + depth -= 1 + if depth == 0: + return pos + 1 + pos += 1 + return pos + + result = [] + i = 0 + while i < len(src): + # Look for optional leading $ then \newcommand + if src[i] == "$" and src[i + 1 :].lstrip().startswith("\\newcommand"): + # Consume optional $, whitespace, \newcommand + j = i + 1 + while j < len(src) and src[j] in " \t": + j += 1 + if src[j:].startswith("\\newcommand"): + j += len("\\newcommand") + j = _skip_brace_group(src, j) # {name} + if j < len(src) and src[j] == "[": # optional [n] + j = src.index("]", j) + 1 + j = _skip_brace_group(src, j) # {expansion} + # Consume trailing $ + while j < len(src) and src[j] in " \t": + j += 1 + if j < len(src) and src[j] == "$": + j += 1 + i = j + continue + elif src[i:].startswith("\\newcommand"): + j = i + len("\\newcommand") + j = _skip_brace_group(src, j) + if j < len(src) and src[j] == "[": + j = src.index("]", j) + 1 + j = _skip_brace_group(src, j) + i = j + continue + result.append(src[i]) + i += 1 + + cleaned = "".join(result) + # Remove bare $ that only wrapped newcommand blocks + cleaned = re.sub(r"^\s*\$\s*\n", "", cleaned, flags=re.MULTILINE) + cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) + return cleaned.strip() + + # Collect all macros defined anywhere in the notebook + all_macros = {} + for cell in content["cells"]: + if cell["cell_type"] == "markdown": + src = "".join(cell.get("source", [])) + if "newcommand" in src: + all_macros.update(_parse_newcommands(src)) + + new_cells = [] + for cell in content["cells"]: + if cell["cell_type"] != "markdown": + new_cells.append(cell) + continue + + src = "".join(cell.get("source", [])) + + # Strip \newcommand definitions from the source (they don't render in MyST) + if all_macros and "newcommand" in src: + src = _strip_newcommands(src) + # If the cell is now empty, drop it entirely + if not src.strip(): + continue + + if all_macros: + # Two passes: the second catches macros introduced by first-pass expansions + # (e.g. \var expands to \brackets{...} which then needs expanding) + for _ in range(2): + # Arg macros first, longest name first to avoid prefix conflicts + for name, (template, n_args) in sorted( + ((n, v) for n, v in all_macros.items() if v[1] > 0), + key=lambda x: len(x[0]), + reverse=True, + ): + src = _expand_arg_macro(src, name, template, n_args) + # Simple (0-arg) macros, longest first + for name, (expansion, _) in sorted( + ((n, v) for n, v in all_macros.items() if v[1] == 0), + key=lambda x: len(x[0]), + reverse=True, + ): + src = _expand_simple_macro(src, name, expansion) + + # Fix bare % inside $...$ math (e.g. $80%$ -> $80\%$) — % is a LaTeX comment + src = re.sub( + r"\$([^$]*\d)%([^$]*)\$", + lambda m: f"${m.group(1)}\\%{m.group(2)}$", + src, + ) + + cell = dict(cell) + cell["source"] = [src] + new_cells.append(cell) + + content = dict(content) + content["cells"] = new_cells + return content + + def pre_process_notebook(file_path): if not os.path.exists(file_path): print(f" Warning: {file_path} not found, skipping") @@ -246,6 +508,7 @@ def pre_process_notebook(file_path): content = json.load(fh) content = open_in_colab_new_tab(content) content = replace_widgets(content) + content = expand_latex_macros(content) content = link_hidden_cells(content) if ARG == "student": content = tag_cells_allow_errors(content) diff --git a/tutorials/W2D3_Microlearning/W2D3_Tutorial1.ipynb b/tutorials/W2D3_Microlearning/W2D3_Tutorial1.ipynb index eb6569d93..0e2ac514b 100644 --- a/tutorials/W2D3_Microlearning/W2D3_Tutorial1.ipynb +++ b/tutorials/W2D3_Microlearning/W2D3_Tutorial1.ipynb @@ -709,53 +709,7 @@ "execution": {} }, "source": [ - "\n", - "$\n", - "\\newcommand{\\stim}{\\mathbf{x}}\n", - "\\newcommand{\\noisew}{\\boldsymbol \\Psi}\n", - "\\newcommand{\\noiser}{\\boldsymbol \\xi}\n", - "\\newcommand{\\target}{y}\n", - "\\newcommand{\\targetdim}{\\mathbf{y}}\n", - "\\newcommand{\\identity}{\\mathbf{I}}\n", - "\\newcommand{\\blackbox}{f}\n", - "\\newcommand{\\weight}{\\mathbf{W}}\n", - "\\newcommand{\\loss}{\\mathcal{L}}\n", - "\\newcommand{\\derivative}[2]{\\frac{d#1}{d#2}}\n", - "\\newcommand{\\pderivative}[2]{\\frac{\\partial#1}{\\partial#2}}\n", - "\\newcommand{\\rate}{\\mathbf{r}}\n", - "\\newcommand{\\T}{^{\\top}}\n", - "\\newcommand{\\RR}{\\mathbb{R}}\n", - "\\newcommand{\\EE}{\\mathbb{E}\\,}\n", - "\\newcommand{\\brackets}[1]{\\left(#1\\right)}\n", - "\\newcommand{\\sqbrackets}[1]{\\left[#1\\right]}\n", - "\\newcommand{\\var}[1]{\\mathbb{V}\\mathrm{ar}\\brackets{#1}}$\n", - "\n", - "In this first section, we will be deriving and implementing the __Weight Perturbation__ algorithm. In the next section, we will be deriving and implementing the __Node Perturbation__ algorithm. Both of these methods of gradient estimation are very closely related to *finite differences* derivative approximation (the idea that a derivative can be understood as an approximation of the slope of the tangent line between two very close points).\n", - "\n", - "Suppose that we have some loss function, $\\loss(\\weight)$, which we would like to minimize by making some change in our synaptic weights, $\\weight$. The most natural way to decrease the loss would be to perform gradient descent; however, it is not reasonable to assume that a synapse in the brain could perform analytic gradient calculations for general loss functions $\\loss(\\weight)$, which may depend on the activity of many downstream neurons and the external environment. We know neurons in the brain can connect to many distant neurons, but not to the extent to mirror large-scale awareness of error signals that gradient descent would imply. Biological systems could solve this problem by *approximating* the gradient of the loss, which could be accomplished in many ways. \n", - "\n", - "To start, we will provide the __weight perturbation__ update rule, and will subsequently demonstrate why it provides an estimate of the gradient. We will first add noise to our weight matrix, using $\\weight' = \\weight + \\noisew$, where $\\noisew \\sim \\mathcal N(0, \\sigma^2)$ is a diagonal matrix with a fixed sigma on each diagonal element. We take as our update:\n", - "\n", - "\n", - "\\begin{equation}\n", - " \\Delta \\weight = - \\eta \\mathbb{E}_{\\noisew} \\left [\\left (\\loss(\\noisew) - \\loss(0)\\right ) \\frac{(\\weight' - \\weight)}{\\sigma^2} \\right ].\n", - "\\end{equation}\n", - "\n", - "In this example, the notation $\\loss(\\noisew)$ stands for the loss value for the perturbed weights and $\\loss(0)$ represents the loss value for the original weights. This notation can also be understood also relating to $\\loss(W')$ and $\\loss(W)$, respectively, without any loss of generality. We will clarify why this parameter update is interesting from a neuroscientific perspective. If we look at the parameter update for a *single synapse*, $\\weight_{ij}$, we have:\n", - "\n", - "\\begin{align}\n", - " \\Delta \\weight_{ij} &= - \\eta \\mathbb{E}_{\\noisew} \\left [\\left (\\loss(\\noisew) - \\loss(0)\\right ) \\frac{(\\weight'_{ij} - \\weight_{ij})}{\\sigma^2} \\right ] \\\\\n", - " & \\approx - \\eta \\frac{1}{K}\\sum_{k = 0}^K\\left [\\left (\\loss(\\noisew^{(k)}) - \\loss(0)\\right ) \\frac{(\\weight'^{(k)}_{ij} - \\weight_{ij})}{\\sigma^2} \\right ],\n", - "\\end{align}\n", - "\n", - "where for the last approximate equality we are substituting an expectation over $\\noisew$ for an empirical approximation over $K$ samples of $\\noisew$. This update only requires information about the global loss, $\\loss(\\noisew^{(k)})$ and the local parameter values, $\\weight'^{(k)}_{ij}$: using this update, a synapse in a neural network can adapt its strength with *very little* information about what is going on in the rest of the neural circuit.\n", - "\n", - "Lastly, we will show why this update is an approximation of the loss gradient: this section is only to satisfy your curiosity, and is not necessary for completing the coding exercises. We first notice that by first-order Taylor expansion $\\loss(\\noisew) \\approx \\loss(0) + \\derivative{\\loss}{\\weight}\\T \\noisew$. Plugging this approximation into our update equation, we get:\n", - "\\begin{align}\n", - " \\Delta \\weight_{ij} &= - \\eta \\mathbb{E}_{\\noisew} \\left [\\left (\\derivative{\\loss}{\\weight}\\T \\noisew\\right ) \\frac{\\noisew_{ij}}{\\sigma^2} \\right ] \\\\\n", - " &= - \\eta \\derivative{\\loss}{\\weight_{ij}},\n", - "\\end{align}\n", - "where this last equality follows from the fact that $\\mathbb{E}_{\\noisew} \\left[\\noisew_{ij} \\noisew_{kl} \\right] = \\sigma^2$ if and only if $i = k$ and $j = l$, and is 0 otherwise. Therefore, in expectation over many noise samples $\\noisew$, our parameter update based purely on measuring how perturbations of the weights $\\weight'$ correlate with changes in the loss function $\\loss(\\noisew)$, ends up being an unbiased approximation of gradient descent.\n" + "\n$\n\\newcommand{\\stim}{\\mathbf{x}}\n\\newcommand{\\noisew}{\\boldsymbol \\Psi}\n\\newcommand{\\noiser}{\\boldsymbol \\xi}\n\\newcommand{\\target}{y}\n\\newcommand{\\targetdim}{\\mathbf{y}}\n\\newcommand{\\identity}{\\mathbf{I}}\n\\newcommand{\\blackbox}{f}\n\\newcommand{\\weight}{\\mathbf{W}}\n\\newcommand{\\loss}{\\mathcal{L}}\n\\newcommand{\\derivative}[2]{\\frac{d#1}{d#2}}\n\\newcommand{\\pderivative}[2]{\\frac{\\partial#1}{\\partial #2}}\n\\newcommand{\\rate}{\\mathbf{r}}\n\\newcommand{\\T}{^{\\top}}\n\\newcommand{\\RR}{\\mathbb{R}}\n\\newcommand{\\EE}{\\mathbb{E}}\n\\newcommand{\\brackets}[1]{\\left(#1\\right)}\n\\newcommand{\\sqbrackets}[1]{\\left[#1\\right]}\n\\newcommand{\\var}[1]{\\mathbb{V}\\mathrm{ar}\\brackets{#1}}$\n\nIn this first section, we will be deriving and implementing the __Weight Perturbation__ algorithm. In the next section, we will be deriving and implementing the __Node Perturbation__ algorithm. Both of these methods of gradient estimation are very closely related to *finite differences* derivative approximation (the idea that a derivative can be understood as an approximation of the slope of the tangent line between two very close points).\n\nSuppose that we have some loss function, $\\loss(\\weight)$, which we would like to minimize by making some change in our synaptic weights, $\\weight$. The most natural way to decrease the loss would be to perform gradient descent; however, it is not reasonable to assume that a synapse in the brain could perform analytic gradient calculations for general loss functions $\\loss(\\weight)$, which may depend on the activity of many downstream neurons and the external environment. We know neurons in the brain can connect to many distant neurons, but not to the extent to mirror large-scale awareness of error signals that gradient descent would imply. Biological systems could solve this problem by *approximating* the gradient of the loss, which could be accomplished in many ways. \n\nTo start, we will provide the __weight perturbation__ update rule, and will subsequently demonstrate why it provides an estimate of the gradient. We will first add noise to our weight matrix, using $\\weight' = \\weight + \\noisew$, where $\\noisew \\sim \\mathcal N(0, \\sigma^2)$ is a diagonal matrix with a fixed sigma on each diagonal element. We take as our update:\n\n\n\\begin{equation}\n \\Delta \\weight = - \\eta \\mathbb{E}_{\\noisew} \\left [\\left (\\loss(\\noisew) - \\loss(0)\\right ) \\frac{(\\weight' - \\weight)}{\\sigma^2} \\right ].\n\\end{equation}\n\nIn this example, the notation $\\loss(\\noisew)$ stands for the loss value for the perturbed weights and $\\loss(0)$ represents the loss value for the original weights. This notation can also be understood also relating to $\\loss(W')$ and $\\loss(W)$, respectively, without any loss of generality. We will clarify why this parameter update is interesting from a neuroscientific perspective. If we look at the parameter update for a *single synapse*, $\\weight_{ij}$, we have:\n\n\\begin{align}\n \\Delta \\weight_{ij} &= - \\eta \\mathbb{E}_{\\noisew} \\left [\\left (\\loss(\\noisew) - \\loss(0)\\right ) \\frac{(\\weight'_{ij} - \\weight_{ij})}{\\sigma^2} \\right ] \\\\\n & \\approx - \\eta \\frac{1}{K}\\sum_{k = 0}^K\\left [\\left (\\loss(\\noisew^{(k)}) - \\loss(0)\\right ) \\frac{(\\weight'^{(k)}_{ij} - \\weight_{ij})}{\\sigma^2} \\right ],\n\\end{align}\n\nwhere for the last approximate equality we are substituting an expectation over $\\noisew$ for an empirical approximation over $K$ samples of $\\noisew$. This update only requires information about the global loss, $\\loss(\\noisew^{(k)})$ and the local parameter values, $\\weight'^{(k)}_{ij}$: using this update, a synapse in a neural network can adapt its strength with *very little* information about what is going on in the rest of the neural circuit.\n\nLastly, we will show why this update is an approximation of the loss gradient: this section is only to satisfy your curiosity, and is not necessary for completing the coding exercises. We first notice that by first-order Taylor expansion $\\loss(\\noisew) \\approx \\loss(0) + \\derivative{\\loss}{\\weight}\\T \\noisew$. Plugging this approximation into our update equation, we get:\n\\begin{align}\n \\Delta \\weight_{ij} &= - \\eta \\mathbb{E}_{\\noisew} \\left [\\left (\\derivative{\\loss}{\\weight}\\T \\noisew\\right ) \\frac{\\noisew_{ij}}{\\sigma^2} \\right ] \\\\\n &= - \\eta \\derivative{\\loss}{\\weight_{ij}},\n\\end{align}\nwhere this last equality follows from the fact that $\\mathbb{E}_{\\noisew} \\left[\\noisew_{ij} \\noisew_{kl} \\right] = \\sigma^2$ if and only if $i = k$ and $j = l$, and is 0 otherwise. Therefore, in expectation over many noise samples $\\noisew$, our parameter update based purely on measuring how perturbations of the weights $\\weight'$ correlate with changes in the loss function $\\loss(\\noisew)$, ends up being an unbiased approximation of gradient descent.\n" ] }, { @@ -1216,99 +1170,7 @@ "execution": {} }, "source": [ - "$\\newcommand{\\stim}{\\mathbf{x}}$\n", - "$\\newcommand{\\noisew}{\\boldsymbol \\Psi}$\n", - "$\\newcommand{\\noiser}{\\boldsymbol \\xi}$\n", - "$\\newcommand{\\target}{y}$\n", - "$\\newcommand{\\targetdim}{\\mathbf{y}}$\n", - "$\\newcommand{\\identity}{\\mathbf{I}}$\n", - "$\\newcommand{\\blackbox}{f}$\n", - "$\\newcommand{\\weight}{\\mathbf{W}}$\n", - "$\\newcommand{\\loss}{\\mathcal{L}}$\n", - "$\\newcommand{\\derivative}[2]{\\frac{d#1}{d#2}}$\n", - "$\\newcommand{\\rate}{\\mathbf{r}}$\n", - "$\\newcommand{\\T}{^{\\top}}$\n", - "$\\newcommand{\\RR}{\\mathbb{R}}$\n", - "$\\newcommand{\\EE}{\\mathbb{E}\\,}$\n", - "$\\newcommand{\\brackets}[1]{\\left(#1\\right)}$\n", - "$\\newcommand{\\sqbrackets}[1]{\\left[#1\\right]}$\n", - "$\\newcommand{\\var}[1]{\\mathbb{V}\\mathrm{ar}\\brackets{#1}}$\n", - "\n", - "The main issue of perturbation methods is noise, meaning that across many samples of input stimuli and network perturbations, the gradient estimates will be much more variable than would be the case for backpropagation. This means that many, many more perturbations/training samples will be required to obtain an accurate gradient estimate: the consequence will be either very slow or much less effective learning. \n", - "\n", - "Here, we will demonstrate the noisiness of these learning algorithms analytically for a simplified loss and network. This derivation is principally to satisfy your curiosity: no subsequent exercises will depend on your understanding of the mathematics here, and we will subsequently provide empirical evidence based on network simulations as well. First, we will work with a linear network so $\\widehat\\targetdim =\\weight\\stim$, where $\\widehat\\targetdim\\in\\RR^M$, $\\weight\\in\\RR^{M\\times N}$ and $\\stim\\in\\RR^N$. Second, we will assume that the target output is zero $\\targetdim=0$, so the loss becomes $\\loss(\\weight)=\\frac{1}{2}\\|\\weight\\stim\\|^2_2$. (This is equivalent to saying that $\\targetdim=\\weight^*\\stim$ and then shifting the actual weights to be $\\weight - \\weight^*$; notice that here we treat the loss as a function of $\\weight$, rather than $\\Delta \\weight$.)\n", - "\n", - "\n", - "With these changes, we will compute the variance of weight updates for a given input $\\stim$, i.e.\n", - "\\begin{equation*}\n", - " \\var{\\Delta \\weight}=\\EE\\brackets{\\Delta \\weight - \\EE\\Delta\\weight}^2 = \\EE\\brackets{\\Delta \\weight}^2 - \\brackets{\\EE\\Delta\\weight}^2\\,.\n", - "\\end{equation*}\n", - "We already know that the $\\EE\\Delta\\weight$ is the gradient update, so\n", - "\\begin{equation}\n", - " \\brackets{\\EE\\Delta\\weight_{ij}}^2 = \\eta^2 \\brackets{\\derivative{\\loss}{\\weight}}_{ij}^2.\n", - "\\end{equation}\n", - "\n", - "Therefore we only need to compute $\\EE(\\Delta\\weight)^2$ for both algorithms.\n", - "\n", - "**Weight perturbation** For a single weight $\\weight_{ij}$, we can use the approximate weight change:\n", - "\\begin{align}\n", - " \\Delta \\weight_{ij} \\,&= - \\eta \\sum_{kl} \\brackets{\\brackets{\\derivative{\\loss}{\\weight}}_{kl} \\noisew_{kl}} \\frac{\\noisew_{ij}}{\\sigma^2}\\,,\\\\\n", - " \\brackets{\\Delta \\weight_{ij}}^2 \\,&= \\frac{\\eta^2}{\\sigma^4} \\brackets{\\sum_{kl}\\brackets{\\derivative{\\loss}{\\weight}}_{kl} \\noisew_{kl}}^2 \\noisew_{ij}^2\\\\\n", - " &=\\frac{\\eta^2}{\\sigma^4} \\brackets{\\sum_{kldn}\\brackets{\\derivative{\\loss}{\\weight}}_{kl}\\brackets{\\derivative{\\loss}{\\weight}}_{dn} \\noisew_{kl}\\noisew_{dn}} \\noisew_{ij}^2\\,.\n", - "\\end{align}\n", - "\n", - "Now we can take the expectation of the last line w.r.t. the noise $\\noisew$. Since all entries of the noise matrix are independent and zero-mean Gaussian, we will have non-zero terms in two case: $kl=dn\\neq ij$ and $kl=dn=ij$:\n", - "\\begin{align}\n", - " \\EE\\noisew_{kl}\\noisew_{dn}\\noisew_{ij}^2 = \\begin{cases}\n", - " 0 & k \\neq d\\ \\mathrm{or}\\ l\\neq n\\\\\n", - " \\sigma^4 & k=d, l=n, (k\\neq i\\ \\mathrm{or}\\ l\\neq j)\\\\\n", - " 3\\,\\sigma^4 & k=d=i,l=n=j\n", - " \\end{cases}\n", - "\\end{align}\n", - "\n", - "Therefore,\n", - "\\begin{align}\n", - " \\EE_{\\noisew}\\brackets{\\brackets{\\Delta \\weight_{ij}}^2} \\,& = \\frac{\\eta^2}{\\sigma^4} \\brackets{\\derivative{\\loss}{\\weight}}_{ij}^2 \\EE \\noisew_{ij}^4 + \\frac{\\eta^2}{\\sigma^4} \\sum_{kl\\neq ij} \\brackets{\\derivative{\\loss}{\\weight}}_{kl}^2 \\EE\\brackets{\\noisew_{kl}^2 \\noisew_{ij}^2}\\\\\n", - " &=3\\eta^2 \\brackets{\\derivative{\\loss}{\\weight}}_{ij}^2 + \\eta^2\\sum_{kl\\neq ij} \\brackets{\\derivative{\\loss}{\\weight}}_{kl}^2\\,,\n", - "\\end{align}\n", - "\n", - "where we used that the 4th central of the Gaussian $\\EE \\noisew_{ij}^4=3\\sigma^4$.\n", - "\n", - "Using the above result, we arrive at\n", - "\\begin{align}\n", - " \\var{\\Delta \\weight_{ij}} = \\eta^2 \\brackets{\\derivative{\\loss}{\\weight}}_{ij}^2 + \\eta^2\\sum_{kl} \\brackets{\\derivative{\\loss}{\\weight}}_{kl}^2 = O(MN)\\,,\n", - "\\end{align}\n", - "where the scaling comes from having $MN$ terms in the sum.\n", - "\n", - "**Node perturbation** Again, for a single weight $\\weight_{ij}$, we can use the approximate weight change:\n", - "\\begin{align}\n", - " \\Delta \\weight_{ij} \\,&= -\\frac{\\eta}{\\sigma^2}\\brackets{\\sum_{k}\\brackets{\\derivative{\\loss}{\\rate}}_k\\noiser_k} \\noiser_i\\stim_j\\,,\\\\\n", - " \\brackets{\\Delta \\weight_{ij}}^2 \\,&= \\frac{\\eta^2}{\\sigma^4}\\brackets{\\sum_{k}\\brackets{\\derivative{\\loss}{\\rate}}_k\\noiser_k}^2 \\noiser_i^2\\stim_j^2\\\\\n", - " &=\\frac{\\eta^2}{\\sigma^4}\\brackets{\\sum_{k,d}\\brackets{\\derivative{\\loss}{\\rate}}_k\\brackets{\\derivative{\\loss}{\\rate}}_d\\noiser_k\\noiser_d} \\noiser_i^2\\stim_j^2\\,.\n", - "\\end{align}\n", - "\n", - "Again, computing the expectation over the last line will make use of the independent zero-mean Gaussian noise:\n", - "\\begin{align}\n", - " \\EE\\noiser_k\\noiser_d\\noiser_i^2 = \\begin{cases}\n", - " 0 & k \\neq d\\\\\n", - " \\sigma^4 & k=d\\neq i\\\\\n", - " 3\\,\\sigma^4 & k=d=i\n", - " \\end{cases}\n", - "\\end{align}\n", - "\n", - "Since only $k=d\\neq i$ and $k=d=i$ terms will remain non-zero, we obtain\n", - "\\begin{align}\n", - " \\EE_{\\noiser}\\brackets{\\brackets{\\Delta \\weight_{ij}}^2} \\,&= \\frac{\\eta^2}{\\sigma^4}\\brackets{\\derivative{\\loss}{\\rate}}_i^2 \\EE\\brackets{\\noiser_i^4}\\stim_j^2 + \\frac{\\eta^2}{\\sigma^4}\\brackets{\\sum_{k\\neq i}\\brackets{\\derivative{\\loss}{\\rate}}_k^2\\EE\\brackets{\\noiser_k^2 \\noiser_i^2}\\stim_j^2}\\\\\n", - " &=3 \\eta^2\\brackets{\\derivative{\\loss}{\\rate}}_i^2 \\stim_j^2 + \\eta^2\\sum_{k\\neq i}\\brackets{\\derivative{\\loss}{\\rate}}_k^2\\stim_j^2\\,.\n", - "\\end{align}\n", - "\n", - "Now since $\\brackets{\\EE_{\\noiser}\\Delta \\weight_{ij}}^2=\\eta^2\\brackets{\\derivative{\\loss}{\\rate}}_i^2 \\stim_j^2$, we have\n", - "\\begin{equation}\n", - " \\var{\\Delta \\weight_{ij}} = \\eta^2\\brackets{\\derivative{\\loss}{\\rate}}_i^2 \\stim_j^2 + \\eta^2\\sum_{k}\\brackets{\\derivative{\\loss}{\\rate}}_k^2\\stim_j^2 = O(M)\\,,\n", - "\\end{equation}\n", - "where the scaling comes from the sum over $M$ outputs. \n", - "\n", - "To conclude, we found that the variance of the __weight perturbation__ method scales as $O(MN)$ (variance increases if there are more inputs and/or more outputs), while the __node perturbation__ variance scales as $O(M)$ (variance increases only if there are more outputs). As such, node perturbation will scale better as the number of inputs or the number of neurons in the network increases, while weight perturbation will do worse. As we will see below, neither of these methods will scale as well as backpropagation. Becoming less effective at scale is a major problem for a learning algorithm operating in the brain, where synaptic modifications may occur in billions of neurons, and potentially trillions of synapses." + "$\\newcommand{\\stim}{\\mathbf{x}}$\n$\\newcommand{\\noisew}{\\boldsymbol \\Psi}$\n$\\newcommand{\\noiser}{\\boldsymbol \\xi}$\n$\\newcommand{\\target}{y}$\n$\\newcommand{\\targetdim}{\\mathbf{y}}$\n$\\newcommand{\\identity}{\\mathbf{I}}$\n$\\newcommand{\\blackbox}{f}$\n$\\newcommand{\\weight}{\\mathbf{W}}$\n$\\newcommand{\\loss}{\\mathcal{L}}$\n$\\newcommand{\\derivative}[2]{\\frac{d#1}{d#2}}$\n$\\newcommand{\\rate}{\\mathbf{r}}$\n$\\newcommand{\\T}{^{\\top}}$\n$\\newcommand{\\RR}{\\mathbb{R}}$\n$\\newcommand{\\EE}{\\mathbb{E}}$\n$\\newcommand{\\brackets}[1]{\\left(#1\\right)}$\n$\\newcommand{\\sqbrackets}[1]{\\left[#1\\right]}$\n$\\newcommand{\\var}[1]{\\mathbb{V}\\mathrm{ar}\\brackets{#1}}$\n\nThe main issue of perturbation methods is noise, meaning that across many samples of input stimuli and network perturbations, the gradient estimates will be much more variable than would be the case for backpropagation. This means that many, many more perturbations/training samples will be required to obtain an accurate gradient estimate: the consequence will be either very slow or much less effective learning. \n\nHere, we will demonstrate the noisiness of these learning algorithms analytically for a simplified loss and network. This derivation is principally to satisfy your curiosity: no subsequent exercises will depend on your understanding of the mathematics here, and we will subsequently provide empirical evidence based on network simulations as well. First, we will work with a linear network so $\\widehat{\\targetdim} =\\weight\\stim$, where $\\widehat{\\targetdim}\\in\\RR^M$, $\\weight\\in\\RR^{M\\times N}$ and $\\stim\\in\\RR^N$. Second, we will assume that the target output is zero $\\targetdim=0$, so the loss becomes $\\loss(\\weight)=\\frac{1}{2}\\|\\weight\\stim\\|^2_2$. (This is equivalent to saying that $\\targetdim=\\weight^*\\stim$ and then shifting the actual weights to be $\\weight - \\weight^*$; notice that here we treat the loss as a function of $\\weight$, rather than $\\Delta \\weight$.)\n\n\nWith these changes, we will compute the variance of weight updates for a given input $\\stim$, i.e.\n\\begin{equation*}\n \\var{\\Delta \\weight}=\\EE\\brackets{\\Delta \\weight - \\EE\\Delta\\weight}^2 = \\EE\\brackets{\\Delta \\weight}^2 - \\brackets{\\EE\\Delta\\weight}^2\\,.\n\\end{equation*}\nWe already know that the $\\EE\\Delta\\weight$ is the gradient update, so\n\\begin{equation}\n \\brackets{\\EE\\Delta\\weight_{ij}}^2 = \\eta^2 \\brackets{\\derivative{\\loss}{\\weight}}_{ij}^2.\n\\end{equation}\n\nTherefore we only need to compute $\\EE(\\Delta\\weight)^2$ for both algorithms.\n\n**Weight perturbation** For a single weight $\\weight_{ij}$, we can use the approximate weight change:\n\\begin{align}\n \\Delta \\weight_{ij} \\,&= - \\eta \\sum_{kl} \\brackets{\\brackets{\\derivative{\\loss}{\\weight}}_{kl} \\noisew_{kl}} \\frac{\\noisew_{ij}}{\\sigma^2}\\,,\\\\\n \\brackets{\\Delta \\weight_{ij}}^2 \\,&= \\frac{\\eta^2}{\\sigma^4} \\brackets{\\sum_{kl}\\brackets{\\derivative{\\loss}{\\weight}}_{kl} \\noisew_{kl}}^2 \\noisew_{ij}^2\\\\\n &=\\frac{\\eta^2}{\\sigma^4} \\brackets{\\sum_{kldn}\\brackets{\\derivative{\\loss}{\\weight}}_{kl}\\brackets{\\derivative{\\loss}{\\weight}}_{dn} \\noisew_{kl}\\noisew_{dn}} \\noisew_{ij}^2\\,.\n\\end{align}\n\nNow we can take the expectation of the last line w.r.t. the noise $\\noisew$. Since all entries of the noise matrix are independent and zero-mean Gaussian, we will have non-zero terms in two case: $kl=dn\\neq ij$ and $kl=dn=ij$:\n\\begin{align}\n \\EE\\noisew_{kl}\\noisew_{dn}\\noisew_{ij}^2 = \\begin{cases}\n 0 & k \\neq d\\ \\mathrm{or}\\ l\\neq n\\\\\n \\sigma^4 & k=d, l=n, (k\\neq i\\ \\mathrm{or}\\ l\\neq j)\\\\\n 3\\,\\sigma^4 & k=d=i,l=n=j\n \\end{cases}\n\\end{align}\n\nTherefore,\n\\begin{align}\n \\EE_{\\noisew}\\brackets{\\brackets{\\Delta \\weight_{ij}}^2} \\,& = \\frac{\\eta^2}{\\sigma^4} \\brackets{\\derivative{\\loss}{\\weight}}_{ij}^2 \\EE \\noisew_{ij}^4 + \\frac{\\eta^2}{\\sigma^4} \\sum_{kl\\neq ij} \\brackets{\\derivative{\\loss}{\\weight}}_{kl}^2 \\EE\\brackets{\\noisew_{kl}^2 \\noisew_{ij}^2}\\\\\n &=3\\eta^2 \\brackets{\\derivative{\\loss}{\\weight}}_{ij}^2 + \\eta^2\\sum_{kl\\neq ij} \\brackets{\\derivative{\\loss}{\\weight}}_{kl}^2\\,,\n\\end{align}\n\nwhere we used that the 4th central of the Gaussian $\\EE \\noisew_{ij}^4=3\\sigma^4$.\n\nUsing the above result, we arrive at\n\\begin{align}\n \\var{\\Delta \\weight_{ij}} = \\eta^2 \\brackets{\\derivative{\\loss}{\\weight}}_{ij}^2 + \\eta^2\\sum_{kl} \\brackets{\\derivative{\\loss}{\\weight}}_{kl}^2 = O(MN)\\,,\n\\end{align}\nwhere the scaling comes from having $MN$ terms in the sum.\n\n**Node perturbation** Again, for a single weight $\\weight_{ij}$, we can use the approximate weight change:\n\\begin{align}\n \\Delta \\weight_{ij} \\,&= -\\frac{\\eta}{\\sigma^2}\\brackets{\\sum_{k}\\brackets{\\derivative{\\loss}{\\rate}}_k\\noiser_k} \\noiser_i\\stim_j\\,,\\\\\n \\brackets{\\Delta \\weight_{ij}}^2 \\,&= \\frac{\\eta^2}{\\sigma^4}\\brackets{\\sum_{k}\\brackets{\\derivative{\\loss}{\\rate}}_k\\noiser_k}^2 \\noiser_i^2\\stim_j^2\\\\\n &=\\frac{\\eta^2}{\\sigma^4}\\brackets{\\sum_{k,d}\\brackets{\\derivative{\\loss}{\\rate}}_k\\brackets{\\derivative{\\loss}{\\rate}}_d\\noiser_k\\noiser_d} \\noiser_i^2\\stim_j^2\\,.\n\\end{align}\n\nAgain, computing the expectation over the last line will make use of the independent zero-mean Gaussian noise:\n\\begin{align}\n \\EE\\noiser_k\\noiser_d\\noiser_i^2 = \\begin{cases}\n 0 & k \\neq d\\\\\n \\sigma^4 & k=d\\neq i\\\\\n 3\\,\\sigma^4 & k=d=i\n \\end{cases}\n\\end{align}\n\nSince only $k=d\\neq i$ and $k=d=i$ terms will remain non-zero, we obtain\n\\begin{align}\n \\EE_{\\noiser}\\brackets{\\brackets{\\Delta \\weight_{ij}}^2} \\,&= \\frac{\\eta^2}{\\sigma^4}\\brackets{\\derivative{\\loss}{\\rate}}_i^2 \\EE\\brackets{\\noiser_i^4}\\stim_j^2 + \\frac{\\eta^2}{\\sigma^4}\\brackets{\\sum_{k\\neq i}\\brackets{\\derivative{\\loss}{\\rate}}_k^2\\EE\\brackets{\\noiser_k^2 \\noiser_i^2}\\stim_j^2}\\\\\n &=3 \\eta^2\\brackets{\\derivative{\\loss}{\\rate}}_i^2 \\stim_j^2 + \\eta^2\\sum_{k\\neq i}\\brackets{\\derivative{\\loss}{\\rate}}_k^2\\stim_j^2\\,.\n\\end{align}\n\nNow since $\\brackets{\\EE_{\\noiser}\\Delta \\weight_{ij}}^2=\\eta^2\\brackets{\\derivative{\\loss}{\\rate}}_i^2 \\stim_j^2$, we have\n\\begin{equation}\n \\var{\\Delta \\weight_{ij}} = \\eta^2\\brackets{\\derivative{\\loss}{\\rate}}_i^2 \\stim_j^2 + \\eta^2\\sum_{k}\\brackets{\\derivative{\\loss}{\\rate}}_k^2\\stim_j^2 = O(M)\\,,\n\\end{equation}\nwhere the scaling comes from the sum over $M$ outputs. \n\nTo conclude, we found that the variance of the __weight perturbation__ method scales as $O(MN)$ (variance increases if there are more inputs and/or more outputs), while the __node perturbation__ variance scales as $O(M)$ (variance increases only if there are more outputs). As such, node perturbation will scale better as the number of inputs or the number of neurons in the network increases, while weight perturbation will do worse. As we will see below, neither of these methods will scale as well as backpropagation. Becoming less effective at scale is a major problem for a learning algorithm operating in the brain, where synaptic modifications may occur in billions of neurons, and potentially trillions of synapses." ] }, { @@ -1515,68 +1377,7 @@ "execution": {} }, "source": [ - "\n", - "$\\newcommand{\\stim}{\\mathbf{x}}$\n", - "$\\newcommand{\\h}{\\mathbf{h}}$\n", - "$\\newcommand{\\noisew}{\\boldsymbol \\Psi}$\n", - "$\\newcommand{\\noiser}{\\boldsymbol \\xi}$\n", - "$\\newcommand{\\target}{y}$\n", - "$\\newcommand{\\pred}{\\mathbf{\\hat{y}}}$\n", - "$\\newcommand{\\identity}{\\mathbf{I}}$\n", - "$\\newcommand{\\blackbox}{f}$\n", - "$\\newcommand{\\weight}{\\mathbf{W}}$\n", - "$\\newcommand{\\weightout}{\\mathbf{W}^{\\textrm{out}}}$\n", - "$\\newcommand{\\loss}{\\mathcal{L}}$\n", - "$\\newcommand{\\derivative}[2]{\\frac{\\partial#1}{\\partial#2}}$\n", - "$\\newcommand{\\rate}{\\mathbf{r}}$\n", - "$\\newcommand{\\error}{\\boldsymbol \\delta}$\n", - "$\\newcommand{\\losserror}{\\mathbf{e}}$\n", - "$\\newcommand{\\backweight}{\\mathbf{B}}$\n", - "\n", - "In this section, we describe the __Feedback Alignment__ algorithm. Unlike weight and node perturbation, feedback alignment provides a mechanism whereby individual neurons can receive *targeted* error signals. To start, we assume the following network setup:\n", - "\n", - "\\begin{align}\n", - " \\pred = \\blackbox(\\weight \\stim) = \\weightout\\sigma(\\weight\\stim) =\\weightout \\h\n", - "\\end{align}\n", - "\n", - "With a mean squared error loss over all of the output neurons.\n", - "\\begin{equation}\n", - " \\loss = \\frac{1}{2n} \\sum_{k=1}^{n}\\left (\\target_k - \\hat{y}_k \\right )^2\n", - "\\end{equation}\n", - "\n", - "Note here we have suppressed the batch index notation, and will calculate the following gradients as averages over batch elements.\n", - "\n", - "Backpropagation updates parameters using the gradient of the loss scaled by the learning rate $\\eta$.\n", - "\n", - "\\begin{align}\n", - " \\Delta \\weight_{ji} &= - \\eta \\derivative{\\loss}{\\weight}_{ji} \\\\\n", - " &= - \\eta \\underbrace{\\derivative{\\loss}{\\pred}\\derivative{\\pred}{h_j}}_{\\delta_j}\\derivative{h_j}{\\weight_{ji}}\\\\\n", - " &= - \\eta \\delta_j \\sigma^{\\prime}(\\weight\\stim)_j\\stim_i \\\\\n", - " &= - \\eta \\delta_j h^{\\prime}_j\\stim_i\n", - "\\end{align}\n", - "\n", - "While $h^{\\prime}_j$ and $\\stim_i$ are available locally to the neuron, calculating $\\delta_j$\n", - "involves non-local information, and is therefore biologically implausible.\n", - "\n", - "\\begin{align}\n", - " \\delta_j &= \\derivative{\\loss}{h_j} \\\\\n", - " &= \\sum_{k=1}^n \\derivative{\\loss}{\\hat{y}_k}\\derivative{\\hat{y}_k}{h_j} \\\\\n", - " &= \\sum_{k=1}^n \\overbrace{(y_k - \\hat{y_k})}^{e_k} \\weightout_{kj} \\\\\n", - " &= e_1 {\\color{red}\\weightout_{1j}} + e_2 {\\color{green}\\weightout_{2j}} + e_3{\\color{magenta}\\weightout_{3j}}\n", - "\\end{align}\n", - "\n", - "In order to calculate $\\delta_j$ we need to use all of of the outgoing weights from neuron $h_j$.\n", - "\n", - "Writing $\\error$ as a column vector (i.e. $\\derivative{\\loss}{\\h}$ in [denominator layout](https://en.wikipedia.org/wiki/Matrix_calculus#Layout_conventions)) we see that in order to calculate $\\error$ we need the transpose of the forward weights.\n", - "\\begin{align}\n", - " \\error &= \\weight_{out}^T \\losserror .\n", - "\\end{align}\n", - "\n", - "\n", - "\n", - "*From Lillicrap et al. (2016), CC-BY*\n", - "\n", - "Feedback alignment replaces $\\weight_{out}^T $ with a random matrix, $\\backweight$. This resolves the 'weight transport' problem, because the feedback weights are no longer the same as the feedforward weights. However, by replacing $\\weight_{out}^T$ with $\\backweight$, we are no longer calculating an accurate gradient! Interestingly, we will see empirically in subsequent sections that this replacement still produces reasonably good gradient estimates, though it still introduces *bias*, because the backward weights are not the same as the forward weights (as explained in the video above)." + "\n$\\newcommand{\\stim}{\\mathbf{x}}$\n$\\newcommand{\\h}{\\mathbf{h}}$\n$\\newcommand{\\noisew}{\\boldsymbol \\Psi}$\n$\\newcommand{\\noiser}{\\boldsymbol \\xi}$\n$\\newcommand{\\target}{y}$\n$\\newcommand{\\pred}{\\mathbf{\\hat{y}}}$\n$\\newcommand{\\identity}{\\mathbf{I}}$\n$\\newcommand{\\blackbox}{f}$\n$\\newcommand{\\weight}{\\mathbf{W}}$\n$\\newcommand{\\weightout}{\\mathbf{W}^{\\textrm{out}}}$\n$\\newcommand{\\loss}{\\mathcal{L}}$\n$\\newcommand{\\derivative}[2]{\\frac{\\partial#1}{\\partial #2}}$\n$\\newcommand{\\rate}{\\mathbf{r}}$\n$\\newcommand{\\error}{\\boldsymbol \\delta}$\n$\\newcommand{\\losserror}{\\mathbf{e}}$\n$\\newcommand{\\backweight}{\\mathbf{B}}$\n\nIn this section, we describe the __Feedback Alignment__ algorithm. Unlike weight and node perturbation, feedback alignment provides a mechanism whereby individual neurons can receive *targeted* error signals. To start, we assume the following network setup:\n\n\\begin{align}\n \\pred = \\blackbox(\\weight \\stim) = \\weightout\\sigma(\\weight\\stim) =\\weightout \\h\n\\end{align}\n\nWith a mean squared error loss over all of the output neurons.\n\\begin{equation}\n \\loss = \\frac{1}{2n} \\sum_{k=1}^{n}\\left (\\target_k - \\hat{y}_k \\right )^2\n\\end{equation}\n\nNote here we have suppressed the batch index notation, and will calculate the following gradients as averages over batch elements.\n\nBackpropagation updates parameters using the gradient of the loss scaled by the learning rate $\\eta$.\n\n\\begin{align}\n \\Delta \\weight_{ji} &= - \\eta \\derivative{\\loss}{\\weight}_{ji} \\\\\n &= - \\eta \\underbrace{\\derivative{\\loss}{\\pred}\\derivative{\\pred}{h_j}}_{\\delta_j}\\derivative{h_j}{\\weight_{ji}}\\\\\n &= - \\eta \\delta_j \\sigma^{\\prime}(\\weight\\stim)_j\\stim_i \\\\\n &= - \\eta \\delta_j h^{\\prime}_j\\stim_i\n\\end{align}\n\nWhile $h^{\\prime}_j$ and $\\stim_i$ are available locally to the neuron, calculating $\\delta_j$\ninvolves non-local information, and is therefore biologically implausible.\n\n\\begin{align}\n \\delta_j &= \\derivative{\\loss}{h_j} \\\\\n &= \\sum_{k=1}^n \\derivative{\\loss}{\\hat{y}_k}\\derivative{\\hat{y}_k}{h_j} \\\\\n &= \\sum_{k=1}^n \\overbrace{(y_k - \\hat{y_k})}^{e_k} \\weightout_{kj} \\\\\n &= e_1 {\\color{red}\\weightout_{1j}} + e_2 {\\color{green}\\weightout_{2j}} + e_3{\\color{magenta}\\weightout_{3j}}\n\\end{align}\n\nIn order to calculate $\\delta_j$ we need to use all of of the outgoing weights from neuron $h_j$.\n\nWriting $\\error$ as a column vector (i.e. $\\derivative{\\loss}{\\h}$ in [denominator layout](https://en.wikipedia.org/wiki/Matrix_calculus#Layout_conventions)) we see that in order to calculate $\\error$ we need the transpose of the forward weights.\n\\begin{align}\n \\error &= \\weight_{out}^T \\losserror .\n\\end{align}\n\n\n\n*From Lillicrap et al. (2016), CC-BY*\n\nFeedback alignment replaces $\\weight_{out}^T $ with a random matrix, $\\backweight$. This resolves the 'weight transport' problem, because the feedback weights are no longer the same as the feedforward weights. However, by replacing $\\weight_{out}^T$ with $\\backweight$, we are no longer calculating an accurate gradient! Interestingly, we will see empirically in subsequent sections that this replacement still produces reasonably good gradient estimates, though it still introduces *bias*, because the backward weights are not the same as the forward weights (as explained in the video above)." ] }, { @@ -2254,4 +2055,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file