From 636f805a0610a4ad646a979647696795c92d3e80 Mon Sep 17 00:00:00 2001 From: Federico Trifoglio Date: Sat, 28 Oct 2023 18:37:11 +0100 Subject: [PATCH] add derivation of BFGS direct inverse --- docs/calculus/ca_extra.ipynb | 32772 ++++++++++++++++++++++++++++++++- 1 file changed, 32684 insertions(+), 88 deletions(-) diff --git a/docs/calculus/ca_extra.ipynb b/docs/calculus/ca_extra.ipynb index 4c9cf7c..87a1d67 100644 --- a/docs/calculus/ca_extra.ipynb +++ b/docs/calculus/ca_extra.ipynb @@ -82,8 +82,12 @@ "xx, yy = np.meshgrid(x_, y_)\n", "\n", "fig, ax = plt.subplots()\n", - "cs1 = ax.contour(xx, yy, sp.lambdify((x, y), f, \"numpy\")(xx, yy), levels=20, cmap=\"RdBu_r\")\n", - "cs2 = ax.contour(xx, yy, sp.lambdify((x, y), g, \"numpy\")(xx, yy), levels=[136], colors=\"k\")\n", + "cs1 = ax.contour(\n", + " xx, yy, sp.lambdify((x, y), f, \"numpy\")(xx, yy), levels=20, cmap=\"RdBu_r\"\n", + ")\n", + "cs2 = ax.contour(\n", + " xx, yy, sp.lambdify((x, y), g, \"numpy\")(xx, yy), levels=[136], colors=\"k\"\n", + ")\n", "ax.clabel(cs1, inline=True, fontsize=10)\n", "ax.clabel(cs2, inline=True, fontsize=10)\n", "ax.spines[[\"left\", \"bottom\"]].set_position((\"data\", 0))\n", @@ -92,7 +96,7 @@ "ax.set_xlim(-15, 15)\n", "ax.set_ylim(-15, 15)\n", "ax.set_aspect(\"equal\")\n", - "plt.title(\"Contour plot of $f(x, y)$ with constraint $g(x) = 136$\", pad=20.)\n", + "plt.title(\"Contour plot of $f(x, y)$ with constraint $g(x) = 136$\", pad=20.0)\n", "plt.show()" ] }, @@ -377571,9 +377575,9 @@ } }, "text/html": [ - "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "w, b = params.values()\n", "xx1, xx2 = np.meshgrid(np.linspace(-5, 5, 100), np.linspace(-5, 5, 100))\n", @@ -763418,6 +795219,801 @@ " ),\n", ")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Derivation of the direct inverse of the BFGS update" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The objective is to derive\n", + "\n", + "$B^{-1}_{k+1} = \\left(I - \\cfrac{s_ky_k^T}{y_k^Ts_k} \\right)B^{-1}_k \\left(I - \\cfrac{y_ks_k^T}{y_k^Ts_k} \\right) + \\cfrac{s_ks_k^T}{y_k^Ts_k}$\n", + "\n", + "from\n", + "\n", + "$B^{-1}_{k+1} = \\left(B_{k} + \\cfrac{y_ky_k^T}{y_k^Ts_k} -\\cfrac{B_ks_ks_k^TB_k}{s_k^TB_ks_k})\\right)^{-1}$\n", + "\n", + "Using Woodbury matrix identity we get that the inverse of $B_{k+1}$ is\n", + "\n", + "$B_{k+1}^{-1} = \\left(\\underbrace{B_{k} + \\cfrac{y_ky_k^T}{y_k^Ts_k} -\\cfrac{B_ks_ks_k^TB_k}{s_k^TB_ks_k}}_{(1)}\\right)^{-1} = (B_k+UV^T)^{-1} = \\underbrace{B_k^{-1}-B_k^{-1}U(I+V^TB_k^{-1}U)^{-1}V^TB_k^{-1}}_{(2)}$\n", + "\n", + "For the sake of a cleaner notation, let's remove the $k$ indexing from $(1)$.\n", + "\n", + "$B + \\cfrac{yy^T}{y^Ts} -\\cfrac{Bss^TB}{s^TBs}$\n", + "\n", + "It turns out the above expression can be rewritten as\n", + "\n", + "$\\underbrace{B}_{n \\times n}+\n", + "\\underbrace{\\begin{bmatrix}\\overbrace{Bs}^{n \\times 1}&&\\overbrace{y}^{n \\times 1}\\end{bmatrix}}_{U \\text{ } (n \\times 2)}\n", + "\\underbrace{\\overbrace{\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}}^{V_1 \\text{ } (2 \\times 2)}\n", + "\\overbrace{\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}}^{V_2 \\text{ } (2 \\times n)}}_{V^T \\text{ } (2 \\times n)}$\n", + "\n", + "To verify this and all the next operations we'll be making, let's create a simple example with $n=3$." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shapes:\n", + "-------\n", + "y: (3,)\n", + "s: (3,)\n", + "B: (3, 3)\n", + "U: (3, 2)\n", + "V1: (2, 2)\n", + "V2: (2, 3)\n", + "VT: (2, 3)\n" + ] + } + ], + "source": [ + "y = np.array([1, 2, 3])\n", + "s = np.array([3, 4, 5])\n", + "B = np.array([[1, 2, 3], [2, 2, 3], [3, 3, 3]])\n", + "B1 = (\n", + " B\n", + " + np.outer(y, y.T) / np.dot(y.T, s)\n", + " - np.outer(B @ s, s.T @ B) / np.dot(np.dot(s.T, B), s)\n", + ")\n", + "\n", + "U = np.hstack([(B @ s)[:, None], y[:, None]])\n", + "V1 = np.array([[-1 / (s.T @ B @ s), 0], [0, 1 / (y.T @ s)]])\n", + "V2 = np.vstack([s.T @ B, y.T])\n", + "VT = V1 @ V2\n", + "\n", + "print(\"Shapes:\\n-------\")\n", + "print(\"y: \", y.shape)\n", + "print(\"s: \", s.shape)\n", + "print(\"B: \", B.shape)\n", + "print(\"U: \", U.shape)\n", + "print(\"V1:\", V1.shape)\n", + "print(\"V2:\", V2.shape)\n", + "print(\"VT:\", VT.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's verify that\n", + "\n", + "$B + \\cfrac{yy^T}{y^Ts} -\\cfrac{Bss^TB}{s^TBs} = B + UV^T$" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(B1, B + U @ VT)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plug the substitutions for $U$ and $V^T$ into $(2)$ and we obtain\n", + "\n", + "$B^{-1}-B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\n", + "\\left(I+\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\n", + "\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\\right)^{-1}\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}$\n", + "\n", + "Let's verify that we can simplify $B^{-1}U := B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}$ to $\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}$ and let's define it as $D$." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shapes:\n", + "-------\n", + "D: (3, 2)\n" + ] + } + ], + "source": [ + "D = np.hstack([s[:, None], (np.linalg.inv(B) @ y)[:, None]])\n", + "assert np.allclose(np.linalg.inv(B) @ U, D)\n", + "print(\"Shapes:\\n-------\")\n", + "print(\"D:\", D.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's verify we can simplify $V_2B^{-1} := \\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}$ to $\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}$ and that is equivalent to $D^T$, and so it follows that $B^{-1}V_2^T = D$.\n", + "\n", + "Note that $(B^{-1})^T$ is the same as $B^{-1}$ because $B$ is symmetric." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(V2 @ np.linalg.inv(B), np.vstack([s.T, y.T @ np.linalg.inv(B)]))\n", + "assert np.allclose(D.T, np.vstack([s.T, y.T @ np.linalg.inv(B)]))\n", + "assert np.allclose(V2 @ np.linalg.inv(B), D.T)\n", + "assert np.allclose(np.linalg.inv(B) @ V2.T, D)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let $M$ the big bracketed expression in the middle.\n", + "\n", + "$B^{-1}-\\underbrace{\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}}_{D \\text{ } (n \\times 2)}\n", + "\\underbrace{\\left(\\overbrace{I}^{(2 \\times 2)}+\n", + "\\overbrace{\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}}^{2 \\times n}\n", + "\\overbrace{B^{-1}}^{n \\times n}\n", + "\\overbrace{\\begin{bmatrix}Bs&&y\\end{bmatrix}}^{(n \\times 2)}\\right)^{-1}\n", + "\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}}_{M^{-1} \\text{ } (2 \\times 2)}\n", + "\\underbrace{\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}}_{D^T \\text{ } (2 \\times n)}$" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shapes:\n", + "-------\n", + "M_inv: (2, 2)\n" + ] + } + ], + "source": [ + "M_inv = np.linalg.inv(np.identity(2) + VT @ np.linalg.inv(B) @ U) @ V1\n", + "print(\"Shapes:\\n-------\")\n", + "print(\"M_inv:\", M_inv.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's verify $B^{-1}-DM^{-1}D^T$ is equivalent to $\\left(B_{k} + \\cfrac{y_ky_k^T}{y_k^Ts_k} -\\cfrac{B_ks_ks_k^TB_k}{s_k^TB_ks_k}\\right)^{-1}$." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(np.linalg.inv(B) - D @ M_inv @ D.T, np.linalg.inv(B1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's simplify $M^{-1}$.\n", + "\n", + "$M^{-1} = (I + V^T B^{-1} U)^{-1} V_1$\n", + "\n", + "$M^{-1} = [V_{1}^{-1} (I + V_1 V_2 B^{-1} U)]^{-1}$\n", + "\n", + "Earlier we simplified $B^{-1} U$ to $D$, so we get\n", + "\n", + "$M^{-1} = [V_{1}^{-1} (I + V_{1} V_{2} D)]^{-1}$\n", + "\n", + "$M^{-1} = (V_{1}^{-1} + V_2 D)^{-1}$\n", + "\n", + "Let's verify it." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(M_inv, np.linalg.inv(np.linalg.inv(V1) + V2 @ np.linalg.inv(B) @ U))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So we have that $M^{-1}$ is\n", + "\n", + "$M^{-1} = \\left(\\begin{bmatrix}-s^TBs&&0\\\\0&&y^Ts\\end{bmatrix}+\n", + "\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\\right)^{-1}$\n", + "\n", + "It might not be obvious that the inverse of $V1$ is $\\begin{bmatrix}-s^TBs&&0\\\\0&&y^Ts\\end{bmatrix}$, so let's verify it." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(np.linalg.inv(V1), np.array([[-s.T @ B @ s, 0], [0, y.T @ s]]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's verify $\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}$ becomes $\\begin{bmatrix}s^TBs&&s^Ty\\\\y^Ts&&y^TB^{-1}y\\end{bmatrix}$." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(\n", + " V2 @ D, np.array([[s.T @ B @ s, s.T @ y], [y.T @ s, y.T @ np.linalg.inv(B) @ y]])\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So $M^{-1}$ now is\n", + "\n", + "$M^{-1} = \\left(\\begin{bmatrix}-s^TBs&&0\\\\0&&y^Ts\\end{bmatrix}+\n", + "\\begin{bmatrix}s^TBs&&s^Ty\\\\y^Ts&&y^TB^{-1}y\\end{bmatrix}\\right)^{-1}$\n", + "\n", + "And we can simplify it further to\n", + "\n", + "$M^{-1} = \\begin{bmatrix}0&&s^Ty\\\\y^Ts&&y^Ts+y^TB^{-1}y\\end{bmatrix}^{-1}$\n", + "\n", + "Let's verify it." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(\n", + " M_inv,\n", + " np.linalg.inv(\n", + " np.array([[0, y.T @ s], [s.T @ y, y.T @ s + y.T @ np.linalg.inv(B) @ y]])\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To calculate the inverse of $M^{-1}$, recall\n", + "\n", + "$A^{-1} = \\begin{bmatrix}a&&b\\\\c&&d\\end{bmatrix}^{-1}$\n", + "\n", + "$A^{-1} = \\cfrac{1}{ab - cd}\\begin{bmatrix}d&&-b\\\\-c&&a\\end{bmatrix}$\n", + "\n", + "The determinant of $M$ is\n", + "\n", + "$\\text{det} M = -s^Tyy^Ts$\n", + "\n", + "So $M^{-1}$ becomes\n", + "\n", + "$M^{-1} = -\\cfrac{1}{s^Tyy^Ts} \\begin{bmatrix}y^Ts+y^TB^{-1}y&&-s^Ty\\\\-y^Ts&&0\\end{bmatrix}$\n", + "\n", + "$M^{-1} = \\begin{bmatrix}-\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}&&\\cfrac{s^Ty}{s^Tyy^Ts}\\\\\\cfrac{y^Ts}{s^Tyy^Ts}&&0\\end{bmatrix}$" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "det = s.T @ y * y @ s.T\n", + "assert np.allclose(\n", + " M_inv,\n", + " np.array(\n", + " [\n", + " [-(y.T @ s + y.T @ np.linalg.inv(B) @ y) / det, s.T @ y / det],\n", + " [y.T @ s / det, 0],\n", + " ]\n", + " ),\n", + ")\n", + "# replace M_inv with new equivalent\n", + "M_inv = np.array(\n", + " [\n", + " [-(y.T @ s + y.T @ np.linalg.inv(B) @ y) / det, s.T @ y / det],\n", + " [y.T @ s / det, 0],\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plug it back in into $B^{-1}-DM^{-1}D^T$ and we get\n", + "\n", + "$B^{-1}-\n", + "\\underbrace{\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}}_{D \\text{ } (n \\times 2)}\n", + "\\underbrace{\\begin{bmatrix}-\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}&&\\cfrac{s^Ty}{s^Tyy^Ts}\\\\\\cfrac{y^Ts}{s^Tyy^Ts}&&0\\end{bmatrix}}_{M^{-1} \\text{ } (2 \\times 2)}\n", + "\\underbrace{\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}}_{D^T \\text{ } (2 \\times n)}$\n", + "\n", + "Let's verify it." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(np.linalg.inv(B) - D @ M_inv @ D.T, np.linalg.inv(B1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$B^{-1}-\n", + "\\underbrace{\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}}_{D \\text{ } (n \\times 2)}\n", + "\\underbrace{\\begin{bmatrix}-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\\\s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\end{bmatrix}}_{M^{-1}D^T \\text{ } (2 \\times n)}$\n", + "\n", + "Let's verify it." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(\n", + " M_inv @ D.T,\n", + " np.vstack(\n", + " [\n", + " -s.T * (y.T @ s + y.T @ np.linalg.inv(B) @ y) / det\n", + " + (y.T @ np.linalg.inv(B)) * (s.T @ y / det),\n", + " s.T * (y.T @ s / det),\n", + " ]\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$B^{-1}-\\underbrace{\\left[s\\left(-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\right)+B^{-1}y\\left(s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right)\\right]}_{DM^{-1}D^T \\text{} (n \\times n)}$\n", + "\n", + "The first and second element of `D` are column vectors.\n", + "\n", + "Recall, column vector by row vector is an outer product, whereas row vector by column vector is a dot product.\n", + "\n", + "So $s\\left(-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\right)$ is an outer product.\n", + "\n", + "And $B^{-1}y\\left(s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right)$ is another outer product.\n", + "\n", + "Let's verify it." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(\n", + " D @ M_inv @ D.T,\n", + " np.outer(\n", + " s,\n", + " (\n", + " -s.T * (y.T @ s + y.T @ np.linalg.inv(B) @ y) / det\n", + " + (y.T @ np.linalg.inv(B)) * (s.T @ y / det)\n", + " ),\n", + " )\n", + " + np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s / det)),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's bring $s$ inside the bracket.\n", + "\n", + "$B^{-1}-\\left[-ss^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+sy^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}+B^{-1}ys^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right]$\n", + "\n", + "Let's verify." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "out1 = np.outer(-s, s.T * (y.T @ s + y.T @ np.linalg.inv(B) @ y) / det)\n", + "out2 = np.outer(s, (y.T @ np.linalg.inv(B)) * (s.T @ y / det))\n", + "out3 = np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s / det))\n", + "assert np.allclose(D @ M_inv @ D.T, out1 + out2 + out3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's bring $\\cfrac{1}{s^Tyy^Ts}$ outside.\n", + "\n", + "$B^{-1}-\\cfrac{1}{s^Tyy^Ts}\\left[-ss^Ty^Ts -ss^Ty^TB^{-1}y + sy^TB^{-1}s^Ty + B^{-1}ys^Ty^Ts\\right]$\n", + "\n", + "Let's verify." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "out1 = np.outer(-s, s.T * (y.T @ s))\n", + "out2 = np.outer(-s, s.T * (y.T @ np.linalg.inv(B) @ y))\n", + "out3 = np.outer(s, (y.T @ np.linalg.inv(B)) * (s.T @ y))\n", + "out4 = np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s))\n", + "assert np.allclose(D @ M_inv @ D.T, 1 / det * (out1 + out2 + out3 + out4))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's swap the signs.\n", + "\n", + "$B^{-1} + \\cfrac{1}{s^Tyy^Ts}\\left[ss^Ty^Ts + ss^Ty^TB^{-1}y - sy^TB^{-1}s^Ty - B^{-1}ys^Ty^Ts\\right]$\n", + "\n", + "Let's verify." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "out1 = np.outer(s, s.T * (y.T @ s))\n", + "out2 = np.outer(s, s.T * (y.T @ np.linalg.inv(B) @ y))\n", + "out3 = np.outer(s, (y.T @ np.linalg.inv(B)) * (s.T @ y))\n", + "out4 = np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s))\n", + "assert np.allclose(\n", + " np.linalg.inv(B) - D @ M_inv @ D.T,\n", + " np.linalg.inv(B) + 1 / det * (out1 + out2 - out3 - out4),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$B^{-1} + \\cfrac{ss^Ty^Ts}{s^Tyy^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{s^Tyy^Ts} - \\cfrac{sy^TB^{-1}s^Ty}{s^Tyy^Ts} - \\cfrac{B^{-1}ys^Ty^Ts}{s^Tyy^Ts}$\n", + "\n", + "Now, it can be really confusing to understand what's a dot product, an outer product or a simple multiplication.\n", + "\n", + "So let's take a look at the code.\n", + "\n", + "```\n", + "out1 = np.outer(s, s.T * (y.T @ s)) / (s.T @ y * y @ s.T)\n", + "out2 = np.outer(s, s.T * (y.T @ np.linalg.inv(B) @ y)) / (s.T @ y * y @ s.T)\n", + "out3 = np.outer(s, (y.T @ np.linalg.inv(B)) * (s.T @ y)) / (s.T @ y * y @ s.T)\n", + "out4 = np.outer(np.linalg.inv(B) @ y, s.T * (y.T @ s)) / (s.T @ y * y @ s.T)\n", + "```\n", + "\n", + "We can see that `(y.T @ s)`, `(s.T @ y)` and `(y.T @ s)` in `out1`, `out3` and `out4`, respectively can go away.\n", + "\n", + "$B^{-1} + \\cfrac{ss^T}{s^Ty} + \\cfrac{ss^Ty^TB^{-1}y}{s^Tyy^Ts} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{s^Ty}$\n", + "\n", + "Also note that $s^Ty = y^Ts$ and $s^Tyy^Ts = (y^Ts)^2$, but let's verify it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "assert s.T @ y == y.T @ s\n", + "assert s.T @ y * y @ s.T == (y.T @ s)**2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recall, our objective is to get to this form\n", + "\n", + "$\\left(I - \\cfrac{sy^T}{y^Ts} \\right)B^{-1} \\left(I - \\cfrac{ys^T}{y^Ts} \\right) + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "We're almost there.\n", + "\n", + "$B^{-1} + \\cfrac{ss^T}{y^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{y^Ts}$\n", + "\n", + "Let's verify it, though." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "out1 = np.outer(s, s.T) / (y.T @ s)\n", + "out2 = np.outer(s, s.T * (y.T @ np.linalg.inv(B) @ y)) / (y.T @ s)**2\n", + "out3 = np.outer(s, (y.T @ np.linalg.inv(B))) / (y.T @ s)\n", + "out4 = np.outer(np.linalg.inv(B) @ y, s.T) / (y.T @ s)\n", + "assert np.allclose(\n", + " np.linalg.inv(B) - D @ M_inv @ D.T,\n", + " np.linalg.inv(B) + out1 + out2 - out3 - out4,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To make the next simplifications as clear as possible, let's just change the order of the terms and put the ones that can be factored together.\n", + "\n", + "$\\left(B^{-1} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{y^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2}\\right) + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "Although it was a trivial operation, let's verify it again." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "out1 = np.outer(s, (y.T @ np.linalg.inv(B))) / (y.T @ s)\n", + "out2 = np.outer(np.linalg.inv(B) @ y, s.T) / (y.T @ s)\n", + "out3 = np.outer(s, s.T * (y.T @ np.linalg.inv(B) @ y)) / (y.T @ s) ** 2\n", + "out4 = np.outer(s, s.T) / (y.T @ s)\n", + "assert np.allclose(\n", + " np.linalg.inv(B) - D @ M_inv @ D.T,\n", + " np.linalg.inv(B) - out1 - out2 + out3 + out4,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It turns out that\n", + "\n", + "$\\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2} = \\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}$\n", + "\n", + "And also\n", + "\n", + "$\\cfrac{sy^TB^{-1}}{y^Ts} = \\cfrac{sy^T}{y^Ts}B^{-1}$\n", + "\n", + "$\\cfrac{B^{-1}ys^T}{y^Ts} = B^{-1}\\cfrac{ys^T}{y^Ts}$\n", + "\n", + "Let's verify it." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(out1, (np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B))\n", + "assert np.allclose(out2, np.linalg.inv(B) @ (np.outer(y, s.T) / (y.T @ s)))\n", + "assert np.allclose(\n", + " out3,\n", + " (np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B) @ np.outer(y, s.T) / (y.T @ s),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1} - B^{-1}\\cfrac{ys^T}{y^Ts} + \\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "Let's verify that \n", + "\n", + "$B^{-1}\\cfrac{ys^T}{y^Ts} + \\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}$ \n", + "\n", + "can be rewritten as\n", + "\n", + "$\\left(B^{-1} + \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\cfrac{ys^T}{y^Ts}$" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(\n", + " out2 + out3,\n", + " (np.linalg.inv(B) + (np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B))\n", + " @ np.outer(y, s.T)\n", + " / (y.T @ s),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1} - \\left(B^{-1} + \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\cfrac{ys^T}{y^Ts} + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "Let's verify that \n", + "\n", + "$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right) - \\left(B^{-1} + \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\cfrac{ys^T}{y^Ts}$\n", + "\n", + "can be rewritten as\n", + "\n", + "$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\left(I - \\cfrac{ys^T}{y^Ts}\\right)$\n", + "\n", + "Note the signs are indeed correct.\n", + "\n", + "$B^{-1}$ with $- \\cfrac{ys^T}{y^Ts}$ becomes $-B^{-1}\\cfrac{ys^T}{y^Ts}$\n", + "\n", + "$- \\cfrac{sy^T}{y^Ts}B^{-1}$ with $- \\cfrac{ys^T}{y^Ts}$ becomes $\\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}$." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(\n", + " np.linalg.inv(B) - out1 - out2 + out3,\n", + " (np.linalg.inv(B) - (np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B))\n", + " @ (np.identity(3) - np.outer(y, s.T) / (y.T @ s)),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\left(I - \\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "Now, let's verify that \n", + "\n", + "$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right)$\n", + "\n", + "can be rewritten as\n", + "\n", + "$\\left(I - \\cfrac{sy^T}{y^Ts}\\right)B^{-1}$" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.allclose(\n", + " np.linalg.inv(B) - out1,\n", + " (np.identity(3) - np.outer(s, y.T) / (y.T @ s)) @ np.linalg.inv(B),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So we finally have\n", + "\n", + "$\\left(I - \\cfrac{sy^T}{y^Ts}\\right)B^{-1}\\left(I - \\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "which is indeed what we want\n", + "\n", + "$B^{-1}_{k+1} = \\left(I - \\cfrac{s_ky_k^T}{y_k^Ts_k} \\right)B^{-1}_k \\left(I - \\cfrac{y_ks_k^T}{y_k^Ts_k} \\right) + \\cfrac{s_ks_k^T}{y_k^Ts_k}$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To appreciate how far we've come, here's the full derivation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$B^{-1}-B^{-1}U(I+V^TB^{-1}U)^{-1}V^TB^{-1}$\n", + "\n", + "$B^{-1}-B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\n", + "\\left(I+\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\n", + "\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}\\begin{bmatrix}Bs&&y\\end{bmatrix}\\right)^{-1}\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}$\n", + "\n", + "$B^{-1}-\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}\n", + "\\left(I+\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\\begin{bmatrix}s^TB\\\\y^T\\end{bmatrix}B^{-1}\n", + "\\begin{bmatrix}Bs&&y\\end{bmatrix}\\right)^{-1}\n", + "\\begin{bmatrix}-\\cfrac{1}{s^TBs}&&0\\\\0&&\\cfrac{1}{y^Ts}\\end{bmatrix}\n", + "\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}$\n", + "\n", + "$B^{-1}-\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}\n", + "\\begin{bmatrix}-\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}&&\\cfrac{s^Ty}{s^Tyy^Ts}\\\\\\cfrac{y^Ts}{s^Tyy^Ts}&&0\\end{bmatrix}\n", + "\\begin{bmatrix}s^T\\\\y^TB^{-1}\\end{bmatrix}$\n", + "\n", + "$B^{-1}-\\begin{bmatrix}s&&B^{-1}y\\end{bmatrix}\n", + "\\begin{bmatrix}-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\\\s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\end{bmatrix}$\n", + "\n", + "$B^{-1}-\\left[s\\left(-s^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+y^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}\\right)+B^{-1}y\\left(s^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right)\\right]$\n", + "\n", + "$B^{-1}-\\left[-ss^T\\cfrac{y^Ts+y^TB^{-1}y}{s^Tyy^Ts}+sy^TB^{-1}\\cfrac{s^Ty}{s^Tyy^Ts}+B^{-1}ys^T\\cfrac{y^Ts}{s^Tyy^Ts}\\right]$\n", + "\n", + "$B^{-1}-\\cfrac{1}{s^Tyy^Ts}\\left[-ss^Ty^Ts -ss^Ty^TB^{-1}y + sy^TB^{-1}s^Ty + B^{-1}ys^Ty^Ts\\right]$\n", + "\n", + "$B^{-1} + \\cfrac{1}{s^Tyy^Ts}\\left[ss^Ty^Ts + ss^Ty^TB^{-1}y - sy^TB^{-1}s^Ty - B^{-1}ys^Ty^Ts\\right]$\n", + "\n", + "$B^{-1} + \\cfrac{ss^Ty^Ts}{s^Tyy^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{s^Tyy^Ts} - \\cfrac{sy^TB^{-1}s^Ty}{s^Tyy^Ts} - \\cfrac{B^{-1}ys^Ty^Ts}{s^Tyy^Ts}$\n", + "\n", + "$B^{-1} + \\cfrac{ss^T}{s^Ty} + \\cfrac{ss^Ty^TB^{-1}y}{s^Tyy^Ts} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{s^Ty}$\n", + "\n", + "$B^{-1} + \\cfrac{ss^T}{y^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{y^Ts}$\n", + "\n", + "$\\left(B^{-1} - \\cfrac{sy^TB^{-1}}{y^Ts} - \\cfrac{B^{-1}ys^T}{y^Ts} + \\cfrac{ss^Ty^TB^{-1}y}{(y^Ts)^2}\\right) + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1} - B^{-1}\\cfrac{ys^T}{y^Ts} + \\cfrac{sy^T}{y^Ts}B^{-1}\\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "$B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1} - \\left(B^{-1} + \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\cfrac{ys^T}{y^Ts} + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "$\\left(B^{-1} - \\cfrac{sy^T}{y^Ts}B^{-1}\\right)\\left(I - \\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n", + "\n", + "$\\left(I - \\cfrac{sy^T}{y^Ts}\\right)B^{-1}\\left(I - \\cfrac{ys^T}{y^Ts}\\right) + \\cfrac{ss^T}{y^Ts}$\n" + ] } ], "metadata": {