# 关于 Softmax 回归的反向传播求导数过程

## 全连接神经网络

$\ell(y,\hat{y})=-\sum_{i=1}^Cy_i\ln{\hat{y}_i}$

$J(W,b)=\frac{1}{m}\sum_{i=1}^m\ell(y^{(i)},\hat{y}^{(i)})$

## 反向传播求导推理

$\begin{split} \frac{\partial\ell}{\partial a^{[L]}}&=\frac{\partial}{\partial a^{[L]}}\left(-\sum_{i=1}^Cy_i\ln{\hat{y}_i}\right)\\ &=\frac{\partial}{\partial a^{[L]}}\left(-(y_1\ln{\hat{y}_1}+y_2\ln{\hat{y}_2}+\dots+y_C\ln{\hat{y}_C})\right)\\ &=\frac{\partial}{\partial a^{[L]}}\left(-(y_1\ln{a^{[L]}_1}+y_2\ln{a^{[L]}_2}+\dots+y_C\ln{a^{[L]}_C})\right) \end{split}$

$\begin{split} \frac{\partial\ell}{\partial a^{[L]}}&=\frac{\partial}{\partial a^{[L]}}\left(-(y_1\ln{a^{[L]}_1}+y_2\ln{a^{[L]}_2}+\dots+y_C\ln{a^{[L]}_C})\right)\\ &=\begin{bmatrix} \frac{\partial}{\partial a^{[L]}_1}\left(-(y_1\ln{a^{[L]}_1}+y_2\ln{a^{[L]}_2}+\dots+y_C\ln{a^{[L]}_C})\right)& \frac{\partial}{\partial a^{[L]}_2}\left(-(y_1\ln{a^{[L]}_1}+y_2\ln{a^{[L]}_2}+\dots+y_C\ln{a^{[L]}_C})\right)& \dots& \frac{\partial}{\partial a^{[L]}_C}\left(-(y_1\ln{a^{[L]}_1}+y_2\ln{a^{[L]}_2}+\dots+y_C\ln{a^{[L]}_C})\right) \end{bmatrix}\\ &=\begin{bmatrix} -\frac{y_1}{a^{[L]}_1}& -\frac{y_2}{a^{[L]}_2}& \dots& -\frac{y_C}{a^{[L]}_C}& \end{bmatrix}\\ &=-\frac{y}{a^{[L]}}\\ &=-\frac{y}{\hat{y}} \end{split}$

$\begin{split} \frac{\partial a^{[L]}}{\partial z^{[L]}}&=\begin{bmatrix} \frac{\partial a^{[L]}}{\partial z^{[L]}_1}& \frac{\partial a^{[L]}}{\partial z^{[L]}_2}& \dots& \frac{\partial a^{[L]}}{\partial z^{[L]}_C} \end{bmatrix} \end{split}$

$\begin{split} \frac{\partial a^{[L]}}{\partial z^{[L]}}&=\begin{bmatrix} \frac{\partial a^{[L]}}{\partial z^{[L]}_1}& \frac{\partial a^{[L]}}{\partial z^{[L]}_2}& \dots& \frac{\partial a^{[L]}}{\partial z^{[L]}_C} \end{bmatrix}\\ &=\begin{bmatrix} \frac{\partial}{\partial z^{[L]}_1}\left(\frac{e^{z^{[L]}}}{\sum_{i=1}^Ce^{z^{[L]}_i}} \right)& \frac{\partial}{\partial z^{[L]}_2}\left(\frac{e^{z^{[L]}}}{\sum_{i=1}^Ce^{z^{[L]}_i}} \right)& \dots& \frac{\partial}{\partial z^{[L]}_C}\left(\frac{e^{z^{[L]}}}{\sum_{i=1}^Ce^{z^{[L]}_i}} \right) \end{bmatrix} \end{split}$

$\begin{split} \frac{\partial}{\partial z_1^{[L]}}\left(\frac{e^{z^{[L]}}}{\sum_{i=1}^Ce^{z^{[L]}_i}}\right)&=\begin{bmatrix} \frac{\partial}{\partial z^{[L]}_1}\left(\frac{e^{z_1^{[L]}}}{\sum_{i=1}^Ce^{z^{[L]}_i}}\right)& \frac{\partial}{\partial z^{[L]}_1}\left(\frac{e^{z_2^{[L]}}}{\sum_{i=1}^Ce^{z^{[L]}_i}}\right)& \dots& \frac{\partial}{\partial z^{[L]}_1}\left(\frac{e^{z_C^{[L]}}}{\sum_{i=1}^Ce^{z^{[L]}_i}}\right)& \end{bmatrix} \end{split}$

$\begin{split} \frac{\partial a^{[L]}_1}{\partial z_1^{[L]}} &=\frac{\partial}{\partial z_1^{[L]}}\left(\frac{e^{z_1^{[L]}}}{\sum_{i=1}^Ce^{z_i^{[L]}}}\right)\\ &=\frac{(e^{z_1^{[L]}})’\sum_{i=1}^Ce^{z_i^{[L]}}-e^{z_1^{[L]}}(e^{z_1^{[L]}}+e^{z_2^{[L]}}+\dots+e^{z_C^{[L]}})’}{\left(\sum_{i=1}^Ce^{z_i^{[L]}}\right)^2}\\ &=\frac{e^{z_1^{[L]}}\sum_{i=1}^Ce^{z_i^{[L]}}-e^{z_1^{[L]}}e^{z_1^{[L]}}}{\left(\sum_{i=1}^Ce^{z_i^{[L]}}\right)^2}\\ &=\frac{e^{z_1^{[L]}}\sum_{i=1}^Ce^{z_i^{[L]}}}{(\sum_{i=1}^Ce^{z_i^{[L]}})^2}-\frac{e^{z_1^{[L]}}e^{z_1^{[L]}}}{(\sum_{i=1}^Ce^{z_i^{[L]}})^2}\\ &=\frac{e^{z_1^{[L]}}}{\sum_{i=1}^Ce^{z_i^{[L]}}}-\frac{e^{z_1^{[L]}}}{\sum_{i=1}^Ce^{z_i^{[L]}}}\frac{e^{z_1^{[L]}}}{\sum_{i=1}^Ce^{z_i^{[L]}}}\\ &=a^{[L]}_1(1-a^{[L]}_1) \end{split}$

$\frac{\partial e^{z^{[L]}_i}}{\partial z^{[L]}_j}=a^{[L]}_i(1-a^{[L]}_i)$

$\begin{split} \frac{\partial a^{[L]}_i}{\partial z_j^{[L]}} &=\frac{\partial}{\partial z_j^{[L]}}\left(\frac{e^{z_i^{[L]}}}{\sum_{k=1}^Ce^{z_k^{[L]}}}\right)\\ &=\frac{(e^{z_i^{[L]}})’\sum_{k=1}^Ce^{z_k^{[L]}}-e^{z_i^{[L]}}(e^{z_1^{[L]}}+e^{z_2^{[L]}}+\dots+e^{z_C^{[L]}})’}{\left(\sum_{k=1}^Ce^{z_k^{[L]}}\right)^2}\\ &=\frac{0\sum_{i=1}^Ce^{z_i^{[L]}}-e^{z_i^{[L]}}e^{z_j^{[L]}}}{\left(\sum_{k=1}^Ce^{z_k^{[L]}}\right)^2}\\ &=\frac{-e^{[L]}_ie^{[L]}_j}{(\sum_{k=1}^Ce^{[L]}_k)^2}\\ &=-\frac{e^{[L]}_i}{\sum_{k=1}^Ce^{[L]}_k}\frac{e^{[L]}_j}{\sum_{k=1}^Ce^{[L]}_k}\\ &=-a_ia_j \end{split}$

$\begin{split} \frac{\partial a^{[L]}}{\partial z^{[L]}}&=\begin{bmatrix} \frac{\partial a^{[L]}_1}{\partial z^{[L]}_1}&\frac{\partial a^{[L]}_1}{\partial z^{[L]}_2}&\dots&\frac{\partial a^{[L]}_1}{\partial z^{[L]}_C}\\ \frac{\partial a^{[L]}_2}{\partial z^{[L]}_1}&\frac{\partial a^{[L]}_2}{\partial z^{[L]}_2}&\dots&\frac{\partial a^{[L]}_2}{\partial z^{[L]}_C}\\ \vdots&\vdots&\ddots&\vdots\\ \frac{\partial a^{[L]}_C}{\partial z^{[L]}_1}&\frac{\partial a^{[L]}_C}{\partial z^{[L]}_2}&\dots&\frac{\partial a^{[L]}_C}{\partial z^{[L]}C} \end{bmatrix}\\ \end{split}$

$\frac{\partial\ell}{\partial z^{[L]}}=\frac{\partial\ell}{\partial a^{[L]}}\frac{\partial a^{[L]}}{\partial z^{[L]}}$

$\begin{split} \frac{\partial\ell}{\partial a^{[L]}}\frac{\partial a^{[L]}}{\partial z^{[L]}}&= \begin{bmatrix} -\frac{y_1}{a^{[L]}_1}&-\frac{y_2}{a^{[L]}_2}&\dots&-\frac{y_C}{a^{[L]}_C} \end{bmatrix} \begin{bmatrix} \frac{\partial a^{[L]}_1}{\partial z^{[L]}_1}&\frac{\partial a^{[L]}_1}{\partial z^{[L]}_2}&\dots&\frac{\partial a^{[L]}_1}{\partial z^{[L]}_C}\\ \frac{\partial a^{[L]}_2}{\partial z^{[L]}_1}&\frac{\partial a^{[L]}_2}{\partial z^{[L]}_2}&\dots&\frac{\partial a^{[L]}_2}{\partial z^{[L]}_C}\\ \vdots&\vdots&\ddots&\vdots\\ \frac{\partial a^{[L]}_C}{\partial z^{[L]}_1}&\frac{\partial a^{[L]}_C}{\partial z^{[L]}_2}&\dots&\frac{\partial a^{[L]}_C}{\partial z^{[L]}C} \end{bmatrix}\\ &=\begin{bmatrix} -\frac{y_1}{a^{[L]}_1}\frac{\partial a^{[L]}_1}{\partial z^{[L]}_1}-\frac{y_2}{a^{[L]}_2}\frac{\partial a^{[L]}_2}{\partial z^{[L]}_1}-\dots-\frac{y_C}{a^{[L]}_C}\frac{\partial a^{[L]}_C}{\partial z^{[L]}_1}\\ -\frac{y_1}{a^{[L]}_1}\frac{\partial a^{[L]}_1}{\partial z^{[L]}_2}-\frac{y_2}{a^{[L]}_2}\frac{\partial a^{[L]}_2}{\partial z^{[L]}_2}-\dots-\frac{y_C}{a^{[L]}_C}\frac{\partial a^{[L]}_C}{\partial z^{[L]}_2}\\ \vdots\\ -\frac{y_1}{a^{[L]}_1}\frac{\partial a^{[L]}_1}{\partial z^{[L]}_C}-\frac{y_2}{a^{[L]}_2}\frac{\partial a^{[L]}_2}{\partial z^{[L]}_C}-\dots-\frac{y_C}{a^{[L]}_C}\frac{\partial a^{[L]}_C}{\partial z^{[L]}_C}\\ \end{bmatrix}^T\\ &=\begin{bmatrix} -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}\frac{\partial a^{[L]}_i}{\partial z^{[L]}_1}& -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}\frac{\partial a^{[L]}_i}{\partial z^{[L]}_2}& \dots& -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}\frac{\partial a^{[L]}_i}{\partial z^{[L]}_C} \end{bmatrix} \end{split}$

$\begin{split} \frac{\partial\ell}{\partial a^{[L]}}\frac{\partial a^{[L]}}{\partial z^{[L]}}&=\begin{bmatrix} -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}\frac{\partial a^{[L]}_i}{\partial a^{[L]}_1}& -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}\frac{\partial a^{[L]}_i}{\partial a^{[L]}_2}& \dots& -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}\frac{\partial a^{[L]}_i}{\partial a^{[L]}_C} \end{bmatrix}\\ &=\begin{bmatrix} -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}\frac{\partial a^{[L]}_i}{\partial a^{[L]}_j} \end{bmatrix},j=[1,2,\dots,C]\\ \end{split}$

$\begin{split} \frac{\partial a^{[L]}_i}{\partial a^{[L]}_j}=\left\{\begin{matrix} a_i(1-a_j)&&i=j\\ -a_ia_j&&i\neq j \end{matrix}\right. \end{split}$

$\begin{split} -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}\frac{\partial a^{[L]}_i}{\partial z^{[L]}_j}&= \left\{\begin{matrix} -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}a^{[L]}_i(1-a^{[L]}_j)&&i=j\\ \sum_{i=1}^C\frac{y_i}{a^{[L]}_i}a^{[L]}_ia^{[L]}_j&&i\neq j \end{matrix}\right.\\ &= \left\{\begin{matrix} -\sum_{i=1}^Cy_i(1-a^{[L]}_j)&&i=j\\ \sum_{i=1}^Cy_ia^{[L]}_j&&i\neq j \end{matrix}\right.\\ &= \left\{\begin{matrix} \sum_{i=1}^Cy_ia^{[L]}_j-y_i&&i=j\\ \sum_{i=1}^Cy_ia^{[L]}_j&&i\neq j \end{matrix}\right. \end{split}$

$\begin{split} -\sum_{i=1}^C\frac{y_i}{a^{[L]}_i}\frac{\partial a^{[L]}_i}{\partial z^{[L]}_j}&= {\color{red}{-y_j+y_ja^{[L]}_j}}+{\color{green}{\sum_{i\in{\{i|i\neq j\}}}y_ia^{[L]}_j}}\\ &=-y_j+{\color{blue}{y_ja^{[L]}_j+\sum_{i\in{\{i|i\neq j\}}}y_ia^{[L]}_j}}\\ &=-y_j+{\color{orange}{\sum_{i=1}^Cy_ia^{[L]}_j}}\\ &=-y_j+a^{[L]}_j\sum_{i=1}^Cy_i\\ &=a^{[L]}_i-y_j\\ &=a^{[L]}-y \end{split}$

## 总结

$$Softmax$$ 回归的激活部分，和使用 $$Sigmoid/ReLu$$ 作为激活函数是有所不同的，因为在 $$Sigmoid/ReLu$$ 中，每一个神经元计算得到 $$z$$ 后不需要将其它神经元的 $$z$$ 全部累加起来做概率的 归一化 ；也就是说以往的 $$Sigmoid/ReLu$$ 作为激活函数，每一个神经元由 $$z$$ 计算 $$a$$ 时是独立于其它的神经元的；所以在反向传播求导数的时候，我们就能发现当计算 $$\frac{\partial a}{\partial z}$$ 的时候，不再是单独的一一对应的关系，而是像正向传播那样，将上一层的结果全部集成到每一个神经元上，下面的图中，红色箭头表示了 $$Softmax$$$$Sigmoid/ReLu$$ 的反向传播的路径的有所不同。