殊途同归的策略梯度与零阶优化

$$\theta = \mathop{\arg\max}_{\theta}\mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[r\left(y_t, \mathop{\arg\max}_y p_{\theta}(y|x_t)\right)\right]\label{eq:base}$$

$$\theta = \mathop{\arg\max}_{\theta}\mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[\sum_y p_{\theta}(y|x_t) r\left(y_t, y\right)\right]\label{eq:policy}$$

\begin{aligned}\sum_y p_{\theta}(y|x_t) (r\left(y_t, y\right) + c) =& \sum_y p_{\theta}(y|x_t) r\left(y_t, y\right) + \sum_y p_{\theta}(y|x_t) c\\

=& \sum_y p_{\theta}(y|x_t) r\left(y_t, y\right) + c

\end{aligned}

$$\mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[\sum_y \nabla_{\theta} p_{\theta}(y|x_t) r\left(y_t, y\right)\right]\label{eq:policy-grad-base}$$

\begin{aligned}

&\mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[\sum_y p_{\theta}(y|x_t)\frac{\nabla_{\theta} p_{\theta}(y|x_t)}{p_{\theta}(y|x_t)} r\left(y_t, y\right)\right]\\

=& \mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[\sum_y p_{\theta}(y|x_t)r\left(y_t, y\right)\nabla_{\theta} \log p_{\theta}(y|x_t)\right]\\

=& \mathbb{E}_{(x_t,y_t)\sim\mathcal{D}, y\sim p_{\theta}(y|x_t)}\left[r\left(y_t, y\right)\nabla_{\theta}\log p_{\theta}(y|x_t)\right]

\end{aligned}

$$\mathbb{E}_{y\sim p_{\theta}(y|x_t)}\left[(r\left(y_t, y\right)-b)\nabla_{\theta}\log p_{\theta}(y|x_t)\right]\label{eq:var-reduce}$$

$$\mathbb{E}_{y\sim p_{\theta}(y|x_t)}\left[(r\left(y_t, y\right)-b)^2\Vert\nabla_{\theta}\log p_{\theta}(y|x_t)\Vert^2\right]$$

$$b = \frac{\mathbb{E}_{y\sim p_{\theta}(y|x_t)}\left[r\left(y_t, y\right)\Vert\nabla_{\theta}\log p_{\theta}(y|x_t)\Vert^2\right]}{\mathbb{E}_{y\sim p_{\theta}(y|x_t)}\left[\Vert\nabla_{\theta}\log p_{\theta}(y|x_t)\Vert^2\right]}$$

$$b = \mathbb{E}_{y\sim p_{\theta}(y|x_t)}\left[r\left(y_t, y\right)\right]$$

$$\tilde{\nabla}_{x}f(x)=\int p(u) u \left(u^{\top}\nabla_x f(x)\right)du=\int p(u) \left(u u^{\top}\right)\nabla_x f(x)du=\nabla_x f(x)$$

$$\tilde{\nabla}_{x}f(x)=\mathbb{E}_{u\sim p(u)}\left[\frac{f(x + \varepsilon u)}{\varepsilon}u\right] – \mathbb{E}_{u\sim p(u)}\left[\frac{f(x)}{\varepsilon}u\right]=\mathbb{E}_{u\sim p(u)}\left[\frac{f(x + \varepsilon u)}{\varepsilon}u\right]\label{eq:zero-grad-equal}$$

$$\mathbb{E}_{u\sim p(u)}\left[\left(\frac{f(x + \varepsilon u)-b}{\varepsilon}\right)\Vert u\Vert^2\right]$$

$$b=\frac{\mathbb{E}_{u\sim p(u)}\left[f(x + \varepsilon u)\Vert u\Vert^2\right]}{\mathbb{E}_{u\sim p(u)}\left[\Vert u\Vert^2\right]}$$

$$\mathcal{R}_{\theta}=\mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[r\left(y_t, \mathop{\arg\max}_y p_{\theta}(y|x_t)\right)\right]$$

\begin{aligned}

\tilde{\nabla}_{\theta}\mathcal{R}_{\theta}=&\frac{1}{\varepsilon}\int \mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[r\left(y_t, \mathop{\arg\max}_y p_{\theta + \varepsilon u}(y|x_t)\right)\right] p(u) u du\\

=&\mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[\frac{1}{\varepsilon}\int r\left(y_t, \mathop{\arg\max}_y p_{\theta + \varepsilon u}(y|x_t)\right)p(u) u du\right]

\end{aligned}

\begin{aligned}

\tilde{\nabla}_{\theta}\mathcal{R}_{\theta}=& \mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[\frac{1}{\varepsilon}\sum_y\int_{\Omega_{y|x_t}} r\left(y_t, y\right)p(u) u du\right]\\

\Omega_{y|x_t} =& \left\{u\left|u\in\mathbb{R}^d, y=\mathop{\arg\max}_{\hat{y}} p_{\theta + \varepsilon u}(\hat{y}|x_t)\right.\right\}

\end{aligned}

\chi(y|x_t)=\left\{\begin{aligned}1, u\in\Omega_{y|x_t}\\

0, u\not\in\Omega_{y|x_t}\end{aligned}\right.

\begin{aligned}

\frac{1}{\varepsilon}\sum_y\int_{\Omega_{y|x_t}} r\left(y_t, y\right)p(u) u du=\frac{1}{\varepsilon}\sum_y\int \chi(y|x_t) r\left(y_t, y\right)p(u) u du

\end{aligned}

$$\chi(y|x_t)=\lim_{\tau\to 0} p_{\theta + \varepsilon u}(y|x_t)$$

\tilde{\nabla}_{\theta}\mathcal{R}_{\theta} \approx \mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[\frac{1}{\varepsilon}\sum_y\int p_{\theta + \varepsilon u}(y|x_t) r\left(y_t, y\right)p(u) u du\right]

\tilde{\nabla}_{\theta}\mathcal{R}_{\theta} \approx \mathbb{E}_{(x_t,y_t)\sim\mathcal{D}}\left[\sum_y r\left(y_t, y\right)\nabla_{\theta}p_{\theta}(y|x_t)\right]