Derivation of Sparse Bayesian Learning

System Model

We consider following system
\begin{align}
\mathbf{y}=\mathbf{Hx}+\mathbf{w}
\end{align}
where $\mathbf{y}\in \mathbb{R}^M$ is observed signal or received signal, $\mathbf{H}\in \mathbb{R}^{M\times N}$ linear transform matix, and $\mathbf{x}\in \mathbb{R}^{N}$ is estimated signal. In addition, $\mathbf{w}\sim \mathcal{N}(\mathbf{w}|\mathbf{0},\sigma^2\mathbf{I})$ is additional Gaussian noise.

Derivation

$\underline{\textbf{Step 1} }$: Assume $p(\mathbf{x};\mathbf{\lambda})\sim \mathcal{N}\left({\mathbf{x}|\mathbf{0},\text{Diag}(\mathbf{1}\oslash \boldsymbol{\lambda})}\right)$ with parameters $\boldsymbol{\lambda}=\left\{\lambda_1,\cdots,\lambda_M\right\}$, and $\mathbf{w}\sim \mathcal{N}(\mathbf{w}|\mathbf{0},\xi^{-1}\mathbf{I})$. Initialize $\boldsymbol{\lambda}=1$ and $\xi^{-1}=\sigma^{2}$.

$\underline{\textbf{Step 2} }$: Calculate the posterior mean estimator (PME) of $\mathbf{x}$ using Gaussian product lemma. Assume $\mathcal{N}(\mathbf{x}|\mathbf{a},\mathbf{A})$ and $\mathcal{N}(\mathbf{x}|\mathbf{b},\mathbf{B})$, then there is $\mathcal{N}(\mathbf{x}|\mathbf{a},\mathbf{A})\mathcal{N}(\mathbf{x}|\mathbf{b},\mathbf{B})=\mathcal{N}(\mathbf{0}|\mathbf{a}-\mathbf{b},\mathbf{A}+\mathbf{B})\mathcal{N}(\mathbf{x}|\mathbf{c},\mathbf{C})$, where $\mathbf{C}=(\mathbf{A}^{-1}+\mathbf{B}^{-1})^{-1}$ and $\mathbf{c}=\mathbf{C}\cdot (\mathbf{A}^{-1}\mathbf{a}+\mathbf{B}^{-1}\mathbf{b})$. Since prior and likelihood function are Gaussian, the posterior distribution is also Gaussian.
\begin{align}
p(\mathbf{x}|\mathbf{y};\mathbf{\lambda},\xi)
&=\frac{p(\mathbf{y}|\mathbf{x};\xi)p(\mathbf{x};\mathbf{\lambda})}{p(\mathbf{y}|\mathbf{\lambda};\xi)}\\
&\propto p(\mathbf{y}|\mathbf{x};\xi)p(\mathbf{x};\mathbf{\lambda})\\
&\propto \mathcal{N}\left(\mathbf{x}\left|(\mathbf{H}^T\mathbf{H})^{-1}\mathbf{H}^T\mathbf{y},(\xi\mathbf{H}^T\mathbf{H})^{-1}\right.\right)\mathcal{N}(\mathbf{x}|\mathbf{0},\text{Diag}(\mathbf{1}\oslash \mathbf{\lambda}))\\
&\propto \mathcal{N}\left(\mathbf{x}|\boldsymbol{\mu},\mathbf{\Sigma}\right)
\end{align}
where $\boldsymbol{\mu}$ is the PME of $\mathbf{x}$
\begin{align}
\mathbf{\Sigma}&=\left(\xi\mathbf{H}^T\mathbf{H}+\text{Diag}(\boldsymbol{\lambda})\right)^{-1}\\
\boldsymbol{\mu}&=\xi \mathbf{\Sigma}\mathbf{H}^T\mathbf{y}
\end{align}

$\underline{\textbf{Step 3} }$: Update $\boldsymbol{\lambda}$ and $\xi$. There are two schemes for updating parameters $\boldsymbol{\lambda}$ and $\xi$. One is type II maximum likelihood function and another is expectation-maximum (EM). We first introduce type II maximum likelihood function.

$\textbf{Scheme I:}$ Type II Maximum Likelihood function. We first calculate type II likelihood function $p(\mathbf{y};\boldsymbol{\lambda},\xi)$, also named evidence function or partition function.
\begin{align}
p(\mathbf{y};\boldsymbol{\lambda},\xi)
&=\int p(\mathbf{y}|\mathbf{x};\xi)p(\mathbf{x};\boldsymbol{\lambda})\text{d}\mathbf{x}\\
&=\mathcal{N}\left((\mathbf{H}^T\mathbf{H})^{-1}\mathbf{H}^T\mathbf{y}|\mathbf{0},(\xi\mathbf{H}^T\mathbf{H})^{-1}+\text{Diag}(\mathbf{1}\oslash \boldsymbol{\lambda})\right)\\
&=\mathcal{N}(\mathbf{y}|\mathbf{0},\xi^{-1}\mathbf{I}+\mathbf{H}\text{Diag}(\mathbf{1}\oslash \boldsymbol{\lambda})\mathbf{H}^T)
\end{align}
Denote
\begin{align}
\mathcal{L}(\boldsymbol{\lambda},\xi)
&=\log p(\mathbf{y};\boldsymbol{\lambda},\xi)\\
&=-\frac{M}{2}\log 2\pi-\frac{1}{2}\underbrace{\log |\xi^{-1}\mathbf{I}+\mathbf{H}\text{Diag}(\mathbf{1}\oslash \boldsymbol{\lambda})\mathbf{H}^T|}_{(a)}-\frac{1}{2}\underbrace{\mathbf{y}(\xi^{-1}\mathbf{I}+\mathbf{H}\text{Diag}(\mathbf{1}\oslash \boldsymbol{\lambda})\mathbf{H}^T)^{-1}\mathbf{y} }_{(b)}
\end{align}
where part $(a)$ is calculated by exploiting the determinant identity $|\mathbf{A}| |a^{-1}\mathbf{I}+\mathbf{H}\mathbf{A}^{-1}\mathbf{H}^T|=|a^{-1}\mathbf{I}||\mathbf{A}+a\mathbf{H}^T\mathbf{H}|$.
\begin{align}
|\text{Diag}(\boldsymbol{\lambda})| | \xi^{-1}\mathbf{I}+\mathbf{H}\text{Diag}(\mathbf{1}\oslash \boldsymbol{\lambda})\mathbf{H}^T|
&=|\xi^{-1}\mathbf{I}| |\text{Diag}(\boldsymbol{\lambda})+\xi\mathbf{H}^T\mathbf{H}|\\
&=|\xi^{-1}\mathbf{I}| |\mathbf{\Sigma}^{-1}|
\end{align}
therefore
\begin{align}
\log |\xi^{-1}+\mathbf{H}\text{Diag}(\mathbf{1}\oslash \boldsymbol{\lambda})\mathbf{H}^T|=-M\log \xi-\log \mathbf{\Sigma}-\log |\text{Diag}(\boldsymbol{\lambda})|
\end{align}
and the part $(b)$ is computed by using Matrix inverse lemma.
\begin{align}
(\xi^{-1}\mathbf{I}+\mathbf{H}\text{Diag}(\mathbf{1}\oslash \boldsymbol{\lambda})\mathbf{H}^T)^{-1}
&=\xi \mathbf{I}-\xi^2 \mathbf{H}(\xi\mathbf{H}^T\mathbf{H}+\text{Diag}(\boldsymbol{\lambda}))^{-1}\mathbf{H}^T\\
&=\xi \mathbf{I}-\xi^2 \mathbf{H}\mathbf{\Sigma}\mathbf{H}^T
\end{align}
therefore
\begin{align}
\mathbf{y}(\xi^{-1}\mathbf{I}+\mathbf{H}\text{Diag}(\mathbf{1}\oslash \boldsymbol{\lambda})\mathbf{H}^T)^{-1}\mathbf{y}
&=\xi\mathbf{y}^T\mathbf{y}-\xi^{2}\mathbf{y}^T\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\mathbf{y}
\end{align}
Thus $\mathcal{L}(\boldsymbol{\lambda},\xi)$ reads
\begin{align}
\mathcal{L}(\boldsymbol{\lambda},\xi)=\frac{M}{2}\log \xi +\frac{1}{2}\log |\mathbf{\Sigma}|+\frac{1}{2}\log |\text{Diag}(\boldsymbol{\lambda})|-\frac{1}{2}\xi \mathbf{y}^T\mathbf{y}+\frac{1}{2}\xi^2 \mathbf{y}^T\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\mathbf{y}
\end{align}

Taking partial derivation of $\mathcal{L}(\boldsymbol{\lambda},\xi)$ w.r.t. $\lambda_i$ gets
\begin{align}
\frac{\partial \mathcal{L} }{\partial \lambda_i}
&=\frac{1}{2}\frac{\partial \log |\mathbf{\Sigma}|}{\partial \lambda_i}+\frac{1}{2}\frac{\partial \log |\text{Diag}(\boldsymbol{\lambda})|}{\partial \lambda_i}+\frac{1}{2}\xi^{2}\frac{\partial (\mathbf{H}^T\mathbf{y})^T\mathbf{\Sigma}(\mathbf{H}^T\mathbf{y})}{\partial \lambda_i}\\
&=-\frac{\Sigma_{ii} }{2}+\frac{1}{2\lambda_i}-\frac{\mu_i^2}{2}\\
\end{align}
As a result, we get
\begin{align}
\lambda_i^{-1} &=\Sigma_{ii}+\mu_i^2
\end{align}
the details are given by
\begin{align}
\frac{\partial \log |\mathbf{\Sigma}|}{\partial \lambda_i}
&=\text{tr}\left\{\mathbf{\Sigma}^{-1}\frac{\partial \mathbf{\Sigma} }{\partial \lambda_i}\right\}\\
&=\text{tr}\left\{-\frac{\partial \mathbf{\Sigma}^{-1} }{\partial \lambda_i}\mathbf{\Sigma}\right\}\\
&=-\text{tr}\left\{\boldsymbol{e}_i\boldsymbol{e}_i^T\mathbf{\Sigma}\right\}\\
&=-\Sigma_{ii}
\end{align}
and
\begin{align}
\frac{\partial (\mathbf{H}^T\mathbf{y})^T\mathbf{\Sigma}(\mathbf{H}^T\mathbf{y})}{\partial \lambda_i}
&=\text{tr}\left\{(\mathbf{H}^T\mathbf{y})(\mathbf{H}^T\mathbf{y})^T\frac{\partial \mathbf{\Sigma} }{\partial \lambda_i}\right\}\\
&=-\text{tr}\left\{(\mathbf{H}^T\mathbf{y})(\mathbf{H}^T\mathbf{y})^T\mathbf{\Sigma}\boldsymbol{e}_i\boldsymbol{e}_i^T\mathbf{\Sigma}\right\}\\
&=-\mu_i^2
\end{align}

In order to make sure that $\lambda_i=\left(\Sigma_{ii}+\mu_i^2\right)^{-1}$ is maximum point of $\mathcal{L}(\boldsymbol{\lambda},\xi)$ w.r.t. $\lambda_i$, taking the twice partial derivative of $\mathcal{L}(\boldsymbol{\lambda},\xi)$ w.r.t. $\lambda_i$ as following
\begin{align}
\left.\frac{\partial^2 \mathcal{L}(\boldsymbol{\lambda},\xi)}{\partial \lambda_i^2}\right|_{\lambda_i^{-1} =\Sigma_{ii}+\mu_i^2}\leq 0
\end{align}

Taking the partial derivative of $\mathcal{L}(\boldsymbol{\lambda},\xi)$ w.r.t. $\xi$ yields
\begin{align}
\frac{\partial \mathcal{L}(\boldsymbol{\lambda},\xi)}{\partial \xi}
&=\frac{M}{2\xi}+\frac{1}{2}\frac{\log |\mathbf{\Sigma}|}{\partial \xi}-\frac{1}{2}\mathbf{yy}^T+\frac{1}{2}\frac{\partial \xi^2(\mathbf{H}^T\mathbf{y})^T\mathbf{\Sigma}(\mathbf{H}^T\mathbf{y})}{\partial \xi}\\
&=\frac{M}{2\xi}- \frac{1}{2}\text{tr}\left\{\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\right\}-\frac{1}{2}\mathbf{yy}^T+\xi\mathbf{y}^T\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\mathbf{y}-\frac{1}{2}\xi^2\mathbf{y}^T\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\mathbf{y}\\
&=\frac{M}{2\xi}-\frac{1}{2}\text{tr}\left\{\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\right\}-\frac{1}{2}\mathbf{yy}^T+\mathbf{y}\mathbf{H}\boldsymbol{\mu}-\boldsymbol{\mu}^T\mathbf{H}^T\mathbf{H}\boldsymbol{\mu}\\
&=\frac{M}{2\xi}-\frac{1}{2}\text{tr}\left\{\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\right\}-\frac{1}{2}||\mathbf{y}-\mathbf{H}\boldsymbol{\mu}||^2
\end{align}
therefore
\begin{align}
\xi^{-1}=\frac{||\mathbf{y}-\mathbf{H}\boldsymbol{\mu}||^2+\text{tr}(\mathbf{\Sigma}\mathbf{H}^T\mathbf{H})}{M}
\end{align}
the details are given by
\begin{align}
\frac{\partial \log |\mathbf{\Sigma}|}{\partial \xi}
&=\text{tr}\left\{\mathbf{\Sigma}^{-1}\frac{\partial \mathbf{\Sigma} }{\partial \xi}\right\}\\
&=\text{tr}\left\{-\frac{\partial \mathbf{\Sigma}^{-1} }{\partial \xi}\mathbf{\Sigma}\right\}\\
&=\text{tr}\left\{- \mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\right\}
\end{align}
and
\begin{align}
\frac{\partial \xi^2(\mathbf{H}^T\mathbf{y})^T\mathbf{\Sigma}(\mathbf{H}^T\mathbf{y})}{\partial \xi}
&=2\xi(\mathbf{H}^T\mathbf{y})^T\mathbf{\Sigma}(\mathbf{H}^T\mathbf{y})+\xi^2\text{tr}\left\{(\mathbf{H}^T\mathbf{y})(\mathbf{H}^T\mathbf{y})^T\frac{\partial \mathbf{\Sigma} }{\partial \xi}\right\}\\
&=2\xi\mathbf{y}^T\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\mathbf{y}-\xi^2\mathbf{y}^T\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\mathbf{H}\mathbf{\Sigma}\mathbf{H}^T\mathbf{y}
\end{align}
The twice partial derivative of $\mathcal{L}(\boldsymbol{\lambda},\xi)$ w.r.t. $\xi$ is given
\begin{align}
\left.\frac{\partial^2 \mathcal{L}(\boldsymbol{\lambda},\xi)}{\partial \xi^2}\right|_{\xi^{-1}=\frac{||\mathbf{y}-\mathbf{H}\boldsymbol{\mu}||^2+\text{tr}(\mathbf{\Sigma}\mathbf{H}^T\mathbf{H})}{M} } \leq 0
\end{align}

$\textbf{Scheme II:}$ Expectation maximization(EM). The EM algorithm includes two part, expectation step and maximization step. In E-Step, the posterior distribution $p(\mathbf{x}|\mathbf{y};\boldsymbol{\lambda},\xi)$ is calculated , while in M-Step the parameters $\boldsymbol{\lambda}$ and $\xi$ are updated. For this case, we only need to carry out M-step, since the posterior distribution $p(\mathbf{x}|\mathbf{y};\boldsymbol{\lambda},\xi)$ is calculated as $\mathcal{N}(\mathbf{x}|\boldsymbol{\mu},\mathbf{\Sigma})$ by step 1 and step 2. We update $\boldsymbol{\lambda}$ and $\xi$ respectively. One is
\begin{align}
\lambda_i
&= \underset{\lambda_i>0}{\arg \max} \ \mathbb{E}_{p(\mathbf{x}|\mathbf{y};\boldsymbol{\lambda},\xi)} \left\{\log p(\mathbf{y},\mathbf{x};\boldsymbol{\lambda},\xi)\right\}\\
&= \underset{\lambda_i>0}{\arg \max} \ \mathbb{E}_{p(\mathbf{x}|\mathbf{y};\boldsymbol{\lambda},\xi)} \left\{\log p(\mathbf{x};\boldsymbol{\lambda})\right\}\\
&= \underset{\lambda_i>0}{\arg \max} \ \mathbb{E}_{p(\mathbf{x}|\mathbf{y};\boldsymbol{\lambda},\xi)} \left\{\log p(x_i;\lambda_i)\right\}\\
&= \underset{\lambda_i>0}{\arg \max} \ \mathbb{E}_{p(\mathbf{x}|\mathbf{y};\boldsymbol{\lambda},\xi)} \left\{-\frac{1}{2}\log 2\pi+\frac{1}{2}\log \lambda_i-\frac{\lambda_i}{2}x_i^2\right\}\\
&= \underset{\lambda_i>0}{\arg \max} \ \left\{\frac{1}{2}\lambda_i^{-1}-\frac{1}{2}\left(\Sigma_{ii}+\mu_i^2\right)\right\}\\
&=(\mu_i^2+\Sigma_{ii})^{-1}
\end{align}
and another is
\begin{align}
\xi
&=\underset{\xi>0}{\arg \max} \ \mathbb{E}_{p(\mathbf{x}|\mathbf{y};\boldsymbol{\lambda},\xi)} \left\{\log p(\mathbf{y}|\mathbf{x};\boldsymbol{\lambda},\xi)\right\}\\
&=\underset{\xi>0}{\arg \max} \ \mathbb{E}_{p(\mathbf{x}|\mathbf{y};\boldsymbol{\lambda},\xi)} \left\{\frac{M}{2}\log \xi-\frac{\xi}{2}\left(\mathbf{yy}^T+2\mathbf{y}^T\mathbf{Hx}-\mathbf{x}^T\mathbf{H}^T\mathbf{Hx}\right)\right\}\\
&\overset{(a)}{=}\underset{\xi>0}{\arg \max} \ \left\{\frac{M}{2}\log \xi-\frac{\xi}{2}\left(\mathbf{yy}^T+2\mathbf{y}^T\mathbf{H}\boldsymbol{\mu}-\boldsymbol{\mu}\mathbf{H}^T\mathbf{H}\boldsymbol{\mu}-\text{tr}\left\{\mathbf{\Sigma}\mathbf{H}^T\mathbf{H}\right\}\right)\right\}\\
&=\underset{\xi>0}{\arg \max} \ \left\{\frac{M}{2}\log \xi-\frac{\xi}{2}||\mathbf{y}-\mathbf{H}\boldsymbol{\mu}||-\frac{\xi}{2}\text{tr}\left\{\mathbf{\Sigma}\mathbf{H}^T\mathbf{H}\right\}\right\}\\
&=\left(\frac{||\mathbf{y}-\mathbf{H}\boldsymbol{\mu}||^2+\text{tr}(\mathbf{\Sigma}\mathbf{H}^T\mathbf{H})}{M}\right)^{-1}
\end{align}
where $(a)$ holds by the fact
\begin{align}
\mathbb{E}_{\mathcal{N}(\mathbf{x}|\boldsymbol{\mu},\mathbf{\Sigma})}\left\{\mathbf{x}^T\mathbf{B}\mathbf{x}\right\}=\boldsymbol{\mu}^T\mathbf{B}\boldsymbol{\mu}+\text{tr}\left\{\mathbf{\Sigma}\mathbf{B}\right\}
\end{align}

$\underline{\textbf{Step 4} }$: $\longrightarrow \textbf{Step 2}$.

References

[1] Buchgraber T. Variational sparse Bayesian learning: Centralized and distributed processing[M]. na, 2013.