In general ELU > leaky ReLU(and its variants) > ReLU > tanh > logistic. If you care a lot about runtime performance, then you may prefer leaky ReLUs over ELUs. If you don't want to tweak yet another hyperparameter, you may just use the default $\alpha$ value suggested earlier(0.01 for the leaky ReLU, and 1 for ELU). If you have spare time and computing power, you can use cross-validation to evaluate other activation functions, in particular RReLU if your network is overfitting, or PReLU if you have a huge training set.
ReLU
\begin{equation}
ReLU(z) = max(0, z)
\end{equation}
tf.nn.relu
知识兔import matplotlib.pyplot as plt
import numpy as np
def relu(z):
return np.maximum(0, z)
z = np.linspace(-5, 5, 200)
plt.plot(z, relu(z), "r--", linewidth=2)
props = dict(facecolor='black', shrink=0.1)
plt.annotate('ReLU', xytext=(-3.5, 0.5), xy=(-5, 0.1), arrowprops=props, fontsize=14, ha="center")
plt.title("ReLU activation function", fontsize=14)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([0, 0], [-0.5, 4.2], 'k-')
plt.grid(True)
plt.axis([-5, 5, -0.5, 4.2])
plt.tight_layout()
plt.show()
知识兔View Codeleaky ReLU
\begin{equation}
LeakyReLU_{\alpha}(z) = max(\alpha z, z)
\end{equation}
n.leaky_relu
知识兔import matplotlib.pyplot as plt
import numpy as np
def leaky_relu(z, alpha=0.01):
return np.maximum(alpha*z, z)
z = np.linspace(-5, 5, 200)
plt.plot(z, leaky_relu(z, 0.05), "b-", linewidth=2)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([0, 0], [-0.5, 4.2], 'k-')
plt.grid(True)
props = dict(facecolor='black', shrink=0.1)
plt.annotate('Leak', xytext=(-3.5, 0.5), xy=(-5, -0.2), arrowprops=props, fontsize=14, ha="center")
plt.title("Leaky ReLU activation function", fontsize=14)
plt.axis([-5, 5, -0.5, 4.2])
plt.tight_layout()
plt.show()
知识兔View CodeELU
\begin{equation}
\label{b}
ELU(z)=
\begin{cases}
\alpha(e^{z} - 1) & if\ z < 0 \\
z & if\ z\ge 0
\end{cases}
\end{equation}
tf.nn.elu
知识兔import matplotlib.pyplot as plt
import numpy as np
def elu(z, alpha=1):
return np.where(z < 0, alpha * (np.exp(z) - 1), z)
z = np.linspace(-5, 5, 200)
plt.plot(z, elu(z), "g-", linewidth=2)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([-5, 5], [-1, -1], 'k--')
plt.plot([0, 0], [-2.2, 3.2], 'k-')
plt.grid(True)
plt.title(r"ELU activation function ($\alpha=1$)", fontsize=14)
plt.axis([-5, 5, -2.2, 3.2])
plt.tight_layout()
plt.show()
知识兔View CodeRelu6
\begin{equation}
ReLU6(z) = min(max(z, 0), 6)
\end{equation}
f.nn.relu6
知识兔Swish
\begin{equation}
Swish(z) = z*sigmoid(\beta z)
\end{equation}
def swish(x, b = 1):
return x * tf.nn.sigmoid(b * x)
知识兔import matplotlib.pyplot as plt
import numpy as np
def swish(z, b=1):
return z/(1+np.exp(-b*z))
z = np.linspace(-5, 5, 200)
plt.plot(z, swish(z), "g--", linewidth=2)
plt.plot([-5, 5], [0, 0], 'k-')
plt.plot([0, 0], [-0.5, 5.2], 'k-')
plt.grid(True)
plt.title(r"Swish activation function", fontsize=14)
plt.axis([-5, 5, -0.5, 5.2])
plt.tight_layout()
plt.show()
知识兔View Code