, AdaDelta, - -. - - , (, , ), , .
, ,
D = {(A_1,b_1), (A_2,b_2), (A_3,b_3), ...}
A_k - k- , b_k - . , (, , , , )
x = (x_1, x_2, ..., x_n)
(A_k, b_k) , .. , x . - , D .
"" --- , , , (RMS) .
- , , x :
x_new <- x_old - gradient(RMS[predicted-actual])
, AdaGrad AdaDelta, "" , , , , AdaDelta, x "" .
, :
(.. x )
gt = (∂f/∂x_1, ∂f/∂x_2, ..., ∂f/∂x_n) (xt)
f (x1, x2,..., x_n) - , ; x, . : xt.
, RMS -x
RMS[\Delta x]_{t-1} = \sqrt{ E[\Delta x^2]_{t-1} + \epsilon },
E[\Delta x^2]_t = \rho E[\Delta x^2]_{t-1} + (1-\rho) g^2_T,
E[\Delta x^2]_0 = 0.
AdaDelta - . :
(new_weights @ T) := (old_weights @ T-1) - [adaptive_learning_rate] * (gradient @ T)
adaptive_learning_rate := -(RMS[Delta-x] @ T-1)/(RMS[gradient] @ T)
AdaDelta , , .
" " , ; , - (/, /, / ..), , , .
Delta-x - x . , x_i - , x_ {i + 1} - , Delta-x is (x_ {i + 1} - x_i).
(∂f/∂x) - , ( ML, f - ).