Add L2 Weight Decay
This illustrates adding L2 penalty to the loss inside training loop.
xxxxxxxxxx
20
# Suppose total_loss accumulates data loss already:
# L_total = L_data + (lambda/2) * sum(W^2)
def l2_penalty_mats(mats):
return sum(sum(w*w for w in row) for M in mats for row in M)
def l2_penalty_vecs(vecs):
return sum(w*w for v in vecs for w in v) if vecs and isinstance(vecs[0], list) else sum(w*w for w in vecs)
# Example inside training after accumulating gradients:
lam = 1e-3
# add penalty to total_loss (W1,W2 shown)
total_loss += 0.5 * lam * (l2_penalty_mats([W1]) + sum(w*w for w in W2))
# and when updating grads, add lam * W terms (weight decay)
for j in range(hidden):
for i in range(input_dim):
dW1[j][i] += lam * W1[j][i]
for j in range(hidden):
dW2[j] += lam * W2[j]
OUTPUT
:001 > Cmd/Ctrl-Enter to run, Cmd/Ctrl-/ to comment