def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
'''
Inputs:
x: average one hot vector for the context
yhat: prediction (estimate of y)
y: target vector
h: hidden vector (see eq. 1)
W1, W2, b1, b2: matrices and biases
batch_size: batch size
Outputs:
grad_W1, grad_W2, grad_b1, grad_b2: gradients of matrices and biases
'''
# Compute z1 as "W1⋅x + b1"
z1 = np.dot(W1, x) + b1
### START CODE HERE (Replace instanes of 'None' with your code) ###
# Compute l1 as W2^T (Yhat - Y)
l1 = np.dot(W2.T,(yhat-y))
# if z1 < 0, then l1 = 0
# otherwise l1 = l1
# (this is already implemented for you)
l1[z1 < 0] = 0 # use "l1" to compute gradients below
# compute the gradient for W1
grad_W1 = np.dot(l1, x.T) / batch_size
# Compute gradient of W2
grad_W2 = np.dot((yhat-y),h.T) / batch_size
# compute gradient for b1
grad_b1 = np.sum(l1, axis=1, keepdims=True) / batch_size
# compute gradient for b2
grad_b2 = np.sum((yhat-y), axis=1, keepdims=True) / batch_size
### END CODE HERE ####
return grad_W1, grad_W2, grad_b1, grad_b2