import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from IPython import display

def plot(true_points=None, pred_points=None, data_points=None, ax=None):
    
    if ax is None:
        fig, ax = plt.subplots(1, 1)

    if true_points is not None:
        x_true, y_true = true_points
        ax.plot(x_true, y_true, 'g')
    
    if pred_points is not None:
        x_pred, y_pred = pred_points
        ax.plot(x_pred, y_pred, 'r')
    
    if data_points is not None:
        x_data, y_data = data_points
        ax.plot(x_data, y_data, 'bo')
    
    ax.set_xlabel('x')
    ax.set_ylabel('y')

def model(x, w):
    """
    Linear regression model

    Computes f(x) = w_0 + w_1 x + w_2 x^2 + ... + w_n x^n
    for a given x and weights w.
    """

    # compute increasing exponents of x
    X = np.stack([x**i for i in range(w.shape[0])])

    # compute dot-product of weights and x
    return w.dot(X)

x = 2
w = np.array([3, 4, 5])
y = model(x, w)

x, w, y

(2, array([3, 4, 5]), 31)

def model_matmul(xs, w):
    """
    Linear regression model

    Computes f(x) = w_0 + w_1 x + w_2 x^2 + ... + w_n x^n
    for a given set of xs and weights w.
    """
    
    # compute increasing exponents of x
    X = np.stack([xs**i for i in range(w.shape[0])], axis=1)

    # compute dot-product of weights and *all* xs using matrix multiplication
    return X @ w

xs = np.array([1, 2, 3, 4, 5, 6, 7])
w = np.array([3, 4, 5])

ys = [model(x, w) for x in xs]
ys_matmul = model_matmul(xs, w)

ys==ys_matmul

array([ True,  True,  True,  True,  True,  True,  True])

np.random.seed(0)  # set seed for reproducibility

x_data = np.random.uniform(0, 1, size=20)
y_data = np.sin(2 * np.pi * x_data) + np.random.normal(0, 0.1, size=20)

x_true = np.linspace(0, 1, 100)
y_true = np.sin(2 * np.pi * x_true)

# plot data points and true function
fig, ax = plt.subplots(1, 2)
plot(true_points=(x_true, y_true), data_points=(x_data, y_data), ax=ax[0])
# plot kde of data points
_x_data_kde = np.linspace(0, 1, 10000)
_y_data_kde = np.sin(2 * np.pi * _x_data_kde) + np.random.normal(0, 0.1, size=10000)
sns.kdeplot(x=_x_data_kde, y=_y_data_kde, ax=ax[1], shade=True)
plt.show()

x_train = x_data[:10]
y_train = y_data[:10]
x_test = x_data[10:]
y_test = y_data[10:]

def backprop(xs, y_true, w, lr=0.01):

    # build power-bases of xs
    X = np.stack([xs**i for i in range(w.shape[0])], axis=1)  # [n_samples, order]

    # forward pass
    y_pred = X @ w  # [n_samples, order] x [order, 1] = [n_samples]
    err = y_pred - y_true  # [n_samples]
    loss = np.sum(err**2)  # []

    # compute gradient
    dloss = 1  # []
    derr = 2 * err * dloss  # [n_samples]
    dy_pred = derr  # [n_samples]
    dw = X.T @ dy_pred  # [order, n_samples] x [n_samples] = [order]

    # update weights
    w = w - lr * dw

    return w, loss

def train_loop(w_init, lr=0.01, n_epochs=100, quiet=False):

     xs = np.linspace(0, 1, 100)
     w = w_init

     if not quiet:
          fig, ax = plt.subplots(2, 5, figsize=(20, 10))
     for i in range(1, 101):

          w, loss = backprop(x_train, y_train, w, lr=lr)

          if i % 10 == 0:

               y_pred = model_matmul(xs, w)
               _, val_loss = backprop(x_test, y_test, w)

               if not quiet:
                    print(f'Epoch: %d\tLoss: %6.3f\tVal. Loss: %6.3f\t' % (i, loss, val_loss))               
                    n = i//10 - 1
                    plot(true_points=(x_true, y_true), 
                         pred_points=(xs, y_pred), 
                         data_points=(x_test, y_test), 
                         ax=ax[n//5, n%5])

     if not quiet:
          plt.suptitle('Linear Regression Training')
          plt.show()

     return w

np.random.seed(0)  # set seed for reproducibility
w = np.random.normal(0, 0.1, size=11)
_ = train_loop(w, lr=0.01)

Epoch: 10	Loss:  2.266	Val. Loss:  2.337	
Epoch: 20	Loss:  2.149	Val. Loss:  2.114	
Epoch: 30	Loss:  2.039	Val. Loss:  1.927	
Epoch: 40	Loss:  1.935	Val. Loss:  1.758	
Epoch: 50	Loss:  1.836	Val. Loss:  1.607	
Epoch: 60	Loss:  1.743	Val. Loss:  1.471	
Epoch: 70	Loss:  1.655	Val. Loss:  1.349	
Epoch: 80	Loss:  1.572	Val. Loss:  1.240	
Epoch: 90	Loss:  1.493	Val. Loss:  1.144	
Epoch: 100	Loss:  1.419	Val. Loss:  1.060

orders = [0, 1, 6, 9, 16, 23, 47, 93, 127]
fig, ax = plt.subplots(1, len(orders), figsize=(20, 5))
xs = np.linspace(0, 1, 100)

for i, order in enumerate(orders):
    np.random.seed(0)  # set seed for reproducibility
    w_init = np.random.normal(0, 0.1, size=(order+1))
    w_fin = train_loop(w_init, lr=0.01/(1+0.001*order), quiet=True)
    plot(true_points=(x_true, y_true), 
         pred_points=(xs, model_matmul(xs, w_fin)), 
         data_points=(x_test, y_test), 
         ax=ax[i])

fig.suptitle('Various Poly Orders')
plt.show()

def backprop_L2(xs, y_true, w, lambda_L2=0.0, lr=0.01):

    # build power-bases of xs
    X = np.stack([xs**i for i in range(w.shape[0])], axis=1)  # [n_samples, order]

    # forward pass
    y_pred = X @ w  # [n_samples, order] x [order, 1] = [n_samples]
    err = y_pred - y_true  # [n_samples]
    loss = np.sum(err**2)  # []

    # compute gradient
    dloss = 1  # []
    derr = 2 * err * dloss  # [n_samples]
    dy_pred = derr  # [n_samples]
    dw = X.T @ dy_pred  # [order, n_samples] x [n_samples] = [order]
    dw = dw + 2 * lambda_L2 * w  # L2 regularization

    # update weights
    w = w - lr * dw

    return w, loss

def train_loop(w_init, lr=0.01, lambda_L2=0.0, n_epochs=100, quiet=False):
     xs = np.linspace(0, 1, 100)
     w = w_init
     n = 0

     if not quiet:
          fig, ax = plt.subplots(2, 5, figsize=(20, 10))
     for i in range(1, n_epochs+1):

          w, loss = backprop_L2(x_train, y_train, w, lambda_L2=lambda_L2, lr=lr)

          if i % (n_epochs//10) == 0:

               y_pred = model_matmul(xs, w)
               _, val_loss = backprop_L2(x_test, y_test, w, lambda_L2=0.0)

               if not quiet:
                    print(f'Epoch: %d\tLoss: %6.3f\tVal. Loss: %6.3f\t' % (i, loss, val_loss))               
                    plot(true_points=(x_true, y_true), 
                         pred_points=(xs, y_pred), 
                         data_points=(x_test, y_test), 
                         ax=ax[n//5, n%5])
                    n += 1

     if not quiet:
          plt.suptitle('Linear Regression Training with L2 Regularization')
          plt.show()

     return w

np.random.seed(0)  # set seed for reproducibility
w = np.random.normal(0, 5, size=9)
_ = train_loop(w, lr=0.02, lambda_L2=0.1)

Epoch: 10	Loss:  6.302	Val. Loss:  7.193	
Epoch: 20	Loss:  5.250	Val. Loss:  6.005	
Epoch: 30	Loss:  4.697	Val. Loss:  5.432	
Epoch: 40	Loss:  4.237	Val. Loss:  5.015	
Epoch: 50	Loss:  3.836	Val. Loss:  4.680	
Epoch: 60	Loss:  3.483	Val. Loss:  4.405	
Epoch: 70	Loss:  3.174	Val. Loss:  4.177	
Epoch: 80	Loss:  2.902	Val. Loss:  3.986	
Epoch: 90	Loss:  2.661	Val. Loss:  3.825	
Epoch: 100	Loss:  2.449	Val. Loss:  3.688

orders = [0, 1, 6, 9, 16, 23, 47, 93]
lambda_L2 = [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]

fig, ax = plt.subplots(len(lambda_L2), len(orders), figsize=(20, 20))

for i, l2 in enumerate(lambda_L2):
    for j, order in enumerate(orders):
        np.random.seed(0)  # set seed for reproducibility
        w_init = np.random.normal(0, 0.1, size=(order+1))
        w_fin = train_loop(w_init,  lr=0.02, lambda_L2=l2, quiet=True)
        
        plot(true_points=(x_true, y_true), 
             pred_points=(xs, model_matmul(xs, w_fin)), 
             data_points=(x_test, y_test), 
             ax=ax[i,j])
        ax[i,j].title.set_text(f'L2={l2} order={order}')

fig.suptitle('Various poly orders and L2 penalties')
plt.show()

orders = [0, 1, 6, 9, 16, 23, 47, 93]
lambda_L2 = [0.0, 0.05, 0.1, 1.0]

means = np.zeros((len(lambda_L2), len(orders)))

for i, l2 in enumerate(lambda_L2):

    W = np.zeros((max(orders)+3, len(orders)))

    for j, order in enumerate(orders):
        np.random.seed(0)  # set seed for reproducibility
        w_init = np.random.normal(0, 0.1, size=(order+1))
        w_fin = train_loop(w_init, lr=0.02, lambda_L2=l2, quiet=True)
        W[:order+1, j] = w_fin

    sum = W[-2] = np.sum(np.abs(W[:-2]), axis=0)
    mean = W[-1] = sum / (np.array(orders)+1)
    means[i] = mean

    df = pd.DataFrame(W, 
        columns=['M = %d' % order for order in orders], 
        index=['w_%d' % i for i in range(max(orders)+1)] + ['abs sum', 'abs mean'])

    print(f'Results for lambda_L2={l2}')
    display.display(df)

for i, l2 in enumerate(lambda_L2):
    plt.plot(orders, means[i])
plt.legend([f'lambda_L2={l2}' for l2 in lambda_L2])
plt.suptitle('Poly order to mean weight')
plt.show()

Results for lambda_L2=0.0

Results for lambda_L2=0.05

Results for lambda_L2=0.1

Results for lambda_L2=1.0

orders = list(range(100))
lambda_L2 = 10 ** np.linspace(-2,1,10)

means = np.zeros((len(lambda_L2), len(orders)))

for i, l2 in enumerate(lambda_L2):

    W = np.zeros((max(orders)+3, len(orders)))

    for j, order in enumerate(orders):
        np.random.seed(0)  # set seed for reproducibility
        w_init = np.random.normal(0, 0.1, size=(order+1))
        w_fin = train_loop(w_init, lr=0.02, lambda_L2=l2, quiet=True)
        W[:order+1, j] = w_fin

    sum = W[-2] = np.sum(np.abs(W[:-2]), axis=0)
    mean = W[-1] = sum / (np.array(orders)+1)
    means[i] = mean

fig, ax = plt.subplots(1, 2, figsize=(10,5))

for i, l2 in enumerate(lambda_L2):
    ax[0].plot(orders, means[i])
    ax[1].plot(orders, means[i])
plt.legend([f'lambda_L2={l2}' for l2 in lambda_L2])
ax[0].set_xscale('log')
ax[1].set_xscale('log')
ax[1].set_yscale('log')
ax[0].set_title('(log-linear)')
ax[1].set_title('(log-log)')
plt.suptitle('Poly order to mean weight')

plt.show()

def loss(xs, y_true, w):
    X = np.stack([xs**i for i in range(w.shape[0])], axis=1)  # [n_samples, order]
    y_pred = X @ w  # [n_samples, order] x [order, 1] = [n_samples]
    err = y_pred - y_true  # [n_samples]
    return np.sum(err**2)  # []
      
orders = list(range(12))
lambda_L2 = 10.**-np.arange(5)

fig, ax = plt.subplots(len(lambda_L2), 1, figsize=(20, 20))

for i, l2 in enumerate(lambda_L2):
    loss_train = []
    loss_test = []

    for j, order in enumerate(orders):
        np.random.seed(0)  # set seed for reproducibility
        w_init = np.random.normal(0, 0.1, size=(order+1))
        w_fin = train_loop(w_init, lambda_L2=l2, quiet=True)
        loss_train.append(loss(x_train, y_train, w_fin))
        loss_test.append(loss(x_test, y_test, w_fin))

    ax[i].title.set_text(f'L2 penalty = {l2}')
    ax[i].plot(orders, loss_train, 'bo-', label='Training')
    ax[i].plot(orders, loss_test, 'ro-', label='Validation')

fig.suptitle('Train and test loss over polynomail order')
plt.show()

	M = 0	M = 1	M = 6	M = 9	M = 16	M = 23	M = 47	M = 93
w_0	-0.231592	0.383946	0.451798	0.554775	0.626970	0.631668	0.614110	0.614178
w_1	0.000000	-1.012298	-0.813740	-0.845509	-0.807546	-0.789180	-0.764531	-0.764077
w_2	0.000000	0.000000	-0.698745	-0.828729	-0.873127	-0.863600	-0.831517	-0.831236
w_3	0.000000	0.000000	-0.205091	-0.400933	-0.526576	-0.532125	-0.506861	-0.506838
w_4	0.000000	0.000000	0.137173	-0.100272	-0.292319	-0.313132	-0.299889	-0.300068
...	...	...	...	...	...	...	...	...
w_91	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.131540
w_92	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.029785
w_93	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.106296
abs sum	0.231592	1.396244	3.048557	4.535315	6.488994	7.129568	8.860466	12.118300
abs mean	0.231592	0.698122	0.435508	0.453531	0.381706	0.297065	0.184593	0.128918

	M = 0	M = 1	M = 6	M = 9	M = 16	M = 23	M = 47	M = 93
w_0	-0.23044	0.338691	0.402107	0.495026	0.562539	0.568147	0.553851	0.553765
w_1	0.00000	-0.939155	-0.741123	-0.771563	-0.740826	-0.724465	-0.702466	-0.701770
w_2	0.00000	0.000000	-0.643342	-0.762507	-0.807156	-0.799845	-0.772130	-0.771506
w_3	0.00000	0.000000	-0.206262	-0.384445	-0.501760	-0.508774	-0.487752	-0.487436
w_4	0.00000	0.000000	0.108379	-0.106850	-0.282949	-0.304216	-0.294320	-0.294331
...	...	...	...	...	...	...	...	...
w_91	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.109010
w_92	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.025668
w_93	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.088251
abs sum	0.23044	1.277846	2.777711	4.153107	5.973123	6.552749	8.018477	10.652182
abs mean	0.23044	0.638923	0.396816	0.415311	0.351360	0.273031	0.167052	0.113321

	M = 0	M = 1	M = 6	M = 9	M = 16	M = 23	M = 47	M = 93
w_0	-0.229299	0.298701	0.358543	0.442504	0.505692	0.512035	0.500541	0.500360
w_1	0.000000	-0.874363	-0.678051	-0.707208	-0.682607	-0.668072	-0.648421	-0.647550
w_2	0.000000	0.000000	-0.594576	-0.704030	-0.748618	-0.743259	-0.719366	-0.718504
w_3	0.000000	0.000000	-0.205803	-0.368236	-0.478008	-0.486244	-0.468908	-0.468399
w_4	0.000000	0.000000	0.084187	-0.111283	-0.273234	-0.294774	-0.287760	-0.287673
...	...	...	...	...	...	...	...	...
w_91	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.090501
w_92	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.022260
w_93	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.073422
abs sum	0.229299	1.173064	2.538353	3.814889	5.515162	6.082663	7.313230	9.450002
abs mean	0.229299	0.586532	0.362622	0.381489	0.324421	0.253444	0.152359	0.100532

	M = 0	M = 1	M = 6	M = 9	M = 16	M = 23	M = 47	M = 93
w_0	-0.210538	0.003131	0.050081	0.066962	0.088549	0.095486	0.099204	0.099569
w_1	0.000000	-0.381871	-0.255695	-0.269463	-0.277091	-0.276953	-0.274493	-0.273932
w_2	0.000000	0.000000	-0.241408	-0.273167	-0.302047	-0.308511	-0.309654	-0.309431
w_3	0.000000	0.000000	-0.142424	-0.184495	-0.227974	-0.239875	-0.244989	-0.245266
w_4	0.000000	0.000000	-0.042228	-0.089930	-0.143063	-0.159159	-0.167913	-0.168706
...	...	...	...	...	...	...	...	...
w_91	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.007342
w_92	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.005440
w_93	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.006551
abs sum	0.210538	0.385002	0.866922	1.299949	2.036679	2.456913	3.177809	3.648683
abs mean	0.210538	0.192501	0.123846	0.129995	0.119805	0.102371	0.066204	0.038816

A Simple Demonstration of Overfitting¶

References¶