Regularization applications¶
Amirhossein (Miros) ZOHREHVAND
mgmt.ucl.ac.uk/people/amirzohrehvand
11 March 2019
In this notebook, I aim to provide an overview of how to operationalize regularization using Python. I start by introducing a few fundamental tools. Then, I move quickly to delve into some simulated practices in Python. I wrap up the session by going through an example from my own research, where I applied ElasticNet to estimate the effect of a merger of two companies on their customer’s sentiment.
Necassary tools¶
Let’s start by setting the plots properties using a magic function (More about the magic line here) and importing some packages.
For producing the graphics a good package is seaborn
. It basically is a wrapper around the matplotlib
.
%matplotlib inline
import numpy as np
import pandas as pd
import random
from google.colab import files
import matplotlib.pyplot as plt
import seaborn as sns
# for latex equations
from IPython.display import Math, Latex
from IPython.core.display import Image
import warnings
warnings.filterwarnings('ignore')
# size . and color settings for plots, they can be adjusted later as well
# from matplotlib.pylab import rcParams
# rcParams['figure.figsize'] = 10, 10
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(15,5)})
scipy
is another package that we use in this notebook. I use this package to generate different distributions.
from scipy.stats import norm
# ~ N(0,1)
normal_dist_data = norm.rvs(size=10000,loc=0,scale=1)
Let’s inspect the distribution using a seaborn
histogram and then download it as pdf.
normal_dist_ax = sns.distplot(normal_dist_data,
bins=100,
kde=True,
color='skyblue',
hist_kws={"linewidth": 15,'alpha':1})
normal_dist_ax.set(xlabel='Normal Distribution', ylabel='Frequency')
_ = normal_dist_ax.get_figure()
_.savefig("normal_dist.pdf")
files.download('normal_dist.pdf')
from scipy.stats import gamma
data_gamma = gamma.rvs(a=1.9, size=10000)
ax = sns.distplot(data_gamma,
kde_kws={"color": "k", "lw": 3, "label": "KDE"},
bins=100,
color='skyblue',
hist_kws={"linewidth": 15,'alpha':1})
ax.set(xlabel='Gamma Distribution', ylabel='Frequency')
There are other ways to produce the same distribution. For example, the next cell uses numpy
to generate such a distribution. You can read more on numpy distributions here.
numpy_gamma = np.random.gamma(1, 10000)
ax = sns.distplot(data_gamma,
kde_kws={"color": "k", "lw": 3, "label": "KDE"},
bins=100,
color='skyblue',
hist_kws={"linewidth": 15,'alpha':1})
ax.set(xlabel='Gamma Distribution', ylabel='Frequency')
scikit-learn
package¶
This package is one of the main important tools for many machine learning applications. Not only it covers many types of prediction models but also it has many different useful tools that are tailored for machine learning problems, e.g., pipelines.
Let’s start by taking a look at the example from the onilne guide of the package explaining Lasso function.
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
print(clf.coef_)
print(clf.intercept_)
Let’s go trhough a better example:
np.random.seed(42)
N, p = 800, 5000
X = np.random.randn(N, p)
coef = 3 * np.random.randn(p)
# adding sparicity
inds = np.arange(p)
np.random.shuffle(inds)
coef[inds[10:]] = 0
y = np.dot(X, coef)
# noise
y += 0.01 * np.random.normal(size=N)
# train set and test set
X_train, y_train = X[:N // 2], y[:N // 2]
X_test, y_test = X[N // 2:], y[N // 2:]
From the scikitlearn website:
“where:
alpha = a + b and l1_ratio = a / (a + b)
The parameter l1_ratio corresponds to alpha in the glmnet R package while alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable, unless you supply your own sequence of alpha.”
# ElasticNet
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
alphas = np.logspace(-5, 2, 100) # The same grid as before!
# Let's play with the l1_ratio ~ (the same as alpha in R)
elnet = ElasticNet(l1_ratio=1)
train_errors = list()
test_errors = list()
for alpha in alphas:
elnet.set_params(alpha=alpha)
elnet.fit(X_train, y_train)
train_errors.append(elnet.score(X_train, y_train))
test_errors.append(elnet.score(X_test, y_test))
i_alpha_optim = np.argmax(test_errors)
alpha_optim = alphas[i_alpha_optim]
print("Optimal regularization parameter : %s" % alpha_optim)
# Estimate the coef_ on full data with optimal regularization parameter
elnet.set_params(alpha=alpha_optim)
coef_ = elnet.fit(X, y).coef_
y_pred_elnet = elnet.fit(X_train, y_train).predict(X_test)
r2_score_elnet = r2_score(y_test, y_pred_elnet)
print(elnet)
print("r^2 on test data : %f" % r2_score_elnet)
plt.subplot(3, 1, 1)
plt.plot(coef_, label='Estimated coef')
plt.legend()
plt.subplot(3, 1, 2)
plt.plot(coef, label='Original coef')
plt.legend()
plt.show()
# plt.subplot(3, 1, 3)
# plt.semilogx(alphas, train_errors, label='Train')
# plt.semilogx(alphas, test_errors, label='Test')
# plt.legend()
# plt.show()
For the path look at the following example. (Source: scikit-learn)
from sklearn.linear_model import lasso_path, enet_path
from itertools import cycle
# Compute paths
eps = 5e-3 # the smaller it is the longer is the path
print("Computing regularization path using the lasso...")
alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps, fit_intercept=False)
print("Computing regularization path using the positive lasso...")
alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path(
X, y, eps, positive=True, fit_intercept=False)
print("Computing regularization path using the elastic net...")
alphas_enet, coefs_enet, _ = enet_path(
X, y, eps=eps, l1_ratio=0.8, fit_intercept=False)
print("Computing regularization path using the positive elastic net...")
alphas_positive_enet, coefs_positive_enet, _ = enet_path(
X, y, eps=eps, l1_ratio=0.8, positive=True, fit_intercept=False)
# Display results
plt.figure(1)
colors = cycle(['b', 'r', 'g', 'c', 'k'])
neg_log_alphas_lasso = -np.log10(alphas_lasso)
neg_log_alphas_enet = -np.log10(alphas_enet)
for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors):
l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)
plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Lasso and Elastic-Net Paths')
plt.legend((l1[-1], l2[-1]), ('Lasso', 'Elastic-Net'), loc='lower left')
plt.axis('tight')
plt.figure(2)
neg_log_alphas_positive_lasso = -np.log10(alphas_positive_lasso)
for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors):
l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
l2 = plt.plot(neg_log_alphas_positive_lasso, coef_pl, linestyle='--', c=c)
plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Lasso and positive Lasso')
plt.legend((l1[-1], l2[-1]), ('Lasso', 'positive Lasso'), loc='lower left')
plt.axis('tight')
plt.figure(3)
neg_log_alphas_positive_enet = -np.log10(alphas_positive_enet)
for (coef_e, coef_pe, c) in zip(coefs_enet, coefs_positive_enet, colors):
l1 = plt.plot(neg_log_alphas_enet, coef_e, c=c)
l2 = plt.plot(neg_log_alphas_positive_enet, coef_pe, linestyle='--', c=c)
plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Elastic-Net and positive Elastic-Net')
plt.legend((l1[-1], l2[-1]), ('Elastic-Net', 'positive Elastic-Net'),
loc='lower left')
plt.axis('tight')
plt.show()
eps = 5e-3 # the smaller it is the longer is the path
print("Computing regularization path using the lasso...")
alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps, fit_intercept=False)
print("Computing regularization path using the elastic net...")
alphas_enet, coefs_enet, _ = enet_path(
X, y, eps=eps, l1_ratio=0.3, fit_intercept=False)
# Display results
colors = cycle(['b', 'r', 'g', 'c', 'k'])
neg_log_alphas_lasso = -np.log10(alphas_lasso)
neg_log_alphas_enet = -np.log10(alphas_enet)
for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors):
l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c)
l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle='--', c=c)
plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Lasso and Elastic-Net Paths')
plt.legend((l1[-1], l2[-1]), ('Lasso', 'Elastic-Net'), loc='lower left')
plt.axis('tight')
We can inspect the performance of Elastic Net for different l1_ratios.
l1_ratios = [x/10 for x in range(1,9,2)]
colors = cycle(['b', 'r', 'g', 'c', 'k'])
sns.set(rc={'figure.figsize':(15,20)})
count = 1
for i in l1_ratios:
print("Computing regularization path using l1_ratio = {} elastic net.".format(i))
alphas_ener, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=i, fit_intercept=False)
neg_log_alphas_enet = -np.log10(alphas_enet)
plt.subplot(len(l1_ratios), 1, count)
for coef_e, c in zip(coefs_enet, colors):
l1 = plt.plot(neg_log_alphas_enet, coef_e , c=c)
plt.xlabel('-Log(alpha)')
plt.ylabel('coefficients')
plt.title('Elastic-Net Paths for l1_ratio {}'.format(i))
plt.axis('tight')
count += 1
plt.show()
sns.set(rc={'figure.figsize':(15,5)})
Another example from here.
Now let’s visually inspect the effect of penalty term.
Let’s create a 2D data, from $y = sin(x)$ and a random noise ~ $N(0,0.1)$.
# set new
sns.set(rc={'figure.figsize':(10,8)})
# Just a shif of numbers from degrees to radians
x = np.array([i*np.pi/180 for i in range(50,290,5)])
# For reproducability
np.random.seed(111)
y = np.sin(x) + np.random.normal(0,0.1,len(x))
# Let's create a pandas data frame. Pandas dataframe is similiar to R dataframe.
data = pd.DataFrame(np.column_stack([x,y]),columns=['x','y'])
plt.plot(data['x'],data['y'],'.')
true_y, = plt.plot(data['x'], data['x'].apply(np.sin), label='True Y')
plt.legend(handles=[true_y], loc=1)
# plt.show()
Adding the powers of X
for i in range(2,16):
data['x_%d'%i] = data['x']**i
from sklearn.linear_model import LinearRegression
def linear_regression(data, power, models_to_plot):
#initialize predictors:
predictors=['x']
if power>=2:
predictors.extend(['x_%d'%i for i in range(2,power+1)])
#Fit the model
linreg = LinearRegression(normalize=True)
linreg.fit(data[predictors],data['y'])
y_pred = linreg.predict(data[predictors])
#Check if a plot is to be made for the entered power
if power in models_to_plot:
plt.subplot(models_to_plot[power])
plt.tight_layout()
plt.plot(data['x'],y_pred)
plt.plot(data['x'],data['y'],'.')
plt.title('Plot for power: %d'%power)
#Return the result in pre-defined format
rss = sum((y_pred-data['y'])**2)
ret = [rss]
ret.extend([linreg.intercept_])
ret.extend(linreg.coef_)
return ret
#Initialize a dataframe to store the results:
col = ['rss','intercept'] + ['coef_x_%d'%i for i in range(1,16)]
ind = ['model_pow_%d'%i for i in range(1,16)]
coef_matrix_simple = pd.DataFrame(index=ind, columns=col)
#Define the powers for which a plot is required:
models_to_plot = {1:231,3:232,6:233,9:234,12:235,15:236}
#Iterate through all powers and assimilate results
for i in range(1,16):
coef_matrix_simple.iloc[i-1,0:i+2] = linear_regression(data, power=i, models_to_plot=models_to_plot)
From the scikitlearn website:
where:
alpha = a + b and l1_ratio = a / (a + b)
The parameter l1_ratio corresponds to alpha in the glmnet R package while alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable, unless you supply your own sequence of alpha.
from sklearn.linear_model import Ridge
def ridge_regression(data, predictors, alpha, models_to_plot={}):
#Fit the model
ridgereg = Ridge(alpha=alpha,normalize=True)
ridgereg.fit(data[predictors],data['y'])
y_pred = ridgereg.predict(data[predictors])
#Check if a plot is to be made for the entered alpha
if alpha in models_to_plot:
plt.subplot(models_to_plot[alpha])
plt.tight_layout()
plt.plot(data['x'],y_pred)
plt.plot(data['x'],data['y'],'.')
plt.title('Plot for alpha: %.3g'%alpha)
#Return the result in pre-defined format
rss = sum((y_pred-data['y'])**2)
ret = [rss]
ret.extend([ridgereg.intercept_])
ret.extend(ridgereg.coef_)
return ret
#Initialize predictors to be set of 15 powers of x
predictors=['x']
predictors.extend(['x_%d'%i for i in range(2,16)])
#Set the different values of alpha to be tested
alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]
#Initialize the dataframe for storing coefficients.
col = ['rss','intercept'] + ['coef_x_%d'%i for i in range(1,16)]
ind = ['alpha_%.2g'%alpha_ridge[i] for i in range(0,10)]
coef_matrix_ridge = pd.DataFrame(index=ind, columns=col)
models_to_plot = {1e-15:231, 1e-10:232, 1e-4:233, 1e-3:234, 1e-2:235, 5:236}
for i in range(10):
coef_matrix_ridge.iloc[i,] = ridge_regression(data, predictors, alpha_ridge[i], models_to_plot)
#Set the display format to be scientific for ease of analysis
pd.options.display.float_format = '{:,.2g}'.format
coef_matrix_ridge
coef_matrix_ridge.apply(lambda x: sum(x.values==0),axis=1)
from sklearn.linear_model import Lasso
def lasso_regression(data, predictors, alpha, models_to_plot={}):
#Fit the model
lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5)
lassoreg.fit(data[predictors],data['y'])
y_pred = lassoreg.predict(data[predictors])
#Check if a plot is to be made for the entered alpha
if alpha in models_to_plot:
plt.subplot(models_to_plot[alpha])
plt.tight_layout()
plt.plot(data['x'],y_pred)
plt.plot(data['x'],data['y'],'.')
plt.title('Plot for alpha: %.3g'%alpha)
#Return the result in pre-defined format
rss = sum((y_pred-data['y'])**2)
ret = [rss]
ret.extend([lassoreg.intercept_])
ret.extend(lassoreg.coef_)
return ret
#Initialize predictors to all 15 powers of x
predictors=['x']
predictors.extend(['x_%d'%i for i in range(2,16)])
#Define the alpha values to test
alpha_lasso = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]
#Initialize the dataframe to store coefficients
col = ['rss','intercept'] + ['coef_x_%d'%i for i in range(1,16)]
ind = ['alpha_%.2g'%alpha_lasso[i] for i in range(0,10)]
coef_matrix_lasso = pd.DataFrame(index=ind, columns=col)
#Define the models to plot
models_to_plot = {1e-10:231, 1e-5:232,1e-4:233, 1e-3:234, 1e-2:235, 1:236}
#Iterate over the 10 alpha values:
for i in range(10):
coef_matrix_lasso.iloc[i,] = lasso_regression(data, predictors, alpha_lasso[i], models_to_plot)
coef_matrix_lasso.apply(lambda x: sum(x.values==0),axis=1)