Lesser known data science techniques you should add to your toolkit (code)
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.base import ClassifierMixin
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
sns.set_style("whitegrid")
get_ipython().run_line_magic('matplotlib', 'inline')
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
n = 100
X = np.linspace(0, 30, n)
y = 150 + 0.3*X + 1*X**2 + np.random.normal(loc=50, scale=100, size=n)
X = X[:, np.newaxis]
plt.figure(figsize=(17, 7))
plt.scatter(X, y, alpha=.7);
class QuantileRegression(BaseEstimator, RegressorMixin):
"""Sklearn wrapper for statsmodels Quantile Regression
"""
def __init__(self, quantile=0.5, **kwargs):
self.quantile = quantile
self.kwargs = kwargs
self.model = None
self.fitted = None
def fit(self, X, y=None):
X = sm.add_constant(X)
self.model = sm.QuantReg(endog=y, exog=X, **self.kwargs)
self.fitted = self.model.fit(q=self.quantile)
def predict(self, X, y=None):
X = sm.add_constant(X)
return self.fitted.predict(X)
n = 100
X = np.linspace(0, 30, n)
y = 150 + 0.3*X + 1*X**2 + np.random.normal(loc=50, scale=100, size=n)
X = X[:, np.newaxis]
plt.figure(figsize=(17, 7))
plt.scatter(X, y, alpha=.7);
# instantiate models
qr_05 = QuantileRegression(.05)
qr_50 = QuantileRegression(quantile=.5)
qr_95 = QuantileRegression(quantile=.95)
# structure data for model
model_input = np.c_[X, X**2]
# fit models
qr_05.fit(model_input, y)
qr_50.fit(model_input, y)
qr_95.fit(model_input, y)
qr.fitted.summary()
plt.figure(figsize=(25, 13))
plt.scatter(X, y)
preds_05 = qr_05.predict(model_input)
preds_50 = qr_50.predict(model_input)
preds_95 = qr_95.predict(model_input)
plt.plot(X, preds_05, color="orange", label="0.05 Quantile")
plt.plot(X, preds_50, color="red", label="0.5 Quantile")
plt.plot(X, preds_95, color="green", label="0.95 Quantile")
plt.fill_between(X.reshape(-1,), preds_05.reshape(-1, ), preds_95.reshape(-1, ), color="blue", alpha=.3)
fs = 30
plt.legend(fontsize=fs/2)
plt.title("Quantile Regression Predictions", fontsize=fs)
plt.xlabel("X", fontsize=fs)
plt.ylabel("y", fontsize=fs)
plt.yticks(fontsize=fs)
plt.xticks(fontsize=fs);
medium_post_views = np.array([3500, 1600, 482, 245, 198, 116])
days_since_launch = np.arange(medium_post_views.shape[0])
plt.figure(figsize=(25, 13))
plt.plot(medium_post_views)
plt.scatter(days_since_launch, medium_post_views)
plt.title("Medium views decay", fontsize=fs)
plt.ylabel("Views", fontsize=fs)
plt.yticks(fontsize=fs)
plt.xlabel("Days since launch", fontsize=fs)
plt.xticks(fontsize=fs);
from scipy.optimize import curve_fit
from sklearn.base import BaseEstimator, RegressorMixin
import statsmodels.api as sm
class ExponentialDecayRegressor(BaseEstimator, RegressorMixin):
"""Fits an exponential decay curve
"""
def __init__(self, starting_values=[1.,1.e-5,1.], **kwargs,):
self.starting_values = starting_values
self.kwargs = kwargs
self.params = None
def fit(self, X, y=None):
self.params, _ = curve_fit(self.exp_decay_f, X, y, p0=self.starting_values)
def predict(self, X, y=None):
return self.exp_decay_f(X, *self.params)
@staticmethod
def exp_decay_f(X, a, k, b):
return a * np.exp(-k*X) + b
medium_post_views = np.array([3500, 1600, 482, 245, 198, 116])
days_since_launch = np.arange(medium_post_views.shape[0])
xd = ExponentialDecayRegressor()
xd.fit(days_since_launch, medium_post_views)
days_since_launch_plus_future = np.arange(12)
xd_preds = xd.predict(days_since_launch_pred)
plt.figure(figsize=(25, 13))
plt.plot(days_since_launch_plus_future, xd_preds, label="exponential decay fit")
plt.plot(days_since_launch, medium_post_views, label="actual")
plt.legend(fontsize=fs/2)
plt.title("Medium views decay", fontsize=fs)
plt.ylabel("Views", fontsize=fs)
plt.yticks(fontsize=fs)
plt.xlabel("Days since launch", fontsize=fs)
plt.xticks(fontsize=fs);
medium_post_views/medium_post_views.max()
xd = ExponentialDecayRegressor(starting_values=None)
xd.fit(days_since_launch, medium_post_views/medium_post_views.max())
days_since_launch_plus_future = np.arange(12)
xd_preds = xd.predict(days_since_launch_pred)
plt.figure(figsize=(25, 13))
plt.plot(days_since_launch_plus_future, xd_preds, label="exponential decay fit")
plt.plot(days_since_launch, medium_post_views/medium_post_views.max(), label="actual")
plt.legend(fontsize=fs/2)
plt.title("Medium views decay", fontsize=fs)
plt.ylabel("Views", fontsize=fs)
plt.yticks(fontsize=fs)
plt.xlabel("Days since launch", fontsize=fs)
plt.xticks(fontsize=fs);
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_diabetes
rf = GradientBoostingClassifier(n_estimators=30, max_depth=4)
X, y = load_breast_cancer(return_X_y=True)
train_ix, test_ix = train_test_split(np.arange(X.shape[0]))
train_X, train_y = X[train_ix], y[train_ix]
test_X, test_y = X[test_ix], y[test_ix]
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
class TreeEmbeddingLogisticRegression(BaseEstimator, ClassifierMixin):
"""Fits a logistic regression model on tree embeddings.
"""
def __init__(self, **kwargs):
self.kwargs = kwargs
self.gbm = GradientBoostingClassifier(**kwargs)
self.lr = LogisticRegression(penalty="l1", solver="liblinear")
self.bin = OneHotEncoder()
def fit(self, X, y=None):
self.gbm.fit(X, y)
X_emb = self.gbm.apply(X).reshape(X.shape[0], -1)
X_emb = self.bin.fit_transform(X_emb)
self.lr.fit(X_emb, y)
def predict(self, X, y=None, with_tree=False):
if with_tree:
preds = self.gbm.predict(X)
else:
X_emb = self.gbm.apply(X).reshape(X.shape[0], -1)
X_emb = self.bin.transform(X_emb)
preds = self.lr.predict(X_emb)
return preds
def predict_proba(self, X, y=None, with_tree=False):
if with_tree:
preds = self.gbm.predict_proba(X)
else:
X_emb = self.gbm.apply(X).reshape(X.shape[0], -1)
X_emb = self.bin.transform(X_emb)
preds = self.lr.predict_proba(X_emb)
return preds
lr_tree = TreeEmbeddingLogisticRegression(n_estimators=30, max_depth=4)
lr_tree.fit(train_X, train_y)
lr_tree.X_emb.toarray().shape
train_X.shape
lr_tree.X_emb
lr_tree_preds = lr_tree.predict(test_X)
tree_preds = lr_tree.predict(test_X, with_tree=True)
metrics.roc_auc_score(test_y, lr_tree_preds)
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
class CustomClassifier(BaseEstimator, ClassifierMixin):
def __init__(self):
pass
def fit(self, X, y=None):
pass
def predict(self, X, y=None, with_tree=False):
pass
def predict_proba(self, X, y=None, with_tree=False):
pass
class CustomRegressor(BaseEstimator, RegressorMixin):
def __init__(self):
pass
def fit(self, X, y=None):
pass
def predict(self, X, y=None, with_tree=False):
pass
def predict_proba(self, X, y=None, with_tree=False):
pass