from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.base import ClassifierMixin
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

sns.set_style("whitegrid")
get_ipython().run_line_magic('matplotlib', 'inline')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
n = 100
X = np.linspace(0, 30, n) 
y = 150 + 0.3*X + 1*X**2 + np.random.normal(loc=50, scale=100, size=n)
X = X[:, np.newaxis]

plt.figure(figsize=(17, 7))
plt.scatter(X, y, alpha=.7);
class QuantileRegression(BaseEstimator, RegressorMixin):
    """Sklearn wrapper for statsmodels Quantile Regression
    """
    def __init__(self, quantile=0.5, **kwargs):
        self.quantile = quantile
        self.kwargs = kwargs
        self.model = None
        self.fitted = None
    
    def fit(self, X, y=None):
        X = sm.add_constant(X)
        self.model = sm.QuantReg(endog=y, exog=X, **self.kwargs)
        self.fitted = self.model.fit(q=self.quantile)
    
    def predict(self, X, y=None):
        X = sm.add_constant(X)
        return self.fitted.predict(X)
n = 100
X = np.linspace(0, 30, n) 
y = 150 + 0.3*X + 1*X**2 + np.random.normal(loc=50, scale=100, size=n)
X = X[:, np.newaxis]

plt.figure(figsize=(17, 7))
plt.scatter(X, y, alpha=.7);

# instantiate models 
qr_05 = QuantileRegression(.05)
qr_50 = QuantileRegression(quantile=.5)
qr_95 = QuantileRegression(quantile=.95)

# structure data for model
model_input = np.c_[X, X**2]

# fit models
qr_05.fit(model_input, y)
qr_50.fit(model_input, y)
qr_95.fit(model_input, y)
qr.fitted.summary()
QuantReg Regression Results
Dep. Variable: y Pseudo R-squared: 0.6992
Model: QuantReg Bandwidth: 104.1
Method: Least Squares Sparsity: 268.4
Date: Sat, 10 Apr 2021 No. Observations: 100
Time: 17:29:23 Df Residuals: 97
Df Model: 2
coef std err t P>|t| [0.025 0.975]
const 185.1674 39.463 4.692 0.000 106.844 263.491
x1 8.3116 6.080 1.367 0.175 -3.755 20.378
x2 0.7510 0.196 3.830 0.000 0.362 1.140


The condition number is large, 1.2e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
plt.figure(figsize=(25, 13))
plt.scatter(X, y)

preds_05 = qr_05.predict(model_input)
preds_50 = qr_50.predict(model_input)
preds_95 = qr_95.predict(model_input)


plt.plot(X, preds_05, color="orange", label="0.05 Quantile")
plt.plot(X, preds_50, color="red", label="0.5 Quantile")
plt.plot(X, preds_95, color="green", label="0.95 Quantile")
plt.fill_between(X.reshape(-1,), preds_05.reshape(-1, ), preds_95.reshape(-1, ), color="blue", alpha=.3)


fs = 30
plt.legend(fontsize=fs/2)
plt.title("Quantile Regression Predictions", fontsize=fs)
plt.xlabel("X", fontsize=fs)
plt.ylabel("y", fontsize=fs)
plt.yticks(fontsize=fs)

plt.xticks(fontsize=fs);
medium_post_views = np.array([3500, 1600, 482, 245, 198, 116])
days_since_launch = np.arange(medium_post_views.shape[0])
plt.figure(figsize=(25, 13))
plt.plot(medium_post_views)

plt.scatter(days_since_launch, medium_post_views)

plt.title("Medium views decay", fontsize=fs)
plt.ylabel("Views", fontsize=fs)
plt.yticks(fontsize=fs)
plt.xlabel("Days since launch", fontsize=fs)
plt.xticks(fontsize=fs);
from scipy.optimize import curve_fit
from sklearn.base import BaseEstimator, RegressorMixin
import statsmodels.api as sm

class ExponentialDecayRegressor(BaseEstimator, RegressorMixin):
    """Fits an exponential decay curve
    """
    
    def __init__(self, starting_values=[1.,1.e-5,1.], **kwargs,):
        self.starting_values = starting_values
        self.kwargs = kwargs
        self.params = None
        
    def fit(self, X, y=None):
        self.params, _ = curve_fit(self.exp_decay_f, X, y, p0=self.starting_values)
        
        
    def predict(self, X, y=None):
        return self.exp_decay_f(X, *self.params)
    
    @staticmethod
    def exp_decay_f(X, a, k, b):
        return a * np.exp(-k*X) + b    
medium_post_views = np.array([3500, 1600, 482, 245, 198, 116])
days_since_launch = np.arange(medium_post_views.shape[0])
xd = ExponentialDecayRegressor()
xd.fit(days_since_launch, medium_post_views)

days_since_launch_plus_future = np.arange(12)
xd_preds = xd.predict(days_since_launch_pred)
plt.figure(figsize=(25, 13))

plt.plot(days_since_launch_plus_future, xd_preds, label="exponential decay fit")
plt.plot(days_since_launch, medium_post_views, label="actual")
plt.legend(fontsize=fs/2)
plt.title("Medium views decay", fontsize=fs)
plt.ylabel("Views", fontsize=fs)
plt.yticks(fontsize=fs)
plt.xlabel("Days since launch", fontsize=fs)
plt.xticks(fontsize=fs);
medium_post_views/medium_post_views.max()
array([1.        , 0.45714286, 0.13771429, 0.07      , 0.05657143,
       0.03314286])
xd = ExponentialDecayRegressor(starting_values=None)
xd.fit(days_since_launch, medium_post_views/medium_post_views.max())

days_since_launch_plus_future = np.arange(12)
xd_preds = xd.predict(days_since_launch_pred)
plt.figure(figsize=(25, 13))

plt.plot(days_since_launch_plus_future, xd_preds, label="exponential decay fit")
plt.plot(days_since_launch, medium_post_views/medium_post_views.max(), label="actual")
plt.legend(fontsize=fs/2)
plt.title("Medium views decay", fontsize=fs)
plt.ylabel("Views", fontsize=fs)
plt.yticks(fontsize=fs)
plt.xlabel("Days since launch", fontsize=fs)
plt.xticks(fontsize=fs);
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_diabetes
rf = GradientBoostingClassifier(n_estimators=30, max_depth=4)
X, y = load_breast_cancer(return_X_y=True)
train_ix, test_ix = train_test_split(np.arange(X.shape[0]))
train_X, train_y = X[train_ix], y[train_ix]
test_X, test_y = X[test_ix], y[test_ix]
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

class TreeEmbeddingLogisticRegression(BaseEstimator, ClassifierMixin):
    """Fits a logistic regression model on tree embeddings.
    """
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        self.gbm = GradientBoostingClassifier(**kwargs)
        self.lr = LogisticRegression(penalty="l1", solver="liblinear")
        self.bin = OneHotEncoder()
    
    def fit(self, X, y=None):
        self.gbm.fit(X, y)
        X_emb = self.gbm.apply(X).reshape(X.shape[0], -1)
        X_emb = self.bin.fit_transform(X_emb)
        self.lr.fit(X_emb, y)
    
    def predict(self, X, y=None, with_tree=False):
        if with_tree:
            preds = self.gbm.predict(X)
        else:
            X_emb = self.gbm.apply(X).reshape(X.shape[0], -1)
            X_emb = self.bin.transform(X_emb)
            preds = self.lr.predict(X_emb)
        return preds
    
    def predict_proba(self, X, y=None, with_tree=False):
        if with_tree:
            preds = self.gbm.predict_proba(X)
        else:
            X_emb = self.gbm.apply(X).reshape(X.shape[0], -1)
            X_emb = self.bin.transform(X_emb)
            preds = self.lr.predict_proba(X_emb)
        return preds
lr_tree = TreeEmbeddingLogisticRegression(n_estimators=30, max_depth=4)
lr_tree.fit(train_X, train_y)
lr_tree.X_emb.toarray().shape
(426, 440)
train_X.shape
(426, 30)
lr_tree.X_emb
<426x440 sparse matrix of type '<class 'numpy.float64'>'
	with 12780 stored elements in Compressed Sparse Row format>
lr_tree_preds = lr_tree.predict(test_X)
tree_preds = lr_tree.predict(test_X, with_tree=True)
metrics.roc_auc_score(test_y, lr_tree_preds)
0.9444799658994033
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin

class CustomClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        pass
    
    def predict(self, X, y=None, with_tree=False):
        pass
    
    def predict_proba(self, X, y=None, with_tree=False):
        pass

class CustomRegressor(BaseEstimator, RegressorMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        pass
    
    def predict(self, X, y=None, with_tree=False):
        pass
    
    def predict_proba(self, X, y=None, with_tree=False):
        pass