import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import pymc3 as pm
sns.set_style("whitegrid")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

df = pd.read_csv("/Users/conormcdonald/Desktop/medium_post_data.csv", skiprows=1).iloc[:, 1:-4]
df = df[~df.internal_views.isnull()]

h = df.member_reading_time.astype(str).str.split(".", expand=True)
h.loc[:, 0] = (h.loc[:, 0].astype(float) * 60)/60
h.loc[:, 1] = ("." + h.loc[:, 1]).astype(float)
h.loc[:, 1] = (h.loc[:, 1] * 100)/60
df["member_reading_time_adj"] = h.sum(1)

plt.figure(figsize=(14, 8))
plt.scatter(np.log(df.member_reading_time_adj), np.log(df.earnings), alpha=.5)

m = sm.OLS(endog=df.earnings, exog=sm.add_constant(df[["member_reading_time_adj"]])) 
fitted = m.fit()
fitted.summary()

plt.figure(figsize=(14, 8))
plt.scatter(df.member_reading_time_adj, df.earnings, alpha=.4, color="red")

pd.to_datetime("2021-04-20")-pd.to_datetime("2017-11-27")

plt.figure(figsize=(14, 7))
np.random.seed(123)
x = np.clip(sm.add_constant(np.random.poisson(lam=4.61, size=1240)), 0, np.Inf)
plt.plot(x[:, 1], c="green", alpha=.5)
plt.ylabel("Member reading time", fontsize=16)
plt.xlabel("Days since publishing", fontsize=16)

x[:, 1].sum()/60

fitted.predict(x).sum()

df["yhat"] = fitted.predict(sm.add_constant(df.member_reading_time_adj))

df.assign(error = lambda x: x.earnings-x.yhat).error.mean()

## Bayesian Model 

hours_model = pm.Model()
with hours_model:
    
    a = pm.Normal("a", 0, 1)
    b = pm.Normal("b", 0, 1)
    error = pm.HalfCauchy("error", 3)
    
    yhat = a + b * df.member_reading_time_adj

    lik = pm.Normal("lik", yhat, error, observed=df.earnings)

with hours_model:
    trace = pm.sample(2000, chains=4)

plt.figure(figsize=(14, 7))
pm.plot_dist(trace["b"])
plt.xlabel("Model estimate of earnings $", fontsize=16)
plt.ylabel("Probability density", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14);

pm.traceplot(trace);

pm.summary(trace)

preds = pm.sample_posterior_predictive(trace, model=hours_model, samples=2000)

plt.figure(figsize=(14, 8))
plt.scatter(df.member_reading_time_adj, df.earnings, alpha=.4, color="blue")
plt.plot(df.member_reading_time_adj, pd.DataFrame(preds["lik"]).mean(), alpha=.8, color="red", linewidth=3)
plt.xlabel("Member reading time (hrs)", fontsize=16)
plt.ylabel("Earnings $", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14);

pm.trace_to_dataframe(trace)

views_model = pm.Model()
with views_model:
    
    a = pm.Normal("a", 0, 1)
    b = pm.Normal("b", 0, 1)
    error = pm.HalfCauchy("error", 3)
    
    yhat = a + b * df.internal_views

    lik = pm.Normal("lik", yhat, error, observed=df.earnings)

with views_model:
    trace = pm.sample(2000, chains=4)

plt.figure(figsize=(14, 7))
pm.plot_dist(trace["b"])
plt.xlabel("Model estimate of earnings $", fontsize=16)
plt.ylabel("Probability density", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14);

pm.traceplot(trace);

pm.summary(trace)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [error, b, a]
100.00% [12000/12000 00:04<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 5 seconds.
/Users/conormcdonald/opt/anaconda3/lib/python3.7/site-packages/arviz/data/io_pymc3.py:89: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  FutureWarning,
/Users/conormcdonald/opt/anaconda3/lib/python3.7/site-packages/arviz/data/io_pymc3.py:89: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  FutureWarning,
/Users/conormcdonald/opt/anaconda3/lib/python3.7/site-packages/pymc3/sampling.py:1618: UserWarning: samples parameter is smaller than nchains times ndraws, some draws and/or chains may not be represented in the returned posterior predictive sample
  "samples parameter is smaller than nchains times ndraws, some draws "
100.00% [2000/2000 00:02<00:00]
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [error, b, a]
100.00% [12000/12000 00:07<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 7 seconds.
/Users/conormcdonald/opt/anaconda3/lib/python3.7/site-packages/arviz/data/io_pymc3.py:89: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  FutureWarning,
/Users/conormcdonald/opt/anaconda3/lib/python3.7/site-packages/arviz/data/io_pymc3.py:89: FutureWarning: Using `from_pymc3` without the model will be deprecated in a future release. Not using the model will return less accurate and less useful results. Make sure you use the model argument or call from_pymc3 within a model context.
  FutureWarning,
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_mean ess_sd ess_bulk ess_tail r_hat
a 0.389 0.206 0.016 0.791 0.003 0.002 5749.0 5298.0 5773.0 5585.0 1.0
b 0.020 0.001 0.017 0.022 0.000 0.000 6077.0 6077.0 6092.0 5587.0 1.0
error 1.323 0.137 1.075 1.573 0.002 0.001 5469.0 5469.0 5363.0 4415.0 1.0