__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
#We are setting the seed to assure you get the same answers on quizzes as we set up
random.seed(42)


#load data 
df = pd.read_csv('ab_data.csv')
df.head()


#check for number of rows 
df.shape[0]

294478


# within df, check user id column to find the number of unique users  *returns as series 
df["user_id"].nunique()

290584


#within df, find the (mean=proportion) of the coverted column   *return as series
df.converted.mean()

0.11965919355605512


#check to see if any missing info 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
user_id         294478 non-null int64
timestamp       294478 non-null object
group           294478 non-null object
landing_page    294478 non-null object
converted       294478 non-null int64
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


#filter when "treatment" group doesn't have "new page" OR non-"treatment" groups have "new page" and count # of rows
len(df.query('(group == "treatment" and landing_page != "new_page") or (group != "treatment" and landing_page == "new_page")'))

3893


#check for any missing values in the original df.   *already checked and should be False 
df.isnull().values.any()

False


#select (1.e) w/ additional 2 conditions: control group doesn't have new page OR non control group has old page 
#set selected df above as df2
df2 = df.drop(df.query('(group == "treatment" and landing_page != "new_page") or (group != "treatment" and landing_page == "new_page") or (group == "control" and landing_page != "old_page") or (group != "control" and landing_page == "old_page")').index)


# Double Check all of the correct rows were removed - this should be 0
df2[((df2['group'] == 'treatment') == (df2['landing_page'] == 'new_page')) == False].shape[0]

0


# within df2, check user id column to find the number of unique users  *returns as series 
df2['user_id'].nunique()

290584


#within df2, check for duplicated user id and return that unique value as an array 
df2[df2.duplicated(['user_id'])]['user_id'].unique()

array([773192])


#within df2, find duplicated rows under user_id column 
df2[df2.duplicated(['user_id'], keep=False)]


#keep the first row 
df2 = df2[~df2.user_id.duplicated(keep='first')]


#find the mean of the 'converted' column
df2['converted'].mean()

0.11959708724499628


#select control group and find the mean using 'converted' column
df2.query('group == "control"')['converted'].mean()

0.1203863045004612


#select treatment group and find the mean using 'converted' column
df2.query('group == "treatment"')['converted'].mean()

0.11880806551510564


#number of new page individuals divided by total number of individuals (rows)
len(df2.query("landing_page == 'new_page'")) / df2.shape[0]

0.5000619442226688


# p_new = converted success rate regardless of page 
p_new = df2.converted.mean()
p_new

0.11959708724499628


# p_old = p_new under the null described above 
p_old = df2.converted.mean()
p_old

0.11959708724499628


#within df2, compute the number users who has new page 
n_new = df2.query("landing_page == 'new_page'").shape[0]
n_new

145310


#within df2, compute the number users who has old page 
n_old = df2.query("landing_page == 'old_page'").shape[0]  
n_old

145274


# use binomial b/c "true" success for both are equal, under null hypothesis 
# simulate 1 test for the average of n_new binomial flips with a convert rate of p_null
# n=1 trial size, p=probability of trial, size=number of trials to run 
new_page_converted = np.random.binomial(1, p_new, n_new)
new_page_converted.mean()

0.11871860161035029


#simulate 1 test for the average of n_old binomial flips with a conversion rate under the null
old_page_converted = np.random.binomial(1, p_old, n_old)
old_page_converted.mean()

0.12101959056679103


#calculate difference in average conversions
new_page_converted.mean() - old_page_converted.mean()

-0.002300988956440736


#simulate 10,000 tests for the average of n_old and n_new binomial flips with a conversion rate under the null
p_diffs = []
for _ in range(10000):
    new_page_converted = np.random.binomial(1, p_new, n_new)
    old_page_converted = np.random.binomial(1, p_old, n_old)
    p_diff = new_page_converted.mean() - old_page_converted.mean()
#append each result to p_diff
    p_diffs.append(p_diff)


#store p-diffs as an array
p_diffs = np.array(p_diffs)


#probability of individuals were in the treatment group, then they converted 
#probability of individuals were in the control group, then they converted 

#actual observed difference (observation of differences in conversion) 
obs_diff = df2.query('group == "treatment"')['converted'].mean() - df2.query('group == "control"')['converted'].mean()
obs_diff

-0.0015782389853555567


plt.hist(p_diffs)
plt.axvline(obs_diff, color='red');


#actual observed difference (observation of differences in conversion)
obs_diff

-0.0015782389853555567


#proportion is equal to the p-value of our null hypothesis
p_diff_proportion = (p_diffs > obs_diff).mean()
p_diff_proportion

0.90669999999999995


#null: new page = old page 
#alternative: new page > old page
#count number of conversions for both the old and new page passed 
import statsmodels.api as sm

convert_old = df2.query("landing_page == 'old_page' and converted == 1").shape[0]
convert_new = df2.query("landing_page == 'new_page' and converted == 1").shape[0]
n_old = df2.query("landing_page == 'old_page'").shape[0]
n_new = df2.query("landing_page == 'new_page'").shape[0]


#pass as 'smaller' to fit our alternatative hypothesis (new page > old page) b/c z_test is two sided by default
z_score, p_value = sm.stats.proportions_ztest([convert_old, convert_new],[n_old, n_new],alternative='smaller', prop_var=False)
z_score, p_value

(1.3109241984234394, 0.90505831275902449)


# Z score
z_score

1.3109241984234394


#p value
p_value

0.90505831275902449


#so far we know that  old page= control group = interecept 

#we need to create new columns to represent the categorical fields
#so we 5need to have number of the newly created columns that's equal to the number of categorical values

#create a column for the intercept w/ coefficient of 1 as described above w/ control as basline 
#Dummy coded variables have values of 0 for the treatment (reference) group and 1 for the control (treatment) group
df2['intercept'] = 1


#create a dummy variable column for which page each user received
#drop new_page b/c it's the same thingas ab_page
df2= df2.join(pd.get_dummies(df2['landing_page']))


#create ab_page column where control group is dropped (b/c it's same as old_age )  *intercept becomes our baseline 
df2['ab_page'] = pd.get_dummies(df['group']) ['treatment']
df2.head()


#Create Logistic Regression Model for converted variable and ab_page 
lo = sm.Logit(df2['converted'], df2[['intercept','ab_page']])


result = lo.fit()

Optimization terminated successfully.
         Current function value: 0.366118
         Iterations 6


result.summary2()


countries_df = pd.read_csv('./countries.csv')
df_new = countries_df.set_index('user_id').join(df2.set_index('user_id'), how='inner')
df_new.head()


# check country column to see how many unique values (entries) there are 
df_new['country'].unique()

array(['UK', 'US', 'CA'], dtype=object)


# create necessary dummy variables and list the column in alphabetical order (A-Z)
# double check 
df_new[['CA', 'UK', 'US']] = pd.get_dummies(df_new['country'])
df_new.head()


# drop US which is now the baseline
logit_mod = sm.Logit(df_new['converted'], df_new[['intercept', 'CA', 'UK']])
results = logit_mod.fit()

Optimization terminated successfully.
         Current function value: 0.366116
         Iterations 6


results.summary2()


#create the necessary dummy variables
df3[['CA', 'UK', 'US']] = pd.get_dummies(df_new['country'])


#consider US and CA
logit1 = sm.Logit(df3['converted'],df3[['intercept','US','CA']]).fit()
logit1.summary2()

Optimization terminated successfully.
         Current function value: 0.366116
         Iterations 6


#consider UK and CA
logit2 = sm.Logit(df3['converted'],df3[['intercept','UK','CA']]).fit()
logit2.summary2()

Optimization terminated successfully.
         Current function value: 0.366116
         Iterations 6


#consider UK and US
logit3 = sm.Logit(df3['converted'],df3[['intercept','UK','US']]).fit() 
logit3.summary2()

Optimization terminated successfully.
         Current function value: 0.366116
         Iterations 6

Model:	Logit	No. Iterations:	6.0000
Dependent Variable:	converted	Pseudo R-squared:	0.000
Date:	2020-04-20 22:49	AIC:	212780.3502
No. Observations:	290584	BIC:	212801.5095
Df Model:	1	Log-Likelihood:	-1.0639e+05
Df Residuals:	290582	LL-Null:	-1.0639e+05
Converged:	1.0000	Scale:	1.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
intercept	-1.9888	0.0081	-246.6690	0.0000	-2.0046	-1.9730
ab_page	-0.0150	0.0114	-1.3109	0.1899	-0.0374	0.0074

Model:	Logit	No. Iterations:	6.0000
Dependent Variable:	converted	Pseudo R-squared:	0.000
Date:	2020-04-20 22:50	AIC:	212780.8333
No. Observations:	290584	BIC:	212812.5723
Df Model:	2	Log-Likelihood:	-1.0639e+05
Df Residuals:	290581	LL-Null:	-1.0639e+05
Converged:	1.0000	Scale:	1.0000

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
intercept	-1.9967	0.0068	-292.3145	0.0000	-2.0101	-1.9833
CA	-0.0408	0.0269	-1.5178	0.1291	-0.0935	0.0119
UK	0.0099	0.0133	0.7458	0.4558	-0.0161	0.0360

Model:	Logit	No. Iterations:	6.0000
Dependent Variable:	converted	Pseudo R-squared:	0.000
Date:	2020-04-20 22:50	AIC:	212780.8333
No. Observations:	290584	BIC:	212812.5723
Df Model:	2	Log-Likelihood:	-1.0639e+05
Df Residuals:	290581	LL-Null:	-1.0639e+05
Converged:	1.0000	Scale:	1.0000

Table of Contents¶

Introduction¶

Part I - Probability¶

Part II - A/B Test¶

Part III - A regression approach¶

	user_id	timestamp	group	landing_page	converted
0	851104	2017-01-21 22:11:48.556739	control	old_page	0
1	804228	2017-01-12 08:01:45.159739	control	old_page	0
2	661590	2017-01-11 16:55:06.154213	treatment	new_page	0
3	853541	2017-01-08 18:28:03.143765	treatment	new_page	0
4	864975	2017-01-21 01:52:26.210827	control	old_page	1

	user_id	timestamp	group	landing_page	converted
1899	773192	2017-01-09 05:37:58.781806	treatment	new_page	0
2893	773192	2017-01-14 02:55:59.590927	treatment	new_page	0

	country	timestamp	group	landing_page	converted	intercept	new_page	old_page	ab_page
user_id
834778	UK	2017-01-14 23:08:43.304998	control	old_page	0	1	0	1	0
928468	US	2017-01-23 14:44:16.387854	treatment	new_page	0	1	1	0	1
822059	UK	2017-01-16 14:04:14.719771	treatment	new_page	1	1	1	0	1
711597	UK	2017-01-22 03:14:24.763511	control	old_page	0	1	0	1	0
710616	UK	2017-01-16 13:14:44.000513	treatment	new_page	0	1	1	0	1

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
intercept	-1.9868	0.0114	-174.1736	0.0000	-2.0092	-1.9645
US	-0.0099	0.0133	-0.7458	0.4558	-0.0360	0.0161
CA	-0.0507	0.0284	-1.7863	0.0740	-0.1064	0.0049

	Coef.	Std.Err.	z	P>\|z\|	[0.025	0.975]
intercept	-2.0375	0.0260	-78.3639	0.0000	-2.0885	-1.9866
UK	0.0507	0.0284	1.7863	0.0740	-0.0049	0.1064
US	0.0408	0.0269	1.5178	0.1291	-0.0119	0.0935