__author__ = "Donald Ghazi"
__email__ = "donald@donaldghazi.com"
__website__ = "donaldghazi.com"


# import packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


# import PISA 2012 data
pisa_2012 = pd.read_csv("/Users/donaldghazi/Desktop/pisa2012.csv",encoding='latin1',low_memory=False)


# import PISA 2012 Dictionary data 
pisa_dict_2012 = pd.read_csv("/Users/donaldghazi/Desktop/pisadict2012.csv",encoding='latin1',low_memory=False)


pisa_2012.sample(20)


# inspect df
pisa_2012.shape[0]

485490


pisa_2012.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485490 entries, 0 to 485489
Columns: 636 entries, Unnamed: 0 to VER_STU
dtypes: float64(250), int64(18), object(368)
memory usage: 2.3+ GB


# now inspect the column descriptions
pisa_dict_2012


pisa_dict_2012.head(10)


# CNT = Country Code
# NC  = National Centre Code 
# use groupby based on 'NC' then within each 'NC', we group based on 'CNT' 
# then count and sort values in decreaing amount 
pisa_2012.groupby('NC')['CNT'].count().sort_values(ascending=False)

NC
Mexico                       33806
Italy                        31073
Spain                        25313
Canada                       21544
Brazil                       19204
                             ...  
New Zealand                   4291
Iceland                       3508
United Kingdom (Scotland)     2945
Perm (Russian Federation)     1761
Liechtenstein                  293
Name: CNT, Length: 66, dtype: int64


# make a copy of the original df 
pisa_2012_clean = pisa_2012.copy()


# keep the columns that we will need for our analysis 
pisa_2012_clean = pisa_2012_clean[['CNT','ST04Q01','ST26Q12','AGE','ST26Q07','ST25Q01','TCHBEHFA','TCHBEHSO','TCHBEHTD','PV1MATH',
                                   'PV2MATH', 'PV3MATH', 'PV4MATH', 'PV5MATH', 'PV1READ', 'PV2READ', 'PV3READ', 
                                   'PV4READ', 'PV5READ','PV1SCIE', 'PV2SCIE', 'PV3SCIE', 'PV4SCIE', 'PV5SCIE']]


# inspect 
pisa_2012_clean


# doublecheck 
pisa_2012_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485490 entries, 0 to 485489
Data columns (total 24 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   CNT       485490 non-null  object 
 1   ST04Q01   485490 non-null  object 
 2   ST26Q12   474039 non-null  object 
 3   AGE       485374 non-null  float64
 4   ST26Q07   465860 non-null  object 
 5   ST25Q01   465496 non-null  object 
 6   TCHBEHFA  314678 non-null  float64
 7   TCHBEHSO  315114 non-null  float64
 8   TCHBEHTD  315519 non-null  float64
 9   PV1MATH   485490 non-null  float64
 10  PV2MATH   485490 non-null  float64
 11  PV3MATH   485490 non-null  float64
 12  PV4MATH   485490 non-null  float64
 13  PV5MATH   485490 non-null  float64
 14  PV1READ   485490 non-null  float64
 15  PV2READ   485490 non-null  float64
 16  PV3READ   485490 non-null  float64
 17  PV4READ   485490 non-null  float64
 18  PV5READ   485490 non-null  float64
 19  PV1SCIE   485490 non-null  float64
 20  PV2SCIE   485490 non-null  float64
 21  PV3SCIE   485490 non-null  float64
 22  PV4SCIE   485490 non-null  float64
 23  PV5SCIE   485490 non-null  float64
dtypes: float64(19), object(5)
memory usage: 88.9+ MB


#https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.isfinite.html
pisa_2012_clean.loc[np.isfinite(pisa_2012_clean['AGE']) == False, 'AGE'] = pisa_2012_clean['AGE'].mean()
pisa_2012_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485490 entries, 0 to 485489
Data columns (total 24 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   CNT       485490 non-null  object 
 1   ST04Q01   485490 non-null  object 
 2   ST26Q12   474039 non-null  object 
 3   AGE       485490 non-null  float64
 4   ST26Q07   465860 non-null  object 
 5   ST25Q01   465496 non-null  object 
 6   TCHBEHFA  314678 non-null  float64
 7   TCHBEHSO  315114 non-null  float64
 8   TCHBEHTD  315519 non-null  float64
 9   PV1MATH   485490 non-null  float64
 10  PV2MATH   485490 non-null  float64
 11  PV3MATH   485490 non-null  float64
 12  PV4MATH   485490 non-null  float64
 13  PV5MATH   485490 non-null  float64
 14  PV1READ   485490 non-null  float64
 15  PV2READ   485490 non-null  float64
 16  PV3READ   485490 non-null  float64
 17  PV4READ   485490 non-null  float64
 18  PV5READ   485490 non-null  float64
 19  PV1SCIE   485490 non-null  float64
 20  PV2SCIE   485490 non-null  float64
 21  PV3SCIE   485490 non-null  float64
 22  PV4SCIE   485490 non-null  float64
 23  PV5SCIE   485490 non-null  float64
dtypes: float64(19), object(5)
memory usage: 88.9+ MB


# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.isfinite.html
# repeat the same process above for each teacher behavior 
pisa_2012_clean.loc[np.isfinite(pisa_2012_clean['TCHBEHFA']) == False, 'TCHBEHFA'] = pisa_2012_clean['TCHBEHFA'].mean()
pisa_2012_clean.loc[np.isfinite(pisa_2012_clean['TCHBEHSO']) == False, 'TCHBEHSO'] = pisa_2012_clean['TCHBEHSO'].mean()
pisa_2012_clean.loc[np.isfinite(pisa_2012_clean['TCHBEHTD']) == False, 'TCHBEHTD'] = pisa_2012_clean['TCHBEHTD'].mean()
pisa_2012_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485490 entries, 0 to 485489
Data columns (total 24 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   CNT       485490 non-null  object 
 1   ST04Q01   485490 non-null  object 
 2   ST26Q12   474039 non-null  object 
 3   AGE       485490 non-null  float64
 4   ST26Q07   465860 non-null  object 
 5   ST25Q01   465496 non-null  object 
 6   TCHBEHFA  485490 non-null  float64
 7   TCHBEHSO  485490 non-null  float64
 8   TCHBEHTD  485490 non-null  float64
 9   PV1MATH   485490 non-null  float64
 10  PV2MATH   485490 non-null  float64
 11  PV3MATH   485490 non-null  float64
 12  PV4MATH   485490 non-null  float64
 13  PV5MATH   485490 non-null  float64
 14  PV1READ   485490 non-null  float64
 15  PV2READ   485490 non-null  float64
 16  PV3READ   485490 non-null  float64
 17  PV4READ   485490 non-null  float64
 18  PV5READ   485490 non-null  float64
 19  PV1SCIE   485490 non-null  float64
 20  PV2SCIE   485490 non-null  float64
 21  PV3SCIE   485490 non-null  float64
 22  PV4SCIE   485490 non-null  float64
 23  PV5SCIE   485490 non-null  float64
dtypes: float64(19), object(5)
memory usage: 88.9+ MB


# we have 5 plausible values per subject, so add them in their own respective subject and divide by 5 
# store each results in its own respective subject column 
pisa_2012_clean['Math Score'] = (pisa_2012_clean['PV1MATH'] + pisa_2012_clean['PV2MATH'] + pisa_2012_clean['PV3MATH']+ pisa_2012_clean['PV4MATH'] + pisa_2012_clean['PV5MATH']) / 5
pisa_2012_clean['Reading Score'] = (pisa_2012_clean['PV1READ'] + pisa_2012_clean['PV2READ'] + pisa_2012_clean['PV3READ']+ pisa_2012_clean['PV4READ'] + pisa_2012_clean['PV5READ']) / 5
pisa_2012_clean['Science Score'] = (pisa_2012_clean['PV1SCIE'] + pisa_2012_clean['PV2SCIE'] + pisa_2012_clean['PV3SCIE']+ pisa_2012_clean['PV4SCIE'] + pisa_2012_clean['PV5SCIE']) / 5


pisa_2012_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485490 entries, 0 to 485489
Data columns (total 27 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   CNT            485490 non-null  object 
 1   ST04Q01        485490 non-null  object 
 2   ST26Q12        474039 non-null  object 
 3   AGE            485490 non-null  float64
 4   ST26Q07        465860 non-null  object 
 5   ST25Q01        465496 non-null  object 
 6   TCHBEHFA       485490 non-null  float64
 7   TCHBEHSO       485490 non-null  float64
 8   TCHBEHTD       485490 non-null  float64
 9   PV1MATH        485490 non-null  float64
 10  PV2MATH        485490 non-null  float64
 11  PV3MATH        485490 non-null  float64
 12  PV4MATH        485490 non-null  float64
 13  PV5MATH        485490 non-null  float64
 14  PV1READ        485490 non-null  float64
 15  PV2READ        485490 non-null  float64
 16  PV3READ        485490 non-null  float64
 17  PV4READ        485490 non-null  float64
 18  PV5READ        485490 non-null  float64
 19  PV1SCIE        485490 non-null  float64
 20  PV2SCIE        485490 non-null  float64
 21  PV3SCIE        485490 non-null  float64
 22  PV4SCIE        485490 non-null  float64
 23  PV5SCIE        485490 non-null  float64
 24  Math Score     485490 non-null  float64
 25  Reading Score  485490 non-null  float64
 26  Science Score  485490 non-null  float64
dtypes: float64(22), object(5)
memory usage: 100.0+ MB


# now we can drop the columns
pisa_2012_clean.drop(pisa_2012_clean.iloc[:, 9:24], inplace = True, axis = 1) 
pisa_2012_clean


# double check
pisa_2012_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485490 entries, 0 to 485489
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   CNT            485490 non-null  object 
 1   ST04Q01        485490 non-null  object 
 2   ST26Q12        474039 non-null  object 
 3   AGE            485490 non-null  float64
 4   ST26Q07        465860 non-null  object 
 5   ST25Q01        465496 non-null  object 
 6   TCHBEHFA       485490 non-null  float64
 7   TCHBEHSO       485490 non-null  float64
 8   TCHBEHTD       485490 non-null  float64
 9   Math Score     485490 non-null  float64
 10  Reading Score  485490 non-null  float64
 11  Science Score  485490 non-null  float64
dtypes: float64(7), object(5)
memory usage: 44.4+ MB


# replace all NaN values for Dictionary as NA
pisa_2012_clean.loc[pisa_2012_clean['ST26Q12'].isna() == True,'ST26Q12'] = 'NA'


# replace all NaN values for Literature as NA
pisa_2012_clean.loc[pisa_2012_clean['ST26Q07'].isna() == True,'ST26Q07'] = 'NA'


# replace all Nan values for International Language at Home as NA
pisa_2012_clean.loc[pisa_2012_clean['ST25Q01'].isna() == True,'ST25Q01'] = 'NA'


# https://www.oecd.org/pisa/pisaproducts/PISA%202012%20Technical%20Report_Chapter%2016.pdf
# rename the column names 
pisa_2012_clean.rename({'CNT':'Country',
                        'AGE':'Age',
                        'ST04Q01':'Gender',
                        'ST26Q12': 'Dictionary',
                        'ST26Q07': 'Literature',
                        'ST25Q01': 'Test Language', # IT SHOWS IF THE STUDENT TOOK THE TEST IN THEIR NATIVE TOUNGE 
                        'TCHBEHFA':'Formative Assessment',
                        'TCHBEHSO' :'Student Orientation',
                        'TCHBEHTD' : 'Teacher-Directed Instruction'}, axis = 'columns', inplace = True)


# check 
pisa_2012_clean.sample(10)


pisa_2012_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485490 entries, 0 to 485489
Data columns (total 12 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Country                       485490 non-null  object 
 1   Gender                        485490 non-null  object 
 2   Dictionary                    485490 non-null  object 
 3   Age                           485490 non-null  float64
 4   Literature                    485490 non-null  object 
 5   Test Language                 485490 non-null  object 
 6   Formative Assessment          485490 non-null  float64
 7   Student Orientation           485490 non-null  float64
 8   Teacher-Directed Instruction  485490 non-null  float64
 9   Math Score                    485490 non-null  float64
 10  Reading Score                 485490 non-null  float64
 11  Science Score                 485490 non-null  float64
dtypes: float64(7), object(5)
memory usage: 44.4+ MB


# historgram gives the density of distributions from point to point in general terms.
# we want to see the distribution of scores for each of the subject 
# we need 3 subplots as there's three subjects (Math, Reading, and Science)

features = ['Math Score','Reading Score','Science Score']
pisa_2012_clean[features].hist(figsize=(13, 10));


features2 = ['Formative Assessment','Student Orientation','Teacher-Directed Instruction']
pisa_2012_clean[features2].hist(figsize=(13, 10));


#https://stackoverflow.com/questions/43549901/visualize-data-from-one-column
labels = []
for i, dfi in enumerate(pisa_2012_clean.groupby(["Test Language"])):
    labels.append(dfi[0])
    plt.bar(i, dfi[1].count(), label=dfi[0])
plt.xticks(range(len(labels)), labels)
plt.legend()
plt.show()


#https://stackoverflow.com/questions/43549901/visualize-data-from-one-column
labels = []
for i, dfi in enumerate(pisa_2012_clean.groupby(["Gender"])):
    labels.append(dfi[0])
    plt.bar(i, dfi[1].count(), label=dfi[0])
plt.xticks(range(len(labels)), labels)
plt.legend()
plt.show()


import seaborn as sns
sns.boxplot(x = pisa_2012_clean['Reading Score'], y = pisa_2012_clean['Gender'] );


sns.boxplot(x = pisa_2012_clean['Reading Score'], y = pisa_2012_clean['Dictionary'] );


#https://stackoverflow.com/questions/47809646/how-to-make-a-histogram-for-non-numeric-variables-in-python?rq=1

plt.style.use('ggplot')

pisa_2012_clean.groupby(['Dictionary', 'Literature'])\
      .Literature.count().unstack().plot.bar(legend=True)

plt.show()


import seaborn as sns
sns.boxplot(x = pisa_2012_clean['Math Score'], y = pisa_2012_clean['Gender'] );


import seaborn as sns
sns.boxplot(x = pisa_2012_clean['Science Score'], y = pisa_2012_clean['Gender'] );


import seaborn as sns
sns.boxplot(x = pisa_2012_clean['Reading Score'], y = pisa_2012_clean['Test Language'] );


#https://datatofish.com/correlation-matrix-pandas/

df_1 = pd.DataFrame(pisa_2012_clean,columns=['Formative Assessment','Student Orientation','Teacher-Directed Instruction'])
corrMatrix = df_1.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()


df_2 = pd.DataFrame(pisa_2012_clean,columns=['Math Score','Reading Score','Science Score'])
corrMatrix = df_2.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()


df_3 = pd.DataFrame(pisa_2012_clean,columns=['Age','Science Score','Formative Assessment'])
corrMatrix = df_3.corr()
sns.heatmap(corrMatrix, annot=True)
plt.show()

	Unnamed: 0	CNT	SUBNATIO	STRATUM	OECD	NC	SCHOOLID	STIDSTD	ST01Q01	ST02Q01	...	W_FSTR75	W_FSTR76	W_FSTR77	W_FSTR78	W_FSTR79	W_FSTR80	WVARSTRR	VAR_UNIT	SENWGT_STU	VER_STU
199688	199689	United Kingdom	8260000	GBR1105	OECD	United Kingdom (excl.Scotland)	424	10572	11	1.0	...	100.6136	168.5348	89.4344	168.5348	80.4909	187.2609	30	3	0.2010	22NOV13
177231	177232	Finland	2460000	FIN0003	OECD	Finland	56	1557	9	1.0	...	13.6850	4.5651	4.2998	13.7251	12.8907	13.6850	62	1	0.1475	22NOV13
169839	169840	Spain	7240900	ESP0918	OECD	Spain	866	24257	10	1.0	...	41.6396	41.6396	41.6396	13.8799	13.8799	41.6396	47	2	0.0743	22NOV13
464250	464251	Tunisia	7880000	TUN0013	Non-OECD	Tunisia	115	3268	10	2.0	...	36.1375	12.0458	12.0458	12.0458	12.0458	36.1375	54	2	0.1995	22NOV13
424540	424541	Singapore	7020000	SGP0201	Non-OECD	Singapore	54	1765	10	1.0	...	12.1988	3.7651	11.7470	11.2952	11.7470	11.7470	39	2	0.1533	22NOV13
250436	250437	Italy	3800000	ITA1902	OECD	Italy	380	9847	10	2.0	...	29.6272	29.6272	29.6272	29.6272	9.8757	9.8757	49	2	0.0379	22NOV13
401561	401562	China-Shanghai	1560000	QCN0002	Non-OECD	China (Shanghai)	40	1345	10	2.0	...	23.1880	23.1880	23.1880	11.0744	11.0744	23.1880	38	3	0.2016	22NOV13
26394	26395	Australia	360000	AUS0205	OECD	Australia	229	4244	9	1.0	...	2.0723	2.6125	6.8208	6.1229	2.5062	2.4722	71	2	0.0180	22NOV13
254233	254234	Italy	3800000	ITA1501	OECD	Italy	518	13644	10	1.0	...	4.4363	13.3090	13.3090	4.4363	4.4363	13.3090	55	2	0.0170	22NOV13
246809	246810	Italy	3800000	ITA1902	OECD	Italy	240	6220	9	2.0	...	11.5863	11.5863	11.5863	11.2369	32.4395	33.5556	49	1	0.0430	22NOV13
382385	382386	Poland	6160000	POL0001	OECD	Poland	140	3464	9	1.0	...	61.4053	168.1104	57.4735	58.7977	58.8062	60.0252	1	2	0.3063	22NOV13
318838	318839	Mexico	4840000	MEX0410	OECD	Mexico	134	3136	10	5.0	...	12.1540	12.1786	12.1540	4.3495	12.1540	4.3495	22	1	0.0063	22NOV13
13865	13866	United Arab Emirates	7840000	ARE0769	Non-OECD	United Arab Emirates	362	9123	10	2.0	...	4.0771	3.9669	1.3228	1.3598	1.3228	1.3224	66	2	0.0657	22NOV13
14884	14885	United Arab Emirates	7840100	ARE0101	Non-OECD	United Arab Emirates	402	10142	8	1.0	...	2.5341	2.5341	2.5341	7.6022	7.6022	2.5341	7	2	0.1248	22NOV13
14395	14396	United Arab Emirates	7840100	ARE0108	Non-OECD	United Arab Emirates	383	9653	10	1.0	...	7.4775	2.4500	7.4775	7.5442	7.5867	7.5867	48	2	0.1219	22NOV13
393135	393136	Qatar	6340000	QAT0004	Non-OECD	Qatar	47	3885	10	4.0	...	0.5009	1.5027	0.5009	0.5009	0.5009	0.5009	79	2	0.0910	22NOV13
47940	47941	Belgium	560100	BEL0111	OECD	Belgium	220	6554	9	6.0	...	9.5093	24.0369	24.7308	24.7308	26.5182	10.0582	49	2	0.1492	22NOV13
351214	351215	Montenegro	4990000	MNE0006	Non-OECD	Montenegro	19	1706	10	2.0	...	0.6061	1.8182	0.6061	0.6061	0.5855	0.5855	57	2	0.1544	22NOV13
484805	484806	Vietnam	7040000	VNM0208	Non-OECD	Viet Nam	141	4275	10	2.0	...	270.2383	90.0794	90.0794	90.0794	90.0794	270.2383	60	1	0.1883	22NOV13
162993	162994	Spain	7240700	ESP0713	OECD	Spain	616	17411	10	1.0	...	5.6311	17.5992	5.6311	16.7849	16.7849	16.7849	8	1	0.0309	22NOV13

	Unnamed: 0	x
0	CNT	Country code 3-character
1	SUBNATIO	Adjudicated sub-region code 7-digit code (3-di...
2	STRATUM	Stratum ID 7-character (cnt + region ID + orig...
3	OECD	OECD country
4	NC	National Centre 6-digit Code
...	...	...
630	W_FSTR80	FINAL STUDENT REPLICATE BRR-FAY WEIGHT80
631	WVARSTRR	RANDOMIZED FINAL VARIANCE STRATUM (1-80)
632	VAR_UNIT	RANDOMLY ASSIGNED VARIANCE UNIT
633	SENWGT_STU	Senate weight - sum of weight within the count...
634	VER_STU	Date of the database creation

	Unnamed: 0	x
0	CNT	Country code 3-character
1	SUBNATIO	Adjudicated sub-region code 7-digit code (3-di...
2	STRATUM	Stratum ID 7-character (cnt + region ID + orig...
3	OECD	OECD country
4	NC	National Centre 6-digit Code
5	SCHOOLID	School ID 7-digit (region ID + stratum ID + 3-...
6	STIDSTD	Student ID
7	ST01Q01	International Grade
8	ST02Q01	National Study Programme
9	ST03Q01	Birth - Month

	CNT	ST04Q01	ST26Q12	AGE	ST26Q07	ST25Q01	TCHBEHFA	TCHBEHSO	TCHBEHTD	PV1MATH	...	PV1READ	PV2READ	PV3READ	PV4READ	PV5READ	PV1SCIE	PV2SCIE	PV3SCIE	PV4SCIE	PV5SCIE
0	Albania	Female	Yes	16.17	No	Language of the test	1.3625	0.9374	0.4297	406.8469	...	249.5762	254.3420	406.8496	175.7053	218.5981	341.7009	408.8400	348.2283	367.8105	392.9877
1	Albania	Female	Yes	16.17	Yes	Language of the test	NaN	NaN	NaN	486.1427	...	406.2936	349.8975	400.7334	369.7553	396.7618	548.9929	471.5964	471.5964	443.6218	454.8116
2	Albania	Female	Yes	15.58	Yes	Language of the test	NaN	NaN	NaN	533.2684	...	401.2100	404.3872	387.7067	431.3938	401.2100	499.6643	428.7952	492.2044	512.7191	499.6643
3	Albania	Female	Yes	15.67	Yes	Language of the test	0.7644	3.3108	2.3916	412.2215	...	547.3630	481.4353	461.5776	425.0393	471.9036	438.6796	481.5740	448.9370	474.1141	426.5573
4	Albania	Female	Yes	15.50	Yes	Language of the test	0.7644	0.9374	0.4297	381.9209	...	311.7707	141.7883	293.5015	272.8495	260.1405	361.5628	275.7740	372.7527	403.5248	422.1746
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
485485	Vietnam	Female	Yes	15.83	No	Language of the test	NaN	NaN	NaN	477.1849	...	460.2272	476.1134	472.9362	472.1419	481.6736	559.8098	528.1052	519.7128	535.5651	538.3626
485486	Vietnam	Male	Yes	16.17	Yes	Language of the test	-0.2859	-0.1057	0.4297	518.9360	...	490.9325	479.7053	448.4294	565.5134	451.6372	538.7355	493.9761	493.0436	561.1153	535.0056
485487	Vietnam	Male	Yes	15.83	No	Language of the test	-0.9632	-0.1057	-0.5612	475.2376	...	462.6239	514.7503	434.5558	457.8122	511.5425	536.8706	571.3726	488.3812	548.9929	563.9127
485488	Vietnam	Male	Yes	15.83	No	Language of the test	-0.2859	0.2217	0.4297	550.9503	...	505.2873	522.1282	513.3068	528.5437	522.9301	511.0407	532.4879	524.0955	551.1376	514.7706
485489	Vietnam	Female	Yes	15.33	Yes	Language of the test	NaN	NaN	NaN	470.0187	...	532.3506	483.1034	479.9261	459.2741	488.6635	530.6229	473.7411	477.4711	477.4711	505.4457

	CNT	ST04Q01	ST26Q12	AGE	ST26Q07	ST25Q01	TCHBEHFA	TCHBEHSO	TCHBEHTD	Math Score	Reading Score	Science Score
0	Albania	Female	Yes	16.17	No	Language of the test	1.36250	0.937400	0.429700	366.18634	261.01424	371.91348
1	Albania	Female	Yes	16.17	Yes	Language of the test	0.13793	0.209052	0.147423	470.56396	384.68832	478.12382
2	Albania	Female	Yes	15.58	Yes	Language of the test	0.13793	0.209052	0.147423	505.53824	405.18154	486.60946
3	Albania	Female	Yes	15.67	Yes	Language of the test	0.76440	3.310800	2.391600	449.45476	477.46376	453.97240
4	Albania	Female	Yes	15.50	Yes	Language of the test	0.76440	0.937400	0.429700	385.50398	256.01010	367.15778
...	...	...	...	...	...	...	...	...	...	...	...	...
485485	Vietnam	Female	Yes	15.83	No	Language of the test	0.13793	0.209052	0.147423	486.22058	472.61846	536.31110
485486	Vietnam	Male	Yes	16.17	Yes	Language of the test	-0.28590	-0.105700	0.429700	529.21794	487.24356	524.37522
485487	Vietnam	Male	Yes	15.83	No	Language of the test	-0.96320	-0.105700	-0.561200	486.29850	476.25694	541.90600
485488	Vietnam	Male	Yes	15.83	No	Language of the test	-0.28590	0.221700	0.429700	522.90856	518.43922	526.70646
485489	Vietnam	Female	Yes	15.33	Yes	Language of the test	0.13793	0.209052	0.147423	454.43994	488.66354	492.95038

Dataset Description¶

PISA (Programme for International Student Assessment)¶

Gathering Data¶

Assessing Data¶

Brainstorming Questions¶

Cleaning Data¶

I'm looking at the column above and my goal is to decrease the number of columns.¶

Exploratory Data Analysis¶

Method : Univariate Analysis¶

1) How did students perform in each subject?¶

Histogram Visualization Analysis¶

2) What was the distribution for each teacher behavior score?¶

3) About how many students were non-native speakers?¶

4) Which gender was more represented?¶

Method 2: Bivariate Analysis¶

5) Which gender performed better in reading ?¶

6) Did students who possess dictionaries perform better in reading section?¶

7) Do students with dictionaries more likely to possess literature books?¶

8) Which gender performed better in Math ?¶

9) Which gender performed better in Science ?¶

10) How did non native speakers perform compared to non-native speakers, in Reading section ?¶

Method : Multivariate Analysis¶

11) What's the correlation between the three teacher behavior measurements?¶

12) What's the correlation between the three subject scores?¶

13) What's the correlation between Age, Science Score, and Formative Assessment?¶

	Country	Gender	Dictionary	Age	Literature	Test Language	Formative Assessment	Student Orientation	Teacher-Directed Instruction	Math Score	Reading Score	Science Score
204007	Greece	Female	Yes	15.33	Yes	Language of the test	0.13793	0.209052	0.147423	438.54962	540.61140	450.14918
163094	Spain	Female	Yes	15.58	Yes	Other language	-0.28590	0.485500	-0.808300	509.35506	564.12298	549.45916
191421	United Kingdom	Male	Yes	15.42	Yes	Language of the test	0.13793	0.209052	0.147423	423.59402	499.91432	527.54568
225826	Indonesia	Male	Yes	15.92	No	Other language	1.36250	1.154700	-0.079800	262.27606	302.47542	267.66132
102436	Switzerland	Female	Yes	16.08	No	Language of the test	-0.96320	0.221700	-1.673100	598.38758	601.61444	594.31182
467828	Turkey	Female	Yes	16.08	Yes	Language of the test	0.25090	-0.580900	0.167200	535.60524	527.26700	533.60688
428715	Serbia	Male	Yes	15.33	Yes	Language of the test	1.04160	0.485500	1.076800	592.85714	485.31892	547.50092
326119	Mexico	Male	Yes	15.75	Yes	Language of the test	0.13793	0.209052	0.147423	380.59666	390.04782	396.34466
339143	Mexico	Female	NA	16.08	No	NA	0.50540	1.382300	2.563000	314.38698	310.89690	280.80938
308910	Latvia	Male	Yes	16.08	Yes	Language of the test	0.13793	0.209052	0.147423	486.29848	537.04438	521.39128