import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import *
from sklearn.model_selection import *
import statsmodels.api as sm
import scipy.stats as stats
            
import regex as re
import requests
from bs4 import BeautifulSoup


df_final = pd.read_csv("data/final.csv")
df_final = df_final.set_index("Country")
df_final.head()


df_final.describe()


df_h1 = df_final[["Life Expectancy","cal"]].sort_values('cal')
df_h1.head()


sns.set(rc={'figure.figsize':(17,4)})

#Used sqrt transformation because data showed strong curve and slightly heteroskedastic
X = np.sqrt(df_h1["cal"].to_numpy().reshape(-1,1))
y = (df_h1["Life Expectancy"])
calories_train, calories_test, exp_train, exp_test = train_test_split(X, y, test_size=0.2, random_state=10)
model_h1 = LinearRegression().fit(calories_train,exp_train)
train_predictions = model_h1.predict(calories_train)
test_predictions = model_h1.predict(calories_test)

#errors
train_r_squared = model_h1.score(calories_train,exp_train)
test_r_squared = model_h1.score(calories_test,exp_test)
train_residuals = exp_train - train_predictions
test_residuals = exp_test - test_predictions
train_rmse = np.sqrt(np.mean(train_residuals**2))
test_rmse = np.sqrt(np.mean(test_residuals**2))

#plots
OLS_model_h1 = sm.OLS(y, X).fit()
f, (ax1, ax2, ax3) = plt.subplots(1, 3)
sns.regplot(x = X,y = y, ax = ax1).set(title = "Data Scatterplot", xlabel='Square Root of Calories Consumes', ylabel='Life Expectancy')
sns.scatterplot(x = test_predictions, y = test_residuals, ax = ax2).set(title = "Residuals Scatterplot", xlabel='Predicted Life Expectancy', ylabel='Residuals')
sns.histplot(test_residuals, ax = ax3).set(title = "Residuals Histogram",xlabel='Residuals', ylabel='Count')
plt.show()
print("Slope of Model:",f'{model_h1.coef_[0]:.2f}',"\nIntercept of Model:",f'{model_h1.intercept_:.2f}')
print("Train RMSE:",f'{train_rmse:.2f}',"\nTest RMSE:",f'{test_rmse:.2f}')
print("Train R Squared:",f'{train_r_squared:.2f}',"\nTest R Squared:",f'{test_r_squared:.2f}')
print("\nRight-sided P-Value at 97.5% CL:",OLS_model_h1.t_test(1).pvalue)

Slope of Model: 1.14 
Intercept of Model: 11.83
Train RMSE: 5.05 
Test RMSE: 3.57
Train R Squared: 0.46 
Test R Squared: 0.73

Right-sided P-Value at 97.5% CL: 4.700633001111161e-130


df_h2 = df_final[["Life Expectancy","Mean years of schooling"]].sort_values('Mean years of schooling')
df_h2.head()


X = (df_h2["Mean years of schooling"].to_numpy().reshape(-1,1))
y = (df_h2["Life Expectancy"])
schooling_train, schooling_test, exp_train, exp_test = train_test_split(X, y, test_size=0.2, random_state=95)
model_h2 = LinearRegression().fit(schooling_train,exp_train)
train_predictions = model_h2.predict(schooling_train)
test_predictions = model_h2.predict(schooling_test)

#errors
train_r_squared = model_h2.score(schooling_train,exp_train)
test_r_squared = model_h2.score(schooling_test,exp_test)
train_residuals = exp_train - train_predictions
test_residuals = exp_test - test_predictions
train_rmse = np.sqrt(np.mean(train_residuals**2))
test_rmse = np.sqrt(np.mean(test_residuals**2))
OLS_model_h2 = sm.OLS(y, X).fit()

#plots
f, (ax1, ax2, ax3) = plt.subplots(1, 3)
sns.regplot(x = X,y = y, ax = ax1).set(title = "Data Scatterplot", xlabel='Mean Years of Schooling', ylabel='Life Expectancy')
sns.scatterplot(x = test_predictions, y = test_residuals, ax = ax2).set(title = "Residuals Scatterplot", xlabel='Predicted Life Expectancy', ylabel='Residuals')
sns.histplot(test_residuals, ax = ax3).set(title = "Residuals Histogram",xlabel='Residuals', ylabel='Count')
plt.show()

#summary
print("Slope of Model:",f'{model_h2.coef_[0]:.2f}',"\nIntercept of Model:",f'{model_h2.intercept_:.2f}')
print("Train RMSE:",f'{train_rmse:.2f}',"\nTest RMSE:",f'{test_rmse:.2f}')
print("Train R Squared:",f'{train_r_squared:.2f}',"\nTest R Squared:",f'{test_r_squared:.2f}')
print("\nRight-sided P-Value at 97.5% CL:",OLS_model_h2.t_test(1).pvalue)

Slope of Model: 1.59 
Intercept of Model: 59.16
Train RMSE: 4.86 
Test RMSE: 4.92
Train R Squared: 0.50 
Test R Squared: 0.52

Right-sided P-Value at 97.5% CL: 5.5363113503056174e-67


df_h3 = df_final[["Life Expectancy","Gross national income (GNI) per capita"]].sort_values("Gross national income (GNI) per capita")
df_h3.head()


X = (df_h3["Gross national income (GNI) per capita"].to_numpy().reshape(-1,1))
#apply log transformation because data shows strong log(x)-type, diminishing returns curve
X = np.log(df_h3["Gross national income (GNI) per capita"].to_numpy().reshape(-1,1))
y = (df_h3["Life Expectancy"])
GNI_train, GNI_test, exp_train, exp_test = train_test_split(X, y, test_size=0.2, random_state=94)
model_h3 = LinearRegression().fit(GNI_train,exp_train)
train_predictions = model_h3.predict(GNI_train)
test_predictions = model_h3.predict(GNI_test)

#errors
train_r_squared = model_h3.score(GNI_train,exp_train)
test_r_squared = model_h3.score(GNI_test,exp_test)
train_residuals = exp_train - train_predictions
test_residuals = exp_test - test_predictions
train_rmse = np.sqrt(np.mean(train_residuals**2))
test_rmse = np.sqrt(np.mean(test_residuals**2))

#plots
f, (ax1, ax2, ax3) = plt.subplots(1, 3)
sns.regplot(x = X,y = y, ax = ax1).set(title = "Data Scatterplot", xlabel='Log GNI per Capita', ylabel='Life Expectancy')
sns.scatterplot(x = test_predictions, y = test_residuals, ax = ax2).set(title = "Residuals Scatterplot", xlabel='Predicted Life Expectancy', ylabel='Residuals')
sns.histplot(test_residuals, ax = ax3).set(title = "Residuals Histogram",xlabel='Residuals', ylabel='Count')
plt.show()
OLS_model_h3 = sm.OLS(y, X).fit()

#summary
print("Slope of Model:",f'{model_h3.coef_[0]:.2f}',"\nIntercept of Model:",f'{model_h3.intercept_:.2f}')
print("Train RMSE:",f'{train_rmse:.2f}',"\nTest RMSE:",f'{test_rmse:.2f}')
print("Train R Squared:",f'{train_r_squared:.2f}',"\nTest R Squared:",f'{test_r_squared:.2f}')
print("\nRight-sided P-Value at 97.5% CL:",OLS_model_h3.t_test(1).pvalue)

Slope of Model: 5.30 
Intercept of Model: 23.82
Train RMSE: 3.54 
Test RMSE: 4.19
Train R Squared: 0.73 
Test R Squared: 0.59

Right-sided P-Value at 97.5% CL: 4.621439687292658e-134


#setup and run regression
X = df_final["happiness_index"].to_numpy().reshape(-1,1)
y = df_final["Life Expectancy"]
happy_train, happy_test, exp_train, exp_test = train_test_split(X, y, test_size=0.2, random_state = 78)
model_h4 = LinearRegression().fit(happy_train,exp_train)

#get errors
train_r_squared = model_h4.score(happy_train,exp_train)
test_r_squared = model_h4.score(happy_test,exp_test)
train_residuals = exp_train - model_h4.predict(happy_train)
test_preds = model_h4.predict(happy_test)
test_residuals = exp_test - test_preds
train_rmse = np.sqrt(np.mean(train_residuals**2))
test_rmse = np.sqrt(np.mean(test_residuals**2))

#plot regression
# sns.regplot(x = X,y = y).set(xlabel='Happiness', ylabel='Life Expectancy')
f, (ax1, ax2, ax3) = plt.subplots(1, 3)
sns.regplot(x = X,y = y, ax = ax1).set(title = "Data Scatterplot", xlabel='Happiness Index', ylabel='Life Expectancy')
sns.scatterplot(x = test_predictions, y = test_residuals, ax = ax2).set(title = "Residuals Scatterplot", xlabel='Predicted Life Expectancy', ylabel='Residuals')
sns.histplot(test_residuals, ax = ax3).set(title = "Residuals Histogram",xlabel='Residuals', ylabel='Count')
plt.show()
OLS_model_h4 = sm.OLS(y, X).fit()

#summary
print("Slope of Model:",f'{model_h4.coef_[0]:.2f}',"\nIntercept of Model:",f'{model_h4.intercept_:.2f}')
print("Train RMSE:",f'{train_rmse:.2f}',"\nTest RMSE:",f'{test_rmse:.2f}')
print("Train R Squared:",f'{train_r_squared:.2f}',"\nTest R Squared:",f'{test_r_squared:.2f}')
print("\nRight-sided P-Value at 97.5% CL:",OLS_model_h4.t_test(1).pvalue)

Slope of Model: 5.36 
Intercept of Model: 43.53
Train RMSE: 4.51 
Test RMSE: 4.45
Train R Squared: 0.58 
Test R Squared: 0.57

Right-sided P-Value at 97.5% CL: 2.9328005457793113e-98


X = df_final["has_uhc"]
y = df_final["Life Expectancy"]

# Life Expectancy datasets according to presence of UHC
y_has_uhc = y[X == 1.0]
y_no_uhc = y[X != 1.0]

# Boxplot
fig, ax = plt.subplots()
ax.boxplot([y_no_uhc, y_has_uhc])
plt.xticks([1, 2], ["Countries without UHC", "Countries with UHC"])
plt.ylabel("Life Expectancy")
plt.show

# Sample Difference
print("Median Life Expectancy of Countries without UHC:", round(y_no_uhc.median(),2), "years")
print("Median Life Expectancy of Countries with UHC:", y_has_uhc.median(), "years")
sample_diff = round(y_has_uhc.median() - y_no_uhc.median(), 2)
print("Sample Difference in the Medians:", sample_diff, "years")

Median Life Expectancy of Countries without UHC: 71.61 years
Median Life Expectancy of Countries with UHC: 77.46 years
Sample Difference in the Medians: 5.85 years


perm_diff = np.zeros(10000)
for i in range(10000):
    perm_X = np.random.choice(X, X.size, replace = False)
    perm_y_has_uhc = y[perm_X == 1.0]
    perm_y_no_uhc = y[perm_X != 1.0]
    perm_diff[i] = round(perm_y_has_uhc.median() - perm_y_no_uhc.median(), 2)

# Plot result
sns.histplot(perm_diff)
plt.axvline(sample_diff, color='r', ls='--')

<matplotlib.lines.Line2D at 0x22993102790>


perm_mean = perm_diff.mean()
perm_std = perm_diff.std()
std_diff = (sample_diff - perm_mean)/perm_std

print("Mean:", round(perm_mean, 2))
print("Standard Deviation:", round(perm_std, 2))
print("Sample difference is", round(std_diff, 2), "standard deviations away from the mean")
print("P-value:", stats.norm.sf(std_diff))

Mean: -0.03
Standard Deviation: 1.19
Sample difference is 4.94 standard deviations away from the mean
P-value: 3.9889520309137815e-07


sns.set(rc={'figure.figsize':(10,8)})

collinearity_matrix = df_final.iloc[:,1:].corr()
sns.heatmap(collinearity_matrix, xticklabels=collinearity_matrix.columns,yticklabels=collinearity_matrix.columns,cmap="crest", annot=True)

<AxesSubplot:>


sns.set(rc={'figure.figsize':(8,5)})

collinearity_matrix2 = df_final[["Homicide Rate","Gross national income (GNI) per capita","has_uhc","genetic_index"]].corr()
sns.heatmap(collinearity_matrix2, xticklabels=collinearity_matrix2.columns,yticklabels=collinearity_matrix2.columns,cmap="crest", annot=True)

<AxesSubplot:>


sns.set(rc={'figure.figsize':(10,4)})
#Determine Possible Transformations
#From Hypothesis 3, we know we need to use a log transformation on GNI per capita, which can also be seen below
df_model2 = pd.DataFrame(index=df_final.index)
df_model2["Life Expectancy"] = df_final["Life Expectancy"]
df_model2["Log GNI per Capita"] = np.log(df_final["Gross national income (GNI) per capita"])
f, (ax1, ax2) = plt.subplots(1, 2)
sns.regplot(data = df_final, x = "Gross national income (GNI) per capita",y = "Life Expectancy", ax = ax1)
sns.regplot(data = df_model2, x = "Log GNI per Capita",y = "Life Expectancy", ax = ax2)
plt.show()


#Homicide Rate Transformations
#Showing high heteroskedastic behavior, need to apply a transformation that reduces variation as homicide rate increases
df_model2["Log Homicide Rate"] = np.log(df_final["Homicide Rate"])

f, (ax1, ax2) = plt.subplots(1, 2)
sns.regplot(data = df_final, x = "Homicide Rate",y = "Life Expectancy", ax = ax1)
sns.regplot(data = df_model2, x = "Log Homicide Rate",y = "Life Expectancy", ax = ax2)
plt.show()


df_model2["has_uhc"] = df_final["has_uhc"]


sns.set(rc={'figure.figsize':(7,4)})

#Genetic index transformation
sns.regplot(data = df_final, x = "genetic_index",y = "Life Expectancy")
plt.show()
#Data already looks linear
df_model2["genetic_index"] = df_final["genetic_index"]


df_model2.head()


#Model creation
X = df_model2[["Log Homicide Rate","Log GNI per Capita","has_uhc","genetic_index"]]
y = df_model2["Life Expectancy"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=78)
model2 = LinearRegression().fit(x_train,y_train)
train_predictions = model2.predict(x_train)
test_predictions = model2.predict(x_test)
train_residuals = y_train - train_predictions
test_residuals = y_test - test_predictions
train_rmse = np.sqrt(np.mean(train_residuals**2))
test_rmse = np.sqrt(np.mean(test_residuals**2))
print("Train RMSE:",f'{train_rmse:.2f}',"\nTest RMSE:",f'{test_rmse:.2f}')
print("Train R Squared",f'{model2.score(x_train,y_train):.2f}',"\nTest R Squared",f'{model2.score(x_test,y_test):.2f}')
print("Intercept", f'{model2.intercept_:.2f}')
slopes_model2 = pd.DataFrame(data = np.round_(model2.coef_.reshape(1,4),decimals=2),columns = X.iloc[:,:].columns)
print("Coefficients:")
slopes_model2.head()

Train RMSE: 3.05 
Test RMSE: 2.57
Train R Squared 0.81 
Test R Squared 0.86
Intercept 43.69
Coefficients:


f, (ax1, ax2) = plt.subplots(1, 2)
sns.scatterplot(x = test_predictions, y = test_residuals, ax = ax1).set(xlabel='Predicted Life Expectancy', ylabel='Residuals')
sns.histplot(test_residuals, ax = ax2).set(xlabel='Residuals', ylabel='Count')
plt.show()


slopes_model2.head()


slopes_model2.head()


#Find Minimum Value Country for Negative Coefficients and Maximum Value Countries for Positive Coefficients

# Country with lowest Log Homicide Rate
country_minHomicide = df_model2["Log Homicide Rate"].idxmin()
minHomicide = df_model2.loc[country_minHomicide,"Log Homicide Rate"]
print("The country with the lowest Homicide Rate is",country_minHomicide,"with",f'{np.exp(minHomicide):.2f}')

# Country with highest Log GNI per Capita
country_maxGNI = df_model2["Log GNI per Capita"].idxmax()
maxGNI = df_model2.loc[country_maxGNI,"Log GNI per Capita"]
print("The country with the highest Log GNI Per Capita is",country_maxGNI,"with",f'{np.exp(maxGNI):.2f}')

# has_uhc has a negative coefficient, so a country without universal healthcare will be used for the perfect country

# Country with the lowest genetic index for heart disease and later-life diseases
country_minGenetics = df_model2["genetic_index"].idxmin()
minGenetics = df_model2.loc[country_minGenetics,"genetic_index"]
print("The country with the lowest genetic index is",country_minGenetics,"with",f'{minGenetics:.2f}')

The country with the lowest Homicide Rate is Japan with 0.26
The country with the highest Log GNI Per Capita is Luxembourg with 84649.47
The country with the lowest genetic index is Switzerland with 7.90


# Finding our perfect country's life expectancy
perfectCountry = pd.DataFrame({"Log Homicide Rate":[minHomicide],
                            "Log GNI per Capita":[maxGNI],
                            "has_uhc":[0],
                            "genetic_index":[minGenetics]})
print("Perfect Country Life Expectancy",f'{model2.predict(perfectCountry)[0]:.2f}')

# Actual highest life expectancy
country_maxExp = df_model2["Life Expectancy"].idxmax()
maxExp = df_model2.loc[country_maxExp_i,"Life Expectancy"]
print("The country with the actual highest Life Expectancy is",country_maxExp,"with",f'{maxExp:.2f}')

Perfect Country Life Expectancy 85.59
The country with the actual highest Life Expectancy is Japan with 84.62

	Life Expectancy	Homicide Rate	Mean years of schooling	Gross national income (GNI) per capita	kg meat/person	cal	has_uhc	happiness_index	genetic_index	level of human development
Country
Albania	78.686	2.29	11.286455	14131.11039	47.51	3360.0	1.0	5.117	11.4	3
United Arab Emirates	78.120	0.46	12.694030	62573.59181	62.03	3314.0	1.0	6.561	18.5	4
Argentina	76.813	5.32	11.147269	20925.26814	109.39	3307.0	1.0	5.929	15.7	4
Armenia	75.224	1.69	11.330300	13157.99390	45.64	2997.0	0.0	5.283	19.9	3
Australia	83.200	0.89	12.726820	49238.43335	121.61	3391.0	1.0	7.183	8.6	4

	Life Expectancy	Homicide Rate	Mean years of schooling	Gross national income (GNI) per capita	kg meat/person	cal	has_uhc	happiness_index	genetic_index	level of human development
count	109.000000	109.000000	109.000000	109.000000	109.000000	109.000000	109.000000	109.000000	109.000000	109.000000
mean	74.181112	6.087523	9.538524	22451.396131	49.363211	2957.293578	0.541284	5.671046	17.846789	2.990826
std	6.934363	10.228273	3.063096	19594.529183	29.018068	452.467500	0.500594	1.062122	6.897471	1.084362
min	54.836000	0.260000	2.114962	1198.073924	3.780000	1908.000000	0.000000	3.145000	7.900000	1.000000
25%	70.056000	1.170000	7.192013	6589.980037	20.340000	2662.000000	0.000000	4.934000	11.000000	2.000000
50%	75.387805	2.200000	10.427910	15241.914650	53.490000	3019.000000	1.000000	5.813000	17.800000	3.000000
75%	79.208000	5.370000	12.191084	37931.303590	73.010000	3322.000000	1.000000	6.317000	22.600000	4.000000
max	84.615610	52.020000	14.090967	84649.474670	121.610000	3885.000000	1.000000	7.842000	42.700000	4.000000

	Life Expectancy	cal
Country
Zimbabwe	61.738	1908.0
Uganda	63.713	1981.0
Zambia	64.194	2002.0
Mozambique	61.387	2103.0
Tajikistan	71.301	2109.0

	Life Expectancy	Mean years of schooling
Country
Burkina Faso	61.981	2.114962
Niger	62.792	2.116717
Senegal	68.213	2.937938
Mozambique	61.387	3.197642
Ethiopia	66.953	3.201521

	Life Expectancy	Gross national income (GNI) per capita
Country
Mozambique	61.387	1198.073924
Niger	62.792	1239.866936
Liberia	64.423	1288.742350
Malawi	64.694	1465.635064
Sierra Leone	55.066	1621.512579

Life Expectancies Analysis¶

Introduction¶

Data Description¶

What are the observations (rows) and the attributes (columns)?

Why was this dataset created?

What processes might have influenced what data was observed and recorded and what was not?

What preprocessing was done, and how did the data come to be in the form that you are using?

Where can your raw source data be found, if applicable? Provide a link to the raw data (hosted in a Cornell Google Drive or Cornell Box).

Imports¶

Basic Data Analysis¶

Summary Statistics¶

Preregistration¶

Do people that eat more live longer?¶

Are more educated people living longer?¶

Do people with higher incomes live longer?¶

Are happier people living longer?¶

Do countries with universal healthcare have longer-living citizens?¶

Methods of testing¶

Hypothesis Testing and Evaluation of Significance¶

Do people that eat more live longer?¶

Are more educated people living longer?¶

Do people in first world countries live longer?¶

Adapted from Hypothesis 3 in Phase 3¶

Are happier people living longer?¶

Do people in countries with UHC (Universal Healthcare) live longer?¶

Permutation test on UHC¶

Interpreting the permutation test¶

Interpretations for Preregistrations¶

Data Analysis¶

Collinearity Matrix¶

Remove Highly Collinear Variables¶

Multivariate Regression Model (With Pruned Variables)¶

GNI per capita: non-linear; log transform needed¶

Homicide Rate: heteroskedastic; log transform applied¶

has_UHC: binary variable; no transform possible to improve¶

Genetic Index: approximately linear already; no transforms needed¶

Make the model¶

Overfitting Analysis¶

Linear Assumptions and Analysis of Residuals¶

Model R Squared and Coefficient Analysis¶

The Perfect Country¶

Finding the best country for each variable¶

Combine into a perfect country¶

Conclusion¶

Data Limitations¶

Source Code¶