import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# import datasets
df_hours_and_GDP = pd.read_csv('annual-working-hours-vs-gdp-per-capita-pwt.csv')
df_productivity_and_hours = pd.read_csv('productivity-vs-annual-hours-worked.csv')

# merge dataframes
df_merged = pd.merge(df_hours_and_GDP, df_productivity_and_hours, how='left', left_on=['Entity', 'Year'], right_on = ['Entity', 'Year'])

# drop columns
df_merged_clean = df_merged.drop(columns=['Code_x', 'Code_y', 'Annual working hours per worker_y', 'Population (historical estimates)_y', 'Continent_y', 'Population (historical estimates)_x'])

# rename columns
df_merged_clean_renamed = df_merged_clean.rename(columns={'Entity': 'Country', 
            'Annual working hours per worker_x': 'Annual Working Hours per Worker',
            'GDP per capita (output, multiple price benchmarks)':'GDP per Capita',
            'Productivity: output per hour worked':'Productivity',
            'Continent_x':'Continent'})

# drop NaN (from certain columns)
df_all = df_merged_clean_renamed.dropna(subset=['Country', 'Year', 'Annual Working Hours per Worker', 'GDP per Capita', 'Productivity'])

# reset indexes
df2 = df_all.reset_index()
df_all = df2.drop(columns=['index'])


# present data
df_all


x = df_all['Annual Working Hours per Worker']
y = df_all['GDP per Capita']

# plot GDP data
plt.scatter(x, y, c='#DE6666', s=1, alpha=0.6)
plt.xlabel('Annual Working Hours per Worker')
plt.ylabel('GDP per Capita')
plt.title('GDP per Capita vs. Annual Working Hours per Worker Across the World (1950 - 2019)')

#find line of best fit
model = np.poly1d(np.polyfit(x, y, 3))
polyline = np.linspace(1250, 3000)
plt.plot(polyline, model(polyline), color='#853e3e')

[<matplotlib.lines.Line2D at 0x124db1010>]


# plot productivity data
y = df_all['Productivity']
plt.scatter(x, y, c='#4772D1', label='Productivity', s=1, alpha=0.6)
plt.xlabel('Annual Working Hours per Worker')
plt.ylabel('Productivity')
plt.title('Productivity vs. Annual Working Hours per Worker Across the World (1950 - 2019)')

#find line of best fit
model = np.poly1d(np.polyfit(x, y, 3))
polyline = np.linspace(1250, 3000)
plt.plot(polyline, model(polyline), color='#12398c')

[<matplotlib.lines.Line2D at 0x124cea4d0>]

	Country	Year	Annual Working Hours per Worker	GDP per Capita	Continent	Productivity
0	Argentina	1950	2034.0000	2931.7388	NaN	3.727674
1	Argentina	1951	2037.8667	2940.7954	NaN	3.752668
2	Argentina	1952	2041.7408	2629.9502	NaN	3.365233
3	Argentina	1953	2045.6223	2747.4377	NaN	3.522695
4	Argentina	1954	2049.5112	2821.9634	NaN	3.623416
...	...	...	...	...	...	...
3487	Vietnam	2015	2191.3704	6180.3580	Asia	4.946606
3488	Vietnam	2016	2169.5515	6368.6510	NaN	5.156925
3489	Vietnam	2017	2131.9683	6841.6543	NaN	5.652919
3490	Vietnam	2018	2131.9683	7217.9240	NaN	5.982665
3491	Vietnam	2019	2131.9683	7506.8170	NaN	6.739149

Working Hours, GDP, and Productivity¶

Data Dictionary¶