College Majors from Tidy Tuesday - with Pandas
<!DOCTYPE html>
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
recent_grads = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-10-16/recent-grads.csv")
recent_grads['Major'] = recent_grads['Major'].str.title()
recent_grads.head()
sns.distplot(recent_grads['Median'], kde = False)
recent_grads['Median'].hist(bins = 50)
sns.boxplot(recent_grads['Median'])
sns.boxplot(x='Median', y='Major_category', data = recent_grads)
sns.barplot(x='Median', y='Major_category', data = recent_grads)
What are the highest earning majors?¶
df = recent_grads.pivot_table('Median', ['Major', 'Major_category', 'P25th', 'P75th'], aggfunc='median').reset_index()
df.sort_values('Median', ascending = False)
df2 = recent_grads.nlargest(20,'Median')
df2 = df2.sort_values('Median', ascending = True).reset_index(drop=True)
sns.scatterplot(x='Median', y='Major', data = df2, hue = 'Major_category')
# Unable to figure out how to do 'Tie Fighter plot, with error bars for the 25th and 75th percentile)
df2 = recent_grads.nlargest(20,'Median')
df2 = df2.sort_values('Median', ascending = True).reset_index(drop=True)
sns.scatterplot(x='Median', y='Major', data = df2, hue = 'Major_category', xerr= df2['P25th'])
sns.scatterplot(x='Sample_size', y='Median', data = recent_grads, x_jitter = 0.25).set_xscale("log")
# Note that the next step is to text label the points -
# It is possible [here](https://stackoverflow.com/questions/46027653/adding-labels-in-x-y-scatter-plot-with-seaborn) but not trivial
df3 = recent_grads.groupby('Major_category').agg({'Total': 'sum'})
df3 = df3.reset_index()
sns.barplot(x='Total', y='Major_category', data = df3, order=df3.sort_values('Total', ascending=False).Major_category)
df3 = recent_grads.groupby('Major').agg({'Total': 'sum'})
df4 = df3.nlargest(20,'Total')
df4 = df4.reset_index()
sns.barplot(x='Total', y='Major', data = df4, order=df4.sort_values('Total', ascending=False).Major)
dfa=recent_grads.sort_values('Total').nlargest(20, 'Total')
How does gender breakdown relative to typical earnings?¶
dfa= dfa[['Major','Men', 'Women']]
dfa1= pd.melt(dfa, id_vars = ['Major'], var_name = 'Gender', value_name = 'Number')
sns.barplot(x='Number', y='Major' , data=dfa1, hue = 'Gender')
dfa1[['Men','Women']].plot(kind='barh', stacked=True)
dfa1['mandf']= dfa1["Men"] + dfa1['Women']
dfa1.sort_values('mandf', ascending=True)[['Men','Women']].plot(kind='barh', stacked=True)
dfg = recent_grads.groupby('Major_category').agg({'Total': 'sum','Men':'sum', 'Women':'sum'})
dfg['ShareWomen'] = dfg['Women']/dfg['Total']
dfg.dropna(subset=['ShareWomen'])
dfg.sort_values('ShareWomen', ascending=False)
recent_grads['MedianSalary'] = recent_grads['Median'].sum() * recent_grads['Sample_size'].sum() / recent_grads['Sample_size'].sum()
dfg = recent_grads.groupby('Major_category').agg({'Total': 'sum','Men':'sum', 'Women':'sum'})
dfg['ShareWomen'] = dfg['Women']/dfg['Total']
dfg.dropna(subset=['ShareWomen'])
dfg.sort_values('ShareWomen', ascending=False)
recent_grads['MedianSalary'] = recent_grads['Median'].sum() * recent_grads['Sample_size'].sum() / recent_grads['Sample_size'].sum()
dfg = recent_grads.groupby(['Major_category', 'MedianSalary'],as_index=False).agg({'Total': 'sum','Men':'sum', 'Women':'sum'})
dfg['ShareWomen'] = dfg['Women']/dfg['Total']
dfg.dropna(subset=['ShareWomen'])
by_major_category = dfg.sort_values('ShareWomen', ascending=False)
by_major_category
sns.scatterplot(x='ShareWomen', y = 'Median', data=recent_grads)
sns.lmplot(x="ShareWomen", y="Median", data=recent_grads)