9 minute read

<!DOCTYPE html>

college-majors
In [14]:
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
import seaborn as sns
In [4]:
recent_grads = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-10-16/recent-grads.csv")
In [6]:
recent_grads['Major'] = recent_grads['Major'].str.title()
In [7]:
recent_grads.head()
Out[7]:
Rank Major_code Major Total Men Women Major_category ShareWomen Sample_size Employed ... Part_time Full_time_year_round Unemployed Unemployment_rate Median P25th P75th College_jobs Non_college_jobs Low_wage_jobs
0 1 2419 Petroleum Engineering 2339.0 2057.0 282.0 Engineering 0.120564 36 1976 ... 270 1207 37 0.018381 110000 95000 125000 1534 364 193
1 2 2416 Mining And Mineral Engineering 756.0 679.0 77.0 Engineering 0.101852 7 640 ... 170 388 85 0.117241 75000 55000 90000 350 257 50
2 3 2415 Metallurgical Engineering 856.0 725.0 131.0 Engineering 0.153037 3 648 ... 133 340 16 0.024096 73000 50000 105000 456 176 0
3 4 2417 Naval Architecture And Marine Engineering 1258.0 1123.0 135.0 Engineering 0.107313 16 758 ... 150 692 40 0.050125 70000 43000 80000 529 102 0
4 5 2405 Chemical Engineering 32260.0 21239.0 11021.0 Engineering 0.341631 289 25694 ... 5180 16697 1672 0.061098 65000 50000 75000 18314 4440 972

5 rows × 21 columns

In [15]:
sns.distplot(recent_grads['Median'], kde = False)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba5e3d7a90>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [12]:
recent_grads['Median'].hist(bins = 50)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba5bfc0890>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [16]:
sns.boxplot(recent_grads['Median'])
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba5e8c6290>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [19]:
sns.boxplot(x='Median', y='Major_category', data = recent_grads)
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba5fccf490>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [20]:
sns.barplot(x='Median', y='Major_category', data = recent_grads)
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba600a5090>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">

What are the highest earning majors?

In [33]:
df = recent_grads.pivot_table('Median', ['Major', 'Major_category', 'P25th', 'P75th'], aggfunc='median').reset_index()
df.sort_values('Median', ascending = False)
Out[33]:
Major Major_category P25th P75th Median
140 Petroleum Engineering Engineering 95000 125000 110000
115 Mining And Mineral Engineering Engineering 55000 90000 75000
112 Metallurgical Engineering Engineering 50000 105000 73000
131 Naval Architecture And Marine Engineering Engineering 43000 80000 70000
23 Chemical Engineering Engineering 50000 75000 65000
... ... ... ... ... ...
172 Zoology Biology & Life Science 20000 39000 26000
26 Clinical Psychology Psychology & Social Work 25000 40000 25000
51 Educational Psychology Psychology & Social Work 24000 34000 25000
42 Counseling Psychology Psychology & Social Work 19200 26000 23400
98 Library Science Education 20000 22000 22000

173 rows × 5 columns

In [47]:
df2 = recent_grads.nlargest(20,'Median')
df2 = df2.sort_values('Median', ascending = True).reset_index(drop=True)
sns.scatterplot(x='Median', y='Major', data = df2, hue = 'Major_category')
Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba60653950>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [49]:
# Unable to figure out how to do 'Tie Fighter plot, with error bars for the 25th and 75th percentile)
df2 = recent_grads.nlargest(20,'Median')
df2 = df2.sort_values('Median', ascending = True).reset_index(drop=True)
sns.scatterplot(x='Median', y='Major', data = df2, hue = 'Major_category', xerr= df2['P25th'])
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-49-5a89b0f66161> in <module>
      1 df2 = recent_grads.nlargest(20,'Median')
      2 df2 = df2.sort_values('Median', ascending = True).reset_index(drop=True)
----> 3 sns.scatterplot(x='Median', y='Major', data = df2, hue = 'Major_category', yerr= df2['P25th'])

/opt/anaconda3/lib/python3.7/site-packages/seaborn/relational.py in scatterplot(x, y, hue, style, size, data, palette, hue_order, hue_norm, sizes, size_order, size_norm, markers, style_order, x_bins, y_bins, units, estimator, ci, n_boot, alpha, x_jitter, y_jitter, legend, ax, **kwargs)
   1406         ax = plt.gca()
   1407 
-> 1408     p.plot(ax, kwargs)
   1409 
   1410     return ax

/opt/anaconda3/lib/python3.7/site-packages/seaborn/relational.py in plot(self, ax, kws)
    916         # function will advance the axes property cycle.
    917 
--> 918         scout = ax.scatter([], [], **kws)
    919         s = kws.pop("s", scout.get_sizes())
    920         c = kws.pop("c", scout.get_facecolors())

/opt/anaconda3/lib/python3.7/site-packages/matplotlib/__init__.py in inner(ax, data, *args, **kwargs)
   1597     def inner(ax, *args, data=None, **kwargs):
   1598         if data is None:
-> 1599             return func(ax, *map(sanitize_sequence, args), **kwargs)
   1600 
   1601         bound = new_sig.bind(ax, *args, **kwargs)

/opt/anaconda3/lib/python3.7/site-packages/matplotlib/axes/_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnonfinite, **kwargs)
   4498                 )
   4499         collection.set_transform(mtransforms.IdentityTransform())
-> 4500         collection.update(kwargs)
   4501 
   4502         if colors is None:

/opt/anaconda3/lib/python3.7/site-packages/matplotlib/artist.py in update(self, props)
    972 
    973         with cbook._setattr_cm(self, eventson=False):
--> 974             ret = [_update_property(self, k, v) for k, v in props.items()]
    975 
    976         if len(ret):

/opt/anaconda3/lib/python3.7/site-packages/matplotlib/artist.py in <listcomp>(.0)
    972 
    973         with cbook._setattr_cm(self, eventson=False):
--> 974             ret = [_update_property(self, k, v) for k, v in props.items()]
    975 
    976         if len(ret):

/opt/anaconda3/lib/python3.7/site-packages/matplotlib/artist.py in _update_property(self, k, v)
    968                 if not callable(func):
    969                     raise AttributeError('{!r} object has no property {!r}'
--> 970                                          .format(type(self).__name__, k))
    971                 return func(v)
    972 

AttributeError: 'PathCollection' object has no property 'yerr'
In [56]:
sns.scatterplot(x='Sample_size', y='Median', data = recent_grads, x_jitter = 0.25).set_xscale("log")
# Note that the next step is to text label the points - 
# It is possible [here](https://stackoverflow.com/questions/46027653/adding-labels-in-x-y-scatter-plot-with-seaborn) but not trivial
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [72]:
df3 = recent_grads.groupby('Major_category').agg({'Total': 'sum'})
df3 = df3.reset_index()
sns.barplot(x='Total', y='Major_category', data = df3, order=df3.sort_values('Total', ascending=False).Major_category)
Out[72]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba642ea950>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [119]:
df3 = recent_grads.groupby('Major').agg({'Total': 'sum'})
df4 = df3.nlargest(20,'Total')
df4 = df4.reset_index()
sns.barplot(x='Total', y='Major', data = df4, order=df4.sort_values('Total', ascending=False).Major)
Out[119]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba67896ed0>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [143]:
dfa=recent_grads.sort_values('Total').nlargest(20, 'Total')
Out[143]:
Rank Major_code Major Total Men Women Major_category ShareWomen Sample_size Employed ... Part_time Full_time_year_round Unemployed Unemployment_rate Median P25th P75th College_jobs Non_college_jobs Low_wage_jobs
145 146 5200 Psychology 393735.0 86648.0 307087.0 Psychology & Social Work 0.779933 2584 307933 ... 115172 174438 28169 0.083811 31500 24000 41000 125148 141860 48207
76 77 6203 Business Management And Administration 329927.0 173809.0 156118.0 Business 0.473190 4212 276234 ... 50357 199897 21502 0.072218 38000 29000 50000 36720 148395 32395
123 124 3600 Biology 280709.0 111762.0 168947.0 Biology & Life Science 0.601858 1370 182295 ... 72371 100336 13874 0.070725 33400 24000 45000 88232 81109 28339
57 58 6200 General Business 234590.0 132238.0 102352.0 Business 0.436302 2380 190183 ... 36241 138299 14946 0.072861 40000 30000 55000 29334 100831 27320
93 94 1901 Communications 213996.0 70619.0 143377.0 Communications & Journalism 0.669999 2394 179633 ... 49889 116251 14602 0.075177 35000 27000 45000 40763 97964 27440
34 35 6107 Nursing 209394.0 21773.0 187621.0 Health 0.896019 2554 180903 ... 40818 122817 8497 0.044863 48000 39000 58000 151643 26146 6193
77 78 6206 Marketing And Marketing Research 205211.0 78857.0 126354.0 Business 0.615727 2684 178862 ... 35829 127230 11663 0.061215 38000 30000 50000 25320 93889 27968
40 41 6201 Accounting 198633.0 94519.0 104114.0 Business 0.524153 2042 165527 ... 27693 123169 12411 0.069749 45000 34000 56000 11417 39323 10886
137 138 3301 English Language And Literature 194673.0 58227.0 136446.0 Humanities & Liberal Arts 0.700898 1436 149180 ... 57825 81180 14345 0.087724 32000 23000 41000 57690 71827 26503
78 79 5506 Political Science And Government 182621.0 93880.0 88741.0 Social Science 0.485930 1387 133454 ... 43711 83236 15022 0.101175 38000 28000 50000 36854 66947 19803
35 36 6207 Finance 174506.0 115030.0 59476.0 Business 0.340825 2189 145696 ... 21463 108595 9413 0.060686 47000 35000 64000 24243 48447 9910
138 139 2304 Elementary Education 170862.0 13029.0 157833.0 Education 0.923745 1629 149339 ... 37965 86540 7297 0.046586 32000 23400 38000 108085 36972 11502
94 95 5301 Criminal Justice And Fire Protection 152824.0 80231.0 72593.0 Law & Public Policy 0.475010 1728 125393 ... 32242 88548 11268 0.082452 35000 26000 45000 24348 88858 18404
113 114 2300 General Education 143718.0 26893.0 116825.0 Education 0.812877 919 118241 ... 29558 73531 7195 0.057360 34000 26000 41000 82007 31112 11443
114 115 6402 History 141951.0 78253.0 63698.0 Humanities & Liberal Arts 0.448732 1058 105646 ... 40657 59218 11176 0.095667 34000 25000 47000 35336 54569 16839
36 37 5501 Economics 139247.0 89749.0 49498.0 Social Science 0.355469 1322 104117 ... 25325 70740 11452 0.099092 47000 35000 65000 25582 37057 10653
20 21 2102 Computer Science 128319.0 99743.0 28576.0 Computers & Mathematics 0.222695 1196 102087 ... 18726 70932 6884 0.063173 53000 39000 70000 68622 25667 5144
139 140 4101 Physical Fitness Parks Recreation And Leisure 125074.0 62181.0 62893.0 Industrial Arts & Consumer Services 0.502846 1014 103078 ... 38515 57978 5593 0.051467 32000 24000 43000 27581 63946 16838
124 125 5507 Sociology 115433.0 32510.0 82923.0 Social Science 0.718365 1024 92721 ... 29639 56561 8608 0.084951 33000 25000 44000 29051 48899 13748
95 96 6004 Commercial Art And Graphic Design 103480.0 32041.0 71439.0 Arts 0.690365 1186 83483 ... 24387 52243 8947 0.096798 35000 25000 45000 37389 38119 14839

20 rows × 21 columns

How does gender breakdown relative to typical earnings?

In [146]:
dfa= dfa[['Major','Men', 'Women']]
dfa1= pd.melt(dfa, id_vars = ['Major'], var_name = 'Gender', value_name = 'Number')
In [147]:
sns.barplot(x='Number', y='Major' , data=dfa1, hue = 'Gender')
Out[147]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba6510d7d0>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [158]:
dfa1[['Men','Women']].plot(kind='barh', stacked=True)
Out[158]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba68ace810>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [165]:
dfa1['mandf']= dfa1["Men"] + dfa1['Women']
dfa1.sort_values('mandf', ascending=True)[['Men','Women']].plot(kind='barh', stacked=True)
Out[165]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba690c59d0>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [177]:
dfg = recent_grads.groupby('Major_category').agg({'Total': 'sum','Men':'sum', 'Women':'sum'})
dfg['ShareWomen'] = dfg['Women']/dfg['Total']
dfg.dropna(subset=['ShareWomen'])
dfg.sort_values('ShareWomen', ascending=False)
Out[177]:
Total Men Women ShareWomen
Major_category
Health 463230.0 75517.0 387713.0 0.836977
Education 559129.0 103526.0 455603.0 0.814844
Psychology & Social Work 481007.0 98115.0 382892.0 0.796022
Interdisciplinary 12296.0 2817.0 9479.0 0.770901
Communications & Journalism 392601.0 131921.0 260680.0 0.663982
Arts 357130.0 134390.0 222740.0 0.623694
Humanities & Liberal Arts 713468.0 272846.0 440622.0 0.617578
Biology & Life Science 453862.0 184919.0 268943.0 0.592566
Industrial Arts & Consumer Services 229792.0 103781.0 126011.0 0.548370
Social Science 529966.0 256834.0 273132.0 0.515376
Law & Public Policy 179107.0 91129.0 87978.0 0.491204
Business 1302376.0 667852.0 634524.0 0.487205
Physical Sciences 185479.0 95390.0 90089.0 0.485710
Agriculture & Natural Resources 75620.0 40357.0 35263.0 0.466318
Computers & Mathematics 299008.0 208725.0 90283.0 0.301942
Engineering 537583.0 408307.0 129276.0 0.240476
In [200]:
recent_grads['MedianSalary'] = recent_grads['Median'].sum() * recent_grads['Sample_size'].sum() / recent_grads['Sample_size'].sum() 
dfg = recent_grads.groupby('Major_category').agg({'Total': 'sum','Men':'sum', 'Women':'sum'})
dfg['ShareWomen'] = dfg['Women']/dfg['Total']
dfg.dropna(subset=['ShareWomen'])
dfg.sort_values('ShareWomen', ascending=False)
Out[200]:
Total Men Women ShareWomen
Major_category
Health 463230.0 75517.0 387713.0 0.836977
Education 559129.0 103526.0 455603.0 0.814844
Psychology & Social Work 481007.0 98115.0 382892.0 0.796022
Interdisciplinary 12296.0 2817.0 9479.0 0.770901
Communications & Journalism 392601.0 131921.0 260680.0 0.663982
Arts 357130.0 134390.0 222740.0 0.623694
Humanities & Liberal Arts 713468.0 272846.0 440622.0 0.617578
Biology & Life Science 453862.0 184919.0 268943.0 0.592566
Industrial Arts & Consumer Services 229792.0 103781.0 126011.0 0.548370
Social Science 529966.0 256834.0 273132.0 0.515376
Law & Public Policy 179107.0 91129.0 87978.0 0.491204
Business 1302376.0 667852.0 634524.0 0.487205
Physical Sciences 185479.0 95390.0 90089.0 0.485710
Agriculture & Natural Resources 75620.0 40357.0 35263.0 0.466318
Computers & Mathematics 299008.0 208725.0 90283.0 0.301942
Engineering 537583.0 408307.0 129276.0 0.240476
In [209]:
recent_grads['MedianSalary'] = recent_grads['Median'].sum() * recent_grads['Sample_size'].sum() / recent_grads['Sample_size'].sum() 
dfg = recent_grads.groupby(['Major_category', 'MedianSalary'],as_index=False).agg({'Total': 'sum','Men':'sum', 'Women':'sum'})
dfg['ShareWomen'] = dfg['Women']/dfg['Total']
dfg.dropna(subset=['ShareWomen'])
by_major_category = dfg.sort_values('ShareWomen', ascending=False)
by_major_category 
Out[209]:
Major_category MedianSalary Total Men Women ShareWomen
8 Health 6946200.0 463230.0 75517.0 387713.0 0.836977
6 Education 6946200.0 559129.0 103526.0 455603.0 0.814844
14 Psychology & Social Work 6946200.0 481007.0 98115.0 382892.0 0.796022
11 Interdisciplinary 6946200.0 12296.0 2817.0 9479.0 0.770901
4 Communications & Journalism 6946200.0 392601.0 131921.0 260680.0 0.663982
1 Arts 6946200.0 357130.0 134390.0 222740.0 0.623694
9 Humanities & Liberal Arts 6946200.0 713468.0 272846.0 440622.0 0.617578
2 Biology & Life Science 6946200.0 453862.0 184919.0 268943.0 0.592566
10 Industrial Arts & Consumer Services 6946200.0 229792.0 103781.0 126011.0 0.548370
15 Social Science 6946200.0 529966.0 256834.0 273132.0 0.515376
12 Law & Public Policy 6946200.0 179107.0 91129.0 87978.0 0.491204
3 Business 6946200.0 1302376.0 667852.0 634524.0 0.487205
13 Physical Sciences 6946200.0 185479.0 95390.0 90089.0 0.485710
0 Agriculture & Natural Resources 6946200.0 75620.0 40357.0 35263.0 0.466318
5 Computers & Mathematics 6946200.0 299008.0 208725.0 90283.0 0.301942
7 Engineering 6946200.0 537583.0 408307.0 129276.0 0.240476
In [218]:
sns.scatterplot(x='ShareWomen', y = 'Median', data=recent_grads)
Out[218]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba4b2b6350>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [231]:
sns.lmplot(x="ShareWomen", y="Median", data=recent_grads)
Out[231]:
<seaborn.axisgrid.FacetGrid at 0x7fba69becc10>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
In [ ]: