def desc(df, nvalues=10):
for var in df.columns:
print ">>> %s" % var
print "(missing: %d)" % df[var].isnull().sum()
if df[var].unique().size <= nvalues:
print df[var].value_counts()
else:
print df[var].describe()
print ""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from pylab import *
import statsmodels.formula.api as smf
%matplotlib inline
ipeds = pd.read_csv('data/ipeds2012/CSV_512014-1015/CSV_512014-1015.csv')
ipeds2 = pd.read_csv('data/ipeds2012/CSV_512014-1002/CSV_512014-1002.csv')
#ipeds = pd.merge(ipeds, pd.read_csv('data/ipeds2012/CSV_512014-1002/CSV_512014-1002.csv'), on='unitid name')
rename_map = {'DRVIC2012.Percent admitted - total': 'admit_pct',
'DRVIC2012.Admissions yield - total': 'admit_yield',
'DRVIC2012.Tuition and fees, 2012-13': 'tuition_fees',
'HD2012.Institution size category': 'size_cat',
'HD2012.Geographic region': 'region',
'HD2012.Sector of institution': 'inst_sector',
'HD2012.Level of institution': 'inst_lvl',
'HD2012.Control of institution': 'inst_ctrl',
#'HD2012.Degree-granting status': 'degree_grt_status',
'HD2012.Historically Black College or University': 'hbcu',
'HD2012.Degree of urbanization (Urban-centric locale)': 'urban',
'HD2012.Institutional category': 'inst_cat',
'DRVEF2012.Undergraduate enrollment': 'ug_enrol',
'DRVEF2012.Graduate enrollment': 'grad_enrol',
'DRVEF2012.Full-time undergraduate enrollment': 'ft_ug_enrol',
'DRVEF2012.Part-time undergraduate enrollment': 'pt_ug_enrol',
'DRVEF2012.Percent of total enrollment that are Black or African American': 'pct_black',
'DRVEF2012.Percent of total enrollment that are Hispanic/Latino': 'pct_latino',
'DRVEF2012.Percent of total enrollment that are White': 'pct_white',
'DRVEF2012.Percent of total enrollment that are Race/ethnicity unknown': 'pct_unknown_race',
'DRVEF2012.Percent of total enrollment that are women': 'pct_female',
'EF2012D.Student-to-faculty ratio': 'stud_fac_ratio',
'EF2012D.Full-time retention rate, 2012': 'ft_reten_rate',
'EF2012D.Part-time retention rate, 2012': 'pt_reten_rate',
'DRVEF2012.Percent of undergraduate enrollment 18-24': 'pct_ug_18_24',
'DRVEF2012.Percent of undergraduate enrollment, 25-64': 'pct_ug_25_64',
'DRVEF2012.Percent of undergraduate enrollment over 65': 'pct_ug_65p',
'DRVEFDE2012.Percent of students enrolled exclusively in distance education courses': 'pct_all_dist_ed',
'DRVEFDE2012.Percent of students enrolled in some but not all distance education courses': 'pct_any_dist_ed',
'DRVEFDE2012.Percent of students not enrolled in any distance education courses': 'pct_no_dist_ed',
'DRVGR2012.Graduation rate, total cohort': 'grad_rate',
'DRVGR2012.Transfer-out rate, total cohort': 'transfer_rate',
'GR200_12.Graduation rate - degree/certificate within 100% of normal time': 'grad_ontime_rate',
'SFA1112.Percent of full-time first-time undergraduates receiving any financial aid': 'pct_fin_aid',
'SFA1112.Average amount of federal, state, local or institutional grant aid received': 'mean_fin_aid',
'DRVF2012.Core revenues, total dollars (GASB)': 'revenues_public',
'DRVF2012.Tuition and fees as a percent of core revenues (GASB)': 'tuition_pct_rev_public',
'DRVF2012.Core revenues, total dollars (FASB)': 'revenues_private',
'DRVF2012.Tuition and fees as a percent of core revenues (FASB)': 'tuition_pct_rev_private',
'DRVF2012.Revenues from tuition and fees per FTE (GASB)': 'tuit_rev_per_fte_public',
'DRVF2012.Revenues from tuition and fees per FTE (FASB)': 'tuit_rev_per_fte_private',
'DRVF2012.Core expenses, total dollars (GASB)': 'expen_public',
'DRVF2012.Instruction expenses as a percent of total core expenses (GASB)': 'expen_pct_instr_public',
'DRVF2012.Academic support expenses as a percent of total core expenses (GASB)': 'expen_pct_acadsup_public',
'DRVF2012.Core expenses, total dollars (FASB)': 'expen_private',
'DRVF2012.Instruction expenses as a percent of total core expenses (FASB)': 'expen_pct_instr_private',
'DRVF2012.Academic support expenses as a percent of total core expenses (FASB)': 'expen_pct_acadsup_private',
'DRVF2012.Instruction expenses per FTE (GASB)': 'instr_expen_per_fte_public',
'DRVF2012.Instruction expenses per FTE (FASB)': 'instr_expen_per_fte_private',
'DRVF2012.Salaries, wages, and benefit expenses for core expenses as a percent of total core expenses (GASB)':
'expen_pct_labor_public',
'DRVF2012.Salaries, wages, and benefit expenses for instruction as a percent of total expenses for instruction (GASB)':
'expen_pct_instlabor_public',
'DRVF2012.Salaries, wages, and benefit expenses for core expenses as a percent of total core expenses (FASB)':
'expen_pct_labor_private',
'DRVF2012.Salaries, wages, and benefit expenses for instruction as a percent of total expenses for instruction (FASB)':
'expen_pct_instlabor_private',
'DRVF2012.Endowment assets (year end) per FTE enrollment (GASB)': 'endow_per_fte_public',
'DRVF2012.Endowment assets (year end) per FTE enrollment (FASB)': 'endow_per_fte_private',
'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - all ranks': 'mean_ft_sal_all',
'DRVHR2012.Average salary equated to 9 months of full-time insructional staff - professors': 'mean_ft_sal_prof',
'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - associate professors': 'mean_ft_sal_assoc',
'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - assistant professors': 'mean_ft_sal_assis',
'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - instructors': 'mean_ft_sal_instr',
'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - lecturers': 'mean_ft_sal_lect',
'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - No academic rank': 'mean_ft_sal_norank',
'DRVHR2012.Total FTE staff': 'total_staff_fte',
'DRVHR2012.Postsecondary Teachers FTE staff': 'teachers_fte',
'DRVHR2012.Postsecondary Teachers Instructional FTE': 'teach_instr_fte',
'DRVHR2012.Management FTE': 'manag_fte'}
ipeds.rename(columns=rename_map, inplace=True)
ipeds = ipeds.set_index(['unitid', 'institution name'])[rename_map.values()] # keep only the renamed vars
ipeds.info()
ipeds2 = ipeds2[ipeds2['S2012_OC.Occupation and full- and part-time status'] ==
'Part-time, Instructional staff']
ipeds2['pt_teachers'] = ipeds2['S2012_OC.Grand total']
ipeds2['pt_teachers_pct_female'] = ipeds2['S2012_OC.Grand total women'] / ipeds2['pt_teachers']
ipeds2['pt_teachers_pct_white'] = ipeds2['S2012_OC.White total'] / ipeds2['pt_teachers']
ipeds2['pt_teachers_pct_black'] = ipeds2['S2012_OC.Black or African American total'] / ipeds2['pt_teachers']
ipeds2['pt_teachers_pct_latino'] = ipeds2['S2012_OC.Hispanic or Latino total'] / ipeds2['pt_teachers']
ipeds2['pt_teachers_pct_other'] = (ipeds2['pt_teachers']
- ipeds2['S2012_OC.White total']
- ipeds2['S2012_OC.Black or African American total']
- ipeds2['S2012_OC.Hispanic or Latino total']) / ipeds2['pt_teachers']
ipeds2 = ipeds2.set_index(['unitid','institution name']).filter(like='pt_teachers')
ipeds2.info()
ipeds = pd.merge(ipeds, ipeds2, how='left', left_index=True, right_index=True)
ipeds.info()
df = pd.merge(pd.read_csv('data/adjunct.csv'), ipeds.reset_index(), how='left', left_on='college_unit_id', right_on='unitid')
df.info()
df = df.drop(['has_contract', 'college_hbcu', 'college_flagship'], axis=1) # completely empty
df = df.drop(df.filter(like='paytype').columns, axis=1) # no longer kept in new version, very few values
df = df.drop(['college_class', 'college_carnegie'], axis=1) # the fields from IPEDS are higher quality
df = df.drop(['institution name', 'user_supplied_department_name', 'display_name'], axis=1)
df.info()
df = df.sort(axis=1) # sort by column name
df.info()
First, has_union
is incorrectly specified. It is defined as union_name != NULL
, but in fact often union_name == 'None'
, yet has_union == 1
. Since
print df.has_union.value_counts()
pd.crosstab(df.union_name, df.has_union)
df.loc[df.union_name=='None', 'has_union'] = 0
df.has_union.value_counts(normalize=True)
Now rename other has_*
variables and create new verions that are true dummies. Note that, as described in the data dictionary, a response of 4
means "I don't know", which I code as missing (NaN
).
df.rename(columns={
'has_health_insurance':'health_insurance_type',
'has_retirement':'retirement_type',
'has_governance':'governance_type'}, inplace=True)
# When health ins. is non-missing (i.e. = 0,1,2, or 3), replace with True when = 1,2, or 3
df.loc[df.health_insurance_type.isin([0,1,2,3]), 'has_health_ins'] = df.health_insurance_type.isin([1,2,3])
df.loc[df.retirement_type.isin([0,1,2,3]), 'has_retirement'] = df.retirement_type.isin([1,2,3])
df.loc[df.governance_type.isin([0,1]), 'has_governance'] = df.governance_type.isin([1])
Create a new variables with meaningful labels for the has_advanced_degree
variable.
df.rename(columns={'has_advanced_degree':'advanced_degree_type'}, inplace=True)
degree_map = {0: 'None', 1: 'MA/MS', 2: 'PhD', 3: 'JD', 4: 'MD'}
df.loc[df.advanced_degree_type.notnull(), 'adv_degree'] = df.advanced_degree_type.replace(degree_map)
df.loc[df.advanced_degree_type.notnull(), 'has_adv_degree'] \
= df.advanced_degree_type.isin([1,2,3,4])
Create meaninful divisions from the department_type_id
, according to coding listed in the data dictionary.
# Coding is based on first number of department_id
divisions = {
'1': 'Gen/LibA', # General/Liberal Arts
'2': 'Arts/Hum', # Arts/Humanities
'3': 'SocSci', # Social Sciences
'4': 'Sci/Tech', # Science/Technology
'6': 'Prof/ApS', # Professions/Applied Sciences
'8': np.nan, # Not specified
'9': 'Other'
}
df['division'] = df.department_type_id.apply(lambda x: str(x)[0:1]).replace(divisions)
Create two variables describing the number of part-time teachers as a ratio of total staff FTEs and instructor teaching staff FTEs. Note that pt_teachers
is a count of total teachers, not of FTEs, and thus it is possible, for example, for pt_teachers / teach_instr_fte
to be greater than 1 (e.g. if there were 100 teachers, all part time, each of whome teaches only a 1/4 load, then the value of adjunct_teach_rate
would be 4: 100 teachers / 25 instructor FTEs.)
df['adjunct_staff_rate'] = df.pt_teachers / df.total_staff_fte
df['adjunct_teach_rate'] = df.pt_teachers / df.teach_instr_fte
df.loc[df.tuit_rev_per_fte_private.notnull(), 'tuit_rev_per_fte'] = df.tuit_rev_per_fte_private
df.loc[df.tuit_rev_per_fte_public.notnull(), 'tuit_rev_per_fte'] = df.tuit_rev_per_fte_public
df.loc[df.endow_per_fte_private.notnull(), 'endow_per_fte'] = df.endow_per_fte_private
df.loc[df.endow_per_fte_public.notnull(), 'endow_per_fte'] = df.endow_per_fte_public
df.loc[df.instr_expen_per_fte_private.notnull(), 'instr_expen_per_fte'] = df.instr_expen_per_fte_private
df.loc[df.instr_expen_per_fte_public.notnull(), 'instr_expen_per_fte'] = df.instr_expen_per_fte_public
Currently the variable urban
has 12 categories, which is a lot. Let's create a dummy variable instead.
def urb_map(x):
if str(x).find("City") != -1:
return "Urban/Suburban"
elif str(x).find("Suburb") != -1:
return "Urban/Suburban"
elif str(x).find("Rural") != -1:
return "Rural/Town"
elif str(x).find("Town") != -1:
return "Rural/Town"
else:
return np.nan
df['urb'] = df.urban.apply(urb_map)
urb_map = {
'City: Large':'Urban/Suburban',
'City: Small':'Urban/Suburban',
'City: Midsize':'Urban/Suburban',
'Suburb: Large':'Urban/Suburban',
'Suburb: Midsize':'Urban/Suburban',
'Suburb: Small':'Urban/Suburban',
'Town: Fringe':'Town/Rural',
'Town: Distant':'Town/Rural',
'Town: Remote':'Town/Rural',
'Rural: Fringe':'Town/Rural',
'Rural: Distant':'Town/Rural',
'Rural: Remote':'Town/Rural'
}
df['urban_rural'] = df.urban.replace(urb_map)
We restrict data to certain variables and then remove some data that has too few observations.
df.columns.values
data = df[['college_name', 'unitid',
'college_state', 'region',
'admit_pct', 'admit_yield',
'ft_reten_rate', 'pt_reten_rate',
'ft_ug_enrol', 'pt_ug_enrol', 'ug_enrol',
'grad_enrol', 'grad_rate',
'hbcu', 'inst_ctrl',
'inst_lvl', 'inst_sector',
'is_school_rep',
'mean_fin_aid',
'mean_ft_sal_all', 'mean_ft_sal_assis', 'mean_ft_sal_assoc', 'mean_ft_sal_instr',
'mean_ft_sal_lect', 'mean_ft_sal_norank', 'mean_ft_sal_prof',
'office_space',
'payaverage',
'pct_all_dist_ed', 'pct_any_dist_ed', 'pct_no_dist_ed',
'pct_black', 'pct_latino', 'pct_unknown_race', 'pct_white',
'pct_female',
'pct_fin_aid',
'pct_ug_18_24', 'pct_ug_25_64', 'pct_ug_65p',
'pt_teachers', 'pt_teachers_pct_female',
'pt_teachers_pct_black', 'pt_teachers_pct_latino', 'pt_teachers_pct_other',
'pt_teachers_pct_white',
'size_cat', 'stud_fac_ratio',
'student_evals',
'teach_instr_fte', 'teachers_fte',
'total_staff_fte',
'transfer_rate',
'tuition_fees',
'has_health_ins', 'has_retirement', 'has_governance', 'has_union',
'adv_degree', 'has_adv_degree', 'division',
'tuit_rev_per_fte', 'endow_per_fte', 'instr_expen_per_fte',
'urban_rural']]
dlen = len(data)
data = data[(data.adv_degree.isin(['MA/MS','PhD','None']) | (data.adv_degree.isnull()))]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)
data = data[(data.size_cat != 'Not applicable') & (data.size_cat.notnull())]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)
data = data[~(data.inst_sector.isin(['Administrative Unit',
'Private not-for-profit, 2-year',
'Private for-profit, 2-year']))]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)
data = data[data.division != "Other"]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)
data = data[data.ug_enrol > 0]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)
desc(data)
Start with a basic OLS regression of pay on full-time salary of professors.
res = smf.ols("log(payaverage) ~ log(mean_ft_sal_prof)", data=data).fit()
res.summary()
So professor salary explains almost 36% of the variation in average pay.
res = smf.ols("log(payaverage) ~ "
"division + inst_ctrl + inst_lvl + region + urban_rural"
"+ log(ug_enrol) + log(mean_fin_aid) + log(mean_ft_sal_prof)"
"+ log(endow_per_fte)"
"+ has_union + has_health_ins + has_governance + has_retirement", data=data).fit()
res.summary()
df.region.value_counts()
df.filter(like='pt_').info()
There are several ways to classify institution type.
foo= df.groupby('urban').payaverage.describe()
bar = pd.DataFrame(foo).unstack().sort()
bar
bar
stats.ttest_ind(df.loc[df.urban=='Town: Remote', 'payaverage'], df.loc[df.urban=='Suburb: Small', 'payaverage'])
pd.pivot_table(df, values='payaverage', cols='urban', aggfunc=[np.mean, np.percentile(50)])
pd.crosstab(df.inst_cat, df.college_class, dropna=False)
df.T.loc[:,:10]
def desc(df, nvalues=10):
for var in df.columns:
print ">>> %s" % var
print "(missing: %d)" % df[var].isnull().sum()
if df[var].unique().size <= nvalues: # and df[var].dtype == np.object:
print df[var].value_counts()
else:# if df[var].dtype != np.object:
print df[var].describe()
#else:
# print "%s unique string values" % df[var].unique().size
print ""
df['institution name'].describe()
desc(df, nvalues=30)
df.info()
res = smf.ols("log(payaverage) ~ C(adv_degree, Treatment(reference='None'))" \
"+ C(division, Treatment(reference='Gen/LibA'))" \
"+ has_union + has_health_ins + has_retirement + has_governance" \
"+ inst_ctrl + inst_lvl + region" \
"+ admit_pct + log(tuit_rev_per_fte)" \
" + log(mean_ft_sal_prof)", data=df).fit()
res.summary()
df.boxplot('payaverage', by='urban', figsize=(10,10), rot=45)
def f(x):
if x.find("Urban") != -1:
return "Urban"
elif x.find("Suburban") != -1:
return "Suburban"
elif x.find("Rural") != -1:
return "Rural"
elif x.find("Town") != -1:
return "Town"
else:
return np.nan
if "Urban".find("Urb") != -1:
print "True"
def f(x):
if str(x).find("City") != -1:
return "Urban/Suburban"
elif str(x).find("Suburb") != -1:
return "Urban/Suburban"
elif str(x).find("Rural") != -1:
return "Rural/Town"
elif str(x).find("Town") != -1:
return "Rural/Town"
else:
return np.nan
df['urb'] = df.urban.apply(f)
df['urb'].value_counts()
len(df[df.payaverage < 2500])/float(len(df))
df[df.adv_degree.notnull()].adv_degree.value_counts(normalize=True)
df[df.has_health_ins.notnull()].has_health_ins.value_counts(normalize=True)