def desc(df, nvalues=10):
    for var in df.columns:
        print ">>> %s" % var
        print "(missing: %d)" % df[var].isnull().sum()    
        if df[var].unique().size <= nvalues:
            print df[var].value_counts()
        else:
            print df[var].describe()
        print ""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from pylab import *
import statsmodels.formula.api as smf
%matplotlib inline

Clean IPEDS Data

ipeds = pd.read_csv('data/ipeds2012/CSV_512014-1015/CSV_512014-1015.csv')
ipeds2 = pd.read_csv('data/ipeds2012/CSV_512014-1002/CSV_512014-1002.csv')
#ipeds = pd.merge(ipeds, pd.read_csv('data/ipeds2012/CSV_512014-1002/CSV_512014-1002.csv'), on='unitid name')

rename_map = {'DRVIC2012.Percent admitted - total': 'admit_pct',
              'DRVIC2012.Admissions yield - total': 'admit_yield',
              'DRVIC2012.Tuition and fees, 2012-13': 'tuition_fees',
              'HD2012.Institution size category': 'size_cat',
              'HD2012.Geographic region': 'region',
              'HD2012.Sector of institution': 'inst_sector',
              'HD2012.Level of institution': 'inst_lvl',
              'HD2012.Control of institution': 'inst_ctrl',
              #'HD2012.Degree-granting status': 'degree_grt_status',
              'HD2012.Historically Black College or University': 'hbcu',
              'HD2012.Degree of urbanization (Urban-centric locale)': 'urban',
              'HD2012.Institutional category': 'inst_cat',
              'DRVEF2012.Undergraduate enrollment': 'ug_enrol',
              'DRVEF2012.Graduate enrollment': 'grad_enrol',
              'DRVEF2012.Full-time undergraduate enrollment': 'ft_ug_enrol',
              'DRVEF2012.Part-time undergraduate enrollment': 'pt_ug_enrol',
              'DRVEF2012.Percent of total enrollment that are Black or African American': 'pct_black',
              'DRVEF2012.Percent of total enrollment that are Hispanic/Latino': 'pct_latino',
              'DRVEF2012.Percent of total enrollment that are White': 'pct_white',
              'DRVEF2012.Percent of total enrollment that are Race/ethnicity unknown': 'pct_unknown_race',
              'DRVEF2012.Percent of total enrollment that are women': 'pct_female',
              'EF2012D.Student-to-faculty ratio': 'stud_fac_ratio',
              'EF2012D.Full-time retention rate, 2012': 'ft_reten_rate',
              'EF2012D.Part-time retention rate, 2012': 'pt_reten_rate',
              'DRVEF2012.Percent of undergraduate enrollment 18-24': 'pct_ug_18_24',
              'DRVEF2012.Percent of undergraduate enrollment, 25-64': 'pct_ug_25_64',
              'DRVEF2012.Percent of undergraduate enrollment over 65': 'pct_ug_65p',
              'DRVEFDE2012.Percent of students enrolled exclusively in distance education courses': 'pct_all_dist_ed',
              'DRVEFDE2012.Percent of students enrolled in some but not all distance education courses': 'pct_any_dist_ed',
              'DRVEFDE2012.Percent of students not enrolled in any distance education courses': 'pct_no_dist_ed',
              'DRVGR2012.Graduation rate, total cohort': 'grad_rate',
              'DRVGR2012.Transfer-out rate, total cohort': 'transfer_rate',
              'GR200_12.Graduation rate - degree/certificate within 100% of normal time': 'grad_ontime_rate',
              'SFA1112.Percent of full-time first-time undergraduates receiving any financial aid': 'pct_fin_aid',
              'SFA1112.Average amount of federal, state, local or institutional grant aid received': 'mean_fin_aid',
              'DRVF2012.Core revenues, total dollars (GASB)': 'revenues_public',
              'DRVF2012.Tuition and fees as a percent of core revenues (GASB)': 'tuition_pct_rev_public',
              'DRVF2012.Core revenues, total dollars (FASB)': 'revenues_private',
              'DRVF2012.Tuition and fees as a percent of core revenues (FASB)': 'tuition_pct_rev_private',
              'DRVF2012.Revenues from tuition and fees per FTE (GASB)': 'tuit_rev_per_fte_public',
              'DRVF2012.Revenues from tuition and fees per FTE (FASB)': 'tuit_rev_per_fte_private',
              'DRVF2012.Core expenses, total dollars (GASB)': 'expen_public',
              'DRVF2012.Instruction expenses as a percent of total core expenses (GASB)': 'expen_pct_instr_public',
              'DRVF2012.Academic support expenses as a percent of total core expenses (GASB)': 'expen_pct_acadsup_public',
              'DRVF2012.Core expenses, total dollars (FASB)': 'expen_private',
              'DRVF2012.Instruction expenses as a percent of total core expenses (FASB)': 'expen_pct_instr_private',
              'DRVF2012.Academic support expenses as a percent of total core expenses (FASB)': 'expen_pct_acadsup_private',
              'DRVF2012.Instruction expenses per FTE  (GASB)': 'instr_expen_per_fte_public',
              'DRVF2012.Instruction expenses per FTE  (FASB)': 'instr_expen_per_fte_private',
              'DRVF2012.Salaries, wages, and benefit expenses for core expenses as a percent of total core expenses (GASB)':
                  'expen_pct_labor_public',
              'DRVF2012.Salaries, wages, and benefit expenses for instruction as a percent of total expenses for instruction (GASB)':
                  'expen_pct_instlabor_public',
              'DRVF2012.Salaries, wages, and benefit expenses for core expenses as a percent of total core expenses (FASB)':
                  'expen_pct_labor_private',
              'DRVF2012.Salaries, wages, and benefit expenses for instruction as a percent of total expenses for instruction (FASB)':
                  'expen_pct_instlabor_private',
              'DRVF2012.Endowment assets (year end) per FTE enrollment (GASB)': 'endow_per_fte_public',
              'DRVF2012.Endowment assets (year end) per FTE enrollment (FASB)': 'endow_per_fte_private',
              'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - all ranks': 'mean_ft_sal_all',
              'DRVHR2012.Average salary equated to 9 months of full-time insructional staff - professors': 'mean_ft_sal_prof',
              'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - associate professors': 'mean_ft_sal_assoc',
              'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - assistant professors': 'mean_ft_sal_assis',
              'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - instructors': 'mean_ft_sal_instr',
              'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - lecturers': 'mean_ft_sal_lect',
              'DRVHR2012.Average salary equated to 9 months of full-time instructional staff - No academic rank': 'mean_ft_sal_norank',
              'DRVHR2012.Total FTE staff': 'total_staff_fte',
              'DRVHR2012.Postsecondary Teachers FTE staff': 'teachers_fte',
              'DRVHR2012.Postsecondary Teachers Instructional FTE': 'teach_instr_fte',
              'DRVHR2012.Management FTE': 'manag_fte'}
ipeds.rename(columns=rename_map, inplace=True)
ipeds = ipeds.set_index(['unitid', 'institution name'])[rename_map.values()] # keep only the renamed vars
ipeds.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7330 entries, (100654, Alabama A & M University) to (480569, Florida Institute of Technology-Florida Tech Online)
Data columns (total 65 columns):
hbcu                           7330 non-null object
pct_unknown_race               7249 non-null float64
pt_reten_rate                  3673 non-null float64
expen_pct_labor_private        1866 non-null float64
transfer_rate                  2223 non-null float64
endow_per_fte_private          1409 non-null float64
pct_ug_18_24                   3179 non-null float64
expen_pct_instr_public         2003 non-null float64
mean_ft_sal_norank             1307 non-null float64
tuition_pct_rev_public         2003 non-null float64
tuition_fees                   4166 non-null float64
instr_expen_per_fte_public     1936 non-null float64
manag_fte                      7330 non-null int64
ft_reten_rate                  6160 non-null float64
pct_ug_25_64                   3179 non-null float64
tuit_rev_per_fte_private       1863 non-null float64
pct_latino                     7249 non-null float64
expen_pct_instlabor_public     1968 non-null float64
tuit_rev_per_fte_public        1936 non-null float64
total_staff_fte                7330 non-null int64
size_cat                       7330 non-null object
mean_ft_sal_assis              2361 non-null float64
grad_enrol                     7249 non-null float64
urban                          7330 non-null object
pt_ug_enrol                    7249 non-null float64
mean_fin_aid                   6624 non-null float64
teachers_fte                   7330 non-null int64
revenues_public                2004 non-null float64
ft_ug_enrol                    7249 non-null float64
pct_ug_65p                     3179 non-null float64
teach_instr_fte                7330 non-null int64
instr_expen_per_fte_private    1863 non-null float64
inst_sector                    7330 non-null object
expen_private                  1867 non-null float64
expen_pct_acadsup_public       2003 non-null float64
pct_female                     7249 non-null float64
endow_per_fte_public           1237 non-null float64
inst_lvl                       7330 non-null object
pct_white                      7249 non-null float64
grad_ontime_rate               3664 non-null float64
inst_cat                       7330 non-null object
mean_ft_sal_lect               734 non-null float64
admit_yield                    2257 non-null float64
ug_enrol                       7249 non-null float64
region                         7330 non-null object
expen_pct_acadsup_private      1866 non-null float64
mean_ft_sal_assoc              2383 non-null float64
admit_pct                      2260 non-null float64
pct_all_dist_ed                7249 non-null float64
mean_ft_sal_all                4425 non-null float64
expen_public                   2004 non-null float64
stud_fac_ratio                 6929 non-null float64
pct_no_dist_ed                 7249 non-null float64
pct_black                      7249 non-null float64
expen_pct_labor_public         2003 non-null float64
expen_pct_instr_private        1866 non-null float64
inst_ctrl                      7330 non-null object
revenues_private               1867 non-null float64
grad_rate                      6209 non-null float64
mean_ft_sal_prof               2459 non-null float64
tuition_pct_rev_private        1860 non-null float64
expen_pct_instlabor_private    1862 non-null float64
mean_ft_sal_instr              2966 non-null float64
pct_any_dist_ed                7249 non-null float64
pct_fin_aid                    6670 non-null float64
dtypes: float64(53), int64(4), object(8)

ipeds2 = ipeds2[ipeds2['S2012_OC.Occupation and full- and part-time status'] == 
                'Part-time, Instructional staff']

ipeds2['pt_teachers'] = ipeds2['S2012_OC.Grand total']
ipeds2['pt_teachers_pct_female'] = ipeds2['S2012_OC.Grand total women'] / ipeds2['pt_teachers']
ipeds2['pt_teachers_pct_white'] = ipeds2['S2012_OC.White total'] / ipeds2['pt_teachers']
ipeds2['pt_teachers_pct_black'] = ipeds2['S2012_OC.Black or African American total'] / ipeds2['pt_teachers']
ipeds2['pt_teachers_pct_latino'] = ipeds2['S2012_OC.Hispanic or Latino total'] / ipeds2['pt_teachers']
ipeds2['pt_teachers_pct_other'] = (ipeds2['pt_teachers'] 
                                   - ipeds2['S2012_OC.White total'] 
                                   - ipeds2['S2012_OC.Black or African American total'] 
                                   - ipeds2['S2012_OC.Hispanic or Latino total']) / ipeds2['pt_teachers']
ipeds2 = ipeds2.set_index(['unitid','institution name']).filter(like='pt_teachers')
ipeds2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3087 entries, (100706, University of Alabama in Huntsville) to (480569, Florida Institute of Technology-Florida Tech Online)
Data columns (total 6 columns):
pt_teachers               3087 non-null int64
pt_teachers_pct_female    3087 non-null float64
pt_teachers_pct_white     3087 non-null float64
pt_teachers_pct_black     3087 non-null float64
pt_teachers_pct_latino    3087 non-null float64
pt_teachers_pct_other     3087 non-null float64
dtypes: float64(5), int64(1)

ipeds = pd.merge(ipeds, ipeds2, how='left', left_index=True, right_index=True)
ipeds.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7330 entries, (100654, Alabama A & M University) to (480569, Florida Institute of Technology-Florida Tech Online)
Data columns (total 71 columns):
hbcu                           7330 non-null object
pct_unknown_race               7249 non-null float64
pt_reten_rate                  3673 non-null float64
expen_pct_labor_private        1866 non-null float64
transfer_rate                  2223 non-null float64
endow_per_fte_private          1409 non-null float64
pct_ug_18_24                   3179 non-null float64
expen_pct_instr_public         2003 non-null float64
mean_ft_sal_norank             1307 non-null float64
tuition_pct_rev_public         2003 non-null float64
tuition_fees                   4166 non-null float64
instr_expen_per_fte_public     1936 non-null float64
manag_fte                      7330 non-null int64
ft_reten_rate                  6160 non-null float64
pct_ug_25_64                   3179 non-null float64
tuit_rev_per_fte_private       1863 non-null float64
pct_latino                     7249 non-null float64
expen_pct_instlabor_public     1968 non-null float64
tuit_rev_per_fte_public        1936 non-null float64
total_staff_fte                7330 non-null int64
size_cat                       7330 non-null object
mean_ft_sal_assis              2361 non-null float64
grad_enrol                     7249 non-null float64
urban                          7330 non-null object
pt_ug_enrol                    7249 non-null float64
mean_fin_aid                   6624 non-null float64
teachers_fte                   7330 non-null int64
revenues_public                2004 non-null float64
ft_ug_enrol                    7249 non-null float64
pct_ug_65p                     3179 non-null float64
teach_instr_fte                7330 non-null int64
instr_expen_per_fte_private    1863 non-null float64
inst_sector                    7330 non-null object
expen_private                  1867 non-null float64
expen_pct_acadsup_public       2003 non-null float64
pct_female                     7249 non-null float64
endow_per_fte_public           1237 non-null float64
inst_lvl                       7330 non-null object
pct_white                      7249 non-null float64
grad_ontime_rate               3664 non-null float64
inst_cat                       7330 non-null object
mean_ft_sal_lect               734 non-null float64
admit_yield                    2257 non-null float64
ug_enrol                       7249 non-null float64
region                         7330 non-null object
expen_pct_acadsup_private      1866 non-null float64
mean_ft_sal_assoc              2383 non-null float64
admit_pct                      2260 non-null float64
pct_all_dist_ed                7249 non-null float64
mean_ft_sal_all                4425 non-null float64
expen_public                   2004 non-null float64
stud_fac_ratio                 6929 non-null float64
pct_no_dist_ed                 7249 non-null float64
pct_black                      7249 non-null float64
expen_pct_labor_public         2003 non-null float64
expen_pct_instr_private        1866 non-null float64
inst_ctrl                      7330 non-null object
revenues_private               1867 non-null float64
grad_rate                      6209 non-null float64
mean_ft_sal_prof               2459 non-null float64
tuition_pct_rev_private        1860 non-null float64
expen_pct_instlabor_private    1862 non-null float64
mean_ft_sal_instr              2966 non-null float64
pct_any_dist_ed                7249 non-null float64
pct_fin_aid                    6670 non-null float64
pt_teachers                    3087 non-null float64
pt_teachers_pct_female         3087 non-null float64
pt_teachers_pct_white          3087 non-null float64
pt_teachers_pct_black          3087 non-null float64
pt_teachers_pct_latino         3087 non-null float64
pt_teachers_pct_other          3087 non-null float64
dtypes: float64(59), int64(4), object(8)

df = pd.merge(pd.read_csv('data/adjunct.csv'), ipeds.reset_index(), how='left', left_on='college_unit_id', right_on='unitid')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6929 entries, 0 to 6928
Data columns (total 108 columns):
department_id                    int64
payaverage                       int64
paytype                          float64
user_supplied_department_name    object
course_type                      float64
contract_type                    object
period_term                      float64
period_year                      float64
has_contract                     float64
has_retirement                   float64
has_health_insurance             float64
has_governance                   float64
has_union                        float64
union_name                       object
is_school_rep                    int64
course_syllabus                  float64
office_space                     float64
student_evals                    float64
has_advanced_degree              float64
paytype_credits                  float64
paytype_class_size               float64
paytype_hours                    float64
paytype_weeks                    float64
paytype_courses                  float64
department_type_id               int64
display_name                     object
department_type_name             object
college_class                    object
college_carnegie                 object
college_unit_id                  int64
college_state                    object
college_id                       int64
college_hbcu                     float64
college_flagship                 float64
college_name                     object
unitid                           float64
institution name                 object
hbcu                             object
pct_unknown_race                 float64
pt_reten_rate                    float64
expen_pct_labor_private          float64
transfer_rate                    float64
endow_per_fte_private            float64
pct_ug_18_24                     float64
expen_pct_instr_public           float64
mean_ft_sal_norank               float64
tuition_pct_rev_public           float64
tuition_fees                     float64
instr_expen_per_fte_public       float64
manag_fte                        float64
ft_reten_rate                    float64
pct_ug_25_64                     float64
tuit_rev_per_fte_private         float64
pct_latino                       float64
expen_pct_instlabor_public       float64
tuit_rev_per_fte_public          float64
total_staff_fte                  float64
size_cat                         object
mean_ft_sal_assis                float64
grad_enrol                       float64
urban                            object
pt_ug_enrol                      float64
mean_fin_aid                     float64
teachers_fte                     float64
revenues_public                  float64
ft_ug_enrol                      float64
pct_ug_65p                       float64
teach_instr_fte                  float64
instr_expen_per_fte_private      float64
inst_sector                      object
expen_private                    float64
expen_pct_acadsup_public         float64
pct_female                       float64
endow_per_fte_public             float64
inst_lvl                         object
pct_white                        float64
grad_ontime_rate                 float64
inst_cat                         object
mean_ft_sal_lect                 float64
admit_yield                      float64
ug_enrol                         float64
region                           object
expen_pct_acadsup_private        float64
mean_ft_sal_assoc                float64
admit_pct                        float64
pct_all_dist_ed                  float64
mean_ft_sal_all                  float64
expen_public                     float64
stud_fac_ratio                   float64
pct_no_dist_ed                   float64
pct_black                        float64
expen_pct_labor_public           float64
expen_pct_instr_private          float64
inst_ctrl                        object
revenues_private                 float64
grad_rate                        float64
mean_ft_sal_prof                 float64
tuition_pct_rev_private          float64
expen_pct_instlabor_private      float64
mean_ft_sal_instr                float64
pct_any_dist_ed                  float64
pct_fin_aid                      float64
pt_teachers                      float64
pt_teachers_pct_female           float64
pt_teachers_pct_white            float64
pt_teachers_pct_black            float64
pt_teachers_pct_latino           float64
pt_teachers_pct_other            float64
dtypes: float64(84), int64(6), object(18)

Remove empty/unnecessary variables

df = df.drop(['has_contract', 'college_hbcu', 'college_flagship'], axis=1) # completely empty
df = df.drop(df.filter(like='paytype').columns, axis=1) # no longer kept in new version, very few values
df = df.drop(['college_class', 'college_carnegie'], axis=1) # the fields from IPEDS are higher quality
df = df.drop(['institution name', 'user_supplied_department_name', 'display_name'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6929 entries, 0 to 6928
Data columns (total 94 columns):
department_id                  6929 non-null int64
payaverage                     6929 non-null int64
course_type                    1471 non-null float64
contract_type                  1724 non-null object
period_term                    4517 non-null float64
period_year                    4517 non-null float64
has_retirement                 6224 non-null float64
has_health_insurance           6249 non-null float64
has_governance                 6202 non-null float64
has_union                      5797 non-null float64
union_name                     4348 non-null object
is_school_rep                  6929 non-null int64
course_syllabus                4503 non-null float64
office_space                   4429 non-null float64
student_evals                  3857 non-null float64
has_advanced_degree            4532 non-null float64
department_type_id             6929 non-null int64
department_type_name           6929 non-null object
college_unit_id                6929 non-null int64
college_state                  6929 non-null object
college_id                     6929 non-null int64
college_name                   6929 non-null object
unitid                         6876 non-null float64
hbcu                           6876 non-null object
pct_unknown_race               6848 non-null float64
pt_reten_rate                  5412 non-null float64
expen_pct_labor_private        2595 non-null float64
transfer_rate                  3837 non-null float64
endow_per_fte_private          2552 non-null float64
pct_ug_18_24                   5197 non-null float64
expen_pct_instr_public         3906 non-null float64
mean_ft_sal_norank             2549 non-null float64
tuition_pct_rev_public         3906 non-null float64
tuition_fees                   6789 non-null float64
instr_expen_per_fte_public     3881 non-null float64
manag_fte                      6876 non-null float64
ft_reten_rate                  6633 non-null float64
pct_ug_25_64                   5197 non-null float64
tuit_rev_per_fte_private       2595 non-null float64
pct_latino                     6848 non-null float64
expen_pct_instlabor_public     3904 non-null float64
tuit_rev_per_fte_public        3881 non-null float64
total_staff_fte                6876 non-null float64
size_cat                       6876 non-null object
mean_ft_sal_assis              5726 non-null float64
grad_enrol                     6848 non-null float64
urban                          6876 non-null object
pt_ug_enrol                    6848 non-null float64
mean_fin_aid                   6810 non-null float64
teachers_fte                   6876 non-null float64
revenues_public                3906 non-null float64
ft_ug_enrol                    6848 non-null float64
pct_ug_65p                     5197 non-null float64
teach_instr_fte                6876 non-null float64
instr_expen_per_fte_private    2595 non-null float64
inst_sector                    6876 non-null object
expen_private                  2595 non-null float64
expen_pct_acadsup_public       3906 non-null float64
pct_female                     6848 non-null float64
endow_per_fte_public           3456 non-null float64
inst_lvl                       6876 non-null object
pct_white                      6848 non-null float64
grad_ontime_rate               1669 non-null float64
inst_cat                       6876 non-null object
mean_ft_sal_lect               2660 non-null float64
admit_yield                    4597 non-null float64
ug_enrol                       6848 non-null float64
region                         6876 non-null object
expen_pct_acadsup_private      2595 non-null float64
mean_ft_sal_assoc              5706 non-null float64
admit_pct                      4597 non-null float64
pct_all_dist_ed                6848 non-null float64
mean_ft_sal_all                6798 non-null float64
expen_public                   3906 non-null float64
stud_fac_ratio                 6824 non-null float64
pct_no_dist_ed                 6848 non-null float64
pct_black                      6848 non-null float64
expen_pct_labor_public         3906 non-null float64
expen_pct_instr_private        2595 non-null float64
inst_ctrl                      6876 non-null object
revenues_private               2595 non-null float64
grad_rate                      6769 non-null float64
mean_ft_sal_prof               5771 non-null float64
tuition_pct_rev_private        2595 non-null float64
expen_pct_instlabor_private    2595 non-null float64
mean_ft_sal_instr              5126 non-null float64
pct_any_dist_ed                6848 non-null float64
pct_fin_aid                    6810 non-null float64
pt_teachers                    4790 non-null float64
pt_teachers_pct_female         4790 non-null float64
pt_teachers_pct_white          4790 non-null float64
pt_teachers_pct_black          4790 non-null float64
pt_teachers_pct_latino         4790 non-null float64
pt_teachers_pct_other          4790 non-null float64
dtypes: float64(75), int64(6), object(13)

df = df.sort(axis=1) # sort by column name
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6929 entries, 0 to 6928
Data columns (total 94 columns):
admit_pct                      4597 non-null float64
admit_yield                    4597 non-null float64
college_id                     6929 non-null int64
college_name                   6929 non-null object
college_state                  6929 non-null object
college_unit_id                6929 non-null int64
contract_type                  1724 non-null object
course_syllabus                4503 non-null float64
course_type                    1471 non-null float64
department_id                  6929 non-null int64
department_type_id             6929 non-null int64
department_type_name           6929 non-null object
endow_per_fte_private          2552 non-null float64
endow_per_fte_public           3456 non-null float64
expen_pct_acadsup_private      2595 non-null float64
expen_pct_acadsup_public       3906 non-null float64
expen_pct_instlabor_private    2595 non-null float64
expen_pct_instlabor_public     3904 non-null float64
expen_pct_instr_private        2595 non-null float64
expen_pct_instr_public         3906 non-null float64
expen_pct_labor_private        2595 non-null float64
expen_pct_labor_public         3906 non-null float64
expen_private                  2595 non-null float64
expen_public                   3906 non-null float64
ft_reten_rate                  6633 non-null float64
ft_ug_enrol                    6848 non-null float64
grad_enrol                     6848 non-null float64
grad_ontime_rate               1669 non-null float64
grad_rate                      6769 non-null float64
has_advanced_degree            4532 non-null float64
has_governance                 6202 non-null float64
has_health_insurance           6249 non-null float64
has_retirement                 6224 non-null float64
has_union                      5797 non-null float64
hbcu                           6876 non-null object
inst_cat                       6876 non-null object
inst_ctrl                      6876 non-null object
inst_lvl                       6876 non-null object
inst_sector                    6876 non-null object
instr_expen_per_fte_private    2595 non-null float64
instr_expen_per_fte_public     3881 non-null float64
is_school_rep                  6929 non-null int64
manag_fte                      6876 non-null float64
mean_fin_aid                   6810 non-null float64
mean_ft_sal_all                6798 non-null float64
mean_ft_sal_assis              5726 non-null float64
mean_ft_sal_assoc              5706 non-null float64
mean_ft_sal_instr              5126 non-null float64
mean_ft_sal_lect               2660 non-null float64
mean_ft_sal_norank             2549 non-null float64
mean_ft_sal_prof               5771 non-null float64
office_space                   4429 non-null float64
payaverage                     6929 non-null int64
pct_all_dist_ed                6848 non-null float64
pct_any_dist_ed                6848 non-null float64
pct_black                      6848 non-null float64
pct_female                     6848 non-null float64
pct_fin_aid                    6810 non-null float64
pct_latino                     6848 non-null float64
pct_no_dist_ed                 6848 non-null float64
pct_ug_18_24                   5197 non-null float64
pct_ug_25_64                   5197 non-null float64
pct_ug_65p                     5197 non-null float64
pct_unknown_race               6848 non-null float64
pct_white                      6848 non-null float64
period_term                    4517 non-null float64
period_year                    4517 non-null float64
pt_reten_rate                  5412 non-null float64
pt_teachers                    4790 non-null float64
pt_teachers_pct_black          4790 non-null float64
pt_teachers_pct_female         4790 non-null float64
pt_teachers_pct_latino         4790 non-null float64
pt_teachers_pct_other          4790 non-null float64
pt_teachers_pct_white          4790 non-null float64
pt_ug_enrol                    6848 non-null float64
region                         6876 non-null object
revenues_private               2595 non-null float64
revenues_public                3906 non-null float64
size_cat                       6876 non-null object
stud_fac_ratio                 6824 non-null float64
student_evals                  3857 non-null float64
teach_instr_fte                6876 non-null float64
teachers_fte                   6876 non-null float64
total_staff_fte                6876 non-null float64
transfer_rate                  3837 non-null float64
tuit_rev_per_fte_private       2595 non-null float64
tuit_rev_per_fte_public        3881 non-null float64
tuition_fees                   6789 non-null float64
tuition_pct_rev_private        2595 non-null float64
tuition_pct_rev_public         3906 non-null float64
ug_enrol                       6848 non-null float64
union_name                     4348 non-null object
unitid                         6876 non-null float64
urban                          6876 non-null object
dtypes: float64(75), int64(6), object(13)

Recode variables

union variable

First, has_union is incorrectly specified. It is defined as union_name != NULL, but in fact often union_name == 'None', yet has_union == 1. Since

print df.has_union.value_counts()
pd.crosstab(df.union_name, df.has_union)

1    4348
0    1449
dtype: int64

df.loc[df.union_name=='None', 'has_union'] = 0
df.has_union.value_counts(normalize=True)

0    0.680185
1    0.156588
dtype: float64

has_* variables

Now rename other has_* variables and create new verions that are true dummies. Note that, as described in the data dictionary, a response of 4 means "I don't know", which I code as missing (NaN).

df.rename(columns={
    'has_health_insurance':'health_insurance_type',
    'has_retirement':'retirement_type',
    'has_governance':'governance_type'}, inplace=True)
# When health ins. is non-missing (i.e. = 0,1,2, or 3), replace with True when = 1,2, or 3
df.loc[df.health_insurance_type.isin([0,1,2,3]), 'has_health_ins'] = df.health_insurance_type.isin([1,2,3])
df.loc[df.retirement_type.isin([0,1,2,3]), 'has_retirement'] = df.retirement_type.isin([1,2,3])
df.loc[df.governance_type.isin([0,1]), 'has_governance'] = df.governance_type.isin([1])

advanced degrees

Create a new variables with meaningful labels for the has_advanced_degree variable.

df.rename(columns={'has_advanced_degree':'advanced_degree_type'}, inplace=True)
degree_map = {0: 'None', 1: 'MA/MS', 2: 'PhD', 3: 'JD', 4: 'MD'}
df.loc[df.advanced_degree_type.notnull(), 'adv_degree'] = df.advanced_degree_type.replace(degree_map)
df.loc[df.advanced_degree_type.notnull(), 'has_adv_degree'] \
    = df.advanced_degree_type.isin([1,2,3,4])

Academic divisions

Create meaninful divisions from the department_type_id, according to coding listed in the data dictionary.

# Coding is based on first number of department_id
divisions = {
'1': 'Gen/LibA', # General/Liberal Arts
'2': 'Arts/Hum', # Arts/Humanities
'3': 'SocSci', # Social Sciences
'4': 'Sci/Tech', # Science/Technology
'6': 'Prof/ApS', # Professions/Applied Sciences
'8': np.nan, # Not specified
'9': 'Other'
}

df['division'] = df.department_type_id.apply(lambda x: str(x)[0:1]).replace(divisions)

Adjunct staffing rate

Create two variables describing the number of part-time teachers as a ratio of total staff FTEs and instructor teaching staff FTEs. Note that pt_teachers is a count of total teachers, not of FTEs, and thus it is possible, for example, for pt_teachers / teach_instr_fte to be greater than 1 (e.g. if there were 100 teachers, all part time, each of whome teaches only a 1/4 load, then the value of adjunct_teach_rate would be 4: 100 teachers / 25 instructor FTEs.)

df['adjunct_staff_rate'] = df.pt_teachers / df.total_staff_fte
df['adjunct_teach_rate'] = df.pt_teachers / df.teach_instr_fte

Revenues and expenses per FTE student

Tuition revenue per FTE student

df.loc[df.tuit_rev_per_fte_private.notnull(), 'tuit_rev_per_fte'] =  df.tuit_rev_per_fte_private
df.loc[df.tuit_rev_per_fte_public.notnull(), 'tuit_rev_per_fte'] =  df.tuit_rev_per_fte_public

Endowment per FTE student

df.loc[df.endow_per_fte_private.notnull(), 'endow_per_fte'] =  df.endow_per_fte_private
df.loc[df.endow_per_fte_public.notnull(), 'endow_per_fte'] =  df.endow_per_fte_public

Instruction Expenses per FTE Student

df.loc[df.instr_expen_per_fte_private.notnull(), 'instr_expen_per_fte'] =  df.instr_expen_per_fte_private
df.loc[df.instr_expen_per_fte_public.notnull(), 'instr_expen_per_fte'] =  df.instr_expen_per_fte_public

Urban/nonurban location

Currently the variable urban has 12 categories, which is a lot. Let's create a dummy variable instead.

def urb_map(x):
    if str(x).find("City") != -1:
        return "Urban/Suburban"
    elif str(x).find("Suburb") != -1:
        return "Urban/Suburban"
    elif str(x).find("Rural") != -1:
        return "Rural/Town"
    elif str(x).find("Town") != -1:
        return "Rural/Town"
    else: 
        return np.nan

df['urb'] = df.urban.apply(urb_map)

urb_map = {
'City: Large':'Urban/Suburban', 
'City: Small':'Urban/Suburban', 
'City: Midsize':'Urban/Suburban', 
'Suburb: Large':'Urban/Suburban',
'Suburb: Midsize':'Urban/Suburban',
'Suburb: Small':'Urban/Suburban',
'Town: Fringe':'Town/Rural',
'Town: Distant':'Town/Rural',
'Town: Remote':'Town/Rural', 
'Rural: Fringe':'Town/Rural',
'Rural: Distant':'Town/Rural', 
'Rural: Remote':'Town/Rural'
}
df['urban_rural'] = df.urban.replace(urb_map)

Subset Data

We restrict data to certain variables and then remove some data that has too few observations.

df.columns.values

array(['admit_pct', 'admit_yield', 'college_id', 'college_name',
       'college_state', 'college_unit_id', 'contract_type',
       'course_syllabus', 'course_type', 'department_id',
       'department_type_id', 'department_type_name',
       'endow_per_fte_private', 'endow_per_fte_public',
       'expen_pct_acadsup_private', 'expen_pct_acadsup_public',
       'expen_pct_instlabor_private', 'expen_pct_instlabor_public',
       'expen_pct_instr_private', 'expen_pct_instr_public',
       'expen_pct_labor_private', 'expen_pct_labor_public',
       'expen_private', 'expen_public', 'ft_reten_rate', 'ft_ug_enrol',
       'grad_enrol', 'grad_ontime_rate', 'grad_rate',
       'advanced_degree_type', 'governance_type', 'health_insurance_type',
       'retirement_type', 'has_union', 'hbcu', 'inst_cat', 'inst_ctrl',
       'inst_lvl', 'inst_sector', 'instr_expen_per_fte_private',
       'instr_expen_per_fte_public', 'is_school_rep', 'manag_fte',
       'mean_fin_aid', 'mean_ft_sal_all', 'mean_ft_sal_assis',
       'mean_ft_sal_assoc', 'mean_ft_sal_instr', 'mean_ft_sal_lect',
       'mean_ft_sal_norank', 'mean_ft_sal_prof', 'office_space',
       'payaverage', 'pct_all_dist_ed', 'pct_any_dist_ed', 'pct_black',
       'pct_female', 'pct_fin_aid', 'pct_latino', 'pct_no_dist_ed',
       'pct_ug_18_24', 'pct_ug_25_64', 'pct_ug_65p', 'pct_unknown_race',
       'pct_white', 'period_term', 'period_year', 'pt_reten_rate',
       'pt_teachers', 'pt_teachers_pct_black', 'pt_teachers_pct_female',
       'pt_teachers_pct_latino', 'pt_teachers_pct_other',
       'pt_teachers_pct_white', 'pt_ug_enrol', 'region',
       'revenues_private', 'revenues_public', 'size_cat', 'stud_fac_ratio',
       'student_evals', 'teach_instr_fte', 'teachers_fte',
       'total_staff_fte', 'transfer_rate', 'tuit_rev_per_fte_private',
       'tuit_rev_per_fte_public', 'tuition_fees',
       'tuition_pct_rev_private', 'tuition_pct_rev_public', 'ug_enrol',
       'union_name', 'unitid', 'urban', 'has_health_ins', 'has_retirement',
       'has_governance', 'adv_degree', 'has_adv_degree', 'division',
       'tuit_rev_per_fte', 'endow_per_fte', 'instr_expen_per_fte',
       'urban_rural'], dtype=object)

data = df[['college_name', 'unitid',
       'college_state', 'region',
       'admit_pct', 'admit_yield', 
       'ft_reten_rate', 'pt_reten_rate',
       'ft_ug_enrol', 'pt_ug_enrol', 'ug_enrol',
       'grad_enrol', 'grad_rate',
       'hbcu', 'inst_ctrl',
       'inst_lvl', 'inst_sector', 
       'is_school_rep',
       'mean_fin_aid',
       'mean_ft_sal_all', 'mean_ft_sal_assis', 'mean_ft_sal_assoc', 'mean_ft_sal_instr', 
       'mean_ft_sal_lect', 'mean_ft_sal_norank', 'mean_ft_sal_prof', 
       'office_space',
       'payaverage', 
       'pct_all_dist_ed', 'pct_any_dist_ed', 'pct_no_dist_ed',
       'pct_black', 'pct_latino', 'pct_unknown_race', 'pct_white', 
       'pct_female', 
       'pct_fin_aid', 
       'pct_ug_18_24', 'pct_ug_25_64', 'pct_ug_65p', 
       'pt_teachers', 'pt_teachers_pct_female',
       'pt_teachers_pct_black', 'pt_teachers_pct_latino', 'pt_teachers_pct_other',
       'pt_teachers_pct_white',  
       'size_cat', 'stud_fac_ratio',
       'student_evals', 
       'teach_instr_fte', 'teachers_fte',
       'total_staff_fte', 
       'transfer_rate', 
       'tuition_fees',
       'has_health_ins', 'has_retirement', 'has_governance', 'has_union', 
       'adv_degree', 'has_adv_degree', 'division',
       'tuit_rev_per_fte', 'endow_per_fte', 'instr_expen_per_fte',
       'urban_rural']]

dlen = len(data)
data = data[(data.adv_degree.isin(['MA/MS','PhD','None']) | (data.adv_degree.isnull()))]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)
data = data[(data.size_cat != 'Not applicable') & (data.size_cat.notnull())]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)
data = data[~(data.inst_sector.isin(['Administrative Unit', 
                                     'Private not-for-profit, 2-year',
                                     'Private for-profit, 2-year']))]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)
data = data[data.division != "Other"]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)
data = data[data.ug_enrol > 0]
print "%d obs. dropped" % (dlen - len(data))
dlen = len(data)

68 obs. dropped
80 obs. dropped
53 obs. dropped
9 obs. dropped
22 obs. dropped

desc(data)

>>> college_name
(missing: 0)
count                           6728
unique                          1871
top       Sinclair Community College
freq                              48
Name: college_name, dtype: object

>>> unitid
(missing: 0)
count      6728.000000
mean     184423.411415
std       55856.910463
min      100663.000000
25%      146296.000000
50%      182661.000000
75%      212133.000000
max      475121.000000
Name: unitid, dtype: float64

>>> college_state
(missing: 0)
count         6728
unique          51
top       New York
freq           587
Name: college_state, dtype: object

>>> region
(missing: 0)
Mid East DE DC MD NJ NY PA                       1569
Southeast AL AR FL GA KY LA MS NC SC TN VA WV    1469
Great Lakes IL IN MI OH WI                       1084
Far West AK CA HI NV OR WA                        743
New England CT ME MA NH RI VT                     656
Southwest AZ NM OK TX                             495
Plains IA KS MN MO NE ND SD                       483
Rocky Mountains CO ID MT UT WY                    229
dtype: int64

>>> admit_pct
(missing: 2193)
count    4535.000000
mean       62.071444
std        18.781243
min         6.000000
25%        52.000000
50%        64.000000
75%        75.000000
max       100.000000
Name: admit_pct, dtype: float64

>>> admit_yield
(missing: 2193)
count    4535.000000
mean       33.640132
std        15.021061
min         7.000000
25%        23.000000
50%        31.000000
75%        40.000000
max       100.000000
Name: admit_yield, dtype: float64

>>> ft_reten_rate
(missing: 209)
count    6519.000000
mean       71.979751
std        16.080456
min         0.000000
25%        61.000000
50%        74.000000
75%        84.000000
max       100.000000
Name: ft_reten_rate, dtype: float64

>>> pt_reten_rate
(missing: 1401)
count    5327.000000
mean       49.235029
std        23.652469
min         0.000000
25%        37.000000
50%        47.000000
75%        62.000000
max       100.000000
Name: pt_reten_rate, dtype: float64

>>> ft_ug_enrol
(missing: 0)
count      6728.000000
mean       8678.054994
std       15817.134929
min           0.000000
25%        2179.000000
50%        5023.000000
75%       10223.000000
max      208742.000000
Name: ft_ug_enrol, dtype: float64

>>> pt_ug_enrol
(missing: 0)
count     6728.000000
mean      3917.099584
std       6696.336395
min          0.000000
25%        385.000000
50%       1560.000000
75%       4716.750000
max      62610.000000
Name: pt_ug_enrol, dtype: float64

>>> ug_enrol
(missing: 0)
count      6728.000000
mean      12595.154578
std       17800.658788
min           0.000000
25%        3240.000000
50%        8010.000000
75%       17528.000000
max      208742.000000
Name: ug_enrol, dtype: float64

>>> grad_enrol
(missing: 0)
count     6728.000000
mean      2973.441885
std       5307.357010
min          0.000000
25%          0.000000
50%        788.500000
75%       3960.000000
max      47660.000000
Name: grad_enrol, dtype: float64

>>> grad_rate
(missing: 76)
count    6652.000000
mean       47.059531
std        24.500656
min         0.000000
25%        25.000000
50%        47.000000
75%        66.000000
max       100.000000
Name: grad_rate, dtype: float64

>>> hbcu
(missing: 0)
No     6658
Yes      70
dtype: int64

>>> inst_ctrl
(missing: 0)
Public                    3940
Private not-for-profit    2465
Private for-profit         323
dtype: int64

>>> inst_lvl
(missing: 0)
Four or more years                  5114
At least 2 but less than 4 years    1614
dtype: int64

>>> inst_sector
(missing: 0)
Private not-for-profit, 4-year or above    2465
Public, 4-year or above                    2326
Public, 2-year                             1614
Private for-profit, 4-year or above         323
dtype: int64

>>> is_school_rep
(missing: 0)
0    6660
1      68
dtype: int64

>>> mean_fin_aid
(missing: 36)
count     6692.000000
mean     10562.514345
std       7417.885768
min       1387.000000
25%       4872.500000
50%       7371.000000
75%      15396.000000
max      41555.000000
Name: mean_fin_aid, dtype: float64

>>> mean_ft_sal_all
(missing: 48)
count      6680.000000
mean      69302.136826
std       19607.944944
min        9000.000000
25%       56142.000000
50%       66528.000000
75%       80397.000000
max      171657.000000
Name: mean_ft_sal_all, dtype: float64

>>> mean_ft_sal_assis
(missing: 1071)
count      5657.000000
mean      61662.977373
std       13840.536397
min       25470.000000
25%       51588.000000
50%       60111.000000
75%       69066.000000
max      119790.000000
Name: mean_ft_sal_assis, dtype: float64

>>> mean_ft_sal_assoc
(missing: 1091)
count      5637.000000
mean      71728.090474
std       16395.882552
min       22761.000000
25%       60066.000000
50%       70020.000000
75%       82125.000000
max      134559.000000
Name: mean_ft_sal_assoc, dtype: float64

>>> mean_ft_sal_instr
(missing: 1680)
count      5048.000000
mean      50002.327655
std       13993.236061
min        9000.000000
25%       42246.000000
50%       47637.000000
75%       54801.000000
max      191997.000000
Name: mean_ft_sal_instr, dtype: float64

>>> mean_ft_sal_lect
(missing: 4100)
count      2628.000000
mean      50671.743151
std       12332.580440
min       14724.000000
25%       43026.750000
50%       49284.000000
75%       58446.000000
max      124632.000000
Name: mean_ft_sal_lect, dtype: float64

>>> mean_ft_sal_norank
(missing: 4236)
count      2492.000000
mean      54850.276083
std       22255.909404
min        9783.000000
25%       40797.000000
50%       50809.500000
75%       63846.000000
max      195003.000000
Name: mean_ft_sal_norank, dtype: float64

>>> mean_ft_sal_prof
(missing: 1022)
count      5706.000000
mean      93685.607256
std       29514.802350
min       29988.000000
25%       72351.000000
50%       87813.000000
75%      111429.000000
max      207630.000000
Name: mean_ft_sal_prof, dtype: float64

>>> office_space
(missing: 2454)
0    1412
3    1290
2    1185
1     387
dtype: int64

>>> payaverage
(missing: 0)
count     6728.000000
mean      3074.302616
std       1532.654505
min        300.000000
25%       2050.000000
50%       2700.000000
75%       3700.000000
max      20000.000000
Name: payaverage, dtype: float64

>>> pct_all_dist_ed
(missing: 0)
count    6728.000000
mean        9.246879
std        16.219768
min         0.000000
25%         1.000000
50%         4.000000
75%        10.000000
max       100.000000
Name: pct_all_dist_ed, dtype: float64

>>> pct_any_dist_ed
(missing: 0)
count    6728.000000
mean       12.109839
std        12.377563
min         0.000000
25%         2.000000
50%        10.000000
75%        18.000000
max       100.000000
Name: pct_any_dist_ed, dtype: float64

>>> pct_no_dist_ed
(missing: 0)
count    6728.000000
mean       78.626784
std        21.051841
min         0.000000
25%        70.000000
50%        83.000000
75%        95.000000
max       100.000000
Name: pct_no_dist_ed, dtype: float64

>>> pct_black
(missing: 0)
count    6728.000000
mean       12.140458
std        13.237863
min         0.000000
25%         4.000000
50%         8.000000
75%        16.000000
max        99.000000
Name: pct_black, dtype: float64

>>> pct_latino
(missing: 0)
count    6728.000000
mean       10.154578
std        11.202949
min         0.000000
25%         4.000000
50%         6.000000
75%        12.000000
max       100.000000
Name: pct_latino, dtype: float64

>>> pct_unknown_race
(missing: 0)
count    6728.000000
mean        6.811831
std         8.356322
min         0.000000
25%         2.000000
50%         5.000000
75%         9.000000
max        78.000000
Name: pct_unknown_race, dtype: float64

>>> pct_white
(missing: 0)
count    6728.000000
mean       58.331005
std        19.419048
min         0.000000
25%        47.000000
50%        61.000000
75%        73.000000
max        97.000000
Name: pct_white, dtype: float64

>>> pct_female
(missing: 0)
count    6728.000000
mean       57.696492
std         9.529501
min         0.000000
25%        53.000000
50%        57.000000
75%        61.000000
max       100.000000
Name: pct_female, dtype: float64

>>> pct_fin_aid
(missing: 36)
count    6692.000000
mean       83.745218
std        13.860227
min        13.000000
25%        75.000000
50%        86.000000
75%        96.000000
max       100.000000
Name: pct_fin_aid, dtype: float64

>>> pct_ug_18_24
(missing: 1601)
count    5127.000000
mean       71.077043
std        20.059016
min         6.000000
25%        55.000000
50%        75.000000
75%        89.000000
max        99.000000
Name: pct_ug_18_24, dtype: float64

>>> pct_ug_25_64
(missing: 1601)
count    5127.000000
mean       24.859957
std        18.366481
min         0.000000
25%         9.000000
50%        22.000000
75%        36.000000
max        94.000000
Name: pct_ug_25_64, dtype: float64

>>> pct_ug_65p
(missing: 1601)
count    5127.000000
mean        0.146284
std         0.689971
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        18.000000
Name: pct_ug_65p, dtype: float64

>>> pt_teachers
(missing: 2015)
count     4713.000000
mean       621.835137
std       1440.525194
min          1.000000
25%        191.000000
50%        376.000000
75%        686.000000
max      17225.000000
Name: pt_teachers, dtype: float64

>>> pt_teachers_pct_female
(missing: 2015)
count    4713.000000
mean        0.522251
std         0.083018
min         0.000000
25%         0.473214
50%         0.522046
75%         0.569975
max         1.000000
Name: pt_teachers_pct_female, dtype: float64

>>> pt_teachers_pct_black
(missing: 2015)
count    4713.000000
mean        0.065704
std         0.091039
min         0.000000
25%         0.019868
50%         0.040404
75%         0.073394
max         1.000000
Name: pt_teachers_pct_black, dtype: float64

>>> pt_teachers_pct_latino
(missing: 2015)
count    4713.000000
mean        0.037800
std         0.052514
min         0.000000
25%         0.013483
50%         0.025210
75%         0.042424
max         0.848101
Name: pt_teachers_pct_latino, dtype: float64

>>> pt_teachers_pct_other
(missing: 2015)
count    4713.000000
mean        0.127199
std         0.119608
min         0.000000
25%         0.054140
50%         0.093108
75%         0.155080
max         0.975791
Name: pt_teachers_pct_other, dtype: float64

>>> pt_teachers_pct_white
(missing: 2015)
count    4713.000000
mean        0.769297
std         0.153069
min         0.000000
25%         0.707228
50%         0.811676
75%         0.873087
max         1.000000
Name: pt_teachers_pct_white, dtype: float64

>>> size_cat
(missing: 0)
20,000 and above    1792
1,000 - 4,999       1741
10,000 - 19,999     1601
5,000 - 9,999       1332
Under 1,000          262
dtype: int64

>>> stud_fac_ratio
(missing: 22)
count    6706.000000
mean       17.057113
std         5.868013
min         2.000000
25%        13.000000
50%        17.000000
75%        20.000000
max        82.000000
Name: stud_fac_ratio, dtype: float64

>>> student_evals
(missing: 3015)
1    1620
2    1516
0     456
3     121
dtype: int64

>>> teach_instr_fte
(missing: 0)
count    6728.000000
mean      729.783442
std       908.405278
min         3.000000
25%       194.000000
50%       400.000000
75%       922.000000
max      6410.000000
Name: teach_instr_fte, dtype: float64

>>> teachers_fte
(missing: 0)
count    6728.000000
mean      837.187723
std      1137.585244
min         3.000000
25%       196.000000
50%       408.000000
75%       934.000000
max      6410.000000
Name: teachers_fte, dtype: float64

>>> total_staff_fte
(missing: 0)
count     6728.000000
mean      2308.127973
std       3453.725066
min         16.000000
25%        447.750000
50%        950.000000
75%       2434.000000
max      25761.000000
Name: total_staff_fte, dtype: float64

>>> transfer_rate
(missing: 2939)
count    3789.000000
mean       19.757720
std        10.331307
min         0.000000
25%        13.000000
50%        20.000000
75%        26.000000
max        59.000000
Name: transfer_rate, dtype: float64

>>> tuition_fees
(missing: 41)
count     6687.000000
mean     15823.710034
std      13574.518815
min         80.000000
25%       4660.000000
50%       9620.000000
75%      27730.000000
max      47246.000000
Name: tuition_fees, dtype: float64

>>> has_health_ins
(missing: 955)
False    4611
True     1162
dtype: int64

>>> has_retirement
(missing: 1090)
False    4089
True     1549
dtype: int64

>>> has_governance
(missing: 1735)
False    3354
True     1639
dtype: int64

>>> has_union
(missing: 1100)
0    4566
1    1062
dtype: int64

>>> adv_degree
(missing: 2357)
MA/MS    2627
PhD      1551
None      193
dtype: int64

>>> has_adv_degree
(missing: 2357)
True     4178
False     193
dtype: int64

>>> division
(missing: 283)
Arts/Hum    3184
Prof/ApS    1140
SocSci       994
Sci/Tech     674
Gen/LibA     444
Other          9
dtype: int64

>>> tuit_rev_per_fte
(missing: 323)
count     6405.000000
mean     10445.660890
std       8020.163342
min         26.000000
25%       3557.000000
50%       8723.000000
75%      15542.000000
max      33294.000000
Name: tuit_rev_per_fte, dtype: float64

>>> endow_per_fte
(missing: 783)
count       5945.000000
mean       26802.885450
std        75021.825756
min            0.000000
25%         1532.000000
50%         6545.000000
75%        21727.000000
max      1208456.000000
Name: endow_per_fte, dtype: float64

>>> instr_expen_per_fte
(missing: 323)
count     6405.000000
mean      9567.221233
std       8712.895958
min        673.000000
25%       5153.000000
50%       7580.000000
75%      11058.000000
max      92131.000000
Name: instr_expen_per_fte, dtype: float64

>>> urban_rural
(missing: 0)
Urban/Suburban    5741
Town/Rural         987
dtype: int64

Start with a basic OLS regression of pay on full-time salary of professors.

res = smf.ols("log(payaverage) ~ log(mean_ft_sal_prof)", data=data).fit()
res.summary()

So professor salary explains almost 36% of the variation in average pay.

res = smf.ols("log(payaverage) ~ "
              "division + inst_ctrl + inst_lvl + region + urban_rural"
              "+ log(ug_enrol) + log(mean_fin_aid) + log(mean_ft_sal_prof)"
              "+ log(endow_per_fte)"
              "+ has_union + has_health_ins + has_governance + has_retirement", data=data).fit()
res.summary()

df.region.value_counts()

Mid East DE DC MD NJ NY PA                       1608
Southeast AL AR FL GA KY LA MS NC SC TN VA WV    1500
Great Lakes IL IN MI OH WI                       1105
Far West AK CA HI NV OR WA                        761
New England CT ME MA NH RI VT                     673
Southwest AZ NM OK TX                             508
Plains IA KS MN MO NE ND SD                       491
Rocky Mountains CO ID MT UT WY                    230
dtype: int64

df.filter(like='pt_').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6929 entries, 0 to 6928
Data columns (total 8 columns):
pt_reten_rate             5412 non-null float64
pt_ug_enrol               6848 non-null float64
pt_teachers               4790 non-null float64
pt_teachers_pct_female    4790 non-null float64
pt_teachers_pct_white     4790 non-null float64
pt_teachers_pct_black     4790 non-null float64
pt_teachers_pct_latino    4790 non-null float64
pt_teachers_pct_other     4790 non-null float64
dtypes: float64(8)

Institution type

There are several ways to classify institution type.

foo= df.groupby('urban').payaverage.describe()
bar = pd.DataFrame(foo).unstack().sort()
bar

bar
stats.ttest_ind(df.loc[df.urban=='Town: Remote', 'payaverage'], df.loc[df.urban=='Suburb: Small', 'payaverage'])

(array(-3.7191145916390367), 0.00023306187618191029)

pd.pivot_table(df, values='payaverage', cols='urban', aggfunc=[np.mean, np.percentile(50)])

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-66-c1b935f273e3> in <module>()
----> 1 pd.pivot_table(df, values='payaverage', cols='urban', aggfunc=[np.mean, np.percentile(50)])

TypeError: percentile() takes at least 2 arguments (1 given)

pd.crosstab(df.inst_cat, df.college_class, dropna=False)

df.T.loc[:,:10]

def desc(df, nvalues=10):
    for var in df.columns:
        print ">>> %s" % var
        print "(missing: %d)" % df[var].isnull().sum()    
        if df[var].unique().size <= nvalues: # and df[var].dtype == np.object:
            print df[var].value_counts()
        else:# if df[var].dtype != np.object:
            print df[var].describe()
        #else:
        #    print "%s unique string values" % df[var].unique().size
        print ""

df['institution name'].describe()

count                           6876
unique                          1902
top       Sinclair Community College
freq                              48
Name: institution name, dtype: object

desc(df, nvalues=30)

>>> department_id
(missing: 0)
count    6929.000000
mean     2979.756531
std      1784.278913
min         3.000000
25%      1421.000000
50%      2899.000000
75%      4508.000000
max      6213.000000
Name: department_id, dtype: float64

>>> payaverage
(missing: 0)
count     6929.000000
mean      3056.757974
std       1543.561399
min        300.000000
25%       2025.000000
50%       2700.000000
75%       3675.000000
max      20000.000000
Name: payaverage, dtype: float64

>>> user_supplied_department_name
(missing: 210)
count        6719
unique       1231
top       English
freq         1036
Name: user_supplied_department_name, dtype: object

>>> course_type
(missing: 5458)
0    1229
1     242
dtype: int64

>>> contract_type
(missing: 5205)
Semester     1267
None          308
Annual         72
Quarter        41
Term           19
Multiyear      17
dtype: int64

>>> period_term
(missing: 2412)
1    2206
3    1553
4     329
2     266
0     163
dtype: int64

>>> period_year
(missing: 2412)
2013    2728
2012    1162
2014     380
0        163
2011      84
dtype: int64

>>> retirement_type
(missing: 705)
0    4215
1     919
2     481
4     416
3     193
dtype: int64

>>> health_insurance_type
(missing: 680)
0    4738
1     614
3     324
4     305
2     268
dtype: int64

>>> governance_type
(missing: 727)
0    3451
1    1686
4    1065
dtype: int64

>>> has_union
(missing: 1131)
0    4713
1    1085
dtype: int64

>>> union_name
(missing: 2581)
count     4348
unique      52
top       None
freq      3265
Name: union_name, dtype: object

>>> is_school_rep
(missing: 0)
0    6861
1      68
dtype: int64

>>> course_syllabus
(missing: 2426)
2    1886
3    1166
1    1048
0     403
dtype: int64

>>> office_space
(missing: 2500)
0    1476
3    1355
2    1204
1     394
dtype: int64

>>> student_evals
(missing: 3072)
1    1683
2    1581
0     470
3     123
dtype: int64

>>> advanced_degree_type
(missing: 2397)
1    2700
2    1564
0     200
3      62
4       6
dtype: int64

>>> department_type_id
(missing: 0)
count      6929.000000
mean     331078.198153
std      196584.444057
min      101000.000000
25%      203200.000000
50%      205000.000000
75%      407000.000000
max      999999.000000
Name: department_type_id, dtype: float64

>>> display_name
(missing: 0)
count        6929
unique       1115
top       English
freq         1061
Name: display_name, dtype: object

>>> department_type_name
(missing: 0)
count        6929
unique         63
top       English
freq         1114
Name: department_type_name, dtype: object

>>> college_class
(missing: 0)
4-year private not-for-profit    2494
4-year public                    2330
2-year public                    1675
4-year private for-profit         313
2-year private for-profit          53
Administrative Unit                41
2-year Public                      13
2-year private not-for-profit      10
dtype: int64

>>> college_carnegie
(missing: 0)
count                                                  6929
unique                                                   44
top       Masters Colleges and Universities--larger prog...
freq                                                   1437
Name: college_carnegie, dtype: object

>>> college_unit_id
(missing: 0)
count      6929.000000
mean     185454.349978
std       58497.922707
min      100663.000000
25%      146393.000000
50%      182500.000000
75%      212869.000000
max      475121.000000
Name: college_unit_id, dtype: float64

>>> college_state
(missing: 0)
count         6929
unique          51
top       New York
freq           609
Name: college_state, dtype: object

>>> college_id
(missing: 0)
count    6929.000000
mean     2137.284168
std      1225.261931
min         1.000000
25%      1064.000000
50%      2156.000000
75%      3102.000000
max      5368.000000
Name: college_id, dtype: float64

>>> college_name
(missing: 0)
count                           6929
unique                          1948
top       Sinclair Community College
freq                              48
Name: college_name, dtype: object

>>> unitid
(missing: 53)
count      6876.000000
mean     185103.209133
std       57637.311376
min      100663.000000
25%      145831.000000
50%      182728.000000
75%      212832.000000
max      475121.000000
Name: unitid, dtype: float64

>>> institution name
(missing: 53)
count                           6876
unique                          1902
top       Sinclair Community College
freq                              48
Name: institution name, dtype: object

>>> hbcu
(missing: 53)
No     6805
Yes      71
dtype: int64

>>> pct_unknown_race
(missing: 81)
count    6848.000000
mean        6.830754
std         8.372139
min         0.000000
25%         2.000000
50%         5.000000
75%         9.000000
max        78.000000
Name: pct_unknown_race, dtype: float64

>>> pt_reten_rate
(missing: 1517)
count    5412.000000
mean       49.231707
std        23.684972
min         0.000000
25%        37.000000
50%        47.000000
75%        62.000000
max       100.000000
Name: pt_reten_rate, dtype: float64

>>> expen_pct_labor_private
(missing: 4334)
count    2595.000000
mean       60.499807
std         6.382221
min        14.000000
25%        57.000000
50%        61.000000
75%        65.000000
max        79.000000
Name: expen_pct_labor_private, dtype: float64

>>> transfer_rate
(missing: 3092)
count    3837.000000
mean       19.689341
std        10.353825
min         0.000000
25%        13.000000
50%        20.000000
75%        26.000000
max        59.000000
Name: transfer_rate, dtype: float64

>>> endow_per_fte_private
(missing: 4377)
count       2552.000000
mean       53104.958856
std       107507.580913
min           11.000000
25%         8790.000000
50%        22381.500000
75%        44790.000000
max      1208456.000000
Name: endow_per_fte_private, dtype: float64

>>> pct_ug_18_24
(missing: 1732)
count    5197.000000
mean       71.001732
std        20.091539
min         6.000000
25%        55.000000
50%        75.000000
75%        89.000000
max        99.000000
Name: pct_ug_18_24, dtype: float64

>>> expen_pct_instr_public
(missing: 3023)
count    3906.000000
mean       44.313876
std         8.586551
min         0.000000
25%        40.000000
50%        44.500000
75%        50.000000
max        73.000000
Name: expen_pct_instr_public, dtype: float64

>>> mean_ft_sal_norank
(missing: 4380)
count      2549.000000
mean      54663.330718
std       22232.128614
min        9783.000000
25%       40608.000000
50%       50805.000000
75%       63783.000000
max      195003.000000
Name: mean_ft_sal_norank, dtype: float64

>>> tuition_pct_rev_public
(missing: 3023)
count    3906.000000
mean       28.814900
std        14.012553
min         0.000000
25%        18.000000
50%        28.000000
75%        38.000000
max        78.000000
Name: tuition_pct_rev_public, dtype: float64

>>> tuition_fees
(missing: 140)
count     6789.000000
mean     15835.878038
std      13551.374901
min         80.000000
25%       4703.000000
50%       9676.000000
75%      27718.000000
max      47246.000000
Name: tuition_fees, dtype: float64

>>> instr_expen_per_fte_public
(missing: 3048)
count     3881.000000
mean      7262.180366
std       3940.600839
min       2227.000000
25%       4470.000000
50%       6358.000000
75%       8682.000000
max      35831.000000
Name: instr_expen_per_fte_public, dtype: float64

>>> manag_fte
(missing: 53)
count    6876.000000
mean      193.045957
std       328.085473
min         0.000000
25%        36.000000
50%        76.000000
75%       207.000000
max      3701.000000
Name: manag_fte, dtype: float64

>>> ft_reten_rate
(missing: 296)
count    6633.000000
mean       71.910749
std        16.141858
min         0.000000
25%        61.000000
50%        74.000000
75%        84.000000
max       100.000000
Name: ft_reten_rate, dtype: float64

>>> pct_ug_25_64
(missing: 1732)
count    5197.000000
mean       24.945738
std        18.418666
min         0.000000
25%         9.000000
50%        22.000000
75%        36.000000
max        94.000000
Name: pct_ug_25_64, dtype: float64

>>> tuit_rev_per_fte_private
(missing: 4334)
count     2595.000000
mean     18185.555684
std       6404.205426
min       1842.000000
25%      13645.000000
50%      17537.000000
75%      22774.000000
max      33294.000000
Name: tuit_rev_per_fte_private, dtype: float64

>>> pct_latino
(missing: 81)
count    6848.000000
mean       10.212471
std        11.241145
min         0.000000
25%         4.000000
50%         6.000000
75%        12.000000
max       100.000000
Name: pct_latino, dtype: float64

>>> expen_pct_instlabor_public
(missing: 3025)
count    3904.000000
mean       74.934426
std         6.783373
min        29.000000
25%        71.000000
50%        74.000000
75%        79.000000
max       100.000000
Name: expen_pct_instlabor_public, dtype: float64

>>> tuit_rev_per_fte_public
(missing: 3048)
count     3881.000000
mean      5303.835867
std       3670.825009
min         26.000000
25%       2236.000000
50%       4563.000000
75%       7516.000000
max      21099.000000
Name: tuit_rev_per_fte_public, dtype: float64

>>> total_staff_fte
(missing: 53)
count     6876.000000
mean      2298.144852
std       3454.748920
min         13.000000
25%        438.000000
50%        933.000000
75%       2411.000000
max      25761.000000
Name: total_staff_fte, dtype: float64

>>> size_cat
(missing: 53)
20,000 and above    1816
1,000 - 4,999       1773
10,000 - 19,999     1615
5,000 - 9,999       1347
Under 1,000          297
Not applicable        28
dtype: int64

>>> mean_ft_sal_assis
(missing: 1203)
count      5726.000000
mean      61672.496856
std       13939.754902
min       25470.000000
25%       51538.500000
50%       60111.000000
75%       69192.000000
max      119790.000000
Name: mean_ft_sal_assis, dtype: float64

>>> grad_enrol
(missing: 81)
count     6848.000000
mean      2973.568779
std       5323.164073
min          0.000000
25%          0.000000
50%        778.000000
75%       3960.000000
max      47660.000000
Name: grad_enrol, dtype: float64

>>> urban
(missing: 53)
City: Large        2100
Suburb: Large      1517
City: Small         969
City: Midsize       968
Town: Distant       301
Rural: Fringe       258
Town: Remote        207
Suburb: Midsize     182
Town: Fringe        159
Suburb: Small       142
Rural: Distant       54
Rural: Remote        19
dtype: int64

>>> pt_ug_enrol
(missing: 81)
count     6848.000000
mean      3881.886244
std       6660.046913
min          0.000000
25%        368.750000
50%       1527.000000
75%       4663.000000
max      62610.000000
Name: pt_ug_enrol, dtype: float64

>>> mean_fin_aid
(missing: 119)
count     6810.000000
mean     10537.209398
std       7416.398185
min       1387.000000
25%       4870.250000
50%       7369.000000
75%      15357.000000
max      41555.000000
Name: mean_fin_aid, dtype: float64

>>> teachers_fte
(missing: 53)
count    6876.000000
mean      832.252036
std      1139.817963
min         0.000000
25%       191.000000
50%       398.000000
75%       931.000000
max      6410.000000
Name: teachers_fte, dtype: float64

>>> revenues_public
(missing: 3023)
count    3.906000e+03
mean     3.523928e+08
std      5.029711e+08
min      4.870198e+06
25%      6.961095e+07
50%      1.586425e+08
75%      3.900425e+08
max      3.764557e+09
Name: revenues_public, dtype: float64

>>> ft_ug_enrol
(missing: 81)
count      6848.000000
mean       8652.875292
std       15899.101819
min           0.000000
25%        2161.000000
50%        4965.000000
75%       10177.000000
max      208742.000000
Name: ft_ug_enrol, dtype: float64

>>> pct_ug_65p
(missing: 1732)
0     4640
1      480
2       49
3       13
6        5
18       4
4        2
11       1
8        1
7        1
5        1
dtype: int64

>>> teach_instr_fte
(missing: 53)
count    6876.000000
mean      724.886417
std       910.231440
min         0.000000
25%       189.750000
50%       390.000000
75%       916.000000
max      6410.000000
Name: teach_instr_fte, dtype: float64

>>> instr_expen_per_fte_private
(missing: 4334)
count     2595.000000
mean     13057.551445
std      12116.884731
min        673.000000
25%       7191.500000
50%      10100.000000
75%      13676.000000
max      92131.000000
Name: instr_expen_per_fte_private, dtype: float64

>>> inst_sector
(missing: 53)
Private not-for-profit, 4-year or above    2492
Public, 4-year or above                    2351
Public, 2-year                             1623
Private for-profit, 4-year or above         329
Private for-profit, 2-year                   43
Administrative Unit                          28
Private not-for-profit, 2-year               10
dtype: int64

>>> expen_private
(missing: 4334)
count    2.595000e+03
mean     3.481254e+08
std      6.754215e+08
min      1.563631e+06
25%      3.942754e+07
50%      8.669083e+07
75%      2.509930e+08
max      3.563332e+09
Name: expen_private, dtype: float64

>>> expen_pct_acadsup_public
(missing: 3023)
9     516
11    454
10    422
8     391
13    308
7     267
14    225
6     209
12    202
15    194
5     117
4      96
16     79
3      76
18     67
17     65
22     50
27     37
2      32
19     27
21     22
1      18
23     17
20     11
0       3
29      1
dtype: int64

>>> pct_female
(missing: 81)
count    6848.000000
mean       57.787091
std         9.684821
min         0.000000
25%        53.000000
50%        57.000000
75%        62.000000
max       100.000000
Name: pct_female, dtype: float64

>>> endow_per_fte_public
(missing: 3473)
count      3456.000000
mean       7482.129051
std       18770.674402
min           0.000000
25%         638.000000
50%        2289.000000
75%        7022.250000
max      196338.000000
Name: endow_per_fte_public, dtype: float64

>>> inst_lvl
(missing: 53)
Four or more years                  5181
At least 2 but less than 4 years    1695
dtype: int64

>>> pct_white
(missing: 81)
count    6848.000000
mean       58.107331
std        19.563232
min         0.000000
25%        47.000000
50%        61.000000
75%        73.000000
max        97.000000
Name: pct_white, dtype: float64

>>> grad_ontime_rate
(missing: 5260)
count    1669.000000
mean        9.333134
std         8.590706
min         0.000000
25%         4.000000
50%         7.000000
75%        12.000000
max        92.000000
Name: grad_ontime_rate, dtype: float64

>>> inst_cat
(missing: 53)
Degree-granting, primarily baccalaureate or above          4824
Degree-granting, associate's and certificates              1683
Degree-granting, not primarily baccalaureate or above       317
Not applicable                                               28
Degree-granting, graduate with no undergraduate degrees      24
dtype: int64

>>> mean_ft_sal_lect
(missing: 4269)
count      2660.000000
mean      50733.345113
std       12331.775455
min       14724.000000
25%       43062.750000
50%       49329.000000
75%       58446.000000
max      124632.000000
Name: mean_ft_sal_lect, dtype: float64

>>> admit_yield
(missing: 2332)
count    4597.000000
mean       33.697846
std        15.055766
min         7.000000
25%        23.000000
50%        31.000000
75%        40.000000
max       100.000000
Name: admit_yield, dtype: float64

>>> ug_enrol
(missing: 81)
count      6848.000000
mean      12534.761536
std       17859.945818
min           0.000000
25%        3175.750000
50%        7868.500000
75%       17306.500000
max      208742.000000
Name: ug_enrol, dtype: float64

>>> region
(missing: 53)
Mid East DE DC MD NJ NY PA                       1608
Southeast AL AR FL GA KY LA MS NC SC TN VA WV    1500
Great Lakes IL IN MI OH WI                       1105
Far West AK CA HI NV OR WA                        761
New England CT ME MA NH RI VT                     673
Southwest AZ NM OK TX                             508
Plains IA KS MN MO NE ND SD                       491
Rocky Mountains CO ID MT UT WY                    230
dtype: int64

>>> expen_pct_acadsup_private
(missing: 4334)
count    2595.000000
mean       11.642389
std         5.732744
min         0.000000
25%         8.000000
50%        11.000000
75%        15.000000
max        49.000000
Name: expen_pct_acadsup_private, dtype: float64

>>> mean_ft_sal_assoc
(missing: 1223)
count      5706.000000
mean      71750.555205
std       16505.803558
min       22761.000000
25%       60066.000000
50%       70038.000000
75%       82170.000000
max      134559.000000
Name: mean_ft_sal_assoc, dtype: float64

>>> admit_pct
(missing: 2332)
count    4597.000000
mean       62.062650
std        18.837805
min         6.000000
25%        52.000000
50%        64.000000
75%        75.000000
max       100.000000
Name: admit_pct, dtype: float64

>>> pct_all_dist_ed
(missing: 81)
count    6848.000000
mean        9.224737
std        16.235853
min         0.000000
25%         1.000000
50%         4.000000
75%        10.000000
max       100.000000
Name: pct_all_dist_ed, dtype: float64

>>> mean_ft_sal_all
(missing: 131)
count      6798.000000
mean      69163.125331
std       19790.579360
min        9000.000000
25%       55876.500000
50%       66492.000000
75%       80397.000000
max      171657.000000
Name: mean_ft_sal_all, dtype: float64

>>> expen_public
(missing: 3023)
count    3.906000e+03
mean     3.259404e+08
std      4.590209e+08
min      4.779040e+06
25%      6.574000e+07
50%      1.488648e+08
75%      3.617734e+08
max      3.303740e+09
Name: expen_public, dtype: float64

>>> stud_fac_ratio
(missing: 105)
count    6824.000000
mean       17.071805
std         5.887220
min         2.000000
25%        13.000000
50%        17.000000
75%        20.000000
max        82.000000
Name: stud_fac_ratio, dtype: float64

>>> pct_no_dist_ed
(missing: 81)
count    6848.000000
mean       78.708528
std        21.101825
min         0.000000
25%        70.000000
50%        83.000000
75%        95.000000
max       100.000000
Name: pct_no_dist_ed, dtype: float64

>>> pct_black
(missing: 81)
count    6848.000000
mean       12.277161
std        13.449123
min         0.000000
25%         4.000000
50%         8.000000
75%        16.000000
max        99.000000
Name: pct_black, dtype: float64

>>> expen_pct_labor_public
(missing: 3023)
count    3906.000000
mean       59.348694
std         6.847172
min        23.000000
25%        55.000000
50%        60.000000
75%        64.000000
max        82.000000
Name: expen_pct_labor_public, dtype: float64

>>> degree_grt_status
(missing: 53)
Degree-granting    6876
dtype: int64

>>> expen_pct_instr_private
(missing: 4334)
count    2595.000000
mean       43.594990
std         8.277691
min        15.000000
25%        38.000000
50%        43.000000
75%        49.000000
max        66.000000
Name: expen_pct_instr_private, dtype: float64

>>> inst_ctrl
(missing: 53)
Public                    3999
Private not-for-profit    2502
Private for-profit         375
dtype: int64

>>> revenues_private
(missing: 4334)
count    2.595000e+03
mean     3.396032e+08
std      6.426090e+08
min      3.608620e+05
25%      3.862799e+07
50%      8.516176e+07
75%      2.372660e+08
max      3.898859e+09
Name: revenues_private, dtype: float64

>>> grad_rate
(missing: 160)
count    6769.000000
mean       47.062934
std        24.470604
min         0.000000
25%        26.000000
50%        47.000000
75%        66.000000
max       100.000000
Name: grad_rate, dtype: float64

>>> mean_ft_sal_prof
(missing: 1158)
count      5771.000000
mean      93778.443597
std       29666.568697
min       27000.000000
25%       72351.000000
50%       87849.000000
75%      111429.000000
max      207630.000000
Name: mean_ft_sal_prof, dtype: float64

>>> tuition_pct_rev_private
(missing: 4334)
count    2595.000000
mean       75.997303
std        21.115490
min         4.000000
25%        67.000000
50%        83.000000
75%        90.000000
max       154.000000
Name: tuition_pct_rev_private, dtype: float64

>>> expen_pct_instlabor_private
(missing: 4334)
count    2595.000000
mean       72.462428
std         7.848338
min        41.000000
25%        67.000000
50%        73.000000
75%        78.000000
max        99.000000
Name: expen_pct_instlabor_private, dtype: float64

>>> mean_ft_sal_instr
(missing: 1803)
count      5126.000000
mean      49953.462349
std       14001.472979
min        9000.000000
25%       42192.000000
50%       47601.000000
75%       54801.000000
max      191997.000000
Name: mean_ft_sal_instr, dtype: float64

>>> pct_any_dist_ed
(missing: 81)
count    6848.000000
mean       12.050672
std        12.403961
min         0.000000
25%         2.000000
50%        10.000000
75%        18.000000
max       100.000000
Name: pct_any_dist_ed, dtype: float64

>>> pct_fin_aid
(missing: 119)
count    6810.000000
mean       83.812628
std        13.858351
min        13.000000
25%        75.000000
50%        86.000000
75%        96.000000
max       100.000000
Name: pct_fin_aid, dtype: float64

>>> pt_teachers
(missing: 2139)
count     4790.000000
mean       620.991441
std       1450.325181
min          1.000000
25%        189.000000
50%        373.000000
75%        685.000000
max      17225.000000
Name: pt_teachers, dtype: float64

>>> pt_teachers_pct_female
(missing: 2139)
count    4790.000000
mean        0.521664
std         0.085545
min         0.000000
25%         0.473118
50%         0.521951
75%         0.570093
max         1.000000
Name: pt_teachers_pct_female, dtype: float64

>>> pt_teachers_pct_white
(missing: 2139)
count    4790.000000
mean        0.768698
std         0.154074
min         0.000000
25%         0.704545
50%         0.811676
75%         0.873063
max         1.000000
Name: pt_teachers_pct_white, dtype: float64

>>> pt_teachers_pct_black
(missing: 2139)
count    4790.000000
mean        0.066584
std         0.092897
min         0.000000
25%         0.019966
50%         0.041096
75%         0.073879
max         1.000000
Name: pt_teachers_pct_black, dtype: float64

>>> pt_teachers_pct_latino
(missing: 2139)
count    4790.000000
mean        0.037952
std         0.052704
min         0.000000
25%         0.013436
50%         0.025063
75%         0.042553
max         0.848101
Name: pt_teachers_pct_latino, dtype: float64

>>> pt_teachers_pct_other
(missing: 2139)
count    4790.000000
mean        0.126767
std         0.119147
min         0.000000
25%         0.054140
50%         0.092929
75%         0.155080
max         0.975791
Name: pt_teachers_pct_other, dtype: float64

>>> has_health_ins
(missing: 985)
False    4738
True     1206
dtype: int64

>>> has_retirement
(missing: 1121)
False    4215
True     1593
dtype: int64

>>> has_governance
(missing: 1792)
False    3451
True     1686
dtype: int64

>>> adv_degree
(missing: 2397)
MA/MS    2700
PhD      1564
None      200
JD         62
MD          6
dtype: int64

>>> has_adv_degree
(missing: 2397)
True     4332
False     200
dtype: int64

>>> division
(missing: 288)
Arts/Hum    3245
Prof/ApS    1203
SocSci      1019
Sci/Tech     691
Gen/LibA     474
Other          9
dtype: int64

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6929 entries, 0 to 6928
Data columns (total 107 columns):
department_id                    int64
payaverage                       int64
user_supplied_department_name    object
course_type                      float64
contract_type                    object
period_term                      float64
period_year                      float64
retirement_type                  float64
health_insurance_type            float64
governance_type                  float64
has_union                        float64
union_name                       object
is_school_rep                    int64
course_syllabus                  float64
office_space                     float64
student_evals                    float64
advanced_degree_type             float64
department_type_id               int64
display_name                     object
department_type_name             object
college_class                    object
college_carnegie                 object
college_unit_id                  int64
college_state                    object
college_id                       int64
college_name                     object
unitid                           float64
institution name                 object
hbcu                             object
pct_unknown_race                 float64
pt_reten_rate                    float64
expen_pct_labor_private          float64
transfer_rate                    float64
endow_per_fte_private            float64
pct_ug_18_24                     float64
expen_pct_instr_public           float64
mean_ft_sal_norank               float64
tuition_pct_rev_public           float64
tuition_fees                     float64
instr_expen_per_fte_public       float64
manag_fte                        float64
ft_reten_rate                    float64
pct_ug_25_64                     float64
tuit_rev_per_fte_private         float64
pct_latino                       float64
expen_pct_instlabor_public       float64
tuit_rev_per_fte_public          float64
total_staff_fte                  float64
size_cat                         object
mean_ft_sal_assis                float64
grad_enrol                       float64
urban                            object
pt_ug_enrol                      float64
mean_fin_aid                     float64
teachers_fte                     float64
revenues_public                  float64
ft_ug_enrol                      float64
pct_ug_65p                       float64
teach_instr_fte                  float64
instr_expen_per_fte_private      float64
inst_sector                      object
expen_private                    float64
expen_pct_acadsup_public         float64
pct_female                       float64
endow_per_fte_public             float64
inst_lvl                         object
pct_white                        float64
grad_ontime_rate                 float64
inst_cat                         object
mean_ft_sal_lect                 float64
admit_yield                      float64
ug_enrol                         float64
region                           object
expen_pct_acadsup_private        float64
mean_ft_sal_assoc                float64
admit_pct                        float64
pct_all_dist_ed                  float64
mean_ft_sal_all                  float64
expen_public                     float64
stud_fac_ratio                   float64
pct_no_dist_ed                   float64
pct_black                        float64
expen_pct_labor_public           float64
degree_grt_status                object
expen_pct_instr_private          float64
inst_ctrl                        object
revenues_private                 float64
grad_rate                        float64
mean_ft_sal_prof                 float64
tuition_pct_rev_private          float64
expen_pct_instlabor_private      float64
mean_ft_sal_instr                float64
pct_any_dist_ed                  float64
pct_fin_aid                      float64
pt_teachers                      float64
pt_teachers_pct_female           float64
pt_teachers_pct_white            float64
pt_teachers_pct_black            float64
pt_teachers_pct_latino           float64
pt_teachers_pct_other            float64
has_health_ins                   object
has_retirement                   object
has_governance                   object
adv_degree                       object
has_adv_degree                   object
division                         object
urb                              object
dtypes: float64(75), int64(6), object(26)

res = smf.ols("log(payaverage) ~ C(adv_degree, Treatment(reference='None'))" \
                  "+ C(division, Treatment(reference='Gen/LibA'))" \
                  "+ has_union + has_health_ins + has_retirement + has_governance" \
                  "+ inst_ctrl + inst_lvl + region" \
                  "+ admit_pct + log(tuit_rev_per_fte)" \
                  " + log(mean_ft_sal_prof)", data=df).fit()
res.summary()

df.boxplot('payaverage', by='urban', figsize=(10,10), rot=45)

<matplotlib.axes.AxesSubplot at 0x1081933d0>

def f(x):
    if x.find("Urban") != -1:
        return "Urban"
    elif x.find("Suburban") != -1:
        return "Suburban"
    elif x.find("Rural") != -1:
        return "Rural"
    elif x.find("Town") != -1:
        return "Town"
    else: 
        return np.nan

if "Urban".find("Urb") != -1:
    print "True"

True

def f(x):
    if str(x).find("City") != -1:
        return "Urban/Suburban"
    elif str(x).find("Suburb") != -1:
        return "Urban/Suburban"
    elif str(x).find("Rural") != -1:
        return "Rural/Town"
    elif str(x).find("Town") != -1:
        return "Rural/Town"
    else: 
        return np.nan
df['urb'] = df.urban.apply(f)

df['urb'].value_counts()

Urban/Suburban    5878
Rural/Town         998
dtype: int64

len(df[df.payaverage < 2500])/float(len(df))

0.4202626641651032

df[df.adv_degree.notnull()].adv_degree.value_counts(normalize=True)

MA/MS    0.595763
PhD      0.345102
None     0.044131
JD       0.013680
MD       0.001324
dtype: float64

df[df.has_health_ins.notnull()].has_health_ins.value_counts(normalize=True)

False    0.797106
True     0.202894
dtype: float64

has_union	0.0	1.0
union_name
ACT-UAW #7902	0	1
AAUP	0	114
ACT-UAW	0	1
ACT-UAW Local 7902	0	2
AEA	0	1
AFSCME	0	13
AFT	0	224
AFT 1563 Adjunct Faculty Union	0	1
AFT-AAUP	0	1
APSCUF	0	2
CFA	0	4
CPFA	0	1
CWA	0	1
California Faculty Association	0	1
EMUFT	0	1
Faculty Staff Union	0	1
GSEU	0	1
Graduate Student Union	0	1
LEO	0	1
MAHE	0	1
MCCC/MTA	0	1
MEA affiliate	0	1
MTA	0	2
Mass. Teachers Association	0	1
NEA	0	93
NEARI	0	1
NLRB	0	1
NYSUT	0	2
None	1	3263
OEA	0	1
Oregon Education Assoc (NEA)	0	1
Other	0	512
PAFTA, AFL-CIO	0	1
PSC CUNY	0	1
PTFU	0	2
Part-Time Faculty Asociation, affiliate of California Faculty Association	0	1
Roosevelt Adjunct Faculty Organization, IEA/NEA	0	1
SEA/SEIU	0	1
SEIU	0	68
SEIU Adjunct Union	0	1
UAW	0	4
UC-AFT	0	1
UCE of FIT (AFT)	0	1
UFCT	0	1
UFF	0	2
UUP	0	3
UUP Oswego Chapter	0	1
Union of Teaching Faculty/AFT	0	1
United Academics	0	1
United Faculty	0	1
WCCFT (NYSUT-AFT)	0	1
WEA	0	1

Dep. Variable:	log(payaverage)	R-squared:	0.359
Model:	OLS	Adj. R-squared:	0.358
Method:	Least Squares	F-statistic:	3174.
Date:	Fri, 02 May 2014	Prob (F-statistic):	0.00
Time:	09:37:28	Log-Likelihood:	-2083.9
No. Observations:	5680	AIC:	4172.
Df Residuals:	5678	BIC:	4185.
Df Model:	1

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	-1.7283	0.172	-10.034	0.000	-2.066 -1.391
log(mean_ft_sal_prof)	0.8509	0.015	56.343	0.000	0.821 0.881

Omnibus:	218.249	Durbin-Watson:	1.221
Prob(Omnibus):	0.000	Jarque-Bera (JB):	602.555
Skew:	-0.136	Prob(JB):	1.44e-131
Kurtosis:	4.572	Cond. No.	427.

Dep. Variable:	log(payaverage)	R-squared:	0.468
Model:	OLS	Adj. R-squared:	0.464
Method:	Least Squares	F-statistic:	127.3
Date:	Thu, 01 May 2014	Prob (F-statistic):	0.00
Time:	19:13:28	Log-Likelihood:	-880.73
No. Observations:	3210	AIC:	1807.
Df Residuals:	3187	BIC:	1947.
Df Model:	22

Omnibus:	304.122	Durbin-Watson:	1.514
Prob(Omnibus):	0.000	Jarque-Bera (JB):	864.529
Skew:	-0.511	Prob(JB):	1.86e-188
Kurtosis:	5.328	Cond. No.	1.63e+09

	0
	count	mean	std	min	25%	50%	75%	max
urban
City: Large	2100	3368.070000	1637.962862	550	2213.00	3000.0	4000.0	18000
City: Midsize	968	3059.335744	1660.671490	595	2085.00	2612.5	3500.0	12575
City: Small	969	3158.588235	1651.775865	450	2016.00	2700.0	4000.0	20000
Rural: Distant	54	2713.537037	1459.804093	1000	1737.00	2510.5	3000.0	9945
Rural: Fringe	258	2262.399225	875.212583	300	1725.75	2074.5	2619.5	6400
Rural: Remote	19	1928.947368	612.897442	1200	1500.00	1750.0	2150.0	3450
Suburb: Large	1517	3014.793672	1447.153289	500	2100.00	2700.0	3600.0	18000
Suburb: Midsize	182	3030.274725	1230.616739	1000	2188.75	2850.0	3500.0	8750
Suburb: Small	142	2844.352113	957.004726	1187	2067.50	2700.0	3500.0	5625
Town: Distant	301	2666.398671	1498.749481	525	1650.00	2300.0	3036.0	12000
Town: Fringe	159	2411.295597	805.191034	450	1950.00	2300.0	2700.0	6857
Town: Remote	207	2449.096618	987.689758	450	1800.00	2130.0	3000.0	7667

college_class	2-year Public	2-year private for-profit	2-year private not-for-profit	2-year public	4-year private for-profit	4-year private not-for-profit	4-year public	Administrative Unit
inst_cat
Degree-granting, associate's and certificates	13	43	10	1610	0	0	7	0
Degree-granting, graduate with no undergraduate degrees	0	0	0	0	1	23	0	0
Degree-granting, not primarily baccalaureate or above	0	9	0	21	81	52	154	0
Degree-granting, primarily baccalaureate or above	0	0	0	0	225	2416	2169	14
Not applicable	0	0	0	0	3	0	0	25

	0	1	2	3	4	5	6	7	8	9	10
department_id	3	4	5	6	8	9	11	13	9	14	15
payaverage	3298	3000	3200	3450	1605	1350	1605	1400	1350	1200	2070
user_supplied_department_name	All	English	Liberal Arts	All	Social Sciences	MTH	General Education	All	MTH	English	Social Sciences
course_type	0	1	0	NaN	0	0	NaN	NaN	0	1	0
contract_type	Semester	None	None	NaN	Semester	Semester	NaN	Semester	Semester	Semester	Semester
period_term	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
period_year	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
retirement_type	0	0	0	NaN	0	0	0	0	0	0	0
health_insurance_type	0	0	0	NaN	0	0	0	0	0	0	0
governance_type	1	0	0	NaN	0	0	0	0	0	0	0
has_union	0	1	1	0	0	1	0	0	0	0	0
union_name	NaN	Other	United Academics	NaN	NaN	AEA	NaN	NaN	NaN	NaN	NaN
is_school_rep	0	0	0	0	0	0	0	0	0	0	0
course_syllabus	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
office_space	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
student_evals	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
advanced_degree_type	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
department_type_id	888888	203200	102000	888888	300000	408000	101000	888888	408000	203200	300000
display_name	Not Specified	English	Liberal Arts	Not Specified	Social Sciences	Math	General Education	Not Specified	Math	English	Social Sciences
department_type_name	Not Specified	English	Liberal Arts	Not Specified	Social Sciences	Mathematics	General Studies	Not Specified	Mathematics	English	Social Sciences
college_class	4-year public	4-year public	4-year public	4-year public	2-year public	2-year public	2-year public	4-year public	2-year public	2-year public	4-year private not-for-profit
college_carnegie	Masters Colleges and Universities--larger prog...	Masters Colleges and Universities--larger prog...	Research Universities--high research activity	Masters Colleges and Universities--smaller pro...	Associates--Public Urban-serving Multicampus	Associates--Public Rural-serving Large	Associates--Public Rural-serving Large	Masters Colleges and Universities--larger prog...	Associates--Public Rural-serving Large	Associates--Public Rural-serving Medium	Masters Colleges and Universities--smaller pro...
college_unit_id	102553	102553	102614	102632	102030	101514	101240	101480	101514	102076	102234
college_state	Alaska	Alaska	Alaska	Alaska	Alabama	Alabama	Alabama	Alabama	Alabama	Alabama	Alabama
college_id	1	1	3	4	6	7	9	10	7	11	12
college_name	University of Alaska at Anchorage	University of Alaska at Anchorage	University of Alaska at Fairbanks	University of Alaska-Southeast	Bishop State Community College	Calhoun Community College	Gadsden State Community College	Jacksonville State University	Calhoun Community College	Snead State Community College	Spring Hill College
has_health_ins	False	False	False	NaN	False	False	False	False	False	False	False
has_retirement	False	False	False	NaN	False	False	False	False	False	False	False
has_governance	True	False	False	NaN	False	False	False	False	False	False	False
(adv_degree, has_adv_degree)	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
adv_degree	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
has_adv_degree	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
division	NaN	Arts/Hum	Gen/LibA	NaN	SocSci	Sci/Tech	Gen/LibA	NaN	Sci/Tech	Arts/Hum	SocSci

Omnibus:	199.934	Durbin-Watson:	1.705
Prob(Omnibus):	0.000	Jarque-Bera (JB):	537.225
Skew:	-0.653	Prob(JB):	2.20e-117
Kurtosis:	5.466	Cond. No.	nan

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	0.4754	0.208	2.282	0.023	0.067 0.884
division[T.Gen/LibA]	-0.0587	0.024	-2.452	0.014	-0.106 -0.012
division[T.Prof/ApS]	0.0543	0.016	3.446	0.001	0.023 0.085
division[T.Sci/Tech]	0.1273	0.020	6.231	0.000	0.087 0.167
division[T.SocSci]	0.0513	0.017	3.013	0.003	0.018 0.085
inst_ctrl[T.Private not-for-profit]	0.1695	0.107	1.591	0.112	-0.039 0.378
inst_ctrl[T.Public]	0.3059	0.103	2.964	0.003	0.104 0.508
inst_lvl[T.Four or more years]	0.0206	0.021	1.002	0.316	-0.020 0.061
region[T.Great Lakes IL IN MI OH WI]	-0.0668	0.025	-2.725	0.006	-0.115 -0.019
region[T.Mid East DE DC MD NJ NY PA]	-0.0576	0.023	-2.518	0.012	-0.102 -0.013
region[T.New England CT ME MA NH RI VT]	0.1344	0.027	4.938	0.000	0.081 0.188
region[T.Plains IA KS MN MO NE ND SD]	0.0168	0.030	0.567	0.570	-0.041 0.075
region[T.Rocky Mountains CO ID MT UT WY]	-0.0038	0.040	-0.095	0.925	-0.082 0.074
region[T.Southeast AL AR FL GA KY LA MS NC SC TN VA WV]	-0.1317	0.024	-5.434	0.000	-0.179 -0.084
region[T.Southwest AZ NM OK TX]	-0.1056	0.031	-3.409	0.001	-0.166 -0.045
urban_rural[T.Urban/Suburban]	0.0257	0.018	1.463	0.143	-0.009 0.060
has_health_ins[T.True]	0.1341	0.018	7.276	0.000	0.098 0.170
has_governance[T.True]	0.0911	0.013	6.894	0.000	0.065 0.117
has_retirement[T.True]	-0.0065	0.017	-0.395	0.693	-0.039 0.026
log(ug_enrol)	0.0302	0.009	3.329	0.001	0.012 0.048
log(mean_fin_aid)	0.1401	0.021	6.624	0.000	0.099 0.182
log(mean_ft_sal_prof)	0.4534	0.035	13.142	0.000	0.386 0.521
log(endow_per_fte)	0.0505	0.005	9.570	0.000	0.040 0.061
has_union	0.0909	0.017	5.214	0.000	0.057 0.125

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	0.5561	0.267	2.080	0.038	0.032 1.080
C(adv_degree, Treatment(reference='None'))[T.JD]	-0.0828	0.074	-1.119	0.263	-0.228 0.062
C(adv_degree, Treatment(reference='None'))[T.MA/MS]	0.0349	0.042	0.823	0.411	-0.048 0.118
C(adv_degree, Treatment(reference='None'))[T.MD]	-2.689e-15	1.41e-15	-1.910	0.056	-5.45e-15 7.29e-17
C(adv_degree, Treatment(reference='None'))[T.PhD]	0.1236	0.043	2.893	0.004	0.040 0.207
C(division, Treatment(reference='Gen/LibA'))[T.Arts/Hum]	0.0948	0.037	2.581	0.010	0.023 0.167
C(division, Treatment(reference='Gen/LibA'))[T.Other]	-4.517e-17	7.25e-17	-0.623	0.533	-1.87e-16 9.71e-17
C(division, Treatment(reference='Gen/LibA'))[T.Prof/ApS]	0.1190	0.039	3.081	0.002	0.043 0.195
C(division, Treatment(reference='Gen/LibA'))[T.Sci/Tech]	0.1777	0.044	4.080	0.000	0.092 0.263
C(division, Treatment(reference='Gen/LibA'))[T.SocSci]	0.1117	0.041	2.733	0.006	0.032 0.192
has_health_ins[T.True]	0.1276	0.028	4.606	0.000	0.073 0.182
has_retirement[T.True]	-0.0087	0.026	-0.331	0.740	-0.060 0.043
has_governance[T.True]	0.0917	0.019	4.862	0.000	0.055 0.129
inst_ctrl[T.Private not-for-profit]	0.1371	0.131	1.050	0.294	-0.119 0.393
inst_ctrl[T.Public]	0.4190	0.138	3.029	0.002	0.148 0.690
inst_lvl[T.Four or more years]	-0.2184	0.105	-2.074	0.038	-0.425 -0.012
region[T.Great Lakes IL IN MI OH WI]	-0.0488	0.032	-1.539	0.124	-0.111 0.013
region[T.Mid East DE DC MD NJ NY PA]	-0.1005	0.029	-3.426	0.001	-0.158 -0.043
region[T.New England CT ME MA NH RI VT]	0.0269	0.034	0.782	0.434	-0.040 0.094
region[T.Plains IA KS MN MO NE ND SD]	0.0215	0.037	0.575	0.566	-0.052 0.095
region[T.Rocky Mountains CO ID MT UT WY]	-0.0108	0.055	-0.198	0.843	-0.118 0.096
region[T.Southeast AL AR FL GA KY LA MS NC SC TN VA WV]	-0.1262	0.031	-4.039	0.000	-0.187 -0.065
region[T.Southwest AZ NM OK TX]	-0.1258	0.044	-2.884	0.004	-0.211 -0.040
has_union	0.0624	0.026	2.386	0.017	0.011 0.114
admit_pct	-0.0034	0.001	-6.670	0.000	-0.004 -0.002
log(tuit_rev_per_fte)	0.2997	0.026	11.560	0.000	0.249 0.351
log(mean_ft_sal_prof)	0.4085	0.041	9.891	0.000	0.327 0.490