clear all
set matsize 5000
set maxvar 10000

	// SAT scores
	program define satscores
	{
	use final_cohort_history_2004_2014.dta, clear
	sort cohort satrmap year month
	by cohort satrmap: g take = _n
	by cohort satrmap: egen takes = max(take)

	by cohort satrmap: egen maxm = max(math)
	by cohort satrmap: egen maxv = max(verbal)
	by cohort satrmap: egen maxw = max(writing)
	rename (math verbal writing) (m v w)

	foreach y of varlist m v w {
		g gain`y' = `y'-`y'[_n-1] if take==2
		by cohort satrmap: egen maxgain`y' = max(gain`y')
		replace gain`y' = maxgain`y'
		drop maxgain`y'
		replace gain`y' = 0 if (gain`y'==.)&(max`y'!=.)
	}
	
	keep if take==1
	g monthstoretake = 12*(cohort-year)+(6-month)
	drop month year take
	g retook = (takes>1)
	order cohort satrmap monthstoretake m v w retook takes maxm maxv maxw
	label var monthstoretake "Months to retake"
	label var m "First math score"
	label var v "First verbal score"
	label var w "First writing score"
	label var retook "Retook SAT"
	label var takes "SAT takes"
	label var maxm "Maximum math score"
	label var maxv "Maximum verbal score"
	label var maxw "Maximum writing score"
	label var gainm "Math gain from 1st retake"
	label var gainv "Verbal gain from 1st retake"
	label var gainw "Writing gain from 1st retake"
	compress
	save satscores.dta, replace
	}
	end
	
	// Demographics
	program define demographics
	{
	forval x=2004/2014 {
		use satmap cb_rec_id p_aicode p_ethnic p_sex p_zip5 fatheduc motheduc income fe* using "C:\Users\jgoodma1\Weather and Human Capital\Data\Test scores\cbs`x'_scores.dta", clear
		keep if satmap!=""	
		rename satmap satrmap
		cap rename income08 income
		g cohort = `x'
		save temp_`x'.dta, replace
	}
	clear
	forval x=2004/2014 {
		append using temp_`x'.dta
		rm temp_`x'.dta
	}
	g byte feewaiver = (feewvr=="Y") if cohort>=2007
	drop feewvr*
	label var feewaiver "Fee waiver"
	rename (p_aicode p_ethnic p_zip5) (highschool race zipcode)
	label var zipcode "ZIP code"
	label var highschool "High school"
	g female = (p_sex!="M")
	label var female "Female"
	drop p_sex
	replace race = 0 if race==.
	label var race "Race (1=AmInd,2=Asian,3=Black,456=Hispanic,7=White,8=other,0=missing)"
	replace income = 0 if income==.
	replace income =  5000 if inlist(income,1)
	replace income = 15000 if (inlist(income,2,3)&(cohort<=2006))|((income==2)&(cohort>=2007))
	replace income = 25000 if (inlist(income,4,5)&(cohort<=2006))|((income==3)&(cohort>=2007))
	replace income = 35000 if (inlist(income,6,7)&(cohort<=2006))|((income==4)&(cohort>=2007))
	replace income = 45000 if ((income==8)&(cohort<=2006))|((income==5)&(cohort>=2007))
	replace income = 55000 if ((income==9)&(cohort<=2006))|((income==6)&(cohort>=2007))
	replace income = 65000 if ((income==10)&(cohort<=2006))|((income==7)&(cohort>=2007))
	replace income = 75000 if ((income==11)&(cohort<=2006))|((income==8)&(cohort>=2007))
	replace income = 90000 if ((income==12)&(cohort<=2006))|((income==9)&(cohort>=2007))
	replace income = 130000 if ((income==13)&(cohort<=2007))|(inrange(income,10,15)&(cohort>=2008))
	replace income = income/1000
	label var income "Income"
	replace motheduc = 0 if motheduc==.
	replace fatheduc = 0 if fatheduc==.	
	label var fatheduc "Father's education"
	label var motheduc "Mother's education"
	order cohort satrmap zipcode highschool female race income *educ feewaiver
	compress		
	save demographics.dta, replace	
	}
	end
	
	// College choice
	program define college
	{
	
	// Merge high school classes
	
	forval x=2004/2014 {
		use satrmap Transition_Time DI_Code_1 ASC_UnitID_1 ASC_ColYears_* GradDate_* using "C:\Users\jgoodma1\Weather and Human Capital\Data\College\nsc`x'.dta", clear
		keep if satrmap!=""	
		g cohort = `x'
		save temp_`x'.dta, replace
	}
	clear
	forval x=2004/2014 {
		append using temp_`x'.dta
		rm temp_`x'.dta
	}
	
	// Generate sector and quality measures of on-time enrollment
	
	g coll4 = (ASC_ColYears_1=="4")&(Transition_Time<=180)
	g coll2 = (ASC_ColYears_1=="2")&(Transition_Time<=180)
	label var coll4 "Four-year college"
	label var coll2 "Two-year college"
	replace ASC_UnitID_1 = . if Transition_Time>180
	rename cohort year
	merge m:1 ASC_UnitID_1 year using psat.dta, keep(match) nogen
	rename year cohort
	merge m:1 ASC_UnitID_1 using gradrate.dta, keep(match) nogen
	label var psat "College's PSAT z-score"
	label var gradrate "College's BA completion rate"
	drop ASC_UnitID_1
		
	// Generate measure of ever enrolling in four-year college
	g evercoll4 = inlist("4",ASC_ColYears_1,ASC_ColYears_2,ASC_ColYears_3,ASC_ColYears_4)
	label var evercoll4 "Four-year college (ever)"
		
	// Generate B.A. completion measures
	
	g ba4 = (ASC_ColYears_1=="4")&(GradDate_1<=mdy(6,30,cohort+4))
	replace ba4 = 1 if (ASC_ColYears_2=="4")&(GradDate_2<=mdy(6,30,cohort+4))
	replace ba4 = 1 if (ASC_ColYears_3=="4")&(GradDate_3<=mdy(6,30,cohort+4))
	replace ba4 = 1 if (ASC_ColYears_4=="4")&(GradDate_4<=mdy(6,30,cohort+4))
		replace ba4 = . if cohort>2010
	
	g ba6 = (ASC_ColYears_1=="4")&(GradDate_1<=mdy(6,30,`x'+6))
	replace ba6 = 1 if (ASC_ColYears_2=="4")&(GradDate_2<=mdy(6,30,cohort+6))
	replace ba6 = 1 if (ASC_ColYears_3=="4")&(GradDate_3<=mdy(6,30,cohort+6))
	replace ba6 = 1 if (ASC_ColYears_4=="4")&(GradDate_4<=mdy(6,30,cohort+6))
		replace ba6 = . if cohort>2008
		
	drop *ColYears* GradDate* Transition_Time
	destring DI_Code_1, replace
	order satrmap cohort DI_Code_1 
	compress
	save college.dta, replace
	
	}
	end
	
	// Score sends
	program define scoresends
	{
	
	use unitid_ceeb_crosswalk.dta, clear 
	drop if (dicbcode==2660)&(ASC_UnitID_1!=214777)	// Penn State U codes aren't unique
	rename dicbcode di
	merge 1:m di using scoresends.dta, keep(match using) nogen
	drop if di==9999
	
	// Attach graduation rates to each score send
	
	merge m:1 ASC_UnitID_1 using gradrate.dta, keep(match master) nogen
	sum gradrate
	replace gradrate = r(mean) if gradrate==.
	forval x=20(30)80 {
		g byte sends_gr`x' = inrange(gradrate,`x'/100,1)
	}
	drop gradrate ASC_UnitID_1
	
	// Attach college earnings to each score send
	
	preserve
	use DI_Code_1 inc_* if DI!=. using earnings.dta, clear
	rename DI di
	reshape long inc_, i(di) j(year)
	reg inc_ i.year i.di
	predict inc_hat
	replace inc_ = inc_hat if inc_==.
	drop inc_hat
	reshape wide inc_, i(di) j(year)
	replace inc_baseline = (inc_1980+inc_1981+inc_1982)/3 if inc_baseline==.
	keep di inc_baseline inc_1988
	save temp.dta, replace
	restore
	merge m:1 di using temp.dta, keep(match master) nogen
	rm temp.dta
	sum inc_baseline
	replace inc_baseline = r(mean) if inc_baseline==.
	sum inc_1988
	replace inc_1988 = r(mean) if inc_1988==.
	forval x=35(15)65 {
		g byte sends_base`x' = (inc_base>1000*`x')
	}	
	drop inc_*
	
	// Generate send variables
	
	g byte sends = 1
	collapse (sum) sends*, by(cb_rec_id cohort)
	label var sends "Score sends"
	compress
	save scoresends_collapsed.dta, replace
	
	}
	end
	
	// Final retaking data set
	program define retaking
	{	
	
	// Merge
	
	use demographics.dta, clear
	merge 1:1 cohort satrmap using college.dta, keep(match) nogen
	replace satrmap = subinstr(satrmap,"S04","",.)
	replace satrmap = subinstr(satrmap,"S05","",.)
	replace satrmap = trim(satrmap)
	merge 1:1 cohort satrmap using satscores.dta, keep(match) nogen
	drop satrmap
	merge 1:1 cohort cb_rec_id using scoresends_collapsed, keep(match master) nogen
	foreach var of varlist sends* {
		replace `var' = 0 if `var'==.
	}
	drop cb_rec_id
	label var cohort "High school class"
	
	// Generate total SAT scores
	
	g int mvw = m+v+w
	g int maxmvw = maxm+maxv+maxw
	label var mvw "First SAT score"
	label var maxmvw "Maximum SAT score"

	// Merge to state
	
	destring zipcode, force replace
	merge m:1 zipcode using zipcodes.dta, keep(match master) nogen
	drop count
	
	// Merge to 2006 ZIP code income from IRS
	
	replace zipcode = . if !inrange(zipcode,1,99999)
	merge m:1 zipcode using income_by_zip_2006.dta, keep(match master) nogen
	sum zipinc
	replace zipinc = r(mean) if zipinc==.
	replace zipinc = round(zipinc)
	
	// Merge to 2003 USDA rurality measures by ZIP code
	
	merge m:1 zipcode using rurality.dta, keep(match master) nogen
	drop lzden
	
	// Merge to college earnings for class of 2006 (birth cohort of 1988) in 2014 from Chetty et al.
	
	preserve
	use DI_Code_1 inc_* using earnings.dta, clear
	replace DI = 0 if DI==.
	reshape long inc_, i(DI_Code_1) j(year)
	reg inc_ i.year i.DI_Code_1
	predict inc_hat
	replace inc_ = inc_hat if inc_==.
	drop inc_hat
	reshape wide inc_, i(DI_Code_1) j(year)
	replace inc_baseline = (inc_1980+inc_1981+inc_1982)/3 if inc_baseline==.
	replace DI = . if DI==0
	keep DI inc_baseline
	save temp.dta, replace
	restore
	merge m:1 DI_Code_1 using temp.dta, keep(match master) nogen
	drop DI_Code_1
	rm temp.dta
	
	// Save splines version of data (each row is unique student)
	compress
	save retaking_splines.dta, replace
	
	// Final cleanup for stacked version of data
	
	g id = _n
	expand 2, g(copy)
	g int threshold = 100*(int(mvw/100)+copy)
	drop copy
	g byte distance = (mvw-threshold)
	g byte below = (distance<0)
	g byte below_distance = below*distance
	keep if inrange(threshold,700,2300)&(distance!=.)
	compress 
	save retaking_stacked.dta, replace
		
	}
	end
	
*satscores
*demographics
*college
*scoresends
retaking
