/************************************************************************************************/
/* This program assembles data that we use in our analysis of bunching. I assume the following: */
/*  1. The unit of observation is the SSN-year. For each SSN, we have a time series of yearly   */
/*      earnings. It does not matter whether we have observations with zero earnings.           */
/*  2. We have the following variables                                                          */
/*      a. id: uniquely identifies SSN (need not be SSN, of course)                             */
/*      b. year:  year of earnings                                                              */
/*      c. startDate: date that the SSN began receiving benefits, formatted as "MM/DD/YYYY"     */
/*      d. tob: (time-invariant) type of benefits, AXR being "retired worker"                   */
/*      e. dob: date of birth, formatted as "MM/DD/YYYY"                                        */
/*      f. ANNUAL_EARNINGS: nominal earnings for the given year, including self-emp earnings    */
/*      g. seEarnings: self-employment earnings for the give year                               */
/*      h. mbc: monthly benefit amount (time invariant)                                         */
/************************************************************************************************/

local bw 800
local samples "s1All s1Claim s1SE s1Main s1Male s1HighE s1HighV1 s1HighV2 s1HighV3 s1HighV4"
local samples s1All s1Claim

qui forvalues d = 0/9{

	/* load different variables depending on where the data are */
	if strpos(lower("$directory"), "dropbox") | strpos(lower("$directory"), "bulk/state_eitc"){

		use id digit7 year y4start m2start tob y4birth m2birth d2birth y4death ///
			annual_earnings annual_se_earnings sex using "$datadir/DATA" ///
			if digit7==`d' , clear

		/* format age/claim */
		rename d2birth dob
		rename m2birth mob
		rename y4birth yob

		gen age = year - yob
		gen deathAge = y4death - yob

		rename m2start startMonth
		rename y4start startYear
		gen claimed = year>=startYear & !missing(startYear)

		rename annual_earnings ANNUAL_EARNINGS
	}	

	if ~( strpos(lower("$directory"), "dropbox") | strpos(lower("$directory"), "bulk/state_eitc" )) {
		# delimit ;
		use id digit7 sex ybirth mbirth dbirth year earnings se bica yent01 ment01 ydeath
			using "$datadir/DATA"
			if digit7==`d', clear
		;
		# delimit cr
		

		rename mbirth mob
		rename ybirth yob

		gen age = year - yob
		gen deathAge = ydeath - yob

		rename ment01 startMonth
		rename yent01 startYear
		gen claimed = year>=startYear & !missing(startYear)

		rename earnings ANNUAL_EARNINGS
		rename se annual_se_earnings
	}
	keep if inrange(age, 40, 73) & inrange(yob, 1910, 1940)

	if `d' == 0 {
		noisily describe 
		noisily summarize
	}

	/* format rules. most important, define thresholds */
		/* I make three variables: "pre NRA", "post-NRA" and "reaches NRA" for the three possible 
			retirement ages */
		/* data from Statistical Supplement to Social Security Bulletin, 
			tables 2.A29 (pre 2000) and 2.A29.1 (2000-onward) */
	gen preNRA = .
	replace preNRA = 600  if year == 1951
	replace preNRA = 600  if year == 1952
	replace preNRA = 900  if year == 1953
	replace preNRA = 900  if year == 1954
	replace preNRA = 1200  if year == 1955
	replace preNRA = 1200  if year == 1956
	replace preNRA = 1200  if year == 1957
	replace preNRA = 1200  if year == 1958
	replace preNRA = 1200  if year == 1959
	replace preNRA = 1200 if year == 1960
	replace preNRA = 1200 if year == 1961
	replace preNRA = 1200 if year == 1962
	replace preNRA = 1200 if year == 1963
	replace preNRA = 1200 if year == 1964
	replace preNRA = 1200 if year == 1965
	replace preNRA = 1500 if year == 1966
	replace preNRA = 1500 if year == 1967
	replace preNRA = 1680 if year == 1968
	replace preNRA = 1680 if year == 1969
	replace preNRA = 1680 if year == 1970
	replace preNRA = 1680 if year == 1971
	replace preNRA = 1680 if year == 1972
	replace preNRA = 2100 if year == 1973
	replace preNRA = 2400 if year == 1974
	replace preNRA = 2520 if year == 1975
	replace preNRA = 2760 if year == 1976
	replace preNRA = 3000 if year == 1977
	replace preNRA = 3240 if year == 1978
	replace preNRA = 3480 if year == 1979
	replace preNRA = 3720 if year == 1980
	replace preNRA = 4080 if year == 1981
	replace preNRA = 4440 if year == 1982
	replace preNRA = 4920 if year == 1983
	replace preNRA = 5160 if year == 1984
	replace preNRA = 5400 if year == 1985
	replace preNRA = 5760 if year == 1986
	replace preNRA = 6000 if year == 1987
	replace preNRA = 6120 if year == 1988
	replace preNRA = 6480 if year == 1989
	replace preNRA = 6840 if year == 1990
	replace preNRA = 7080 if year == 1991
	replace preNRA = 7440 if year == 1992
	replace preNRA = 7680 if year == 1993
	replace preNRA = 8040 if year == 1994
	replace preNRA = 8160 if year == 1995
	replace preNRA = 8280 if year == 1996
	replace preNRA = 8640 if year == 1997
	replace preNRA = 9120 if year == 1998
	replace preNRA = 9600 if year == 1999
	replace preNRA = 10080 if year == 2000
	replace preNRA = 10680 if year == 2001
	replace preNRA = 11280 if year == 2002
	replace preNRA = 11520 if year == 2003
	replace preNRA = 11640 if year == 2004
	replace preNRA = 12000 if year == 2005
	replace preNRA = 12480 if year == 2006
	replace preNRA = 12960 if year == 2007
	replace preNRA = 13560 if year == 2008
	replace preNRA = 14160 if year == 2009
	replace preNRA = 14160 if year == 2010
	replace preNRA = 0 if year == 1940
	replace preNRA = 0 if year == 1941
	replace preNRA = 0 if year == 1942
	replace preNRA = 0 if year == 1943
	replace preNRA = 0 if year == 1944
	replace preNRA = 0 if year == 1945
	replace preNRA = 0 if year == 1946
	replace preNRA = 0 if year == 1947
	replace preNRA = 0 if year == 1948
	replace preNRA = 0 if year == 1949
	replace preNRA = 0 if year == 1950
	replace preNRA = 600 if year == 1951
	replace preNRA = 600 if year == 1952
	replace preNRA = 900 if year == 1953
	replace preNRA = 900 if year == 1954
	replace preNRA = 1200 if year == 1955
	replace preNRA = 1200 if year == 1956
	replace preNRA = 1200 if year == 1957
	replace preNRA = 1200 if year == 1958
	replace preNRA = 1200 if year == 1959
	replace preNRA = 1200 if year == 1960

	gen postNRA = .
	replace postNRA = 600  if year == 1951
	replace postNRA = 600  if year == 1952
	replace postNRA = 900  if year == 1953
	replace postNRA = 900  if year == 1954
	replace postNRA = 1200  if year == 1955
	replace postNRA = 1200  if year == 1956
	replace postNRA = 1200  if year == 1957
	replace postNRA = 1200  if year == 1958
	replace postNRA = 1200  if year == 1959
	replace postNRA = 1200 if year == 1960
	replace postNRA = 1200 if year == 1961
	replace postNRA = 1200 if year == 1962
	replace postNRA = 1200 if year == 1963
	replace postNRA = 1200 if year == 1964
	replace postNRA = 1200 if year == 1965
	replace postNRA = 1500 if year == 1966
	replace postNRA = 1500 if year == 1967
	replace postNRA = 1680 if year == 1968
	replace postNRA = 1680 if year == 1969
	replace postNRA = 1680 if year == 1970
	replace postNRA = 1680 if year == 1971
	replace postNRA = 1680 if year == 1972
	replace postNRA = 2100 if year == 1973
	replace postNRA = 2400 if year == 1974
	replace postNRA = 2520 if year == 1975
	replace postNRA = 2760 if year == 1976
	replace postNRA = 3000 if year == 1977
	replace postNRA = 4000 if year == 1978
	replace postNRA = 4500 if year == 1979
	replace postNRA = 5000 if year == 1980
	replace postNRA = 5500 if year == 1981
	replace postNRA = 6000 if year == 1982
	replace postNRA = 6600 if year == 1983
	replace postNRA = 6960 if year == 1984
	replace postNRA = 7320 if year == 1985
	replace postNRA = 7800 if year == 1986
	replace postNRA = 8160 if year == 1987
	replace postNRA = 8400 if year == 1988
	replace postNRA = 8880 if year == 1989
	replace postNRA = 9360 if year == 1990
	replace postNRA = 9720 if year == 1991
	replace postNRA = 10200 if year == 1992
	replace postNRA = 10560 if year == 1993
	replace postNRA = 11160 if year == 1994
	replace postNRA = 11280 if year == 1995
	replace postNRA = 12500 if year == 1996
	replace postNRA = 13500 if year == 1997
	replace postNRA = 14500 if year == 1998
	replace postNRA = 15500 if year == 1999
	replace postNRA = 0 if year == 1940
	replace postNRA = 0 if year == 1941
	replace postNRA = 0 if year == 1942
	replace postNRA = 0 if year == 1943
	replace postNRA = 0 if year == 1944
	replace postNRA = 0 if year == 1945
	replace postNRA = 0 if year == 1946
	replace postNRA = 0 if year == 1947
	replace postNRA = 0 if year == 1948
	replace postNRA = 0 if year == 1949
	replace postNRA = 0 if year == 1950
	replace postNRA = 600 if year == 1951
	replace postNRA = 600 if year == 1952
	replace postNRA = 900 if year == 1953
	replace postNRA = 900 if year == 1954
	replace postNRA = 1200 if year == 1955
	replace postNRA = 1200 if year == 1956
	replace postNRA = 1200 if year == 1957
	replace postNRA = 1200 if year == 1958
	replace postNRA = 1200 if year == 1959
	replace postNRA = 1200 if year == 1960

	gen reachesNRA = .
	replace reachesNRA = 17000 if year == 2000
	replace reachesNRA = 25000 if year == 2001
	replace reachesNRA = 30000 if year == 2002
	replace reachesNRA = 30720 if year == 2003
	replace reachesNRA = 31080 if year == 2004
	replace reachesNRA = 31800 if year == 2005
	replace reachesNRA = 33240 if year == 2006
	replace reachesNRA = 34440 if year == 2007
	replace reachesNRA = 36120 if year == 2008
	replace reachesNRA = 37680 if year == 2009
	replace reachesNRA = 37680 if year == 2010

	/* there are two kinks for 1961-1972 */
	gen     fullKink = 1500 if year == 1961
	replace fullKink = 1700 if year == 1962
	replace fullKink = 1700 if year == 1963
	replace fullKink = 1700 if year == 1964
	replace fullKink = 1700 if year == 1965
	replace fullKink = 2700 if year == 1966
	replace fullKink = 2700 if year == 1967
	replace fullKink = 2700 if year == 1968
	replace fullKink = 2700 if year == 1969
	replace fullKink = 2700 if year == 1970
	replace fullKink = 2700 if year == 1971
	replace fullKink = 2700 if year == 1972

	/* first era: 1960-1977, earners aged 71 or younger - not quite true, but coding is correct for our purposes */
	gen nra = 1 if year<=1977

	/* second era: 1978-1982, pre 65/post65 distinction */
	replace nra = 0 if age >= 70 & year>=1973& year<=1982
	replace nra = 1 if age<65 & year>=1973 & year>=1978 & year<=1982
	replace nra = 3 if age>=65 & age<=71  & year>=1978 & year<=1982

	/* third era: 1983-1999, exempt if age>=70, nra = 65 */
	replace nra = 0 if age>=70 & year>=1983 & year<=1999
	replace nra = 1 if age<65  & year>=1983 & year<=1999
	replace nra = 3 if age>=65 & age<=69 & year>=1983 & year<=1999

	/* fourth era: 2000-onward, exempt if age>=70, nra = 65 */
	/* note that the NRA actually grows slightly,           */
	/* so I set NRA = 66 when NRA>65.5                      */
	/* although our data do not actually go this far        */

	replace nra = 0 if age>=70 & year>=2000
	replace nra = 1 if age>=62 & age<=64 & year>=2000
	replace nra = 2 if age==65 & year>=2000 & year<=2004
	replace nra = 1 if age==65 & year>=2005 & year<=2010
	replace nra = 3 if age==66 & year>=2000 & year<=2004
	replace nra = 2 if age==66 & year>=2005 & year<=2010
	replace nra = 3 if age>=67 & age<=69 & year>=2000

	/* no nra if you're younger than 61 */
	replace nra = -1 if age <=61
	noisily di "nra assert"
	assert !missing(nra)

	label var nra "nra - age eligibility"
	label define nra -1 "-1. age 62 or less" 0 "0. no earnings test" 1 "1. age below nra" 2 "2. age at nra" 3 "3. age above nra"
	label values nra nra

	// Earnings thresholds, by year, for SS earnings test 
	gen threshold = preNRA if nra == 1 | nra == -1
	replace threshold = reachesNRA if nra == 2 | ( (nra == 3 | nra == 0) & year>=2000 )
	replace threshold = postNRA if (nra==3 | nra == 0) & year<=1999
	drop if missing(age) | missing(year)
	noisily di "threshold assert"
	assert !missing(threshold)

	// make real threshold/earnings/etc
	merge m:1 year using "$datadir/cpi"
	sum price_level if year == 2010
	local p2010 = r(mean)
	keep if _merge == 3
	drop _merge

	rename ANNUAL_EARNINGS earnings
	gen real_se_earnings = annual_se_earnings*`p2010'/price_level
	gen real_threshold = threshold*`p2010'/price_level
	gen real_earnings = earnings*`p2010'/price_level
	gen real_fullKink = fullKink*`p2010'/price_level
	gen real_distance = real_earnings - real_threshold
	
	gen realPre = preNRA * `p2010'/price_level
	gen realPost = postNRA * `p2010'/price_level
	gen realReaches = reachesNRA * `p2010'/price_level	

	// make lifetime earnings (ages 40-61) 
	if `d' == 0 noi di "  start lifetime earnings  "
	gen tt = real_earnings if real_earnings >0  & inrange(age, 40, 61)
	egen lifetimeEarning = sum(tt/22), by(id)
	egen lifetimeSDCOP = sd(tt), by(id)
	drop tt
	gen tt = real_earnings if inrange(age, 40, 61)
	egen lifetimeSD = sd(tt), by(id)
	
	// set samples: main, se, male, female, high lifetime inc, high vol defns 1-4
	if `d' == 0 noi di "  pre samples  "
	gen s1SE = (startYear<=yob+65) & annual_se_earnings > 0 & !missing(annual_se_earnings)
	gen s1All = ~(annual_se_earnings > 0 & !missing(annual_se_earnings))
	gen s1Main = (startYear<=yob+65) & !s1SE
	gen s1Claim = s1All & startYear <= yob+ age
	
	if `d' == 0 noi di "  pre male  "
	capture gen s1Male = s1Main & sex == "M"
	if _rc ~= 0 {
		if `d' == 0 {
			noisily tab sex
		}
		gen s1Male = s1Main & sex == 1
		noi di "  error for s1Male, trying numeric"
	}
	sum lifetimeEarning if s1Main, det
	gen s1HighE = s1Main & lifetimeEarning > r(p50)
	
	sum lifetimeSD if s1Main, det
	gen s1HighV1 = s1Main & lifetimeSD > r(p50)

	gen lifetimeCV = lifetimeSD/lifetimeEarning
	sum lifetimeCV if s1Main, det
	gen s1HighV2 = s1Main & lifetimeCV > r(p50)

	sum lifetimeSDCOP if s1Main, det
	gen s1HighV3 = s1Main & lifetimeSDCOP > r(p50)
	
	gen lifetimeCVCOP = lifetimeSDCOP/lifetimeEarning
	sum lifetimeCVCOP if s1Main, det
	gen s1HighV4 = s1Main & lifetimeCVCOP > r(p50)
	
	
	// save the samples 
	gen realDistanceRound = round(real_distance, `bw')
	drop if real_earnings == 0 
	keep `samples' year age realDistanceRound 
	save "$datadir/ssaDataAll", replace

	foreach sample of local samples {
		use "$datadir/ssaDataAll" if `sample', clear
		contract year age realDistanceRound, freq(count)
		save "$datadir/ssa_`sample'_`d'", replace
	}
	erase "$datadir/ssaDataAll.dta"
}


foreach sample of local samples {
	clear
	forvalues d = 0/9 {
		append using "$datadir/ssa_`sample'_`d'"
	}
	collapse (sum) count, by(year age realDistanceRound)
	save "$datadir/rr2_`sample'_`bw'", replace 
	
}


** Clean up
foreach sample of local samples {
	forvalues d = 0/9 {
		erase "$datadir/ssa_`sample'_`d'.dta"
	}	
}
