/************************************************************************************************/
/* This program assembles data that we use in our analysis of bunching. I assume the following: */
/*  1. The unit of observation is the SSN-year. For each SSN, we have a time series of yearly   */
/*      earnings. It does not matter whether we have observations with zero earnings.           */
/*  2. We have the following variables                                                          */
/*      a. id: uniquely identifies SSN (need not be SSN, of course)                             */
/*      b. year:  year of earnings                                                              */
/*      c. startDate: date that the SSN began receiving benefits, formatted as "MM/DD/YYYY"     */
/*      d. tob: (time-invariant) type of benefits, AXR being "retired worker"                   */
/*      e. dob: date of birth, formatted as "MM/DD/YYYY"                                        */
/*      f. ANNUAL_EARNINGS: nominal earnings for the given year, including self-emp earnings    */
/*      g. seEarnings: self-employment earnings for the give year                               */
/*      h. mbc: monthly benefit amount (time invariant)                                         */
/************************************************************************************************/

if "$directory" == "" global directory "bulk/state_eitc"

local bwList "400 500 800 1600"

qui forvalues d = 0/9{

/* load different variables depending on where the data are */
if strpos(lower("$directory"), "dropbox") | strpos(lower("$directory"), "bulk/state_eitc"){
    # delimit ;
    use id digit7 year y4start m2start tob y4birth m2birth d2birth y4death annual_earnings annual_se_earnings sex
      using "$datadir/DATA"
      if digit7==`d', clear
      ;
    # delimit cr

    /* format age/claim */
    rename d2birth dob
    rename m2birth mob
    rename y4birth yob

    gen age = year - yob
    gen deathAge = y4death - yob

    rename m2start startMonth
    rename y4start startYear
    gen claimed = year>=startYear & !missing(startYear)

    rename annual_earnings ANNUAL_EARNINGS
}	

if ~( strpos(lower("$directory"), "dropbox") | strpos(lower("$directory"), "bulk/state_eitc" )){
    # delimit ;
    use id digit7 sex ybirth mbirth dbirth year earnings se bica yent01 ment01 ydeath
      using "$datadir/DATA"
      if digit7==`d', clear
    ;
    # delimit cr

    rename mbirth mob
    rename ybirth yob

    gen age = year - yob
    gen deathAge = ydeath - yob

    rename ment01 startMonth
    rename yent01 startYear
    gen claimed = year>=startYear & !missing(startYear)

    rename earnings ANNUAL_EARNINGS
    rename se annual_se_earnings
}


/* format rules. most important, define thresholds */
    /* I make three variables: "pre NRA", "post-NRA" and "reaches NRA" for the three possible retirement ages */
    /* data from Statistical Supplement to Social Security Bulletin, tables 2.A29 (pre 2000) and 2.A29.1 (2000-onward) */
    gen preNRA = .
    replace preNRA = 600  if year == 1951
    replace preNRA = 600  if year == 1952
    replace preNRA = 900  if year == 1953
    replace preNRA = 900  if year == 1954
    replace preNRA = 1200  if year == 1955
    replace preNRA = 1200  if year == 1956
    replace preNRA = 1200  if year == 1957
    replace preNRA = 1200  if year == 1958
    replace preNRA = 1200  if year == 1959
    replace preNRA = 1200 if year == 1960
    replace preNRA = 1200 if year == 1961
    replace preNRA = 1200 if year == 1962
    replace preNRA = 1200 if year == 1963
    replace preNRA = 1200 if year == 1964
    replace preNRA = 1200 if year == 1965
    replace preNRA = 1500 if year == 1966
    replace preNRA = 1500 if year == 1967
    replace preNRA = 1680 if year == 1968
    replace preNRA = 1680 if year == 1969
    replace preNRA = 1680 if year == 1970
    replace preNRA = 1680 if year == 1971
    replace preNRA = 1680 if year == 1972
    replace preNRA = 2100 if year == 1973
    replace preNRA = 2400 if year == 1974
    replace preNRA = 2520 if year == 1975
    replace preNRA = 2760 if year == 1976
    replace preNRA = 3000 if year == 1977
    replace preNRA = 3240 if year == 1978
    replace preNRA = 3480 if year == 1979
    replace preNRA = 3720 if year == 1980
    replace preNRA = 4080 if year == 1981
    replace preNRA = 4440 if year == 1982
    replace preNRA = 4920 if year == 1983
    replace preNRA = 5160 if year == 1984
    replace preNRA = 5400 if year == 1985
    replace preNRA = 5760 if year == 1986
    replace preNRA = 6000 if year == 1987
    replace preNRA = 6120 if year == 1988
    replace preNRA = 6480 if year == 1989
    replace preNRA = 6840 if year == 1990
    replace preNRA = 7080 if year == 1991
    replace preNRA = 7440 if year == 1992
    replace preNRA = 7680 if year == 1993
    replace preNRA = 8040 if year == 1994
    replace preNRA = 8160 if year == 1995
    replace preNRA = 8280 if year == 1996
    replace preNRA = 8640 if year == 1997
    replace preNRA = 9120 if year == 1998
    replace preNRA = 9600 if year == 1999
    replace preNRA = 10080 if year == 2000
    replace preNRA = 10680 if year == 2001
    replace preNRA = 11280 if year == 2002
    replace preNRA = 11520 if year == 2003
    replace preNRA = 11640 if year == 2004
    replace preNRA = 12000 if year == 2005
    replace preNRA = 12480 if year == 2006
    replace preNRA = 12960 if year == 2007
    replace preNRA = 13560 if year == 2008
    replace preNRA = 14160 if year == 2009
    replace preNRA = 14160 if year == 2010
    replace preNRA = 0 if year == 1940
    replace preNRA = 0 if year == 1941
    replace preNRA = 0 if year == 1942
    replace preNRA = 0 if year == 1943
    replace preNRA = 0 if year == 1944
    replace preNRA = 0 if year == 1945
    replace preNRA = 0 if year == 1946
    replace preNRA = 0 if year == 1947
    replace preNRA = 0 if year == 1948
    replace preNRA = 0 if year == 1949
    replace preNRA = 0 if year == 1950
    replace preNRA = 600 if year == 1951
    replace preNRA = 600 if year == 1952
    replace preNRA = 900 if year == 1953
    replace preNRA = 900 if year == 1954
    replace preNRA = 1200 if year == 1955
    replace preNRA = 1200 if year == 1956
    replace preNRA = 1200 if year == 1957
    replace preNRA = 1200 if year == 1958
    replace preNRA = 1200 if year == 1959
    replace preNRA = 1200 if year == 1960

    gen postNRA = .
    replace postNRA = 600  if year == 1951
    replace postNRA = 600  if year == 1952
    replace postNRA = 900  if year == 1953
    replace postNRA = 900  if year == 1954
    replace postNRA = 1200  if year == 1955
    replace postNRA = 1200  if year == 1956
    replace postNRA = 1200  if year == 1957
    replace postNRA = 1200  if year == 1958
    replace postNRA = 1200  if year == 1959
    replace postNRA = 1200 if year == 1960
    replace postNRA = 1200 if year == 1961
    replace postNRA = 1200 if year == 1962
    replace postNRA = 1200 if year == 1963
    replace postNRA = 1200 if year == 1964
    replace postNRA = 1200 if year == 1965
    replace postNRA = 1500 if year == 1966
    replace postNRA = 1500 if year == 1967
    replace postNRA = 1680 if year == 1968
    replace postNRA = 1680 if year == 1969
    replace postNRA = 1680 if year == 1970
    replace postNRA = 1680 if year == 1971
    replace postNRA = 1680 if year == 1972
    replace postNRA = 2100 if year == 1973
    replace postNRA = 2400 if year == 1974
    replace postNRA = 2520 if year == 1975
    replace postNRA = 2760 if year == 1976
    replace postNRA = 3000 if year == 1977
    replace postNRA = 4000 if year == 1978
    replace postNRA = 4500 if year == 1979
    replace postNRA = 5000 if year == 1980
    replace postNRA = 5500 if year == 1981
    replace postNRA = 6000 if year == 1982
    replace postNRA = 6600 if year == 1983
    replace postNRA = 6960 if year == 1984
    replace postNRA = 7320 if year == 1985
    replace postNRA = 7800 if year == 1986
    replace postNRA = 8160 if year == 1987
    replace postNRA = 8400 if year == 1988
    replace postNRA = 8880 if year == 1989
    replace postNRA = 9360 if year == 1990
    replace postNRA = 9720 if year == 1991
    replace postNRA = 10200 if year == 1992
    replace postNRA = 10560 if year == 1993
    replace postNRA = 11160 if year == 1994
    replace postNRA = 11280 if year == 1995
    replace postNRA = 12500 if year == 1996
    replace postNRA = 13500 if year == 1997
    replace postNRA = 14500 if year == 1998
    replace postNRA = 15500 if year == 1999
    replace postNRA = 0 if year == 1940
    replace postNRA = 0 if year == 1941
    replace postNRA = 0 if year == 1942
    replace postNRA = 0 if year == 1943
    replace postNRA = 0 if year == 1944
    replace postNRA = 0 if year == 1945
    replace postNRA = 0 if year == 1946
    replace postNRA = 0 if year == 1947
    replace postNRA = 0 if year == 1948
    replace postNRA = 0 if year == 1949
    replace postNRA = 0 if year == 1950
    replace postNRA = 600 if year == 1951
    replace postNRA = 600 if year == 1952
    replace postNRA = 900 if year == 1953
    replace postNRA = 900 if year == 1954
    replace postNRA = 1200 if year == 1955
    replace postNRA = 1200 if year == 1956
    replace postNRA = 1200 if year == 1957
    replace postNRA = 1200 if year == 1958
    replace postNRA = 1200 if year == 1959
    replace postNRA = 1200 if year == 1960

    gen reachesNRA = .
    replace reachesNRA = 17000 if year == 2000
    replace reachesNRA = 25000 if year == 2001
    replace reachesNRA = 30000 if year == 2002
    replace reachesNRA = 30720 if year == 2003
    replace reachesNRA = 31080 if year == 2004
    replace reachesNRA = 31800 if year == 2005
    replace reachesNRA = 33240 if year == 2006
    replace reachesNRA = 34440 if year == 2007
    replace reachesNRA = 36120 if year == 2008
    replace reachesNRA = 37680 if year == 2009
    replace reachesNRA = 37680 if year == 2010

    /* there are two kinks for 1961-1972 */
    gen     fullKink = 1500 if year == 1961
    replace fullKink = 1700 if year == 1962
    replace fullKink = 1700 if year == 1963
    replace fullKink = 1700 if year == 1964
    replace fullKink = 1700 if year == 1965
    replace fullKink = 2700 if year == 1966
    replace fullKink = 2700 if year == 1967
    replace fullKink = 2700 if year == 1968
    replace fullKink = 2700 if year == 1969
    replace fullKink = 2700 if year == 1970
    replace fullKink = 2700 if year == 1971
    replace fullKink = 2700 if year == 1972

    /* first era: 1960-1977, earners aged 71 or younger - not quite true, but coding is correct for our purposes */
    gen nra = 1 if year<=1977

    /* second era: 1978-1982, pre 65/post65 distinction */
    replace nra = 0 if age >= 70 & year>=1973& year<=1982
    replace nra = 1 if age<65 & year>=1973 & year>=1978 & year<=1982
    replace nra = 3 if age>=65 & age<=71  & year>=1978 & year<=1982

    /* third era: 1983-1999, exempt if age>=70, nra = 65 */
    replace nra = 0 if age>=70 & year>=1983 & year<=1999
    replace nra = 1 if age<65  & year>=1983 & year<=1999
    replace nra = 3 if age>=65 & age<=69 & year>=1983 & year<=1999

    /* fourth era: 2000-onward, exempt if age>=70, nra = 65 */
    /* note that the NRA actually grows slightly,           */
    /* so I set NRA = 66 when NRA>65.5                      */
    /* although our data do not actually go this far        */

    replace nra = 0 if age>=70 & year>=2000
    replace nra = 1 if age>=62 & age<=64 & year>=2000
    replace nra = 2 if age==65 & year>=2000 & year<=2004
    replace nra = 1 if age==65 & year>=2005 & year<=2010
    replace nra = 3 if age==66 & year>=2000 & year<=2004
    replace nra = 2 if age==66 & year>=2005 & year<=2010
    replace nra = 3 if age>=67 & age<=69 & year>=2000

    /* no nra if you're younger than 61 */
    replace nra = -1 if age <=61
    noisily di "nra assert"
    assert !missing(nra)

    label var nra "nra - age eligibility"
    label define nra -1 "-1. age 62 or less" 0 "0. no earnings test" 1 "1. age below nra" 2 "2. age at nra" 3 "3. age above nra"
    label values nra nra

    /* Earnings thresholds, by year, for SS earnings test */
    gen threshold = preNRA if nra == 1 | nra == -1
    replace threshold = reachesNRA if nra == 2 | ( (nra == 3 | nra == 0) & year>=2000 )
    replace threshold = postNRA if (nra==3 | nra == 0) & year<=1999
    drop if missing(age) | missing(year)
    noisily di "threshold assert"
    assert !missing(threshold)

/* make real threshold/earnings/etc */
    merge m:1 year using "$datadir/cpi"
    sum price_level if year == 2010
    local p2010 = r(mean)
    keep if _merge == 3
    drop _merge

    rename ANNUAL_EARNINGS earnings
    gen real_se_earnings = annual_se_earnings*`p2010'/price_level
    gen real_threshold = threshold*`p2010'/price_level
    gen real_earnings = earnings*`p2010'/price_level
    gen real_fullKink = fullKink*`p2010'/price_level
    /* gen real_benefit  = 12*(mbc*`p2010'/price_level) Turned off 20120615 DP */
    gen real_distance = real_earnings - real_threshold
		
		gen realPre = preNRA * `p2010'/price_level
		gen realPost = postNRA * `p2010'/price_level
		gen realReaches = reachesNRA * `p2010'/price_level
		

    /* make lead earnings/distance/threshold */
    sort id year
    by id: gen earnLead1    = real_earnings[_n+1]
    by id: gen distLead1    = real_distance[_n+1]
    by id: gen threshLead1  = real_threshold[_n+1]

    /* make lag earnings/distance/threshold */
    by id: gen earnLag1    = real_earnings[_n-1]
    by id: gen distLag1    = real_distance[_n-1]
    by id: gen threshLag1  = real_threshold[_n-1]

    /* make lifetime earnings (ages 40-61) */
    egen lifetimeEarning = sum(real_earnings * (age>=40 & age<=61)), by(id)

    ** Future LFP
    gen nilf = missing(real_earnings) | real_earnings == 0
    by id: gen nilfLead = nilf[_n+1]
    by id: replace nilfLead = 1 if year[_n+1] != year+1
    replace nilfLead = . if _n == _N

/* make four datasets, one for each sample samples etc */

/* samples are: s1Main, s1Full, s1Placebo, and s1SE ; and now s164, claim by 64*/

/* set samples; there are 15 in total, based on size, claiming info/age, month of birth, and month of claim start */

    gen s1Both = startYear<=yob+65
    gen s1Placebo = startYear>year
    gen s1SE = s1Both & annual_se_earnings > 0 & !missing(annual_se_earnings)
    gen s1Full = !(annual_se_earnings > 0 & !missing(annual_se_earnings))
    gen s1Main = s1Both & !s1SE
    gen s1Claim = (startYear<=year | age<62) & s1Full
    gen s1JanBorn = mob == 1  & s1Main
		gen s1Born13 = inrange(mob, 1, 3) & s1Main
		gen s1Born16 = inrange(mob, 1, 6) & s1Main

		local samples "s1Both s1Placebo s1SE s1Full s1Main s1Claim s1JanBorn s1Born13 s1Born16"
    sum age if  !missing(earnings) & earnings>0
    local aMin = r(min)
    local aMax = r(max)
    local nTotal = r(N)
    count if age==62 & !missing(earnings) & earnings>0
    local n62 = r(N)
    count if startMonth == 1 & !missing(earnings) & earnings>0
    local nJan = r(N)
    noisily di "`d':"
    noisily di "  Total: `nTotal', age 62 count: `n62', jan count: `nJan' age range: `aMin'-`aMax'"

    foreach sample of local samples{

        /* count sample with non-missing age etc */
        sum age if `sample' & !missing(earnings) & earnings>0
        local cS = r(N)
        local aMin = r(min)
        local aMax = r(max)

        /* count people age 62 */
        count if age==62 & `sample' & !missing(earnings) & earnings>0
        local n62 = r(N)

        /* count people who start in january */
        noisily di "  Sample: `sample' ; count: `cS', age 62 count: `n62', age range: `aMin'-`aMax'"
    }


/* save the samples */

    # delimit ;
    keep
        `samples'
        id year earnings age threshold claimed deathAge yob startYear sex
        real_earnings real_threshold real_distance lifetimeEarning real_se_earnings
        *Lag1 *Lead1 nilfLead
        mob realPre realPost realReaches
    ;
    # delimit cr
    save "$datadir/ssaDataAll", replace

    foreach var of local samples{
        use "$datadir/ssaDataAll", clear
        keep if `var'
        drop `samples'
        save "$datadir/ssa_`var'_`d'", replace
    }
    noisily di "  count assert, d= `d'"
    count
    assert r(N) > 0
    erase "$datadir/ssaDataAll.dta"
}


// Main sample, look at transitions
use "$datadir/ssa_s1Both_0" if inrange(age, 57, 74), clear
forvalues d = 1/9{
		append using "$datadir/ssa_s1Both_`d'"
		keep if inrange(age, 57, 74)
}
desc
compress
keep id year age real_threshold - real_distance ///
	realPre realPost realReaches *se* startYear
save "$datadir/ssa_s1Both_micro", replace 	


local samples "s1Placebo s1SE s1Full s1Main s1Claim s1JanBorn s1Born13 s1Born16"

/* Make CFOP datasets: collapse to bin counts */
foreach sample of local samples{
    foreach bw of local bwList{

        /* get counts for each decile */
        forvalues d = 0/9{
            use "$datadir/ssa_`sample'_`d'" if !missing(real_earnings), clear
            gen count = 1
            gen realDistanceRound = round(real_distance, `bw')
            forvalues age = 70(5)85{
                gen lives`age' = (deathAge>=`age') if (2006-yob >=`age')
            }
            gen deathAgeVar = deathAge
            collapse (rawsum) count (mean) real_threshold lives* deathAge nilfLead (sd) deathAgeVar, by(realDistanceRound age year) fast
            replace deathAgeVar = deathAgeVar^2
            save "$datadir/temp`d'", replace
        }
        forvalues d= 0/8{
            append using "$datadir/temp`d'"
            erase "$datadir/temp`d'.dta"
        }

        /* add up counts */
        egen meanVar = mean(deathAgeVar), by(realDistanceRound age year)
        egen sdMean = sd(deathAge), by(realDistanceRound age year)
        replace deathAgeVar = meanVar + sdMean^2
        gen wt = count
        collapse (rawsum) count (mean) real_threshold lives* deathAge* meanVar nilfLead [fw=wt], by(realDistanceRound age year) fast

        save "$datadir/cfopSSA_`sample'_`bw'", replace
    }
}
