/*******************************************************************************************
Match CENCOL lottery names to CENCOL wide-net names
***********************************************************************************************/

clear all
set more off 
cap log close 
set mem 12g 
set matsize 11000

global dir "/home/research/cavoced/cred" 
global build "$dir/build"
global analy "$dir/analysis"
log using "$build/prog/CENCOLlottery/namematch_CENCOL.log", replace


cd "$build/raw/CENCOLData"

/*
*Logic of the merges
	1) 
		a) Merge on first, last, year, month, day , gender
		b) Merge on 		last, year, month, day, gender using xxx file
		c) drop if a and b BOTH merge on to a CENCOL student
	2) Of the unmerged
		repeat a/b/c using first, last, year, gender
	3) of the unmerge
		repeat a/b/c using first, last, gender

*/




/*********************************************************/
*Read in Lottery data, just keep one observation per student

cd "$build/raw/CENCOLData"

*Now read in the nursing list, and make it wide
	insheet using NursingLotteries.Data.new_MG.csv, names clear
	gen orderitcamein=_n
	des
	
	destring stid, gen(stid_num) ignore("NR")
	tab evalstat/*many are ineligible, and some have the reason why*/
		keep if evalstat=="Eligible"|evalstat=="Eigible"|evalstat=="Eiigible"|evalstat=="Eiligible"|evalstat=="Elgible"|evalstat=="eligible"
		drop evalstat
	*drop if don't have a STID
		drop if stid=="NR" | stid==""
	*first application date
		replace appl=upper(appl)
		replace appl=subinstr(appl," ","",.)
		replace appl=subinstr(appl,"SPIRING","SPRING",.)
		replace appl=subinstr(appl,"SPRINTG","SPRING",.)
		replace appl=subinstr(appl,"SPING","SPRING",.)
		replace appl=subinstr(appl,"FAL2","FALL2",.)
		replace appl=subinstr(appl,"`","",.)
		replace appl=subinstr(appl,"20205","2005",.)
		replace appl=subinstr(appl,"21013","2013",.)
		replace appl=subinstr(appl,"213","2013",.)
		
		gen firstapp_term=0.7 if substr(appl,1,4)=="FALL"
			replace appl=subinstr(appl,"FALL","",.)
		replace firstapp_term=0.1 if substr(appl,1,6)=="SPRING"
			replace appl=subinstr(appl,"SPRING","",.)
		replace firstapp_term=0.3 if substr(appl,1,6)=="SUMMER"
			replace appl=subinstr(appl,"SUMMER","",.)
		destring appl, gen (tempyearapp)
			replace firstapp_term=round(tempyearapp)+firstapp_term
			drop appl tempyearapp
			
	*Program applied for
		gen program_TRN=program=="TRN"|program=="RT"|program=="RN"|program=="PMRN"|program=="PRN"
			bysort stid firstapp: egen bloop=total(program_TRN)
			replace program_TRN=bloop>0
		gen program_LVN=strpos(program,"LV")>0
			drop bloop
			bysort stid firstapp: egen bloop=total(program_LVN)
			replace program_LVN=bloop>0	
			drop bloop

			
		keep if program_T 
		
		replace lastname=upper(lastname)
		replace firstname=upper(firstname)
		keep firstname_a lastname_a stid day month year gender
		duplicates drop
			bysort firstname lastname day month year gender: gen numdup=_N
			tab numdup
			drop if numdup>1
			drop numdup
*alternate names
				duplicates tag stid, gen(multiname)
				bysort stid: gen order=_n
				sort stid
				gen altname_first=""
					replace altname_first=firstname[_n+1] if order==1 & multiname>0
				gen altname_last=""
					replace altname_last=lastname[_n+1] if order==1 & multiname>0
				drop if order>1			
		
	*NEED TO MAKE THIS FLAT
		*by reporting duplicates for stid lastname firstname month year day gender
	duplicates report lastname firstname month year day gender
	bysort lastname firstname month year day gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		
		tempfile tempfullnurses
		save `tempfullnurses', replace


***********************
*Match Round 1: Nonduplicated First and Last names in Both Files

	use $build/raw/all_potmatches, clear
	des
	keep  STUDENT_ID  ST_NAME_FIRST ST_NAME_LAST ST_GENDER   birthmonth birthyear birthday

	
	rename birthmonth month
	rename birthyear year
	rename birthday day
	rename ST_GENDER gender
	rename ST_NAME_LAST lastname_a
	rename ST_NAME_FIRST firstname_a
	
	tostring month, replace
	tostring year, replace
	tostring day, replace
	
	duplicates drop
	
**************************************************************
*Create 2 datasets: one where Firstname exists, and another where it=XXX
*****************************************************************
		gen xxx=firstname=="XXX"
		tab xxx
		*Have to drop the common denominator: duplicates by firstname lastname, month year day gender.
		bysort  firstname lastname  month year day gender: gen numdupl=_N
			tab numdup
			drop if numdup>1
			drop numdup
		preserve
		keep if firstname!="XXX"
	
			tempfile temp_firstlast
			save `temp_firstlast'
		restore
		keep if firstname=="XXX"
		drop firstname
		bysort   lastname  month year day gender: gen numdupl=_N
			tab numdup
			drop if numdup>1
			drop numdup		
			tempfile temp_last
			save `temp_last'
*******************************************************************
*Perform the matches
********************************************************************		

*Round 1: Name, birthday, gender
	use `tempfullnurses', clear
	

		
		di "THE NUMBER OF POTENTIAL MATCHES, UNDUPLICATED, is " _N
		*match on firstname last name, birthday, gender
		merge 1:1 lastname firstname month year day gender using `temp_firstlast'
			rename _merge mergefirstlast
			*keep the matched folks
			preserve
			keep if mergefirstlast==3
			keep stid mergefirstlast STUDENT_ID
				rename STUDENT ssn_firstlast
			tempfile matched_firstlast
			save `matched_firstlast'
			restore
			
	*now match on lastname, birthday, gender (for the XXX sample)
	use `tempfullnurses', clear
			bysort lastname  month year day gender: gen numdup=_N
				tab numdup
				drop if numdup>1
				drop numdup			
			merge 1:1 lastname month year day gender using `temp_last', gen(mergelast)
			preserve
			keep if mergelast==3
			keep stid mergelast STUDENT
				rename STUDENT ssn_last
			tempfile matched_last
			save `matched_last'
			restore

			use `tempfullnurses', clear			
			merge 1:1 stid using `matched_firstlast'
				drop _merge
			merge 1:1 stid using `matched_last'
				drop _merge
			save `tempfullnurses', replace	
			
				replace mergelast=mergelast==3
				replace mergefirstlast=mergefirstlast==3
			egen anymatch=rowtotal(mergelast mergefirstlast)
			di "========================================="
			di "INITIAL MATCH RATE IS "
			tab anymatch
			di "___________________________________________"
			
			
		save "$build/temp/flat_temp_round1", replace
		keep if anymatch==0
			tempfile nomatches_round1
			save `nomatches_round1'

*Round 2: Name, birthyear, gender
	*first on first and last name, birthdyear, gender
	use `temp_firstlast', clear
	bysort lastname firstname year gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		save `temp_firstlast', replace
	use `nomatches_round1', clear
	keep stid lastname firstname  year  gender
	bysort lastname firstname  year  gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		*match on firstname last name, birthday, gender
		merge 1:1 lastname firstname  year  gender using `temp_firstlast'
			rename _merge mergefirstlast_nodmo
			preserve
			keep if mergefirstlast_nodmo==3
			keep stid mergefirstlast STUDENT
				rename STUDENT ssn_firstlast_nodmo		
			tempfile matched_flnodmo
			save `matched_flnodmo', replace
			restore
			
			
			
	*now on last name, birthyear, gender
	use `temp_last', clear
	bysort lastname  year gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		save `temp_last', replace
	use `nomatches_round1', clear
	keep stid lastname firstname  year  gender
		bysort lastname  year gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup

	merge 1:1 lastname year gender using `temp_last', gen(mergelast_nodmo)
			preserve
			keep if mergelast_nodmo==3
			keep stid mergelast_nodmo STUDENT
				rename STUDENT ssn_last_nodmo		
			tempfile matched_lnodmo
			save `matched_lnodmo', replace
	*append the secondary matches together
		use "$build/temp/flat_temp_round1", clear
			merge 1:1 stid using `matched_flnodmo'
				drop _merge
			merge 1:1 stid using `matched_lnodmo'
				drop _merge
			
	
	
	
	
		replace mergelast_=mergelast_==3
		replace mergefirstlast_=mergefirstlast_==3
		egen anymatch2=rowtotal(mergelast_ mergefirstlast_)
			di "========================================="
			di "SECOND MATCH RATE IS "
			tab anymatch2
			di "___________________________________________"
		save "$build/temp/flat_temp_round12", replace

	*keep a list of the still-unmatched people
			keep if anymatch2==0 & anymatch==0
			tempfile nomatches_round2
			save `nomatches_round2'

	restore
*Round 3: Name, gender
	*first on first and lastname, gender
	use `temp_firstlast', clear
	bysort lastname firstname  gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		save `temp_firstlast', replace
	use `nomatches_round2', clear
	bysort lastname firstname  gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup	
		merge 1:1 lastname firstname    gender using `temp_firstlast'
			rename _merge mergefirstlast_nob
			keep stid mergefirstlast_nob STUDENT
				rename STUDENT ssn_firstlast_nob		
			keep if mergefirstlast_nob==3
			tempfile matched_flnob
			save `matched_flnob', replace
			
	*now on last name, birthyear, gender
	use `temp_last', clear
	bysort lastname   gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		save `temp_last', replace			
	use `nomatches_round2', clear
	bysort lastname  gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup	
		

		merge 1:1 lastname  gender using `temp_last', gen(mergelast_nob)
			keep stid mergelast_nob STUDENT
				rename STUDENT ssn_last_nob		
			keep if mergelast_nob==3
			tempfile matched_lnob
			save `matched_lnob', replace
	*Append the third matches together.
		use "$build/temp/flat_temp_round12", clear
			merge 1:1 stid using `matched_flnob'
				drop _merge
			merge 1:1 stid using `matched_lnob'
				drop _merge
			
	
	
	
	
		replace mergelast_nob=mergelast_nob==3
		replace mergefirstlast_nob=mergefirstlast_nob==3
		egen anymatch3=rowtotal(mergelast_nob mergefirstlast_nob)
			di "========================================="
			di "SECOND MATCH RATE IS "
			tab anymatch3
			di "___________________________________________"
			
			
			
			
*Match file
gen matched=anymatch==1 | anymatch2==1 | anymatch3==1
replace matched=1 if (anymatch==2 & ssn_firstlast==ssn_last)
replace matched=1 if (anymatch2==2 & ssn_firstlast_nod==ssn_last_nod)
replace matched=1 if (anymatch3==2 & ssn_firstlast_nob==ssn_last_nob)			
di "//////////////////////////////////////////////////////////////"
di "Final Match Rate"
di "////////////////////////////////////////////////////////////////"
tab matched



		gen ssn_final=ssn_firstlast
		replace ssn_final=ssn_last if ssn_final==""
		replace ssn_final=ssn_firstlast_nod if ssn_final==""
		replace ssn_final=ssn_last_nod if ssn_final==""
		replace ssn_final=ssn_firstlast_nob if ssn_final==""
		replace ssn_final=ssn_last_nob if ssn_final==""
		save "$build/temp/flat_temp_round123", replace	
			
			

********************************************************************************************************
*THIS SECTION MATCHES ADDITIONAL PEOPLE TO THE ALREADY MADE DATASET, and then appends.
	*Only looks within Central College to find matches.
********************************************************************************************************

*First get all the unmatched lottery applicants
use $build/temp/flat_temp_round123, clear
	keep if anymatch==0
	keep stid lastname firstname gender day month year multiname order
	duplicates drop
	tempfile unmatchedapps
	save `unmatchedapps'

*First, get rid of Central College students who are already matched
use $build/temp/flat_temp_round123, clear
	keep if anymatch==1
	keep ssn_final
	rename ssn_final STUDENT_ID
	merge 1:1 STUDENT_ID using $build/raw/allCENCOL_potmatches, gen(almatch)
	keep if almatch==2 /*these are unmatched students in the non-lottery dataset*/
	keep  STUDENT_ID  ST_NAME_FIRST ST_NAME_LAST ST_GENDER   birthmonth birthyear birthday
	rename birthmonth month
	rename birthyear year
	rename birthday day
	rename ST_GENDER gender
	rename ST_NAME_LAST lastname_a
	rename ST_NAME_FIRST firstname_a
	
	tostring month, replace
	tostring year, replace
	tostring day, replace
	
	duplicates drop
**************************************************************
*Create 2 datasets: one where Firstname exists, and another where it=XXX
*****************************************************************
		gen xxx=firstname=="XXX"
		tab xxx
		*Have to drop the common denominator: duplicates by firstname lastname, month year day gender.
		bysort  firstname lastname  month year day gender: gen numdupl=_N
			tab numdup
			drop if numdup>1
			drop numdup
		preserve
		keep if firstname!="XXX"
	
			tempfile temp_firstlast
			save `temp_firstlast'
		restore
		keep if firstname=="XXX"
		drop firstname
		bysort   lastname  month year day gender: gen numdupl=_N
			tab numdup
			drop if numdup>1
			drop numdup		
			tempfile temp_last
			save `temp_last'
*******************************************************************
*Perform the matches
********************************************************************		

*Round 1: Name, birthday, gender
	use `unmatchedapps', clear
	

		
		di "THE NUMBER OF POTENTIAL MATCHES, UNDUPLICATED, is " _N
		*match on firstname last name, birthday, gender
		merge 1:1 lastname firstname month year day gender using `temp_firstlast'
			rename _merge mergefirstlast
			*keep the matched folks
			preserve
			keep if mergefirstlast==3
			keep stid mergefirstlast STUDENT_ID
				rename STUDENT ssn_firstlast
			tempfile matched_firstlast
			save `matched_firstlast'
			restore
			
	*now match on lastname, birthday, gender (for the XXX sample)
	use `unmatchedapps', clear
			bysort lastname  month year day gender: gen numdup=_N
				tab numdup
				drop if numdup>1
				drop numdup			
			merge 1:1 lastname month year day gender using `temp_last', gen(mergelast)
			preserve
			keep if mergelast==3
			keep stid mergelast STUDENT
				rename STUDENT ssn_last
			tempfile matched_last
			save `matched_last'
			restore

			use `unmatchedapps', clear			
			merge 1:1 stid using `matched_firstlast'
				drop _merge
			merge 1:1 stid using `matched_last'
				drop _merge
			save `unmatchedapps', replace	
			
				replace mergelast=mergelast==3
				replace mergefirstlast=mergefirstlast==3
			egen anymatch=rowtotal(mergelast mergefirstlast)
			di "========================================="
			di "INITIAL MATCH RATE IS "
			tab anymatch
			di "___________________________________________"
			
			
		save "$build/temp/flat_temp_round1_rr", replace
		keep if anymatch==0
			tempfile nomatches_round1
			save `nomatches_round1'

*Round 2: Name, birthyear, gender
	*first on first and last name, birthdyear, gender
	use `temp_firstlast', clear
	bysort lastname firstname year gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		save `temp_firstlast', replace
	use `nomatches_round1', clear
	keep stid lastname firstname  year  gender
	bysort lastname firstname  year  gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		*match on firstname last name, birthday, gender
		merge 1:1 lastname firstname  year  gender using `temp_firstlast'
			rename _merge mergefirstlast_nodmo
			preserve
			keep if mergefirstlast_nodmo==3
			keep stid mergefirstlast STUDENT
				rename STUDENT ssn_firstlast_nodmo		
			tempfile matched_flnodmo
			save `matched_flnodmo', replace
			restore
			
			
			
	*now on last name, birthyear, gender
	use `temp_last', clear
	bysort lastname  year gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		save `temp_last', replace
	use `nomatches_round1', clear
	keep stid lastname firstname  year  gender
		bysort lastname  year gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup

	merge 1:1 lastname year gender using `temp_last', gen(mergelast_nodmo)
			preserve
			keep if mergelast_nodmo==3
			keep stid mergelast_nodmo STUDENT
				rename STUDENT ssn_last_nodmo		
			tempfile matched_lnodmo
			save `matched_lnodmo', replace
	*append the secondary matches together
		use "$build/temp/flat_temp_round1_rr", clear
			merge 1:1 stid using `matched_flnodmo'
				drop _merge
			merge 1:1 stid using `matched_lnodmo'
				drop _merge
			
	
	
	
	
		replace mergelast_=mergelast_==3
		replace mergefirstlast_=mergefirstlast_==3
		egen anymatch2=rowtotal(mergelast_ mergefirstlast_)
			di "========================================="
			di "SECOND MATCH RATE IS "
			tab anymatch2
			di "___________________________________________"
		save "$build/temp/flat_temp_round12_rr", replace

	*keep a list of the still-unmatched people
			keep if anymatch2==0 & anymatch==0
			tempfile nomatches_round2
			save `nomatches_round2'

	restore
*Round 3: Name, gender
	*first on first and lastname, gender
	use `temp_firstlast', clear
	bysort lastname firstname  gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		save `temp_firstlast', replace
	use `nomatches_round2', clear
	bysort lastname firstname  gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup	
		merge 1:1 lastname firstname    gender using `temp_firstlast'
			rename _merge mergefirstlast_nob
			keep stid mergefirstlast_nob STUDENT
				rename STUDENT ssn_firstlast_nob		
			keep if mergefirstlast_nob==3
			tempfile matched_flnob
			save `matched_flnob', replace
			
	*now on last name, birthyear, gender
	use `temp_last', clear
	bysort lastname   gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup
		save `temp_last', replace			
	use `nomatches_round2', clear
	bysort lastname  gender: gen numdup=_N
		tab numdup
		drop if numdup>1
		drop numdup	
		

		merge 1:1 lastname  gender using `temp_last', gen(mergelast_nob)
			keep stid mergelast_nob STUDENT
				rename STUDENT ssn_last_nob		
			keep if mergelast_nob==3
			tempfile matched_lnob
			save `matched_lnob', replace
	*Append the third matches together.
		use "$build/temp/flat_temp_round12_rr", clear
			merge 1:1 stid using `matched_flnob'
				drop _merge
			merge 1:1 stid using `matched_lnob'
				drop _merge
			
	
	
	
	
		replace mergelast_nob=mergelast_nob==3
		replace mergefirstlast_nob=mergefirstlast_nob==3
		egen anymatch3=rowtotal(mergelast_nob mergefirstlast_nob)
			di "========================================="
			di "SECOND MATCH RATE IS "
			tab anymatch3
			di "___________________________________________"
			
			
			
			
*Match file
gen matched=anymatch==1 | anymatch2==1 | anymatch3==1
replace matched=1 if (anymatch==2 & ssn_firstlast==ssn_last)
replace matched=1 if (anymatch2==2 & ssn_firstlast_nod==ssn_last_nod)
replace matched=1 if (anymatch3==2 & ssn_firstlast_nob==ssn_last_nob)			
di "//////////////////////////////////////////////////////////////"
di "Final Match Rate"
di "////////////////////////////////////////////////////////////////"
tab matched



		gen ssn_final=ssn_firstlast
		replace ssn_final=ssn_last if ssn_final==""
		replace ssn_final=ssn_firstlast_nod if ssn_final==""
		replace ssn_final=ssn_last_nod if ssn_final==""
		replace ssn_final=ssn_firstlast_nob if ssn_final==""
		replace ssn_final=ssn_last_nob if ssn_final==""

		tempfile prefinal
		save `prefinal'
		*append to the previous flat file
		keep if matched==1
		gen rrmerge=1
		append using $build/temp/flat_temp_round123
			replace rrmerge=rrmerge==1
			*if there's a duplicate, then consider the newly matched person unmatched
				bysort stid: gen numcases=_N
				bysort stid: egen maxmatch=max(matched)
					*get rid of unmatched cases where new cases were matched
					drop if matched==0 & maxmatch==1 & numcases==2
					*get rid of cases where 
					drop numcases
					bysort stid: gen numcases=_N
					bysort stid: egen keepme=min(rrmerge)
						drop if numcases==2 & rrmerge>keepme
			drop maxmatch numcases keepme
			
			tab match rrmerge
			save "$build/temp/flat_temp_round123_rr", replace
