/****************************************************************************************** Syntax file to accompany the book Complex Survey Data Analysis with SAS(R) by Taylor Lewis. last updated: 1.2.17 ******************************************************************************************/ ******* Read in SAS Data Sets ************************************************************; * NOTE: assign the macro variable below the path to which you have saved the ZIP file of SAS data sets associated with this book; %let path=C:\Data\; * <- just an example of a path; libname data "&path"; *** Chapter 1; data expenditures; set data.expenditures; run; data CLT_example_toplot; set data.CLT_example_toplot; run; data sample_means; set data.sample_means; run; data sample_SRSWOR; set data.sample_SRSWOR; run; data sample_str_SRSWOR; set data.sample_str_SRSWOR; run; data sample_clus; set data.sample_clus; run; data sample_weights; set data.sample_weights; run; *** Chapter 2; data frame; set data.frame; run; *** Chapter 3/9; data CBECS_2003; set data.CBECS_2003; run; *** Chapter 4/7; data NSFG_0610_F; set data.NSFG_0610_F; run; *** Chapter 5/6/9; data NAMCS_2010; set data.NAMCS_2010; run; *** Chapter 10/11; data pop; set data.pop; run; data samp; set data.samp; run; ****** Chapter 1: Features and Examples of Complex Surveys; *** Figure 1.2; proc format; value $y_type '1'='Variable 1: Normal Distribution' '2'='Variable 2: Right-Skewed' '3'='Variable 3: Bimodal Distribution'; run; proc sgpanel data=CLT_example_toplot noautolegend; panelby y_type / novarname columns=1 rows=3 spacing=20; histogram y; format y_type $y_type.; label y='Value of Variable'; run; *** Figure 1.3; proc format; value $y_type '1'='Variable 1: Normal Distribution' '2'='Variable 2: Right-Skewed' '3'='Variable 3: Bimodal Distribution'; value $group '1'='n = 15' '2'='n = 30' '3'='n = 100'; run; proc sgpanel data=sample_means noautolegend; panelby y_type group / novarname columns=3 rows=3 spacing=20; histogram y; density y; format y_type $y_type. group $group.; label y='Value of Sample Mean'; run; *** Program 1.1; title1 'Simple Random Sampling without Replacement'; title2 'Estimating a Sample Mean and its Variance Ignoring the FPC'; proc surveymeans data=sample_SRSWOR mean var; var exp_OTCmeds; run; title2 'Estimating a Sample Mean and its Variance Accounting for the FPC'; proc surveymeans data=sample_SRSWOR total=2000 mean var; var exp_OTCmeds; run; title; *** Program 1.2; title1 'Stratified Simple Random Sampling without Replacement'; title2 'Estimating a Sample Mean and its Variance Ignoring the Stratification'; proc surveymeans data=sample_str_SRSWOR total=2000 mean var; var exp_travel; run; data totals; length cityside $4; input cityside _TOTAL_; datalines; East 1000 West 1000 ; run; title2 'Estimating a Sample Mean and its Variance Accounting for the Stratification'; proc surveymeans data=sample_str_SRSWOR total=totals mean var; strata cityside; var exp_travel; run; title; *** Figure 1.4; proc sort data=sample_str_SRSWOR out=junk (keep=blockID adultID cityside); by cityside blockID; run; data sample_str_SRSWOR; merge sample_str_SRSWOR junk; by cityside; run; proc sgplot data=sample_str_SRSWOR noautolegend; scatter x=adultID y=exp_travel / group=cityside markerattrs=(symbol='X'); refline 1000 / axis=X lineattrs=(thickness=3); run; * NOTE: stratum mean reference lines were added to plot using Windows image editor; *** Program 1.3; title1 'Cluster Sampling'; title2 'Estimating a Sample Mean and its Variance Ignoring the Clustering'; proc surveymeans data=sample_clus mean var; var exp_OTCmeds exp_travel; run; title2 'Estimating a Sample Mean and its Variance Accounting for the Clustering'; proc surveymeans data=sample_clus mean var; cluster blockID; var exp_OTCmeds exp_travel; run; title; *** Figure 1.5; * visualizing the dispersion of cluster-specific means in the sample; data toplot; set sample_clus (in=a rename=(exp_OTCmeds=expense)) sample_clus (in=b rename=(exp_travel=expense)); if a then type='OTC Meds'; if b then type='Travel'; run; proc sgpanel data=toplot; panelby type / uniscale=row; dot blockID / response=expense stat=mean limits=both; run; *** Program 1.4; proc means data=sample_clus noprint nway; class blockID; var exp_OTCmeds exp_travel; output out=cluster_means mean=; run; proc surveymeans data=cluster_means mean var; var exp_OTCmeds exp_travel; run; *** Program 1.5; title1 'Sampling with Weights'; title2 'Estimating Two Sample Means and Variances Ignoring the Weights'; proc surveymeans data=sample_weights mean var; var exp_OTCmeds exp_travel; run; title2 'Estimating Two Sample Means and Variances Accounting for the Weights'; proc surveymeans data=sample_weights mean var; var exp_OTCmeds exp_travel; weight weight; run; title; ****** Chapter 2: Drawing Random Samples Using PROC SURVEYSELECT; *** Program 2.1; proc surveyselect data=frame out=sample_SRSWOR sampsize=400 method=SRS seed=40029; run; *** Program 2.2; proc surveyselect data=frame out=sample_SRSWR sampsize=400 seed=22207 method=URS outhits; run; *** Program 2.3; proc surveyselect data=frame out=sample_SYS sampsize=400 seed=65401 method=SYS; run; *** Program 2.4; proc surveyselect data=frame out=sample_SYS_c sampsize=400 seed=94424 method=SYS; control income; run; *** Program 2.5; proc surveyselect data=frame out=sample_PPSWOR sampsize=400 seed=32124 method=PPS; size income; run; *** Program 2.6; proc sort data=frame; by cityside; run; data sampsizes; length cityside $4; input cityside _NSIZE_; datalines; East 100 West 300 ; run; proc surveyselect data=frame out=sample_STR_SRS sampsize=sampsizes seed=89045; strata cityside; run; *** Program 2.7; proc surveyselect data=frame out=sample_cluster sampsize=20 seed=82908; cluster blockID; run; *** Program 2.8; proc surveyselect data=frame out=sample_PPS_cluster sampsize=20 seed=49721 method=PPS; cluster blockID; size income; run; *** Program 2.9; proc sort data=frame; by cityside; run; data sampsizes_clusters; length cityside $4; input cityside _NSIZE_; datalines; East 5 West 15 ; run; proc surveyselect data=frame out=sample_str_cluster sampsize=sampsizes_clusters seed=77472; strata cityside; cluster blockID; run; *** Program 2.10; proc sort data=frame; by cityside; run; data sampsizes_clusters2; length cityside $4; input cityside _NSIZE_; datalines; East 10 West 30 ; run; * Step 1) selection of PSUs (blocks); proc surveyselect data=frame seed=78410 sampsize=sampsizes_clusters2 out=sample_prelim (rename=(SelectionProb = SelectProb1 SamplingWeight = SamplingWeight1)); strata cityside; cluster blockID; run; * Step 2) selection of SSUs (adults within blocks); proc surveyselect data=sample_prelim seed=64107 rate=.5 out=sample_multistage (rename=(SelectionProb = SelectProb2 SamplingWeight = SamplingWeight2)); strata cityside blockID; run; * assign variables SelectionProb and SamplingWeight to be the product of the corresponding variables output from the two SURVEYSELECT runs above; data sample_multistage; set sample_multistage; SelectionProb=SelectProb1*SelectProb2; SamplingWeight=SamplingWeight1*SamplingWeight2; run; * verify the weights sum to known totals from the sampling frame; proc means data=sample_multistage nway; class cityside; var SamplingWeight; run; ****** Chapter 3: Analyzing Continuous Variables Using PROC SURVEYMEANS; *** Program 3.1; proc surveymeans data=CBECS_2003 sum; strata STRATUM8; cluster PAIR8; var SQFT8; weight ADJWT8; run; *** Program 3.2; proc format; value YESNO 1='Yes' 2='No'; run; proc surveymeans data=CBECS_2003 sum; strata STRATUM8; cluster PAIR8; class STUSED8; var STUSED8; weight ADJWT8; format STUSED8 YESNO.; run; *** Program 3.3; proc surveymeans data=CBECS_2003 mean; strata STRATUM8; cluster PAIR8; var SQFT8; weight ADJWT8; run; *** Program 3.4; proc format; value YESNO 1='Yes' 2='No'; run; proc surveymeans data=CBECS_2003 mean; strata STRATUM8; cluster PAIR8; class NGUSED8; var NGUSED8; weight ADJWT8; format NGUSED8 YESNO.; run; *** Program 3.5; data ratio_example; set CBECS_2003; num=ADJWT8*SQFT8; den=ADJWT8; run; proc surveymeans data=ratio_example ratio; strata STRATUM8; cluster PAIR8; var num den; ratio num / den; run; *** Program 3.6; proc surveymeans data=CBECS_2003 ratio; strata STRATUM8; cluster PAIR8; var ELCNS8 SQFT8; ratio ELCNS8 / SQFT8; weight ADJWT8; run; *** Program 3.7; * ratio estimator of a total; data ex_ratio_total; set CBECS_2003; ELCNS8_=ELCNS8*71657900522; run; proc surveymeans data=ex_ratio_total ratio; strata STRATUM8; cluster PAIR8; var ELCNS8_ SQFT8; ratio ELCNS8_ / SQFT8; weight ADJWT8; run; * traditional estimator of a total; proc surveymeans data=CBECS_2003 sum; strata STRATUM8; cluster PAIR8; var ELCNS8; weight ADJWT8; run; *** Program 3.8; proc surveymeans data=CBECS_2003 quartiles mean; strata STRATUM8; cluster PAIR8; var ELEXP8; weight ADJWT8; run; ****** Chapter 4: Analyzing Categorical Variables Using PROC SURVEYFREQ; *** Program 4.1; proc format; value EVRMARRY 0='NEVER MARRIED' 1='MARRIED'; run; proc surveymeans data=NSFG_0610_F nobs sum mean; strata SEST; cluster SECU; class EVRMARRY; var EVRMARRY; format EVRMARRY EVRMARRY.; weight WGTQ1Q16; run; proc surveyfreq data=NSFG_0610_F; strata SEST; cluster SECU; tables EVRMARRY; format EVRMARRY EVRMARRY.; weight WGTQ1Q16; run; *** Program 4.2; proc surveyfreq data=NSFG_0610_F; strata SEST; cluster SECU; tables PREGNUM / nowt nofreq cl; weight WGTQ1Q16; run; *** Program 4.3; proc surveyfreq data=NSFG_0610_F; strata SEST; cluster SECU; tables PREGNUM / nowt nofreq cl(psmall=.1 type=logit); weight WGTQ1Q16; run; *** Program 4.4; proc format; value PRG4PLUS 0='NONE' 1='1 PREGNANCY' 2='2 PREGNANCIES' 3='3 PREGNANCIES' other='4 OR MORE PREGNANCIES' ; run; proc surveyfreq data=NSFG_0610_F; strata SEST; cluster SECU; tables PREGNUM / chisq lrchisq testp=(.4 .15 .15 .15 .15); format PREGNUM PRG4PLUS.; weight WGTQ1Q16; run; *** Program 4.5; proc format; value EVRMARRY 0='NEVER MARRIED' 1='MARRIED'; value RELIGION 1='NO RELIGION' 2='CATHOLIC' 3='PROTESTANT' 4='OTHER RELIGIONS'; run; proc surveyfreq data=NSFG_0610_F; strata SEST; cluster SECU; tables RELIGION*EVRMARRY; format RELIGION RELIGION. EVRMARRY EVRMARRY.; weight WGTQ1Q16; run; *** Program 4.6; proc format; value EVRMARRY 0='NEVER MARRIED' 1='MARRIED'; value RELIGION 1='NO RELIGION' 2='CATHOLIC' 3='PROTESTANT' 4='OTHER RELIGIONS' ; run; proc surveyfreq data=NSFG_0610_F; strata SEST; cluster SECU; tables RELIGION*EVRMARRY / chisq lrchisq; format RELIGION RELIGION. EVRMARRY EVRMARRY.; weight WGTQ1Q16; run; *** Program 4.7; proc format; value EVRMARRY 1='1. MARRIED' 0='2. NEVER MARRIED'; value PILLR 1='1. YES' 2='2. NO'; run; proc surveyfreq data=NSFG_0610_F order=formatted; strata SEST; cluster SECU; tables EVRMARRY*PILLR / nofreq nocellpercent row risk relrisk; format EVRMARRY EVRMARRY. PILLR PILLR.; weight WGTQ1Q16; run; *** Program 4.8; proc format; value RELIGION 1='1. NOT RELIGIOUS' 2,3,4='2. RELIGIOUS'; value EVRMARRY 1='1. MARRIED' 0='2. NEVER MARRIED'; value PILLR 1='1. YES' 2='2. NO'; run; ods graphics on; proc surveyfreq data=NSFG_0610_F order=formatted; strata SEST; cluster SECU; tables RELIGION*EVRMARRY*PILLR / nofreq nocellpercent row relrisk plots=relriskplot; format RELIGION RELIGION. EVRMARRY EVRMARRY. PILLR PILLR.; weight WGTQ1Q16; run; ods graphics off; ****** Chapter 5: Fitting Linear Regression Models Using PROC SURVEYREG; *** Figure 5.1; data bivariate; do x=0 to 10 by .5; y=x + 2*rannor(34442); output; end; run; proc sgplot data=bivariate noautolegend; reg X=x Y=y / markerattrs=(size=8 symbol=circlefilled) lineattrs=(color=black thickness=3); xaxis values=(0 to 11 by 1); yaxis values=(0 to 11 by 1); run; *** Figure 5.2; data violations; do x=1 to 10 by .25; hetero=0 + x*.4*rannor(23401); if x le 4 then hetero=hetero/x; mis=-5 + x + rannor(777548); serial=3*sin(x) + .5*rannor(342021); output; end; run; data violations; set violations (in=a keep=X hetero rename=(hetero=y)) violations (in=b keep=X mis rename=(mis=y)) violations (in=c keep=X serial rename=(serial=y)); length Type $25; if a then type='Heteroscedasticity'; if b then type='Model Misspecification'; if c then type='Serial Correlation'; run; proc sgpanel; panelby type / rows=3 spacing=10; scatter X=x Y=y / markerattrs=(size=5 symbol=circlefilled); refline 0 /axis=Y; colaxis values=(1 to 10 by 1); rowaxis values=(-5 to 5 by 5); label x='Predicted Value' y='Residual'; run; *** Program 5.1; proc surveyreg data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class MED SEX RACER SOLO MAJOR; model TIMEMD = MED SEX RACER SOLO MAJOR AGE TOTCHRON / solution; weight PATWT; run; *** Program 5.2; proc surveyreg data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class MED SEX RACER SOLO MAJOR; model TIMEMD = MED SEX RACER SOLO MAJOR AGE TOTCHRON / solution; weight PATWT; contrast 'Test of Reduced Model' MED 1 -1, SEX 1 -1, RACER 1 0 -1, RACER 0 1 -1; run; *** Program 5.3; proc surveyreg data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class SOLO MAJOR; model TIMEMD = SOLO MAJOR AGE TOTCHRON / ANOVA; weight PATWT; output out=preds p=y_hat std=se_y_hat l=y_hat_lower u=y_hat_upper; run; * create prediction intervals for each sampling unit; data preds; set preds; * RMSE obtained from output of PROC SURVEYREG run above; MSE=13.8772; t_crit=TINV(0.975,552); y_pred_lower=y_hat - t_crit*SQRT(se_y_hat**2 + MSE); y_pred_upper=y_hat + t_crit*SQRT(se_y_hat**2 + MSE); run; *** Program 5.4; proc surveyreg data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class SOLO MAJOR; model TIMEMD = SOLO MAJOR AGE TOTCHRON; weight PATWT; estimate 'Case 1' INTERCEPT 1 AGE 77 TOTCHRON 1 SOLO 1 0 MAJOR 0 0 0 0 1; estimate 'Case 2' INTERCEPT 1 AGE 45 TOTCHRON 1 SOLO 0 1 MAJOR 0 1 0 0 0; run; *** Program 5.5; data addons; input AGE TOTCHRON SOLO MAJOR; datalines; 77 1 1 5 45 1 2 2 ; run; data example_expectation; set NAMCS_2010 addons; run; proc surveyreg data=example_expectation; strata CSTRATM; cluster CPSUM; class SOLO MAJOR; model TIMEMD = SOLO MAJOR AGE TOTCHRON; weight PATWT; output out=PREDS p=Y_HAT std=SE_Y_HAT; run; proc print data=PREDS; where TIMEMD=.; var TIMEMD SOLO MAJOR AGE TOTCHRON Y_HAT SE_Y_HAT; run; *** Program 5.6; proc surveyreg data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class SOLO MAJOR; model TIMEMD = SOLO MAJOR AGE TOTCHRON; weight PATWT; estimate 'Case 1 - Case 2' AGE 32 SOLO 1 -1 MAJOR 0 -1 0 0 1; run; ****** Chapter 6: Fitting Logistic Regression Models Using PROC SURVEYLOGISTIC; *** Figure 6.1; data toplot; input x y1; datalines; -1 . 0 0 1 0 2 0 3 1 4 0 5 1 6 0 7 1 8 1 9 1 10 1 11 . ; run; * overlay a regression line to the plot above to illustrate how the predicted values can be "out of scope"; proc reg data=toplot; model y1 = x; output out=toplot2 p=pred_ols; run; quit; proc sgplot data=toplot2 noautolegend; scatter X=x Y=y1 / markerattrs=(size=8 symbol=circlefilled); series X=x Y=pred_ols / lineattrs=(color=black thickness=3); xaxis values=(-1 to 11 by 1); yaxis values=(-.2 to 1.2 by .2); label y1='y'; run; *** Figure 6.2; data toplot; input x y1; datalines; -1 . 0 0 1 0 2 0 3 1 4 0 5 1 6 0 7 1 8 1 9 1 10 1 11 . ; run; data toplot3; do x=-1 to 11 by .1; output; end; run; data toplot3; merge toplot3 toplot; by x; run; proc logistic data=toplot3; model y1 (event='1') = x; output out=toplot3 p=pred_logistic; run; quit; proc sgplot data=toplot3 noautolegend; scatter X=x Y=y1 / markerattrs=(size=8 symbol=circlefilled); series X=x Y=pred_logistic / lineattrs=(color=black thickness=3); xaxis values=(-1 to 11 by 1); yaxis values=(-.2 to 1.2 by .2); label y1='y'; run; *** Program 6.1; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class SEX RACER MAJOR / param=ref; model MED (event='1') = SEX RACER MAJOR AGE TOTCHRON TIMEMD; weight PATWT; run; *** Program 6.2; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class SEX RACER MAJOR / param=ref; model MED (event='1') = SEX RACER MAJOR AGE TOTCHRON TIMEMD; weight PATWT; contrast 'Test of Reduced Model' SEX 1, AGE 1, TIMEMD 1; run; *** Program 6.3; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model MED (event='1') = RACER MAJOR TOTCHRON; weight PATWT; output out=residuals p=p_hat; run; data residuals; set residuals; residual=MED-p_hat; run; proc sort data=residuals; by p_hat; run; * assign risk decile thresholds where cumulated sum of weights exceeds (1/g)th of the total sum of weights; data residuals; set residuals; PATWT_sum=1008802005; PATWT_running_sum+PATWT; decile=ceil(10*(PATWT_running_sum/PATWT_sum)); run; proc surveyreg data=residuals; strata CSTRATM; cluster CPSUM; class decile; model residual = decile / noint solution vadjust=none; weight PATWT; contrast 'A-L LOF Test' decile 1 1 1 1 1 1 1 1 1 1; run; *** Program 6.4; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model MED (event='1') = RACER MAJOR TOTCHRON; weight PATWT; output out=preds p=p_hat l=p_hat_lower u=p_hat_upper; run; *** Program 6.5; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model MED (event='1') = RACER MAJOR TOTCHRON; weight PATWT; contrast 'B vs. W Patients' RACER -1 1 / estimate=both; run; *** Program 6.6; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model MED (event='1') = RACER MAJOR TOTCHRON; weight PATWT; contrast 'TOTCHRON + 3' TOTCHRON 3 / estimate=both; run; *** Program 6.7; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model MED (event='1') = RACER MAJOR TOTCHRON RACER*TOTCHRON; weight PATWT; contrast 'Custom Odds Ratio' MAJOR 0 1 -1 0 RACER -1 1 RACER*TOTCHRON -1 1 / estimate=both e; run; *** Program 6.8; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model NUMMED (ref='0') = RACER MAJOR TOTCHRON / link=GLOGIT; weight PATWT; run; *** Program 6.9; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model NUMMED (ref='0') = RACER MAJOR TOTCHRON / link=GLOGIT; weight PATWT; test INTERCEPT_1=INTERCEPT_2, RACER1_1=RACER1_2, RACER2_1=RACER2_2, MAJOR1_1=MAJOR1_2, MAJOR2_1=MAJOR2_2, MAJOR3_1=MAJOR3_2, MAJOR4_1=MAJOR4_2, TOTCHRON_1=TOTCHRON_2; run; *** Program 6.10; proc surveylogistic data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model NUMMED = RACER MAJOR TOTCHRON / link=CLOGIT; weight PATWT; run; ****** Chapter 7: Survival Analysis with Complex Survey Data; *** Program 7.1; data NSFG_F_MAR; set NSFG_0610_F; * maintain only those respondents who have ever been married; if EVRMARRY=1; * assign an indicator of being censored observations if individual is still married, separated (but not divorced), or if the spouse died; censor=(MAREND01 in(. 2 3)); run; proc format; value RELIGION 1='NO RELIGION' 2='CATHOLIC' 3='PROTESTANT' 4='OTHER RELIGIONS'; run; proc lifetest data=NSFG_F_MAR method=KM plots=(survival(nocensor)) maxtime=240; time MAR1DISS*censor(1); strata RELIGION; weight WGTQ1Q16; format RELIGION RELIGION.; run; *** Program 7.2; proc format; value COHEVER 1='YES' 2='NO'; run; proc lifetest data=NSFG_F_MAR method=KM plots=(survival(failure nocensor)) maxtime=240; time MAR1DISS*censor(1); strata COHEVER; weight WGTQ1Q16; format COHEVER COHEVER.; run; *** Program 7.3; proc surveyphreg data=NSFG_F_MAR; strata SEST; cluster SECU; class RELIGION (ref='1') HISPRACE (ref='2') COHEVER / param=ref; model MAR1DISS*censor(1) = RELIGION HISPRACE COHEVER FMAR1AGE; weight WGTQ1Q16; run; *** Program 7.4; proc surveyphreg data=NSFG_F_MAR; strata SEST; cluster SECU; class RELIGION (ref='1') HISPRACE (ref='2') COHEVER / param=ref; model MAR1DISS*censor(1) = RELIGION HISPRACE COHEVER FMAR1AGE; weight WGTQ1Q16; estimate 'Test of Reduced Model - Dropping Religion' RELIGION 1 0 0, RELIGION 0 1 0, RELIGION 0 0 1 / joint; run; *** Program 7.5; proc format; value COHEVER 1='YES' 2='NO'; run; proc lifetest data=NSFG_F_MAR method=KM plots=(loglogs) maxtime=240; time MAR1DISS*censor(1); strata COHEVER; weight WGTQ1Q16; format COHEVER COHEVER.; run; *** Program 7.6; proc surveyphreg data=NSFG_F_MAR; strata SEST; cluster SECU; class HISPRACE (ref='2') COHEVER / param=ref; model MAR1DISS*censor(1) = HISPRACE FMAR1AGE COHEVER COHEVERT; weight WGTQ1Q16; COHEVERT=(COHEVER=1)*MAR1DISS; run; *** Program 7.7; data NSFG_F_MAR_PP; set NSFG_F_MAR (keep=CASEID SEST SECU WGTQ1Q16 MAR1DISS censor RELIGION HISPRACE COHEVER FMAR1AGE); intervals=ceil(MAR1DISS/12 + .000001); do m=1 to intervals; output; end; run; proc sort data=NSFG_F_MAR_PP; by CASEID; run; data NSFG_F_MAR_PP; set NSFG_F_MAR_PP; by CASEID; diss=0; if last.CASEID and censor ne 1 then diss=1; run; title 'Cox Proportional Hazards Version'; proc surveyphreg data=NSFG_F_MAR; strata SEST; cluster SECU; class RELIGION (ref='1') HISPRACE (ref='2') COHEVER / param=ref; model MAR1DISS*censor(1) = RELIGION HISPRACE COHEVER FMAR1AGE; weight WGTQ1Q16; run; title 'Discrete-Time Hazards Version'; proc surveylogistic data=NSFG_F_MAR_PP; strata SEST; cluster SECU; class RELIGION (ref='1') HISPRACE (ref='2') COHEVER / param=ref; model diss (event='1') = m RELIGION HISPRACE COHEVER FMAR1AGE / expb; weight WGTQ1Q16; run; title; *** Program 7.8; data NSFG_F_MAR_PP2; set NSFG_F_MAR (keep=CASEID SEST SECU WGTQ1Q16 MAR1DISS censor RELIGION HISPRACE COHEVER FMAR1AGE DATBABY1 MARDAT01); intervals=ceil(MAR1DISS/12 + .000001); do m=1 to intervals; output; end; run; proc sort data=NSFG_F_MAR_PP2; by CASEID; run; data NSFG_F_MAR_PP2; set NSFG_F_MAR_PP2; by CASEID; * insert indicator of whether child is present during marriage interval; children=(DATBABY1 ne . and DATBABY1 le MARDAT01+(m*12)); diss=0; if last.CASEID and censor ne 1 then diss=1; run; proc surveylogistic data=NSFG_F_MAR_PP2; strata SEST; cluster SECU; class RELIGION (ref='1') HISPRACE (ref='2') COHEVER / param=ref; model diss (event='1') = m RELIGION HISPRACE COHEVER FMAR1AGE children / expb; weight WGTQ1Q16; run; ****** Chapter 8: Domain Estimation; *** Program 8.1; data test; input class class_type $ homeroom grade1 tutor $ grade2 weight; datalines; 12 upper 1 87 Y 94 49.5 12 upper 1 89 N 89 49.5 12 upper 2 91 Y 94 48 12 upper 2 84 Y 92 48 11 upper 1 82 N 84 47.5 11 upper 1 94 N 95 47.5 11 upper 2 93 N 95 48 11 upper 2 94 Y 97 48 10 under 1 78 N 81 39 10 under 1 84 N 84 39 10 under 2 90 N 87 37.5 10 under 2 82 N 85 37.5 9 under 1 88 N 88 40 9 under 1 93 Y 91 40 9 under 2 77 Y 85 48 9 under 2 81 N 84 48 ; run; title '1) Analyzing Proper Subsets with the BY statement'; proc sort data=test; by class_type; run; proc surveymeans data=test mean stderr; by class_type; stratum class; cluster homeroom; var grade2; weight weight; run; title '2) Analyzing Proper Subsets with the DOMAIN statement'; proc surveymeans data=test mean stderr; stratum class; cluster homeroom; var grade2; weight weight; domain class_type; run; title; *** Program 8.2; title '1) Improperly Analyzing a Domain by Subsetting the Data Set'; proc surveymeans data=test mean stderr; where tutor='Y'; stratum class; cluster homeroom; var grade2; weight weight; run; title '2) Correctly Analyzing a Domain with the DOMAIN Statement'; proc surveymeans data=test mean stderr; stratum class; cluster homeroom; var grade2; weight weight; domain tutor; run; title; *** Program 8.3; data test_d; set test; weight_d=weight*(tutor='Y') + .0000000000001; run; proc surveymeans data=test_d mean stderr median; stratum class; cluster homeroom; var grade2; weight weight_d; run; *** Program 8.4; proc format; value YESNO 1='Yes' 2='No'; value REGION 1='Northeast' 2='Midwest' 3='South' 4='West'; run; proc surveymeans data=CBECS_2003 nobs sum; strata STRATUM8; cluster PAIR8; class STUSED8; var STUSED8; weight ADJWT8; domain REGION8; format STUSED8 YESNO. REGION8 REGION.; run; * an equivalent analysis using PROC SURVEYFREQ; proc surveyfreq data=CBECS_2003; strata STRATUM8; cluster PAIR8; tables REGION8*STUSED8 / nopercent; weight ADJWT8; format STUSED8 YESNO. REGION8 REGION.; run; *** Program 8.5; title1 '1) Illustrating the DOMAIN Statement in PROC SURVEYREG'; proc surveyreg data=NAMCS_2010; strata CSTRATM; cluster CPSUM; class SOLO MAJOR; model TIMEMD = SOLO MAJOR AGE TOTCHRON / solution; weight PATWT; domain PHYS; run; title1 '2) Illustrating the Domain-Specific Weight Approach for Estimating Regression Coefficients'; data NAMCS_2010_d; set NAMCS_2010; PATWT_d=PHYS*PATWT + .0000000000001; run; proc surveyreg data=NAMCS_2010_d; strata CSTRATM; cluster CPSUM; class SOLO MAJOR; model TIMEMD = SOLO MAJOR AGE TOTCHRON / solution; weight PATWT_d; run; title; *** Program 8.6; title '1) Testing Year-End Mean Score for Underclassmen vs. Upperclassmen'; proc surveyreg data=test; stratum class; cluster homeroom; class class_type; model grade2 = class_type / solution vadjust=none; weight weight; run; title '2) Testing Year-End Mean Score for Tutored Students vs. Non-Tutored Students'; proc surveyreg data=test; stratum class; cluster homeroom; class tutor; model grade2 = tutor / solution vadjust=none; weight weight; run; title; *** Program 8.7; * difference variable approach; data test; set test; diff=grade2-grade1; run; proc surveymeans data=test mean stderr t; stratum class; cluster homeroom; var diff; weight weight; run; * data stacking approach; data test_stacked; set test (in=d1 rename=(grade1=grade)) test (in=d2 rename=(grade2=grade)); domain=d2; run; proc surveyreg data=test_stacked; stratum class; cluster homeroom; model grade = domain / vadjust=none; weight weight; run; *** Program 8.8; proc surveyreg data=NAMCS_2010; stratum CSTRATM; cluster CPSUM; class MAJOR; model TIMEMD = MAJOR / vadjust=none; weight PATWT; lsmeans MAJOR / diff; run; *** Program 8.9; proc surveyreg data=NAMCS_2010; stratum CSTRATM; cluster CPSUM; class SOLO; model MED = SOLO / vadjust=none solution clparm; weight PATWT; run; proc surveyfreq data=NAMCS_2010; stratum CSTRATM; cluster CPSUM; tables SOLO*MED / nowt row riskdiff; weight PATWT; run; *** Program 8.10; title '1) 95% Confidence Limits without a DF Adjustment'; data test_d; set test; weight_d=weight*(tutor='Y') + .0000000000001; run; proc surveymeans data=test_d mean stderr clm df; stratum class; cluster homeroom; var grade2; weight weight_d; run; title '2) 95% Confidence Limits with DF Adjustment per Heeringa et al. (2010)'; * preliminary PROC SQL step subsets the data for strata with at least one PSU in the domain defined by TUTOR='Y'; proc sql; create table test_d_adj as select * from test_d where class in(select distinct class from test_d where tutor='Y'); quit; proc surveymeans data=test_d_adj mean stderr clm df; stratum class; cluster homeroom; var grade2; weight weight_d; run; title; ****** Chapter 9: Replication Techniques for Variance Estimation; *** Program 9.1; data test; input class class_type $ homeroom grade1 tutor $ grade2 weight; datalines; 12 upper 1 87 Y 94 49.5 12 upper 1 89 N 89 49.5 12 upper 2 91 Y 94 48 12 upper 2 84 Y 92 48 11 upper 1 82 N 84 47.5 11 upper 1 94 N 95 47.5 11 upper 2 93 N 95 48 11 upper 2 94 Y 97 48 10 under 1 78 N 81 39 10 under 1 84 N 84 39 10 under 2 90 N 87 37.5 10 under 2 82 N 85 37.5 9 under 1 88 N 88 40 9 under 1 93 Y 91 40 9 under 2 77 Y 85 48 9 under 2 81 N 84 48 ; run; title '1) Demonstrating the Woodruff Algorithm for Taylor Series Linearization'; * compute the PSU-level linear substitute term; proc sql; select sum(weight) into :N_hat from test; select sum(weight*grade2) into :Y_hat from test; create table lin_subs as select class,homeroom, (1/&N_hat)*(sum(weight*grade2) - (&Y_hat/&N_hat)*sum(weight)) as u_i from test group by class,homeroom; quit; * estimate variance of the sum of the linear substitute with respect to the design; proc surveymeans data=lin_subs varsum std; stratum class; cluster homeroom; var u_i; run; title '2) Standard PROC SURVEYMEANS Syntax for a Weighted Mean'; proc surveymeans data=test mean var stderr; stratum class; cluster homeroom; var grade2; weight weight; run; title; *** Program 9.2; proc surveymeans data=test varmethod=BRR mean var stderr; stratum class; cluster homeroom; var grade2; weight weight; run; *** Program 9.3; proc surveymeans data=test varmethod=BRR (outweights=test_BRR) mean var stderr; stratum class; cluster homeroom; var grade2; weight weight; run; *** Program 9.4; proc surveymeans data=test_BRR varmethod=BRR mean var stderr; var grade2; weight weight; repweights RepWt_1-RepWt_8; run; *** Program 9.5; proc surveymeans data=test varmethod=BRR (outweights=test_BRR_Fay fay=.3) mean stderr; stratum class; cluster homeroom; var grade2; weight weight; run; *** Program 9.6; title '1) Domain Estimation with BRR'; proc surveymeans data=test_BRR varmethod=BRR mean stderr; var grade2; weight weight; repweights RepWt_1-RepWt_8; domain tutor; run; title "2) Domain Estimation with Fay's BRR"; proc surveymeans data=test_BRR_Fay varmethod=BRR (fay=.3) mean stderr; var grade2; weight weight; repweights RepWt_1-RepWt_8; domain tutor; run; title; *** Program 9.7; proc surveymeans data=test varmethod=JK (outweights=test_JK outjkcoefs=test_JK_coefs) mean stderr; stratum class; cluster homeroom; var grade2; weight weight; run; *** Program 9.8; proc surveymeans data=test_JK varmethod=JK mean stderr; var grade2; weight weight; repweights RepWt_1-RepWt_8 / jkcoefs=test_JK_coefs; run; *** Program 9.9; proc surveymeans data=test_JK varmethod=JK mean stderr; var grade2; weight weight; repweights RepWt_2 RepWt_3 RepWt_5 RepWt_8 / jkcoefs=0.99999; run; *** Program 9.10; * compile bootstrap replicate weight macro; * SOURCE: Lohr, S. (2012). "Using SASŪ for the Design, Analysis, and Visualization of Complex Surveys," Proceedings of the SAS Global Forum. Available on-line at: http://support.sas.com/resources/papers/proceedings12/343-2012.pdf ; * NOTE: a modification made here is adding the macro paramater SEED to ensure weight creation process is reproducible; %macro bootwt (fulldata,wt,stratvar,psuvar,numboot,fullrep,repwt,seed); proc sort data=&fulldata out=fulldata; by &stratvar &psuvar; run; proc sql stimer; create table psulist as select distinct &stratvar, &psuvar from fulldata order by &stratvar, &psuvar; create table numpsu as select distinct &stratvar, count(*)-1 as _nsize_ from psulist group by &stratvar order by &stratvar; quit; data fulldata (drop=_nsize_); merge fulldata (in=inf) numpsu (in=inn); by &stratvar; if inf & inn; wtmult = (_nsize_ + 1) / _nsize_; run; proc surveyselect data=psulist method=urs sampsize=numpsu out=repout outall reps=&numboot seed=&seed; strata &stratvar; id &psuvar; run; proc sort data=repout (keep=&stratvar &psuvar replicate numberhits) out=repout_sorted; by &stratvar &psuvar replicate; run; proc transpose data=repout_sorted (keep=&stratvar &psuvar replicate numberhits) out=repout_tr (keep=&stratvar &psuvar repmult:) prefix=repmult; by &stratvar &psuvar; id replicate; var numberhits; run; data &fullrep (drop=i repmult1-repmult&numboot wtmult); array &repwt (&numboot); array repmult (&numboot); merge fulldata (in=inf) repout_tr (in=inr); by &stratvar &psuvar; do i = 1 to &numboot; &repwt(i) = &wt * repmult(i) * wtmult; end; run; %mend bootwt; %bootwt (test,weight,class,homeroom,200,test_boot,RepWt_,399448); *** Program 9.11; proc surveymeans data=test_boot varmethod=JK mean stderr; var grade2; weight weight; repweights RepWt_1-RepWt_200 / jkcoef=0.005025; run; *** Program 9.12; data test_brr; set test_brr; tutor_Y=(tutor='Y'); run; proc surveyreg data=test_brr varmethod=BRR; model grade2 = tutor_Y / covb; weight weight; repweights RepWt_1-RepWt_8; run; *** Program 9.13; title '1) Taylor Series Linearization'; proc surveylogistic data=NAMCS_2010 varmethod=Taylor; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model MED (event='1') = RACER MAJOR TOTCHRON; weight PATWT; run; title '2) Jackknife Repeated Replication'; proc surveylogistic data=NAMCS_2010 varmethod=JK; strata CSTRATM; cluster CPSUM; class RACER MAJOR / param=ref; model MED (event='1') = RACER MAJOR TOTCHRON; weight PATWT; run; title; *** Program 9.14; * generate BRR replicate weights; proc surveymeans data=CBECS_2003 varmethod=BRR (outweights=CBECS_2003_brr); stratum STRATUM8; cluster PAIR8; var ELCNS8 SQFT8; ratio ELCNS8 / SQFT8; weight ADJWT8; domain PBA8; run; %macro BRR_ratiodiff(replicates); * housecleaning; proc sql; drop table BRR_est_all; quit; * store full-sample estimates in macro variables; proc sql noprint; select sum(ADJWT8*ELCNS8)/sum(ADJWT8*SQFT8) into :ratio1 from CBECS_2003_brr where PBA8=16; select sum(ADJWT8*ELCNS8)/sum(ADJWT8*SQFT8) into :ratio2 from CBECS_2003_brr where PBA8=8; select sum((PBA8=16)*ADJWT8*ELCNS8)/sum((PBA8=16)*ADJWT8*SQFT8) - sum((PBA8=8 )*ADJWT8*ELCNS8)/sum((PBA8=8 )*ADJWT8*SQFT8) into :ratio_diff_FS from CBECS_2003_brr; quit; * loop through all replicate weights and get the same estimate; %do r=1 %to &replicates; proc sql; create table BRR_est_&r as select ratio1 - ratio2 as est from (select sum((PBA8=16)*RepWt_&r*ELCNS8)/sum((PBA8=16)*RepWt_&r*SQFT8) as ratio1 from CBECS_2003_brr), (select sum((PBA8=8 )*RepWt_&r*ELCNS8)/sum((PBA8=8 )*RepWt_&r*SQFT8) as ratio2 from CBECS_2003_brr); quit; proc datasets nolist; append data=BRR_est_&r base=BRR_est_all FORCE; delete BRR_est_&r; run; quit; %end; * consolidate replicate-specific estimates and find squared deviations from full-sample estimate; data BRR_est_all; set BRR_est_all; var=(1/&replicates)*(est - &ratio_diff_FS)**2; run; proc sql; select &ratio1 as Ratio1,&ratio2 as Ratio2, &ratio1 - &ratio2 as Difference, sqrt(sum(var)) as SE_brr, sum(var) as var_brr from BRR_est_all; quit; %mend; %BRR_ratiodiff(48); * Woodruff TSL approach to estimate the variance of the same quantity; proc sql noprint; select sum(ADJWT8*ELCNS8) into :T1 from CBECS_2003 where PBA8=16; select sum(ADJWT8*SQFT8) into :T2 from CBECS_2003 where PBA8=16; select sum(ADJWT8*ELCNS8) into :T3 from CBECS_2003 where PBA8=8; select sum(ADJWT8*SQFT8) into :T4 from CBECS_2003 where PBA8=8; create table lin_subs as select STRATUM8,PAIR8, sum((1/&T2)*(PBA8=16)*(ADJWT8*ELCNS8 - (&T1/&T2)*ADJWT8*SQFT8) - (1/&T4)*(PBA8=8 )*(ADJWT8*ELCNS8 - (&T3/&T4)*ADJWT8*SQFT8)) as u_i from CBECS_2003 group by STRATUM8,PAIR8; quit; * estimate variance of the sum of the linear substitute with respect to the design; proc surveymeans data=lin_subs varsum std; stratum STRATUM8; cluster PAIR8; var u_i; run; *** Program 9.15; title '1) Bootstrap CI Using Default DF Calculation'; proc surveymeans data=test_boot varmethod=JK mean stderr clm df; var grade2; weight weight; repweights RepWt_1-RepWt_200 / jkcoef=0.005025; run; title '2) Bootstrap CI Overriding Default DF Calculation'; proc surveymeans data=test_boot varmethod=JK mean stderr clm df; var grade2; weight weight; repweights RepWt_1-RepWt_200 / jkcoef=0.005025 DF=4; run; title; ****** Chapter 10: Weight Adjustment Methods; *** Program 10.1; title 'Sum of Base Weights for Sampling units'; proc freq data=samp; tables gender*minority / list nocum; weight weight_base; run; title 'Sum of Base Weights for Respondents'; proc freq data=samp; where respond=1; tables gender*minority / list nocum; weight weight_base; run; title; *** Program 10.2; * sum the base weights for sampling units; proc freq data=samp; tables gender*minority / out=counts_s (rename=(count=count_s)); weight weight_base; run; * sum the base weights for respondents; proc freq data=samp; where respond=1; tables gender*minority / out=counts_r (rename=(count=count_r)); weight weight_base; run; * create data set of adjustment cell weight inflation factors; data cell_factors; merge counts_s counts_r; by gender minority; factor_adj_cell=count_s/count_r; keep gender minority factor_adj_cell; run; * merge these factors into sample data set and assign a new, nonresponse-adjusted weight; proc sort data=samp; by gender minority; run; proc sort data=cell_factors; by gender minority; run; data samp; merge samp (in=a) cell_factors (in=b); by gender minority; if a; weight_adj_cell=weight_base*factor_adj_cell*(respond=1); run; * verify sum of adjusted weights for respondents match sum of base weights for the full sample; title 'Sum of Adjustment Cell Weights'; proc freq data=samp; where respond=1; tables gender*minority / list nocum; weight weight_adj_cell; run; title; *** Program 10.3; * estimate response propensities via logistic regression; proc logistic data=samp descending; class gender minority / param=ref; model respond (event='1') = age|gender|supervisor|minority @2; output out=samp p=p_hat; run; * stratify the propensities into 5 cells; proc rank data=samp out=samp groups=5; var p_hat; ranks propensity_cell; run; * sum base weights of all sampling units in a cell; proc freq data=samp; tables propensity_cell / out=counts_s (rename=(count=count_s)); weight weight_base; run; * sum base weights of responding sampling units in a cell; proc freq data=samp; where respond=1; tables propensity_cell / out=counts_r (rename=(count=count_r)); weight weight_base; run; * create data set of adjustment cell weight inflation factors; data cell_factors; merge counts_s counts_r; by propensity_cell; factor_prop_cell=count_s/count_r; keep propensity_cell factor_prop_cell; run; * merge these into sample data set and assign new, nonresponse-adjusted weight; proc sort data=samp; by propensity_cell; run; proc sort data=cell_factors; by propensity_cell; run; data samp; merge samp (in=a) cell_factors (in=b); by propensity_cell; if a; weight_adj_prop_cell=weight_base*factor_prop_cell*(respond=1); run; *** Program 10.4; * create supplemental data set with control totals; proc freq data=pop; tables gender*minority / list out=counts_p (rename=(count=_PSTOTAL_)); run; * set nonrespondent weights to missing; data samp2; set samp; if respond=0 then weight_base=.; run; proc surveymeans data=samp2 noprint; weight weight_base; poststrata gender minority / pstotal=counts_p out=samp_weight_ps; run; *** Program 10.5; * create supplemental data set with adjustment cell base-weighted totals; proc freq data=samp; tables gender*minority / out=counts_s (rename=(count=_PSTOTAL_)); weight weight_base; run; * set nonrespondent weights to missing; data samp2; set samp; * set nonrespondent weights to missing; if respond=0 then weight_base=.; run; proc surveymeans data=samp2 noprint; weight weight_base; poststrata gender minority / pstotal=counts_s out=samp_weight_adj_cell; run; *** Program 10.6; * read in same test score data set used in Chapter 8 and 9; data test; input class class_type $ homeroom grade1 tutor $ grade2 weight; datalines; 12 upper 1 87 Y 94 49.5 12 upper 1 89 N 89 49.5 12 upper 2 91 Y 94 48 12 upper 2 84 Y 92 48 11 upper 1 82 N 84 47.5 11 upper 1 94 N 95 47.5 11 upper 2 93 N 95 48 11 upper 2 94 Y 97 48 10 under 1 78 N 81 39 10 under 1 84 N 84 39 10 under 2 90 N 87 37.5 10 under 2 82 N 85 37.5 9 under 1 88 N 88 40 9 under 1 93 Y 91 40 9 under 2 77 Y 85 48 9 under 2 81 N 84 48 ; run; * create supplemental data set with class totals; data class_totals; input class _PSTOTAL_; datalines; 9 400 10 360 11 395 12 405 ; run; proc surveymeans data=test varmethod=BRR (outweights=test_BRR_ps) noprint; stratum class; cluster homeroom; weight weight; poststrata class / pstotal=class_totals; run; *** Program 10.7; * create the supplemental data sets of marginal totals; proc freq data=pop; tables gender / out=gender (drop=percent rename=(count=MRGTOTAL)); tables agecat / out=agecat (drop=percent rename=(count=MRGTOTAL)); tables minority / out=minority (drop=percent rename=(count=MRGTOTAL)); tables supervisor / out=supervisor (drop=percent rename=(count=MRGTOTAL)); run; * filter sample data set for only respondents; data respondents; set samp; where respond=1; run; * complile raking macro; * SOURCE: Izrael, D., Hoaglin, D., and Battaglia, M. (2000). "A SAS Macro for Balancing a Weighted Sample," Proceedings of the SAS Users Group International (SUGI) Conference. Available on-line at: http://www2.sas.com/proceedings/sugi25/25/st/25p258.pdf ; %MACRO RAKING (inds=, /* input data set */ outds=, /*output data set */ inwt=, /*weight to be raked */ freqlist=,/*list of data sets with marginal */ /*freqs or marginal control totals*/ /*they must contain name of raking*/ /* variable and either variable */ /* PERCENT or MRGTOTAL, e.g. direct*/ /* counts of marginal control totals;*/ /* by default it is assigned names of*/ /* raking variables */ outwt=, /* resulting raked weight */ byvar=, /* variable BY which user might want */ /* to set up raking. */ /* By default, raking is done */ /* over the whole input data set */ varlist=, /* list of variables to be raked on */ numvar=, /* number of raking variables */ cntotal=, /* general control total; it is */ /* required if freqlist data sets */ /* contains PERCENT; */ /* if byvar not empty, cntotal */ /* must be present for each byvar */ trmprec=1, /* tolerance,default is 1 */ numiter=50);/*number of iterations,default is 50*/ %macro reqpar (param); /** checking on required **/ %if (&&¶m eq ) %then %do; /** parameters **/ %put **** Program terminated: Macro parameter %upcase(&PARAM) missing ****; endsas; %end; %mend; %reqpar(inds) %reqpar(outds) %reqpar (inwt) %reqpar(outwt) %reqpar(varlist) %reqpar(numvar) %reqpar(trmprec) %reqpar(numiter) %let varlist=%upcase(&varlist); %let byvar=%upcase(&byvar); %let ls=%sysfunc(getoption(ls,keyword)); %let ps=%sysfunc(getoption(ps,keyword)); /* checking on number of raking variables */ %if (%scan(&varlist,&numvar) eq ) or (%scan(&varlist,%eval(&numvar+1)) ne ) %then %do; %put **** Program terminated: Number of variables in the VARLIST ****; %put **** does not match NUMVAR ****; endsas; %end; data __i0; set &inds; weight=&inwt; run; %do i=1 %to &numiter; /* loop on iteration */ %let sumterm = 0; /* cumulative termin flags*/ %do j=1 %to &numvar;/* loop on raking variable */ %let varrake= %scan(&varlist,&j); %if (&freqlist ne ) %then %let dsfreq=%scan(&freqlist,&j); %else %let dsfreq=&varrake; proc sort data=__i0; by &varrake; run; proc summary nway data=__i0 ; *** calculate; class &varrake; *** adjusted; var weight; ** weighted total; output out=__i1(drop=_type_ _freq_) sum=sum&j; run; data __i0; ** merge with marginal data set; merge __i0(in=_1) __i1 &dsfreq(in=_2); by &varrake; %if &i=1 %then %do; if (_1 and ^_2) or (_2 and ^_1) then do; call symput('match','1');stop;end; else call symput('match','2'); if mrgtotal ne . then call symput('mrg','1'); else call symput('mrg','2'); if percent ne . then call symput ('pct','1'); else call symput('pct','2'); %end; run; %if &i=1 %then %do; %if &match=1 %then %do; %put **** Program terminated: levels of variable &varrake do not match ****; %put ****in sample and marginal totals data sets ****; endsas; %end; %if &pct = 1 and (&cntotal eq ) %then %do; %put ** Program terminated: PERCENT is not missing and CNTOTAL is missing **; %put ** for raking variable &varrake **; endsas; %end; %else %if &pct=2 and &mrg=2 %then %do; %put **** Program terminated: Both PERCENT and MRGTOTAL are missing ****; endsas; %end; %end; data __i0; set __i0; %if (&cntotal ne ) %then %do; if mrgtotal ne . then /*case of marginal totals*/ cntmarg=mrgtotal; else if percent ne . then /*case of marginal freqs*/ cntmarg=&cntotal.*percent/100; %end; %else %do; if mrgtotal ne . then /*case of marginal totals*/ cntmarg=mrgtotal; %end; weight=weight*cntmarg/sum&j;***actual raking; drop percent mrgtotal; run; data __i2(keep=&varrake sum&j cntmarg differ); set __i0; by &varrake; if first.&varrake; differ=cntmarg-sum&j; run; proc print label data=__i2; **print diagnostics; %if (&byvar ne) %then %do; title3 "Raking &byvar - &s by &varrake, iteration - &i"; %end; %else %do; title3 "Raking by &varrake, iteration - &i "; %end; sum sum&j cntmarg; label sum&j ='Calculated margin' differ='Difference' cntmarg='Margin Control Total'; run; data __i2; set __i2 end=eof; retain comm 0; *** termination test; if abs( differ)>&trmprec then comm=1; if eof and comm=1 then call symput("term&j",'2'); ** continue with iterations; else if eof then call symput("term&j",'1'); ** convergence ; run; ** achieved; %let sumterm=%eval(&sumterm+&&term&j); %end; data __i0; set __i0; drop sum1-sum&numvar; %if (&byvar ne ) %then %put &byvar=&s iteration=&i numvar=&numvar sumterm=&sumterm; %else %put numvar=&numvar sumterm=&sumterm; %if &sumterm=&numvar or &i=&numiter %then %do; /** termination criterion met **/ %if (&byvar ne ) %then %put **** Terminated &byvar &s at &i-th iteration; %else %put *** Task terminated at &i-th iteration ****; title3 ' '; data _null_; *** write diagostics to listing; set __i0; if _n_=1; file print &ps &ls; put ' '; %if &sumterm=&numvar %then %do; ** convergence; %if (&byvar ne ) %then %do; ** achieved; put "**** Program for &byvar &s terminated at iteration &i because all calculated margins"; %end; %else %do; put "**** Program terminated at iteration &i because all calculated margins"; %end; put "differ from Marginal Control Totals by less than &trmprec"; run; %end; %else %do; ** no convergence ; %if (&byvar ne ) %then %do; put "**** Program for &byvar &s terminated at iteration &i"; %end; %else %do; put "**** Program terminated at iteration &i"; %end; put "**** No convergence achieved"; %end; data &outds(drop=cntmarg); *** write output; set __i0; *** data set; rename weight=&outwt; %let i=&numiter; %end; %end; proc datasets library=work nolist mt=data; delete __i0 __i1 __i2; ** purge work data; run; quit; %mend; * execute the macro; %RAKING( inds=respondents, outds=respondents_raked, inwt=weight_base, freqlist=gender agecat minority supervisor, outwt=weight_raked, varlist=gender agecat minority supervisor, numvar=4, cntotal=, trmprec=1, numiter=50 ); ****** Chapter 11: Imputation Methods; *** Figure 11.1; data respondents_toplot; set samp; * pick a random sample of about 100 data points; if respond and 200 le _N_ le 400; if (los - 20) gt age then los=los-3*abs(rannor(4220)); run; proc sgplot data=respondents_toplot noautolegend; reg y=los x=age / markerattrs=(size=5 symbol=circlefilled color=black) lineattrs=(color=black pattern=5 thickness=3); xaxis label='Age of Employee'; yaxis label='Length of Service'; run; quit; *** Figure 11.2; proc sgplot data=respondents_toplot noautolegend; vbar gender / response=Q1 stat=mean fillattrs=(color=black); xaxis label='Employee Gender'; yaxis label='Proportion Reacting Positively to Q1'; run; *** Program 11.1; proc mi data=samp out=samp_MI_5 nimpute=5 seed=943222; class gender minority; var gender minority age supervisor los; monotone reg (los=age supervisor gender minority / details); run; *** Program 11.2; proc mi data=samp nimpute=5 seed=7000726 out=samp_MI_5; class gender minority Q1; var gender minority age supervisor Q1; monotone logistic (Q1=gender minority age supervisor / details); run; *** Program 11.3; proc mi data=samp nimpute=5 seed=655231 out=samp_MI_5; class gender minority Q1_full; var gender minority age supervisor Q1_full; monotone logistic (Q1_full=gender minority age supervisor / details); run; *** Program 11.4; proc mi data=samp nimpute=5 seed=609210 out=samp_MI_5; class gender minority Q1_full; var gender minority age supervisor Q1_full; monotone logistic (Q1_full=gender minority age supervisor / link=glogit details); run; *** Program 11.5; * get counts of observed and missing cases within cells; proc sql; create table counts_obs as select gender,minority,count(*) as _NSIZE_ from samp where Q1_full ne . group by gender,minority; create table counts_miss as select gender,minority,count(*) as _NSIZE_ from samp where Q1_full = . group by gender,minority; quit; * select donors within cells; proc sort data=samp; by gender minority; run; * first step of ABB is to select WR sample of observed cases; proc surveyselect data=samp (where=(Q1_full ne .)) sampsize=counts_obs method=URS seed=5441254 reps=5 outhits out=ABB_part1 (keep=replicate gender minority Q1_full); strata gender minority; run; * second step is to select WR sample of missing cases; * create supplemental sample size file with the M=5 replicate codes; data counts_miss; set counts_miss; do replicate=1 to 5; output; end; run; proc sort data=counts_miss; by replicate gender minority; run; proc sort data=ABB_part1; by replicate gender minority; run; * select the sample of donors; proc surveyselect data=ABB_part1 sampsize=counts_miss method=URS seed=8744063 outhits out=ABB_part2 (keep=replicate gender minority Q1_full); strata replicate gender minority; run; * randomly sort donor values within a cell; data ABB_part2; set ABB_part2; random=ranuni(34234433); run; proc sort data=ABB_part2 out=ABB_part2 (drop=random); by replicate gender minority random; run; * merge donor values into SAMP to create M=5 completed data sets; data samp_MI_5; set samp; do replicate=1 to 5; output; end; run; * sort stacked completed data sets such that missing values appear first; proc sort data=samp_MI_5; by replicate gender minority Q1_full; run; data samp_MI_5; merge samp_MI_5 ABB_part2 (in=b); by replicate gender minority; run; *** Program 11.6; proc mi data=samp nimpute=5 seed=655231 out=samp_MI_5; class gender minority; var gender minority age supervisor Q1_full; monotone propensity (Q1_full=age|gender|supervisor|minority @2); run; *** Program 11.7; proc mi data=samp nimpute=5 seed=2311109 out=samp_MI_5; class gender minority; var gender minority age supervisor LOS; monotone regpmm (LOS=age gender supervisor minority / k=15); run; *** Program 11.8; proc mi data=samp nimpute=5 seed=2311002 out=samp_MI_5; class gender minority Q1; var age supervisor gender minority LOS Q1; monotone reg (LOS / details); monotone logistic (Q1 / details); run; *** Program 11.9; data samp_arb; set samp; if LOS ne . and ranuni(882377) < .25 then LOS=.; if Q1 ne . and ranuni(243113) < .1 then Q1=.; run; proc mi data=samp_arb nimpute=5 seed=873772 out=samp_MI_5; class gender minority Q1; var age supervisor gender minority Q1 LOS; fcs reg (LOS / details); fcs logistic (Q1 / details); run; *** Program 11.10; * replicating the predictive mean matching imputation model approach from Program 11.7; proc mi data=samp nimpute=5 seed=2311109 out=samp_MI_5; class gender minority; var gender minority age supervisor LOS; monotone regpmm (LOS=age gender supervisor minority / k=15); run; * compute and store the M estimated means and standard errors of LOS; proc sort data=samp_MI_5; by _IMPUTATION_; run; ods output statistics=stats_MI_5; proc surveymeans data=samp_MI_5 mean stderr; by _IMPUTATION_; var LOS; weight weight_base; run; * get the overall MI estimate and standard error; proc mianalyze data=stats_MI_5; modeleffects mean; stderr stderr; run; *** Program 11.11; * replicating the logistic regression imputation model approach from Program 11.2; proc mi data=samp nimpute=5 seed=7000726 out=samp_MI_5; class gender minority Q1; var gender minority age supervisor Q1; monotone logistic (Q1=gender minority age supervisor / details); run; ods output ParameterEstimates=betas covb=cov_matrix; proc surveylogistic data=samp_MI_5; by _IMPUTATION_; class gender minority; model Q1(event='1') = gender minority age supervisor / covb; run; * rename the variables in cov_matrix to match those in betas; data cov_matrix; set cov_matrix; if Parameter='GENDERF' then Parameter='GENDER'; if Parameter='MinorityN' then Parameter='Minority'; rename GENDERF = GENDER MinorityN = Minority; run; proc mianalyze parms=betas covb=cov_matrix; modeleffects intercept gender minority age supervisor; run;